Merge commit 'refs/top-bases/feature-branch' into feature-branch

This commit is contained in:
Brian Behlendorf 2009-07-06 13:16:34 -07:00
commit 8d4f19348a
103 changed files with 7631 additions and 4095 deletions

View File

@ -1 +1 @@
http://dlc.sun.com/osol/on/downloads/b108/on-src.tar.bz2 http://dlc.sun.com/osol/on/downloads/b117/on-src.tar.bz2

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -101,7 +101,9 @@ usage(void)
(void) fprintf(stderr, " -C cached pool configuration\n"); (void) fprintf(stderr, " -C cached pool configuration\n");
(void) fprintf(stderr, " -i intent logs\n"); (void) fprintf(stderr, " -i intent logs\n");
(void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -b block statistics\n");
(void) fprintf(stderr, " -c checksum all data blocks\n"); (void) fprintf(stderr, " -m metaslabs\n");
(void) fprintf(stderr, " -c checksum all metadata (twice for "
"all data) blocks\n");
(void) fprintf(stderr, " -s report stats on zdb's I/O\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n");
(void) fprintf(stderr, " -S <user|all>:<cksum_alg|all> -- " (void) fprintf(stderr, " -S <user|all>:<cksum_alg|all> -- "
"dump blkptr signatures\n"); "dump blkptr signatures\n");
@ -124,6 +126,11 @@ usage(void)
exit(1); exit(1);
} }
/*
* Called for usage errors that are discovered after a call to spa_open(),
* dmu_bonus_hold(), or pool_match(). abort() is called for other errors.
*/
static void static void
fatal(const char *fmt, ...) fatal(const char *fmt, ...)
{ {
@ -135,7 +142,7 @@ fatal(const char *fmt, ...)
va_end(ap); va_end(ap);
(void) fprintf(stderr, "\n"); (void) fprintf(stderr, "\n");
abort(); exit(1);
} }
static void static void
@ -208,7 +215,7 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
size_t nvsize = *(uint64_t *)data; size_t nvsize = *(uint64_t *)data;
char *packed = umem_alloc(nvsize, UMEM_NOFAIL); char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
VERIFY(0 == dmu_read(os, object, 0, nvsize, packed)); VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
@ -434,7 +441,7 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
alloc = 0; alloc = 0;
for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) { for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
VERIFY(0 == dmu_read(os, smo->smo_object, offset, VERIFY(0 == dmu_read(os, smo->smo_object, offset,
sizeof (entry), &entry)); sizeof (entry), &entry, DMU_READ_PREFETCH));
if (SM_DEBUG_DECODE(entry)) { if (SM_DEBUG_DECODE(entry)) {
(void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n", (void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n",
(u_longlong_t)(offset / sizeof (entry)), (u_longlong_t)(offset / sizeof (entry)),
@ -465,6 +472,21 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
} }
} }
static void
dump_metaslab_stats(metaslab_t *msp)
{
char maxbuf[5];
space_map_t *sm = &msp->ms_map;
avl_tree_t *t = sm->sm_pp_root;
int free_pct = sm->sm_space * 100 / sm->sm_size;
nicenum(space_map_maxsize(sm), maxbuf);
(void) printf("\t %20s %10lu %7s %6s %4s %4d%%\n",
"segments", avl_numnodes(t), "maxsize", maxbuf,
"freepct", free_pct);
}
static void static void
dump_metaslab(metaslab_t *msp) dump_metaslab(metaslab_t *msp)
{ {
@ -475,22 +497,28 @@ dump_metaslab(metaslab_t *msp)
nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf); nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
if (dump_opt['d'] <= 5) {
(void) printf("\t%10llx %10llu %5s\n",
(u_longlong_t)msp->ms_map.sm_start,
(u_longlong_t)smo->smo_object,
freebuf);
return;
}
(void) printf( (void) printf(
"\tvdev %llu offset %08llx spacemap %4llu free %5s\n", "\tvdev %5llu offset %12llx spacemap %6llu free %5s\n",
(u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start, (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
(u_longlong_t)smo->smo_object, freebuf); (u_longlong_t)smo->smo_object, freebuf);
ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift)); if (dump_opt['m'] > 1) {
mutex_enter(&msp->ms_lock);
VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops,
SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0);
dump_metaslab_stats(msp);
space_map_unload(&msp->ms_map);
mutex_exit(&msp->ms_lock);
}
if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
mutex_enter(&msp->ms_lock);
dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
mutex_exit(&msp->ms_lock);
}
dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
} }
static void static void
@ -505,14 +533,12 @@ dump_metaslabs(spa_t *spa)
for (c = 0; c < rvd->vdev_children; c++) { for (c = 0; c < rvd->vdev_children; c++) {
vd = rvd->vdev_child[c]; vd = rvd->vdev_child[c];
(void) printf("\n vdev %llu\n\n", (u_longlong_t)vd->vdev_id); (void) printf("\t%-10s %-19s %-15s %-10s\n",
"vdev", "offset", "spacemap", "free");
(void) printf("\t%10s %19s %15s %10s\n",
"----------", "-------------------",
"---------------", "-------------");
if (dump_opt['d'] <= 5) {
(void) printf("\t%10s %10s %5s\n",
"offset", "spacemap", "free");
(void) printf("\t%10s %10s %5s\n",
"------", "--------", "----");
}
for (m = 0; m < vd->vdev_ms_count; m++) for (m = 0; m < vd->vdev_ms_count; m++)
dump_metaslab(vd->vdev_ms[m]); dump_metaslab(vd->vdev_ms[m]);
(void) printf("\n"); (void) printf("\n");
@ -916,6 +942,7 @@ dump_uidgid(objset_t *os, znode_phys_t *zp)
/* first find the fuid object. It lives in the master node */ /* first find the fuid object. It lives in the master node */
VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
8, 1, &fuid_obj) == 0); 8, 1, &fuid_obj) == 0);
zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
(void) zfs_fuid_table_load(os, fuid_obj, (void) zfs_fuid_table_load(os, fuid_obj,
&idx_tree, &domain_tree); &idx_tree, &domain_tree);
fuid_table_loaded = B_TRUE; fuid_table_loaded = B_TRUE;
@ -1019,6 +1046,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
dump_packed_nvlist, /* FUID nvlist size */ dump_packed_nvlist, /* FUID nvlist size */
dump_zap, /* DSL dataset next clones */ dump_zap, /* DSL dataset next clones */
dump_zap, /* DSL scrub queue */ dump_zap, /* DSL scrub queue */
dump_zap, /* ZFS user/group used */
dump_zap, /* ZFS user/group quota */
}; };
static void static void
@ -1082,6 +1111,14 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
} }
if (verbosity >= 4) { if (verbosity >= 4) {
(void) printf("\tdnode flags: %s%s\n",
(dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
"USED_BYTES " : "",
(dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
"USERUSED_ACCOUNTED " : "");
(void) printf("\tdnode maxblkid: %llu\n",
(longlong_t)dn->dn_phys->dn_maxblkid);
object_viewer[doi.doi_bonus_type](os, object, bonus, bsize); object_viewer[doi.doi_bonus_type](os, object, bonus, bsize);
object_viewer[doi.doi_type](os, object, NULL, 0); object_viewer[doi.doi_type](os, object, NULL, 0);
*print_header = 1; *print_header = 1;
@ -1136,7 +1173,7 @@ dump_dir(objset_t *os)
uint64_t object, object_count; uint64_t object, object_count;
uint64_t refdbytes, usedobjs, scratch; uint64_t refdbytes, usedobjs, scratch;
char numbuf[8]; char numbuf[8];
char blkbuf[BP_SPRINTF_LEN]; char blkbuf[BP_SPRINTF_LEN + 20];
char osname[MAXNAMELEN]; char osname[MAXNAMELEN];
char *type = "UNKNOWN"; char *type = "UNKNOWN";
int verbosity = dump_opt['d']; int verbosity = dump_opt['d'];
@ -1162,8 +1199,8 @@ dump_dir(objset_t *os)
nicenum(refdbytes, numbuf); nicenum(refdbytes, numbuf);
if (verbosity >= 4) { if (verbosity >= 4) {
(void) strcpy(blkbuf, ", rootbp "); (void) sprintf(blkbuf + strlen(blkbuf), ", rootbp ");
sprintf_blkptr(blkbuf + strlen(blkbuf), (void) sprintf_blkptr(blkbuf + strlen(blkbuf),
BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp); BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp);
} else { } else {
blkbuf[0] = '\0'; blkbuf[0] = '\0';
@ -1198,7 +1235,12 @@ dump_dir(objset_t *os)
} }
dump_object(os, 0, verbosity, &print_header); dump_object(os, 0, verbosity, &print_header);
object_count = 1; object_count = 0;
if (os->os->os_userused_dnode &&
os->os->os_userused_dnode->dn_type != 0) {
dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
}
object = 0; object = 0;
while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
@ -1210,8 +1252,10 @@ dump_dir(objset_t *os)
(void) printf("\n"); (void) printf("\n");
if (error != ESRCH) if (error != ESRCH) {
fatal("dmu_object_next() = %d", error); (void) fprintf(stderr, "dmu_object_next() = %d\n", error);
abort();
}
} }
static void static void
@ -1394,7 +1438,8 @@ static space_map_ops_t zdb_space_map_ops = {
zdb_space_map_unload, zdb_space_map_unload,
NULL, /* alloc */ NULL, /* alloc */
zdb_space_map_claim, zdb_space_map_claim,
NULL /* free */ NULL, /* free */
NULL /* maxsize */
}; };
static void static void
@ -1504,13 +1549,25 @@ zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
{ {
zdb_cb_t *zcb = arg; zdb_cb_t *zcb = arg;
char blkbuf[BP_SPRINTF_LEN]; char blkbuf[BP_SPRINTF_LEN];
dmu_object_type_t type;
boolean_t is_l0_metadata;
if (bp == NULL) if (bp == NULL)
return (0); return (0);
zdb_count_block(spa, zcb, bp, BP_GET_TYPE(bp)); type = BP_GET_TYPE(bp);
if (dump_opt['c'] || dump_opt['S']) { zdb_count_block(spa, zcb, bp, type);
/*
* if we do metadata-only checksumming there's no need to checksum
* indirect blocks here because it is done during traverse
*/
is_l0_metadata = (BP_GET_LEVEL(bp) == 0 && type < DMU_OT_NUMTYPES &&
dmu_ot[type].ot_metadata);
if (dump_opt['c'] > 1 || dump_opt['S'] ||
(dump_opt['c'] && is_l0_metadata)) {
int ioerr, size; int ioerr, size;
void *data; void *data;
@ -1522,7 +1579,7 @@ zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
free(data); free(data);
/* We expect io errors on intent log */ /* We expect io errors on intent log */
if (ioerr && BP_GET_TYPE(bp) != DMU_OT_INTENT_LOG) { if (ioerr && type != DMU_OT_INTENT_LOG) {
zcb->zcb_haderrors = 1; zcb->zcb_haderrors = 1;
zcb->zcb_errors[ioerr]++; zcb->zcb_errors[ioerr]++;
@ -1570,8 +1627,9 @@ dump_block_stats(spa_t *spa)
int c, e; int c, e;
if (!dump_opt['S']) { if (!dump_opt['S']) {
(void) printf("\nTraversing all blocks %s%s%s%s...\n", (void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
(dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
(dump_opt['c'] == 1) ? "metadata " : "",
dump_opt['c'] ? "checksums " : "", dump_opt['c'] ? "checksums " : "",
(dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
!dump_opt['L'] ? "nothing leaked " : ""); !dump_opt['L'] ? "nothing leaked " : "");
@ -1771,14 +1829,17 @@ dump_zpool(spa_t *spa)
if (dump_opt['u']) if (dump_opt['u'])
dump_uberblock(&spa->spa_uberblock); dump_uberblock(&spa->spa_uberblock);
if (dump_opt['d'] || dump_opt['i']) { if (dump_opt['d'] || dump_opt['i'] || dump_opt['m']) {
dump_dir(dp->dp_meta_objset); dump_dir(dp->dp_meta_objset);
if (dump_opt['d'] >= 3) { if (dump_opt['d'] >= 3) {
dump_bplist(dp->dp_meta_objset, dump_bplist(dp->dp_meta_objset,
spa->spa_sync_bplist_obj, "Deferred frees"); spa->spa_sync_bplist_obj, "Deferred frees");
dump_dtl(spa->spa_root_vdev, 0); dump_dtl(spa->spa_root_vdev, 0);
dump_metaslabs(spa);
} }
if (dump_opt['d'] >= 3 || dump_opt['m'])
dump_metaslabs(spa);
(void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL, (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL,
DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
} }
@ -2254,13 +2315,14 @@ main(int argc, char **argv)
dprintf_setup(&argc, argv); dprintf_setup(&argc, argv);
while ((c = getopt(argc, argv, "udibcsvCLS:U:lRep:t:")) != -1) { while ((c = getopt(argc, argv, "udibcmsvCLS:U:lRep:t:")) != -1) {
switch (c) { switch (c) {
case 'u': case 'u':
case 'd': case 'd':
case 'i': case 'i':
case 'b': case 'b':
case 'c': case 'c':
case 'm':
case 's': case 's':
case 'C': case 'C':
case 'l': case 'l':
@ -2396,7 +2458,7 @@ main(int argc, char **argv)
} }
if (error == 0) if (error == 0)
error = spa_import_faulted(argv[0], error = spa_import_verbatim(argv[0],
exported_conf, nvl); exported_conf, nvl);
nvlist_free(nvl); nvlist_free(nvl);

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
/* /*
* Print intent log header and statistics. * Print intent log header and statistics.
*/ */
@ -345,8 +343,10 @@ dump_intent_log(zilog_t *zilog)
if (zh->zh_log.blk_birth == 0 || verbose < 2) if (zh->zh_log.blk_birth == 0 || verbose < 2)
return; return;
(void) printf("\n ZIL header: claim_txg %llu, seq %llu\n", (void) printf("\n ZIL header: claim_txg %llu, claim_seq %llu",
(u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_replay_seq); (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_claim_seq);
(void) printf(" replay_seq %llu, flags 0x%llx\n",
(u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
if (verbose >= 4) if (verbose >= 4)
print_log_bp(&zh->zh_log, "\n\tfirst block: "); print_log_bp(&zh->zh_log, "\n\tfirst block: ");

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -53,11 +53,14 @@ typedef struct zfs_node {
} zfs_node_t; } zfs_node_t;
typedef struct callback_data { typedef struct callback_data {
uu_avl_t *cb_avl; uu_avl_t *cb_avl;
int cb_flags; int cb_flags;
zfs_type_t cb_types; zfs_type_t cb_types;
zfs_sort_column_t *cb_sortcol; zfs_sort_column_t *cb_sortcol;
zprop_list_t **cb_proplist; zprop_list_t **cb_proplist;
int cb_depth_limit;
int cb_depth;
uint8_t cb_props_table[ZFS_NUM_PROPS];
} callback_data_t; } callback_data_t;
uu_avl_pool_t *avl_pool; uu_avl_pool_t *avl_pool;
@ -98,10 +101,17 @@ zfs_callback(zfs_handle_t *zhp, void *data)
uu_avl_node_init(node, &node->zn_avlnode, avl_pool); uu_avl_node_init(node, &node->zn_avlnode, avl_pool);
if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol, if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol,
&idx) == NULL) { &idx) == NULL) {
if (cb->cb_proplist && if (cb->cb_proplist) {
zfs_expand_proplist(zhp, cb->cb_proplist) != 0) { if ((*cb->cb_proplist) &&
free(node); !(*cb->cb_proplist)->pl_all)
return (-1); zfs_prune_proplist(zhp,
cb->cb_props_table);
if (zfs_expand_proplist(zhp, cb->cb_proplist)
!= 0) {
free(node);
return (-1);
}
} }
uu_avl_insert(cb->cb_avl, node, idx); uu_avl_insert(cb->cb_avl, node, idx);
dontclose = 1; dontclose = 1;
@ -113,11 +123,15 @@ zfs_callback(zfs_handle_t *zhp, void *data)
/* /*
* Recurse if necessary. * Recurse if necessary.
*/ */
if (cb->cb_flags & ZFS_ITER_RECURSE) { if (cb->cb_flags & ZFS_ITER_RECURSE &&
((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 ||
cb->cb_depth < cb->cb_depth_limit)) {
cb->cb_depth++;
if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM)
(void) zfs_iter_filesystems(zhp, zfs_callback, data); (void) zfs_iter_filesystems(zhp, zfs_callback, data);
if ((zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) && include_snaps) if ((zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) && include_snaps)
(void) zfs_iter_snapshots(zhp, zfs_callback, data); (void) zfs_iter_snapshots(zhp, zfs_callback, data);
cb->cb_depth--;
} }
if (!dontclose) if (!dontclose)
@ -325,10 +339,10 @@ zfs_sort(const void *larg, const void *rarg, void *data)
int int
zfs_for_each(int argc, char **argv, int flags, zfs_type_t types, zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
zfs_sort_column_t *sortcol, zprop_list_t **proplist, zfs_sort_column_t *sortcol, zprop_list_t **proplist, int limit,
zfs_iter_f callback, void *data) zfs_iter_f callback, void *data)
{ {
callback_data_t cb; callback_data_t cb = {0};
int ret = 0; int ret = 0;
zfs_node_t *node; zfs_node_t *node;
uu_avl_walk_t *walk; uu_avl_walk_t *walk;
@ -346,6 +360,45 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
cb.cb_flags = flags; cb.cb_flags = flags;
cb.cb_proplist = proplist; cb.cb_proplist = proplist;
cb.cb_types = types; cb.cb_types = types;
cb.cb_depth_limit = limit;
/*
* If cb_proplist is provided then in the zfs_handles created we
* retain only those properties listed in cb_proplist and sortcol.
* The rest are pruned. So, the caller should make sure that no other
* properties other than those listed in cb_proplist/sortcol are
* accessed.
*
* If cb_proplist is NULL then we retain all the properties. We
* always retain the zoned property, which some other properties
* need (userquota & friends), and the createtxg property, which
* we need to sort snapshots.
*/
if (cb.cb_proplist && *cb.cb_proplist) {
zprop_list_t *p = *cb.cb_proplist;
while (p) {
if (p->pl_prop >= ZFS_PROP_TYPE &&
p->pl_prop < ZFS_NUM_PROPS) {
cb.cb_props_table[p->pl_prop] = B_TRUE;
}
p = p->pl_next;
}
while (sortcol) {
if (sortcol->sc_prop >= ZFS_PROP_TYPE &&
sortcol->sc_prop < ZFS_NUM_PROPS) {
cb.cb_props_table[sortcol->sc_prop] = B_TRUE;
}
sortcol = sortcol->sc_next;
}
cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE;
cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE;
} else {
(void) memset(cb.cb_props_table, B_TRUE,
sizeof (cb.cb_props_table));
}
if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) { if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) {
(void) fprintf(stderr, (void) fprintf(stderr,
gettext("internal error: out of memory\n")); gettext("internal error: out of memory\n"));

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -41,9 +41,10 @@ typedef struct zfs_sort_column {
#define ZFS_ITER_RECURSE (1 << 0) #define ZFS_ITER_RECURSE (1 << 0)
#define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1) #define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1)
#define ZFS_ITER_PROP_LISTSNAPS (1 << 2) #define ZFS_ITER_PROP_LISTSNAPS (1 << 2)
#define ZFS_ITER_DEPTH_LIMIT (1 << 3)
int zfs_for_each(int, char **, int options, zfs_type_t, int zfs_for_each(int, char **, int options, zfs_type_t,
zfs_sort_column_t *, zprop_list_t **, zfs_iter_f, void *); zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *);
int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t); int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t);
void zfs_free_sort_columns(zfs_sort_column_t *); void zfs_free_sort_columns(zfs_sort_column_t *);

View File

@ -39,12 +39,14 @@
#include <unistd.h> #include <unistd.h>
#include <fcntl.h> #include <fcntl.h>
#include <zone.h> #include <zone.h>
#include <grp.h>
#include <pwd.h>
#include <sys/mkdev.h> #include <sys/mkdev.h>
#include <sys/mntent.h> #include <sys/mntent.h>
#include <sys/mnttab.h> #include <sys/mnttab.h>
#include <sys/mount.h> #include <sys/mount.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/avl.h> #include <sys/fs/zfs.h>
#include <libzfs.h> #include <libzfs.h>
#include <libuutil.h> #include <libuutil.h>
@ -56,6 +58,7 @@ libzfs_handle_t *g_zfs;
static FILE *mnttab_file; static FILE *mnttab_file;
static char history_str[HIS_MAX_RECORD_LEN]; static char history_str[HIS_MAX_RECORD_LEN];
const char *pypath = "/usr/lib/zfs/pyzfs.py";
static int zfs_do_clone(int argc, char **argv); static int zfs_do_clone(int argc, char **argv);
static int zfs_do_create(int argc, char **argv); static int zfs_do_create(int argc, char **argv);
@ -75,8 +78,8 @@ static int zfs_do_unshare(int argc, char **argv);
static int zfs_do_send(int argc, char **argv); static int zfs_do_send(int argc, char **argv);
static int zfs_do_receive(int argc, char **argv); static int zfs_do_receive(int argc, char **argv);
static int zfs_do_promote(int argc, char **argv); static int zfs_do_promote(int argc, char **argv);
static int zfs_do_allow(int argc, char **argv); static int zfs_do_userspace(int argc, char **argv);
static int zfs_do_unallow(int argc, char **argv); static int zfs_do_python(int argc, char **argv);
/* /*
* Enable a reasonable set of defaults for libumem debugging on DEBUG builds. * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
@ -116,7 +119,9 @@ typedef enum {
HELP_UNMOUNT, HELP_UNMOUNT,
HELP_UNSHARE, HELP_UNSHARE,
HELP_ALLOW, HELP_ALLOW,
HELP_UNALLOW HELP_UNALLOW,
HELP_USERSPACE,
HELP_GROUPSPACE
} zfs_help_t; } zfs_help_t;
typedef struct zfs_command { typedef struct zfs_command {
@ -150,6 +155,8 @@ static zfs_command_t command_table[] = {
{ "get", zfs_do_get, HELP_GET }, { "get", zfs_do_get, HELP_GET },
{ "inherit", zfs_do_inherit, HELP_INHERIT }, { "inherit", zfs_do_inherit, HELP_INHERIT },
{ "upgrade", zfs_do_upgrade, HELP_UPGRADE }, { "upgrade", zfs_do_upgrade, HELP_UPGRADE },
{ "userspace", zfs_do_userspace, HELP_USERSPACE },
{ "groupspace", zfs_do_userspace, HELP_GROUPSPACE },
{ NULL }, { NULL },
{ "mount", zfs_do_mount, HELP_MOUNT }, { "mount", zfs_do_mount, HELP_MOUNT },
{ "unmount", zfs_do_unmount, HELP_UNMOUNT }, { "unmount", zfs_do_unmount, HELP_UNMOUNT },
@ -159,9 +166,9 @@ static zfs_command_t command_table[] = {
{ "send", zfs_do_send, HELP_SEND }, { "send", zfs_do_send, HELP_SEND },
{ "receive", zfs_do_receive, HELP_RECEIVE }, { "receive", zfs_do_receive, HELP_RECEIVE },
{ NULL }, { NULL },
{ "allow", zfs_do_allow, HELP_ALLOW }, { "allow", zfs_do_python, HELP_ALLOW },
{ NULL }, { NULL },
{ "unallow", zfs_do_unallow, HELP_UNALLOW }, { "unallow", zfs_do_python, HELP_UNALLOW },
}; };
#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))
@ -184,8 +191,8 @@ get_usage(zfs_help_t idx)
return (gettext("\tdestroy [-rRf] " return (gettext("\tdestroy [-rRf] "
"<filesystem|volume|snapshot>\n")); "<filesystem|volume|snapshot>\n"));
case HELP_GET: case HELP_GET:
return (gettext("\tget [-rHp] [-o field[,...]] " return (gettext("\tget [-rHp] [-d max] "
"[-s source[,...]]\n" "[-o field[,...]] [-s source[,...]]\n"
"\t <\"all\" | property[,...]> " "\t <\"all\" | property[,...]> "
"[filesystem|volume|snapshot] ...\n")); "[filesystem|volume|snapshot] ...\n"));
case HELP_INHERIT: case HELP_INHERIT:
@ -195,8 +202,8 @@ get_usage(zfs_help_t idx)
return (gettext("\tupgrade [-v]\n" return (gettext("\tupgrade [-v]\n"
"\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); "\tupgrade [-r] [-V version] <-a | filesystem ...>\n"));
case HELP_LIST: case HELP_LIST:
return (gettext("\tlist [-rH] [-o property[,...]] " return (gettext("\tlist [-rH][-d max] "
"[-t type[,...]] [-s property] ...\n" "[-o property[,...]] [-t type[,...]] [-s property] ...\n"
"\t [-S property] ... " "\t [-S property] ... "
"[filesystem|volume|snapshot] ...\n")); "[filesystem|volume|snapshot] ...\n"));
case HELP_MOUNT: case HELP_MOUNT:
@ -232,7 +239,8 @@ get_usage(zfs_help_t idx)
return (gettext("\tunshare [-f] " return (gettext("\tunshare [-f] "
"<-a | filesystem|mountpoint>\n")); "<-a | filesystem|mountpoint>\n"));
case HELP_ALLOW: case HELP_ALLOW:
return (gettext("\tallow [-ldug] " return (gettext("\tallow <filesystem|volume>\n"
"\tallow [-ldug] "
"<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n" "<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n"
"\t <filesystem|volume>\n" "\t <filesystem|volume>\n"
"\tallow [-ld] -e <perm|@setname>[,...] " "\tallow [-ld] -e <perm|@setname>[,...] "
@ -250,6 +258,14 @@ get_usage(zfs_help_t idx)
"<filesystem|volume>\n" "<filesystem|volume>\n"
"\tunallow [-r] -s @setname [<perm|@setname>[,...]] " "\tunallow [-r] -s @setname [<perm|@setname>[,...]] "
"<filesystem|volume>\n")); "<filesystem|volume>\n"));
case HELP_USERSPACE:
return (gettext("\tuserspace [-hniHp] [-o field[,...]] "
"[-sS field] ... [-t type[,...]]\n"
"\t <filesystem|snapshot>\n"));
case HELP_GROUPSPACE:
return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] "
"[-sS field] ... [-t type[,...]]\n"
"\t <filesystem|snapshot>\n"));
} }
abort(); abort();
@ -311,7 +327,6 @@ usage(boolean_t requested)
{ {
int i; int i;
boolean_t show_properties = B_FALSE; boolean_t show_properties = B_FALSE;
boolean_t show_permissions = B_FALSE;
FILE *fp = requested ? stdout : stderr; FILE *fp = requested ? stdout : stderr;
if (current_command == NULL) { if (current_command == NULL) {
@ -342,13 +357,7 @@ usage(boolean_t requested)
strcmp(current_command->name, "list") == 0)) strcmp(current_command->name, "list") == 0))
show_properties = B_TRUE; show_properties = B_TRUE;
if (current_command != NULL &&
(strcmp(current_command->name, "allow") == 0 ||
strcmp(current_command->name, "unallow") == 0))
show_permissions = B_TRUE;
if (show_properties) { if (show_properties) {
(void) fprintf(fp, (void) fprintf(fp,
gettext("\nThe following properties are supported:\n")); gettext("\nThe following properties are supported:\n"));
@ -359,16 +368,26 @@ usage(boolean_t requested)
(void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE,
ZFS_TYPE_DATASET); ZFS_TYPE_DATASET);
(void) fprintf(fp, "\t%-15s ", "userused@...");
(void) fprintf(fp, " NO NO <size>\n");
(void) fprintf(fp, "\t%-15s ", "groupused@...");
(void) fprintf(fp, " NO NO <size>\n");
(void) fprintf(fp, "\t%-15s ", "userquota@...");
(void) fprintf(fp, "YES NO <size> | none\n");
(void) fprintf(fp, "\t%-15s ", "groupquota@...");
(void) fprintf(fp, "YES NO <size> | none\n");
(void) fprintf(fp, gettext("\nSizes are specified in bytes " (void) fprintf(fp, gettext("\nSizes are specified in bytes "
"with standard units such as K, M, G, etc.\n")); "with standard units such as K, M, G, etc.\n"));
(void) fprintf(fp, gettext("\nUser-defined properties can " (void) fprintf(fp, gettext("\nUser-defined properties can "
"be specified by using a name containing a colon (:).\n")); "be specified by using a name containing a colon (:).\n"));
(void) fprintf(fp, gettext("\nThe {user|group}{used|quota}@ "
} else if (show_permissions) { "properties must be appended with\n"
(void) fprintf(fp, "a user or group specifier of one of these forms:\n"
gettext("\nThe following permissions are supported:\n")); " POSIX name (eg: \"matt\")\n"
" POSIX id (eg: \"126829\")\n"
zfs_deleg_permissions(); " SMB name@domain (eg: \"matt@sun\")\n"
" SMB SID (eg: \"S-1-234-567-89\")\n"));
} else { } else {
(void) fprintf(fp, (void) fprintf(fp,
gettext("\nFor the property list, run: %s\n"), gettext("\nFor the property list, run: %s\n"),
@ -415,6 +434,27 @@ parseprop(nvlist_t *props)
return (0); return (0);
} }
static int
parse_depth(char *opt, int *flags)
{
char *tmp;
int depth;
depth = (int)strtol(opt, &tmp, 0);
if (*tmp) {
(void) fprintf(stderr,
gettext("%s is not an integer\n"), optarg);
usage(B_FALSE);
}
if (depth < 0) {
(void) fprintf(stderr,
gettext("Depth can not be negative.\n"));
usage(B_FALSE);
}
*flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE);
return (depth);
}
/* /*
* zfs clone [-p] [-o prop=value] ... <snap> <fs | vol> * zfs clone [-p] [-o prop=value] ... <snap> <fs | vol>
* *
@ -1063,6 +1103,17 @@ get_callback(zfs_handle_t *zhp, void *data)
zprop_print_one_property(zfs_get_name(zhp), cbp, zprop_print_one_property(zfs_get_name(zhp), cbp,
zfs_prop_to_name(pl->pl_prop), zfs_prop_to_name(pl->pl_prop),
buf, sourcetype, source); buf, sourcetype, source);
} else if (zfs_prop_userquota(pl->pl_user_prop)) {
sourcetype = ZPROP_SRC_LOCAL;
if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
buf, sizeof (buf), cbp->cb_literal) != 0) {
sourcetype = ZPROP_SRC_NONE;
(void) strlcpy(buf, "-", sizeof (buf));
}
zprop_print_one_property(zfs_get_name(zhp), cbp,
pl->pl_user_prop, buf, sourcetype, source);
} else { } else {
if (nvlist_lookup_nvlist(userprop, if (nvlist_lookup_nvlist(userprop,
pl->pl_user_prop, &propval) != 0) { pl->pl_user_prop, &propval) != 0) {
@ -1102,6 +1153,7 @@ zfs_do_get(int argc, char **argv)
int i, c, flags = 0; int i, c, flags = 0;
char *value, *fields; char *value, *fields;
int ret; int ret;
int limit = 0;
zprop_list_t fake_name = { 0 }; zprop_list_t fake_name = { 0 };
/* /*
@ -1115,11 +1167,14 @@ zfs_do_get(int argc, char **argv)
cb.cb_type = ZFS_TYPE_DATASET; cb.cb_type = ZFS_TYPE_DATASET;
/* check options */ /* check options */
while ((c = getopt(argc, argv, ":o:s:rHp")) != -1) { while ((c = getopt(argc, argv, ":d:o:s:rHp")) != -1) {
switch (c) { switch (c) {
case 'p': case 'p':
cb.cb_literal = B_TRUE; cb.cb_literal = B_TRUE;
break; break;
case 'd':
limit = parse_depth(optarg, &flags);
break;
case 'r': case 'r':
flags |= ZFS_ITER_RECURSE; flags |= ZFS_ITER_RECURSE;
break; break;
@ -1250,7 +1305,7 @@ zfs_do_get(int argc, char **argv)
/* run for each object */ /* run for each object */
ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL,
&cb.cb_proplist, get_callback, &cb); &cb.cb_proplist, limit, get_callback, &cb);
if (cb.cb_proplist == &fake_name) if (cb.cb_proplist == &fake_name)
zprop_free_list(fake_name.pl_next); zprop_free_list(fake_name.pl_next);
@ -1363,10 +1418,10 @@ zfs_do_inherit(int argc, char **argv)
if (flags & ZFS_ITER_RECURSE) { if (flags & ZFS_ITER_RECURSE) {
ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
NULL, NULL, inherit_recurse_cb, propname); NULL, NULL, 0, inherit_recurse_cb, propname);
} else { } else {
ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
NULL, NULL, inherit_cb, propname); NULL, NULL, 0, inherit_cb, propname);
} }
return (ret); return (ret);
@ -1435,21 +1490,30 @@ upgrade_set_callback(zfs_handle_t *zhp, void *data)
{ {
upgrade_cbdata_t *cb = data; upgrade_cbdata_t *cb = data;
int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
int i;
static struct { int zplver; int spaver; } table[] = {
{ZPL_VERSION_FUID, SPA_VERSION_FUID},
{ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
{0, 0}
};
if (cb->cb_version >= ZPL_VERSION_FUID) {
int spa_version;
if (zfs_spa_version(zhp, &spa_version) < 0) for (i = 0; table[i].zplver; i++) {
return (-1); if (cb->cb_version >= table[i].zplver) {
int spa_version;
if (spa_version < SPA_VERSION_FUID) { if (zfs_spa_version(zhp, &spa_version) < 0)
/* can't upgrade */ return (-1);
(void) printf(gettext("%s: can not be upgraded; "
"the pool version needs to first be upgraded\nto " if (spa_version < table[i].spaver) {
"version %d\n\n"), /* can't upgrade */
zfs_get_name(zhp), SPA_VERSION_FUID); (void) printf(gettext("%s: can not be "
cb->cb_numfailed++; "upgraded; the pool version needs to first "
return (0); "be upgraded\nto version %d\n\n"),
zfs_get_name(zhp), table[i].spaver);
cb->cb_numfailed++;
return (0);
}
} }
} }
@ -1550,6 +1614,8 @@ zfs_do_upgrade(int argc, char **argv)
(void) printf(gettext(" 2 Enhanced directory entries\n")); (void) printf(gettext(" 2 Enhanced directory entries\n"));
(void) printf(gettext(" 3 Case insensitive and File system " (void) printf(gettext(" 3 Case insensitive and File system "
"unique identifer (FUID)\n")); "unique identifer (FUID)\n"));
(void) printf(gettext(" 4 userquota, groupquota "
"properties\n"));
(void) printf(gettext("\nFor more information on a particular " (void) printf(gettext("\nFor more information on a particular "
"version, including supported releases, see:\n\n")); "version, including supported releases, see:\n\n"));
(void) printf("http://www.opensolaris.org/os/community/zfs/" (void) printf("http://www.opensolaris.org/os/community/zfs/"
@ -1561,7 +1627,7 @@ zfs_do_upgrade(int argc, char **argv)
if (cb.cb_version == 0) if (cb.cb_version == 0)
cb.cb_version = ZPL_VERSION; cb.cb_version = ZPL_VERSION;
ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM,
NULL, NULL, upgrade_set_callback, &cb); NULL, NULL, 0, upgrade_set_callback, &cb);
(void) printf(gettext("%llu filesystems upgraded\n"), (void) printf(gettext("%llu filesystems upgraded\n"),
cb.cb_numupgraded); cb.cb_numupgraded);
if (cb.cb_numsamegraded) { if (cb.cb_numsamegraded) {
@ -1579,14 +1645,14 @@ zfs_do_upgrade(int argc, char **argv)
flags |= ZFS_ITER_RECURSE; flags |= ZFS_ITER_RECURSE;
ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
NULL, NULL, upgrade_list_callback, &cb); NULL, NULL, 0, upgrade_list_callback, &cb);
found = cb.cb_foundone; found = cb.cb_foundone;
cb.cb_foundone = B_FALSE; cb.cb_foundone = B_FALSE;
cb.cb_newer = B_TRUE; cb.cb_newer = B_TRUE;
ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
NULL, NULL, upgrade_list_callback, &cb); NULL, NULL, 0, upgrade_list_callback, &cb);
if (!cb.cb_foundone && !found) { if (!cb.cb_foundone && !found) {
(void) printf(gettext("All filesystems are " (void) printf(gettext("All filesystems are "
@ -1598,11 +1664,90 @@ zfs_do_upgrade(int argc, char **argv)
} }
/* /*
* list [-rH] [-o property[,property]...] [-t type[,type]...] * zfs userspace
*/
static int
userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
{
zfs_userquota_prop_t *typep = arg;
zfs_userquota_prop_t p = *typep;
char *name = NULL;
char *ug, *propname;
char namebuf[32];
char sizebuf[32];
if (domain == NULL || domain[0] == '\0') {
if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) {
struct group *g = getgrgid(rid);
if (g)
name = g->gr_name;
} else {
struct passwd *p = getpwuid(rid);
if (p)
name = p->pw_name;
}
}
if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA)
ug = "group";
else
ug = "user";
if (p == ZFS_PROP_USERUSED || p == ZFS_PROP_GROUPUSED)
propname = "used";
else
propname = "quota";
if (name == NULL) {
(void) snprintf(namebuf, sizeof (namebuf),
"%llu", (longlong_t)rid);
name = namebuf;
}
zfs_nicenum(space, sizebuf, sizeof (sizebuf));
(void) printf("%s %s %s%c%s %s\n", propname, ug, domain,
domain[0] ? '-' : ' ', name, sizebuf);
return (0);
}
static int
zfs_do_userspace(int argc, char **argv)
{
zfs_handle_t *zhp;
zfs_userquota_prop_t p;
int error;
/*
* Try the python version. If the execv fails, we'll continue
* and do a simplistic implementation.
*/
(void) execv(pypath, argv-1);
(void) printf("internal error: %s not found\n"
"falling back on built-in implementation, "
"some features will not work\n", pypath);
if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL)
return (1);
(void) printf("PROP TYPE NAME VALUE\n");
for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
error = zfs_userspace(zhp, p, userspace_cb, &p);
if (error)
break;
}
return (error);
}
/*
* list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...]
* [-s property [-s property]...] [-S property [-S property]...] * [-s property [-s property]...] [-S property [-S property]...]
* <dataset> ... * <dataset> ...
* *
* -r Recurse over all children * -r Recurse over all children
* -d Limit recursion by depth.
* -H Scripted mode; elide headers and separate columns by tabs * -H Scripted mode; elide headers and separate columns by tabs
* -o Control which fields to display. * -o Control which fields to display.
* -t Control which object types to display. * -t Control which object types to display.
@ -1685,7 +1830,6 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted)
first = B_FALSE; first = B_FALSE;
} }
right_justify = B_FALSE;
if (pl->pl_prop != ZPROP_INVAL) { if (pl->pl_prop != ZPROP_INVAL) {
if (zfs_prop_get(zhp, pl->pl_prop, property, if (zfs_prop_get(zhp, pl->pl_prop, property,
sizeof (property), NULL, NULL, 0, B_FALSE) != 0) sizeof (property), NULL, NULL, 0, B_FALSE) != 0)
@ -1694,6 +1838,13 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted)
propstr = property; propstr = property;
right_justify = zfs_prop_align_right(pl->pl_prop); right_justify = zfs_prop_align_right(pl->pl_prop);
} else if (zfs_prop_userquota(pl->pl_user_prop)) {
if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
property, sizeof (property), B_FALSE) != 0)
propstr = "-";
else
propstr = property;
right_justify = B_TRUE;
} else { } else {
if (nvlist_lookup_nvlist(userprops, if (nvlist_lookup_nvlist(userprops,
pl->pl_user_prop, &propval) != 0) pl->pl_user_prop, &propval) != 0)
@ -1701,6 +1852,7 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted)
else else
verify(nvlist_lookup_string(propval, verify(nvlist_lookup_string(propval,
ZPROP_VALUE, &propstr) == 0); ZPROP_VALUE, &propstr) == 0);
right_justify = B_FALSE;
} }
width = pl->pl_width; width = pl->pl_width;
@ -1752,16 +1904,20 @@ zfs_do_list(int argc, char **argv)
char *fields = NULL; char *fields = NULL;
list_cbdata_t cb = { 0 }; list_cbdata_t cb = { 0 };
char *value; char *value;
int limit = 0;
int ret; int ret;
zfs_sort_column_t *sortcol = NULL; zfs_sort_column_t *sortcol = NULL;
int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS;
/* check options */ /* check options */
while ((c = getopt(argc, argv, ":o:rt:Hs:S:")) != -1) { while ((c = getopt(argc, argv, ":d:o:rt:Hs:S:")) != -1) {
switch (c) { switch (c) {
case 'o': case 'o':
fields = optarg; fields = optarg;
break; break;
case 'd':
limit = parse_depth(optarg, &flags);
break;
case 'r': case 'r':
flags |= ZFS_ITER_RECURSE; flags |= ZFS_ITER_RECURSE;
break; break;
@ -1852,7 +2008,7 @@ zfs_do_list(int argc, char **argv)
cb.cb_first = B_TRUE; cb.cb_first = B_TRUE;
ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist,
list_callback, &cb); limit, list_callback, &cb);
zprop_free_list(cb.cb_proplist); zprop_free_list(cb.cb_proplist);
zfs_free_sort_columns(sortcol); zfs_free_sort_columns(sortcol);
@ -2235,7 +2391,7 @@ zfs_do_set(int argc, char **argv)
} }
ret = zfs_for_each(argc - 2, argv + 2, NULL, ret = zfs_for_each(argc - 2, argv + 2, NULL,
ZFS_TYPE_DATASET, NULL, NULL, set_callback, &cb); ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, &cb);
return (ret); return (ret);
} }
@ -2495,390 +2651,6 @@ zfs_do_receive(int argc, char **argv)
return (err != 0); return (err != 0);
} }
typedef struct allow_cb {
int a_permcnt;
size_t a_treeoffset;
} allow_cb_t;
static void
zfs_print_perms(avl_tree_t *tree)
{
zfs_perm_node_t *permnode;
permnode = avl_first(tree);
while (permnode != NULL) {
(void) printf("%s", permnode->z_pname);
permnode = AVL_NEXT(tree, permnode);
if (permnode)
(void) printf(",");
else
(void) printf("\n");
}
}
/*
* Iterate over user/groups/everyone/... and the call perm_iter
* function to print actual permission when tree has >0 nodes.
*/
static void
zfs_iter_perms(avl_tree_t *tree, const char *banner, allow_cb_t *cb)
{
zfs_allow_node_t *item;
avl_tree_t *ptree;
item = avl_first(tree);
while (item) {
ptree = (void *)((char *)item + cb->a_treeoffset);
if (avl_numnodes(ptree)) {
if (cb->a_permcnt++ == 0)
(void) printf("%s\n", banner);
(void) printf("\t%s", item->z_key);
/*
* Avoid an extra space being printed
* for "everyone" which is keyed with a null
* string
*/
if (item->z_key[0] != '\0')
(void) printf(" ");
zfs_print_perms(ptree);
}
item = AVL_NEXT(tree, item);
}
}
#define LINES "-------------------------------------------------------------\n"
static int
zfs_print_allows(char *ds)
{
zfs_allow_t *curperms, *perms;
zfs_handle_t *zhp;
allow_cb_t allowcb = { 0 };
char banner[MAXPATHLEN];
if (ds[0] == '-')
usage(B_FALSE);
if (strrchr(ds, '@')) {
(void) fprintf(stderr, gettext("Snapshots don't have 'allow'"
" permissions\n"));
return (1);
}
if ((zhp = zfs_open(g_zfs, ds, ZFS_TYPE_DATASET)) == NULL)
return (1);
if (zfs_perm_get(zhp, &perms)) {
(void) fprintf(stderr,
gettext("Failed to retrieve 'allows' on %s\n"), ds);
zfs_close(zhp);
return (1);
}
zfs_close(zhp);
if (perms != NULL)
(void) printf("%s", LINES);
for (curperms = perms; curperms; curperms = curperms->z_next) {
(void) snprintf(banner, sizeof (banner),
gettext("Permission sets on (%s)"), curperms->z_setpoint);
allowcb.a_treeoffset =
offsetof(zfs_allow_node_t, z_localdescend);
allowcb.a_permcnt = 0;
zfs_iter_perms(&curperms->z_sets, banner, &allowcb);
(void) snprintf(banner, sizeof (banner),
gettext("Create time permissions on (%s)"),
curperms->z_setpoint);
allowcb.a_treeoffset =
offsetof(zfs_allow_node_t, z_localdescend);
allowcb.a_permcnt = 0;
zfs_iter_perms(&curperms->z_crperms, banner, &allowcb);
(void) snprintf(banner, sizeof (banner),
gettext("Local permissions on (%s)"), curperms->z_setpoint);
allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_local);
allowcb.a_permcnt = 0;
zfs_iter_perms(&curperms->z_user, banner, &allowcb);
zfs_iter_perms(&curperms->z_group, banner, &allowcb);
zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
(void) snprintf(banner, sizeof (banner),
gettext("Descendent permissions on (%s)"),
curperms->z_setpoint);
allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_descend);
allowcb.a_permcnt = 0;
zfs_iter_perms(&curperms->z_user, banner, &allowcb);
zfs_iter_perms(&curperms->z_group, banner, &allowcb);
zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
(void) snprintf(banner, sizeof (banner),
gettext("Local+Descendent permissions on (%s)"),
curperms->z_setpoint);
allowcb.a_treeoffset =
offsetof(zfs_allow_node_t, z_localdescend);
allowcb.a_permcnt = 0;
zfs_iter_perms(&curperms->z_user, banner, &allowcb);
zfs_iter_perms(&curperms->z_group, banner, &allowcb);
zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
(void) printf("%s", LINES);
}
zfs_free_allows(perms);
return (0);
}
#define ALLOWOPTIONS "ldcsu:g:e"
#define UNALLOWOPTIONS "ldcsu:g:er"
/*
* Validate options, and build necessary datastructure to display/remove/add
* permissions.
* Returns 0 - If permissions should be added/removed
* Returns 1 - If permissions should be displayed.
* Returns -1 - on failure
*/
int
parse_allow_args(int *argc, char **argv[], boolean_t unallow,
char **ds, int *recurse, nvlist_t **zperms)
{
int c;
char *options = unallow ? UNALLOWOPTIONS : ALLOWOPTIONS;
zfs_deleg_inherit_t deleg_type = ZFS_DELEG_NONE;
zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN;
char *who = NULL;
char *perms = NULL;
zfs_handle_t *zhp;
while ((c = getopt(*argc, *argv, options)) != -1) {
switch (c) {
case 'l':
if (who_type == ZFS_DELEG_CREATE ||
who_type == ZFS_DELEG_NAMED_SET)
usage(B_FALSE);
deleg_type |= ZFS_DELEG_PERM_LOCAL;
break;
case 'd':
if (who_type == ZFS_DELEG_CREATE ||
who_type == ZFS_DELEG_NAMED_SET)
usage(B_FALSE);
deleg_type |= ZFS_DELEG_PERM_DESCENDENT;
break;
case 'r':
*recurse = B_TRUE;
break;
case 'c':
if (who_type != ZFS_DELEG_WHO_UNKNOWN)
usage(B_FALSE);
if (deleg_type)
usage(B_FALSE);
who_type = ZFS_DELEG_CREATE;
break;
case 's':
if (who_type != ZFS_DELEG_WHO_UNKNOWN)
usage(B_FALSE);
if (deleg_type)
usage(B_FALSE);
who_type = ZFS_DELEG_NAMED_SET;
break;
case 'u':
if (who_type != ZFS_DELEG_WHO_UNKNOWN)
usage(B_FALSE);
who_type = ZFS_DELEG_USER;
who = optarg;
break;
case 'g':
if (who_type != ZFS_DELEG_WHO_UNKNOWN)
usage(B_FALSE);
who_type = ZFS_DELEG_GROUP;
who = optarg;
break;
case 'e':
if (who_type != ZFS_DELEG_WHO_UNKNOWN)
usage(B_FALSE);
who_type = ZFS_DELEG_EVERYONE;
break;
default:
usage(B_FALSE);
break;
}
}
if (deleg_type == 0)
deleg_type = ZFS_DELEG_PERM_LOCALDESCENDENT;
*argc -= optind;
*argv += optind;
if (unallow == B_FALSE && *argc == 1) {
/*
* Only print permissions if no options were processed
*/
if (optind == 1)
return (1);
else
usage(B_FALSE);
}
/*
* initialize variables for zfs_build_perms based on number
* of arguments.
* 3 arguments ==> zfs [un]allow joe perm,perm,perm <dataset> or
* zfs [un]allow -s @set1 perm,perm <dataset>
* 2 arguments ==> zfs [un]allow -c perm,perm <dataset> or
* zfs [un]allow -u|-g <name> perm <dataset> or
* zfs [un]allow -e perm,perm <dataset>
* zfs unallow joe <dataset>
* zfs unallow -s @set1 <dataset>
* 1 argument ==> zfs [un]allow -e <dataset> or
* zfs [un]allow -c <dataset>
*/
switch (*argc) {
case 3:
perms = (*argv)[1];
who = (*argv)[0];
*ds = (*argv)[2];
/*
* advance argc/argv for do_allow cases.
* for do_allow case make sure who have a know who type
* and its not a permission set.
*/
if (unallow == B_TRUE) {
*argc -= 2;
*argv += 2;
} else if (who_type != ZFS_DELEG_WHO_UNKNOWN &&
who_type != ZFS_DELEG_NAMED_SET)
usage(B_FALSE);
break;
case 2:
if (unallow == B_TRUE && (who_type == ZFS_DELEG_EVERYONE ||
who_type == ZFS_DELEG_CREATE || who != NULL)) {
perms = (*argv)[0];
*ds = (*argv)[1];
} else {
if (unallow == B_FALSE &&
(who_type == ZFS_DELEG_WHO_UNKNOWN ||
who_type == ZFS_DELEG_NAMED_SET))
usage(B_FALSE);
else if (who_type == ZFS_DELEG_WHO_UNKNOWN ||
who_type == ZFS_DELEG_NAMED_SET)
who = (*argv)[0];
else if (who_type != ZFS_DELEG_NAMED_SET)
perms = (*argv)[0];
*ds = (*argv)[1];
}
if (unallow == B_TRUE) {
(*argc)--;
(*argv)++;
}
break;
case 1:
if (unallow == B_FALSE)
usage(B_FALSE);
if (who == NULL && who_type != ZFS_DELEG_CREATE &&
who_type != ZFS_DELEG_EVERYONE)
usage(B_FALSE);
*ds = (*argv)[0];
break;
default:
usage(B_FALSE);
}
if (strrchr(*ds, '@')) {
(void) fprintf(stderr,
gettext("Can't set or remove 'allow' permissions "
"on snapshots.\n"));
return (-1);
}
if ((zhp = zfs_open(g_zfs, *ds, ZFS_TYPE_DATASET)) == NULL)
return (-1);
if ((zfs_build_perms(zhp, who, perms,
who_type, deleg_type, zperms)) != 0) {
zfs_close(zhp);
return (-1);
}
zfs_close(zhp);
return (0);
}
static int
zfs_do_allow(int argc, char **argv)
{
char *ds;
nvlist_t *zperms = NULL;
zfs_handle_t *zhp;
int unused;
int ret;
if ((ret = parse_allow_args(&argc, &argv, B_FALSE, &ds,
&unused, &zperms)) == -1)
return (1);
if (ret == 1)
return (zfs_print_allows(argv[0]));
if ((zhp = zfs_open(g_zfs, ds, ZFS_TYPE_DATASET)) == NULL)
return (1);
if (zfs_perm_set(zhp, zperms)) {
zfs_close(zhp);
nvlist_free(zperms);
return (1);
}
nvlist_free(zperms);
zfs_close(zhp);
return (0);
}
static int
unallow_callback(zfs_handle_t *zhp, void *data)
{
nvlist_t *nvp = (nvlist_t *)data;
int error;
error = zfs_perm_remove(zhp, nvp);
if (error) {
(void) fprintf(stderr, gettext("Failed to remove permissions "
"on %s\n"), zfs_get_name(zhp));
}
return (error);
}
static int
zfs_do_unallow(int argc, char **argv)
{
int recurse = B_FALSE;
char *ds;
int error;
nvlist_t *zperms = NULL;
int flags = 0;
if (parse_allow_args(&argc, &argv, B_TRUE,
&ds, &recurse, &zperms) == -1)
return (1);
if (recurse)
flags |= ZFS_ITER_RECURSE;
error = zfs_for_each(argc, argv, flags,
ZFS_TYPE_FILESYSTEM|ZFS_TYPE_VOLUME, NULL,
NULL, unallow_callback, (void *)zperms);
if (zperms)
nvlist_free(zperms);
return (error);
}
typedef struct get_all_cbdata { typedef struct get_all_cbdata {
zfs_handle_t **cb_handles; zfs_handle_t **cb_handles;
size_t cb_alloc; size_t cb_alloc;
@ -3944,6 +3716,15 @@ zfs_do_unshare(int argc, char **argv)
return (unshare_unmount(OP_SHARE, argc, argv)); return (unshare_unmount(OP_SHARE, argc, argv));
} }
/* ARGSUSED */
static int
zfs_do_python(int argc, char **argv)
{
(void) execv(pypath, argv-1);
(void) printf("internal error: %s not found\n", pypath);
return (-1);
}
/* /*
* Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is
* 'legacy'. Otherwise, complain that use should be using 'zfs mount'. * 'legacy'. Otherwise, complain that use should be using 'zfs mount'.
@ -4197,6 +3978,7 @@ main(int argc, char **argv)
/* /*
* Run the appropriate command. * Run the appropriate command.
*/ */
libzfs_mnttab_cache(g_zfs, B_TRUE);
if (find_command_idx(cmdname, &i) == 0) { if (find_command_idx(cmdname, &i) == 0) {
current_command = &command_table[i]; current_command = &command_table[i];
ret = command_table[i].func(argc - 1, argv + 1); ret = command_table[i].func(argc - 1, argv + 1);
@ -4209,6 +3991,7 @@ main(int argc, char **argv)
"command '%s'\n"), cmdname); "command '%s'\n"), cmdname);
usage(B_FALSE); usage(B_FALSE);
} }
libzfs_mnttab_cache(g_zfs, B_FALSE);
} }
(void) fclose(mnttab_file); (void) fclose(mnttab_file);

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
/* /*
* ZFS Fault Injector * ZFS Fault Injector
* *
@ -224,7 +222,7 @@ usage(void)
"\t\tClear the particular record (if given a numeric ID), or\n" "\t\tClear the particular record (if given a numeric ID), or\n"
"\t\tall records if 'all' is specificed.\n" "\t\tall records if 'all' is specificed.\n"
"\n" "\n"
"\tzinject -d device [-e errno] [-L <nvlist|uber>] pool\n" "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F] pool\n"
"\t\tInject a fault into a particular device or the device's\n" "\t\tInject a fault into a particular device or the device's\n"
"\t\tlabel. Label injection can either be 'nvlist' or 'uber'.\n" "\t\tlabel. Label injection can either be 'nvlist' or 'uber'.\n"
"\t\t'errno' can either be 'nxio' (the default) or 'io'.\n" "\t\t'errno' can either be 'nxio' (the default) or 'io'.\n"
@ -516,7 +514,7 @@ main(int argc, char **argv)
return (0); return (0);
} }
while ((c = getopt(argc, argv, ":ab:d:f:qhc:t:l:mr:e:uL:")) != -1) { while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:")) != -1) {
switch (c) { switch (c) {
case 'a': case 'a':
flags |= ZINJECT_FLUSH_ARC; flags |= ZINJECT_FLUSH_ARC;
@ -553,6 +551,9 @@ main(int argc, char **argv)
return (1); return (1);
} }
break; break;
case 'F':
record.zi_failfast = B_TRUE;
break;
case 'h': case 'h':
usage(); usage();
return (0); return (0);

View File

@ -376,12 +376,11 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props,
} }
normnm = zpool_prop_to_name(prop); normnm = zpool_prop_to_name(prop);
} else { } else {
if ((fprop = zfs_name_to_prop(propname)) == ZPROP_INVAL) { if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
(void) fprintf(stderr, gettext("property '%s' is " normnm = zfs_prop_to_name(fprop);
"not a valid file system property\n"), propname); } else {
return (2); normnm = propname;
} }
normnm = zfs_prop_to_name(fprop);
} }
if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && if (nvlist_lookup_string(proplist, normnm, &strval) == 0 &&
@ -979,14 +978,189 @@ max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max)
return (max); return (max);
} }
typedef struct spare_cbdata {
uint64_t cb_guid;
zpool_handle_t *cb_zhp;
} spare_cbdata_t;
static boolean_t
find_vdev(nvlist_t *nv, uint64_t search)
{
uint64_t guid;
nvlist_t **child;
uint_t c, children;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
search == guid)
return (B_TRUE);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) == 0) {
for (c = 0; c < children; c++)
if (find_vdev(child[c], search))
return (B_TRUE);
}
return (B_FALSE);
}
static int
find_spare(zpool_handle_t *zhp, void *data)
{
spare_cbdata_t *cbp = data;
nvlist_t *config, *nvroot;
config = zpool_get_config(zhp, NULL);
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
if (find_vdev(nvroot, cbp->cb_guid)) {
cbp->cb_zhp = zhp;
return (1);
}
zpool_close(zhp);
return (0);
}
/*
* Print out configuration state as requested by status_callback.
*/
void
print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
int namewidth, int depth, boolean_t isspare)
{
nvlist_t **child;
uint_t c, children;
vdev_stat_t *vs;
char rbuf[6], wbuf[6], cbuf[6], repaired[7];
char *vname;
uint64_t notpresent;
spare_cbdata_t cb;
char *state;
verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
(uint64_t **)&vs, &c) == 0);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
children = 0;
state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
if (isspare) {
/*
* For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
* online drives.
*/
if (vs->vs_aux == VDEV_AUX_SPARED)
state = "INUSE";
else if (vs->vs_state == VDEV_STATE_HEALTHY)
state = "AVAIL";
}
(void) printf("\t%*s%-*s %-8s", depth, "", namewidth - depth,
name, state);
if (!isspare) {
zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
}
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
&notpresent) == 0) {
char *path;
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
(void) printf(" was %s", path);
} else if (vs->vs_aux != 0) {
(void) printf(" ");
switch (vs->vs_aux) {
case VDEV_AUX_OPEN_FAILED:
(void) printf(gettext("cannot open"));
break;
case VDEV_AUX_BAD_GUID_SUM:
(void) printf(gettext("missing device"));
break;
case VDEV_AUX_NO_REPLICAS:
(void) printf(gettext("insufficient replicas"));
break;
case VDEV_AUX_VERSION_NEWER:
(void) printf(gettext("newer version"));
break;
case VDEV_AUX_SPARED:
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
&cb.cb_guid) == 0);
if (zpool_iter(g_zfs, find_spare, &cb) == 1) {
if (strcmp(zpool_get_name(cb.cb_zhp),
zpool_get_name(zhp)) == 0)
(void) printf(gettext("currently in "
"use"));
else
(void) printf(gettext("in use by "
"pool '%s'"),
zpool_get_name(cb.cb_zhp));
zpool_close(cb.cb_zhp);
} else {
(void) printf(gettext("currently in use"));
}
break;
case VDEV_AUX_ERR_EXCEEDED:
(void) printf(gettext("too many errors"));
break;
case VDEV_AUX_IO_FAILURE:
(void) printf(gettext("experienced I/O failures"));
break;
case VDEV_AUX_BAD_LOG:
(void) printf(gettext("bad intent log"));
break;
default:
(void) printf(gettext("corrupted data"));
break;
}
} else if (vs->vs_scrub_repaired != 0 && children == 0) {
/*
* Report bytes resilvered/repaired on leaf devices.
*/
zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
(void) printf(gettext(" %s %s"), repaired,
(vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
"resilvered" : "repaired");
}
(void) printf("\n");
for (c = 0; c < children; c++) {
uint64_t is_log = B_FALSE;
/* Don't print logs here */
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&is_log);
if (is_log)
continue;
vname = zpool_vdev_name(g_zfs, zhp, child[c]);
print_status_config(zhp, vname, child[c],
namewidth, depth + 2, isspare);
free(vname);
}
}
/* /*
* Print the configuration of an exported pool. Iterate over all vdevs in the * Print the configuration of an exported pool. Iterate over all vdevs in the
* pool, printing out the name and status for each one. * pool, printing out the name and status for each one.
*/ */
void void
print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth, print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
boolean_t print_logs)
{ {
nvlist_t **child; nvlist_t **child;
uint_t c, children; uint_t c, children;
@ -1043,12 +1217,11 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth,
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&is_log); &is_log);
if ((is_log && !print_logs) || (!is_log && print_logs)) if (is_log)
continue; continue;
vname = zpool_vdev_name(g_zfs, NULL, child[c]); vname = zpool_vdev_name(g_zfs, NULL, child[c]);
print_import_config(vname, child[c], print_import_config(vname, child[c], namewidth, depth + 2);
namewidth, depth + 2, B_FALSE);
free(vname); free(vname);
} }
@ -1073,6 +1246,43 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth,
} }
} }
/*
* Print log vdevs.
* Logs are recorded as top level vdevs in the main pool child array
* but with "is_log" set to 1. We use either print_status_config() or
* print_import_config() to print the top level logs then any log
* children (eg mirrored slogs) are printed recursively - which
* works because only the top level vdev is marked "is_log"
*/
static void
print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose)
{
uint_t c, children;
nvlist_t **child;
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child,
&children) != 0)
return;
(void) printf(gettext("\tlogs\n"));
for (c = 0; c < children; c++) {
uint64_t is_log = B_FALSE;
char *name;
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&is_log);
if (!is_log)
continue;
name = zpool_vdev_name(g_zfs, zhp, child[c]);
if (verbose)
print_status_config(zhp, name, child[c], namewidth,
2, B_FALSE);
else
print_import_config(name, child[c], namewidth, 2);
free(name);
}
}
/* /*
* Display the status for the given pool. * Display the status for the given pool.
*/ */
@ -1241,11 +1451,9 @@ show_import(nvlist_t *config)
if (namewidth < 10) if (namewidth < 10)
namewidth = 10; namewidth = 10;
print_import_config(name, nvroot, namewidth, 0, B_FALSE); print_import_config(name, nvroot, namewidth, 0);
if (num_logs(nvroot) > 0) { if (num_logs(nvroot) > 0)
(void) printf(gettext("\tlogs\n")); print_logs(NULL, nvroot, namewidth, B_FALSE);
print_import_config(name, nvroot, namewidth, 0, B_TRUE);
}
if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
(void) printf(gettext("\n\tAdditional devices are known to " (void) printf(gettext("\n\tAdditional devices are known to "
@ -2427,10 +2635,14 @@ zpool_do_online(int argc, char **argv)
zpool_handle_t *zhp; zpool_handle_t *zhp;
int ret = 0; int ret = 0;
vdev_state_t newstate; vdev_state_t newstate;
int flags = 0;
/* check options */ /* check options */
while ((c = getopt(argc, argv, "t")) != -1) { while ((c = getopt(argc, argv, "et")) != -1) {
switch (c) { switch (c) {
case 'e':
flags |= ZFS_ONLINE_EXPAND;
break;
case 't': case 't':
case '?': case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"), (void) fprintf(stderr, gettext("invalid option '%c'\n"),
@ -2458,7 +2670,7 @@ zpool_do_online(int argc, char **argv)
return (1); return (1);
for (i = 1; i < argc; i++) { for (i = 1; i < argc; i++) {
if (zpool_vdev_online(zhp, argv[i], 0, &newstate) == 0) { if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) {
if (newstate != VDEV_STATE_HEALTHY) { if (newstate != VDEV_STATE_HEALTHY) {
(void) printf(gettext("warning: device '%s' " (void) printf(gettext("warning: device '%s' "
"onlined, but remains in faulted state\n"), "onlined, but remains in faulted state\n"),
@ -2715,181 +2927,6 @@ print_scrub_status(nvlist_t *nvroot)
(u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60)); (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60));
} }
typedef struct spare_cbdata {
uint64_t cb_guid;
zpool_handle_t *cb_zhp;
} spare_cbdata_t;
static boolean_t
find_vdev(nvlist_t *nv, uint64_t search)
{
uint64_t guid;
nvlist_t **child;
uint_t c, children;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
search == guid)
return (B_TRUE);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) == 0) {
for (c = 0; c < children; c++)
if (find_vdev(child[c], search))
return (B_TRUE);
}
return (B_FALSE);
}
static int
find_spare(zpool_handle_t *zhp, void *data)
{
spare_cbdata_t *cbp = data;
nvlist_t *config, *nvroot;
config = zpool_get_config(zhp, NULL);
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
if (find_vdev(nvroot, cbp->cb_guid)) {
cbp->cb_zhp = zhp;
return (1);
}
zpool_close(zhp);
return (0);
}
/*
* Print out configuration state as requested by status_callback.
*/
void
print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
int namewidth, int depth, boolean_t isspare, boolean_t print_logs)
{
nvlist_t **child;
uint_t c, children;
vdev_stat_t *vs;
char rbuf[6], wbuf[6], cbuf[6], repaired[7];
char *vname;
uint64_t notpresent;
spare_cbdata_t cb;
char *state;
verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
(uint64_t **)&vs, &c) == 0);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
children = 0;
state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
if (isspare) {
/*
* For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
* online drives.
*/
if (vs->vs_aux == VDEV_AUX_SPARED)
state = "INUSE";
else if (vs->vs_state == VDEV_STATE_HEALTHY)
state = "AVAIL";
}
(void) printf("\t%*s%-*s %-8s", depth, "", namewidth - depth,
name, state);
if (!isspare) {
zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
}
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
&notpresent) == 0) {
char *path;
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
(void) printf(" was %s", path);
} else if (vs->vs_aux != 0) {
(void) printf(" ");
switch (vs->vs_aux) {
case VDEV_AUX_OPEN_FAILED:
(void) printf(gettext("cannot open"));
break;
case VDEV_AUX_BAD_GUID_SUM:
(void) printf(gettext("missing device"));
break;
case VDEV_AUX_NO_REPLICAS:
(void) printf(gettext("insufficient replicas"));
break;
case VDEV_AUX_VERSION_NEWER:
(void) printf(gettext("newer version"));
break;
case VDEV_AUX_SPARED:
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
&cb.cb_guid) == 0);
if (zpool_iter(g_zfs, find_spare, &cb) == 1) {
if (strcmp(zpool_get_name(cb.cb_zhp),
zpool_get_name(zhp)) == 0)
(void) printf(gettext("currently in "
"use"));
else
(void) printf(gettext("in use by "
"pool '%s'"),
zpool_get_name(cb.cb_zhp));
zpool_close(cb.cb_zhp);
} else {
(void) printf(gettext("currently in use"));
}
break;
case VDEV_AUX_ERR_EXCEEDED:
(void) printf(gettext("too many errors"));
break;
case VDEV_AUX_IO_FAILURE:
(void) printf(gettext("experienced I/O failures"));
break;
case VDEV_AUX_BAD_LOG:
(void) printf(gettext("bad intent log"));
break;
default:
(void) printf(gettext("corrupted data"));
break;
}
} else if (vs->vs_scrub_repaired != 0 && children == 0) {
/*
* Report bytes resilvered/repaired on leaf devices.
*/
zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
(void) printf(gettext(" %s %s"), repaired,
(vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
"resilvered" : "repaired");
}
(void) printf("\n");
for (c = 0; c < children; c++) {
uint64_t is_log = B_FALSE;
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&is_log);
if ((is_log && !print_logs) || (!is_log && print_logs))
continue;
vname = zpool_vdev_name(g_zfs, zhp, child[c]);
print_status_config(zhp, vname, child[c],
namewidth, depth + 2, isspare, B_FALSE);
free(vname);
}
}
static void static void
print_error_log(zpool_handle_t *zhp) print_error_log(zpool_handle_t *zhp)
{ {
@ -2940,7 +2977,7 @@ print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares,
for (i = 0; i < nspares; i++) { for (i = 0; i < nspares; i++) {
name = zpool_vdev_name(g_zfs, zhp, spares[i]); name = zpool_vdev_name(g_zfs, zhp, spares[i]);
print_status_config(zhp, name, spares[i], print_status_config(zhp, name, spares[i],
namewidth, 2, B_TRUE, B_FALSE); namewidth, 2, B_TRUE);
free(name); free(name);
} }
} }
@ -2960,7 +2997,7 @@ print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache,
for (i = 0; i < nl2cache; i++) { for (i = 0; i < nl2cache; i++) {
name = zpool_vdev_name(g_zfs, zhp, l2cache[i]); name = zpool_vdev_name(g_zfs, zhp, l2cache[i]);
print_status_config(zhp, name, l2cache[i], print_status_config(zhp, name, l2cache[i],
namewidth, 2, B_FALSE, B_FALSE); namewidth, 2, B_FALSE);
free(name); free(name);
} }
} }
@ -3190,11 +3227,10 @@ status_callback(zpool_handle_t *zhp, void *data)
(void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), namewidth, (void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), namewidth,
"NAME", "STATE", "READ", "WRITE", "CKSUM"); "NAME", "STATE", "READ", "WRITE", "CKSUM");
print_status_config(zhp, zpool_get_name(zhp), nvroot, print_status_config(zhp, zpool_get_name(zhp), nvroot,
namewidth, 0, B_FALSE, B_FALSE); namewidth, 0, B_FALSE);
if (num_logs(nvroot) > 0)
print_status_config(zhp, "logs", nvroot, namewidth, 0,
B_FALSE, B_TRUE);
if (num_logs(nvroot) > 0)
print_logs(zhp, nvroot, namewidth, B_TRUE);
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache) == 0) &l2cache, &nl2cache) == 0)
print_l2cache(zhp, l2cache, nl2cache, namewidth); print_l2cache(zhp, l2cache, nl2cache, namewidth);
@ -3418,7 +3454,7 @@ zpool_do_upgrade(int argc, char **argv)
/* check options */ /* check options */
while ((c = getopt(argc, argv, "avV:")) != -1) { while ((c = getopt(argc, argv, ":avV:")) != -1) {
switch (c) { switch (c) {
case 'a': case 'a':
cb.cb_all = B_TRUE; cb.cb_all = B_TRUE;
@ -3435,6 +3471,11 @@ zpool_do_upgrade(int argc, char **argv)
usage(B_FALSE); usage(B_FALSE);
} }
break; break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?': case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"), (void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt); optopt);
@ -3495,8 +3536,9 @@ zpool_do_upgrade(int argc, char **argv)
(void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 11 Improved scrub performance\n"));
(void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 12 Snapshot properties\n"));
(void) printf(gettext(" 13 snapused property\n")); (void) printf(gettext(" 13 snapused property\n"));
(void) printf(gettext(" 14 passthrough-x aclinherit " (void) printf(gettext(" 14 passthrough-x aclinherit\n"));
"support\n")); (void) printf(gettext(" 15 user/group space accounting\n"));
(void) printf(gettext(" 16 stmf property support\n"));
(void) printf(gettext("For more information on a particular " (void) printf(gettext("For more information on a particular "
"version, including supported releases, see:\n\n")); "version, including supported releases, see:\n\n"));
(void) printf("http://www.opensolaris.org/os/community/zfs/" (void) printf("http://www.opensolaris.org/os/community/zfs/"

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -76,6 +76,7 @@
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/dmu.h> #include <sys/dmu.h>
#include <sys/txg.h> #include <sys/txg.h>
#include <sys/dbuf.h>
#include <sys/zap.h> #include <sys/zap.h>
#include <sys/dmu_objset.h> #include <sys/dmu_objset.h>
#include <sys/poll.h> #include <sys/poll.h>
@ -92,6 +93,7 @@
#include <sys/vdev_file.h> #include <sys/vdev_file.h>
#include <sys/spa_impl.h> #include <sys/spa_impl.h>
#include <sys/dsl_prop.h> #include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
#include <sys/refcount.h> #include <sys/refcount.h>
#include <stdio.h> #include <stdio.h>
#include <stdio_ext.h> #include <stdio_ext.h>
@ -162,6 +164,7 @@ typedef void ztest_func_t(ztest_args_t *);
* Note: these aren't static because we want dladdr() to work. * Note: these aren't static because we want dladdr() to work.
*/ */
ztest_func_t ztest_dmu_read_write; ztest_func_t ztest_dmu_read_write;
ztest_func_t ztest_dmu_read_write_zcopy;
ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_write_parallel;
ztest_func_t ztest_dmu_object_alloc_free; ztest_func_t ztest_dmu_object_alloc_free;
ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_dmu_commit_callbacks;
@ -171,6 +174,7 @@ ztest_func_t ztest_traverse;
ztest_func_t ztest_dsl_prop_get_set; ztest_func_t ztest_dsl_prop_get_set;
ztest_func_t ztest_dmu_objset_create_destroy; ztest_func_t ztest_dmu_objset_create_destroy;
ztest_func_t ztest_dmu_snapshot_create_destroy; ztest_func_t ztest_dmu_snapshot_create_destroy;
ztest_func_t ztest_dsl_dataset_promote_busy;
ztest_func_t ztest_spa_create_destroy; ztest_func_t ztest_spa_create_destroy;
ztest_func_t ztest_fault_inject; ztest_func_t ztest_fault_inject;
ztest_func_t ztest_spa_rename; ztest_func_t ztest_spa_rename;
@ -197,6 +201,7 @@ uint64_t zopt_rarely = 60; /* every 60 seconds */
ztest_info_t ztest_info[] = { ztest_info_t ztest_info[] = {
{ ztest_dmu_read_write, 1, &zopt_always }, { ztest_dmu_read_write, 1, &zopt_always },
{ ztest_dmu_read_write_zcopy, 1, &zopt_always },
{ ztest_dmu_write_parallel, 30, &zopt_always }, { ztest_dmu_write_parallel, 30, &zopt_always },
{ ztest_dmu_object_alloc_free, 1, &zopt_always }, { ztest_dmu_object_alloc_free, 1, &zopt_always },
{ ztest_dmu_commit_callbacks, 10, &zopt_always }, { ztest_dmu_commit_callbacks, 10, &zopt_always },
@ -210,6 +215,7 @@ ztest_info_t ztest_info[] = {
{ ztest_spa_rename, 1, &zopt_rarely }, { ztest_spa_rename, 1, &zopt_rarely },
{ ztest_vdev_attach_detach, 1, &zopt_rarely }, { ztest_vdev_attach_detach, 1, &zopt_rarely },
{ ztest_vdev_LUN_growth, 1, &zopt_rarely }, { ztest_vdev_LUN_growth, 1, &zopt_rarely },
{ ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
{ ztest_vdev_add_remove, 1, &zopt_vdevtime }, { ztest_vdev_add_remove, 1, &zopt_vdevtime },
{ ztest_vdev_aux_add_remove, 1, &zopt_vdevtime }, { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime },
{ ztest_scrub, 1, &zopt_vdevtime }, { ztest_scrub, 1, &zopt_vdevtime },
@ -255,9 +261,11 @@ static ztest_shared_t *ztest_shared;
static int ztest_random_fd; static int ztest_random_fd;
static int ztest_dump_core = 1; static int ztest_dump_core = 1;
static uint64_t metaslab_sz;
static boolean_t ztest_exiting; static boolean_t ztest_exiting;
extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
#define ZTEST_DIROBJ 1 #define ZTEST_DIROBJ 1
#define ZTEST_MICROZAP_OBJ 2 #define ZTEST_MICROZAP_OBJ 2
@ -959,7 +967,7 @@ ztest_vdev_aux_add_remove(ztest_args_t *za)
* of devices that have pending state changes. * of devices that have pending state changes.
*/ */
if (ztest_random(2) == 0) if (ztest_random(2) == 0)
(void) vdev_online(spa, guid, B_FALSE, NULL); (void) vdev_online(spa, guid, 0, NULL);
error = spa_vdev_remove(spa, guid, B_FALSE); error = spa_vdev_remove(spa, guid, B_FALSE);
if (error != 0 && error != EBUSY) if (error != 0 && error != EBUSY)
@ -1037,7 +1045,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
} }
oldguid = oldvd->vdev_guid; oldguid = oldvd->vdev_guid;
oldsize = vdev_get_rsize(oldvd); oldsize = vdev_get_min_asize(oldvd);
oldvd_is_log = oldvd->vdev_top->vdev_islog; oldvd_is_log = oldvd->vdev_top->vdev_islog;
(void) strcpy(oldpath, oldvd->vdev_path); (void) strcpy(oldpath, oldvd->vdev_path);
pvd = oldvd->vdev_parent; pvd = oldvd->vdev_parent;
@ -1073,7 +1081,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
} }
if (newvd) { if (newvd) {
newsize = vdev_get_rsize(newvd); newsize = vdev_get_min_asize(newvd);
} else { } else {
/* /*
* Make newsize a little bigger or smaller than oldsize. * Make newsize a little bigger or smaller than oldsize.
@ -1148,6 +1156,95 @@ ztest_vdev_attach_detach(ztest_args_t *za)
(void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock); (void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock);
} }
/*
* Callback function which expands the physical size of the vdev.
*/
vdev_t *
grow_vdev(vdev_t *vd, void *arg)
{
spa_t *spa = vd->vdev_spa;
size_t *newsize = arg;
size_t fsize;
int fd;
ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
ASSERT(vd->vdev_ops->vdev_op_leaf);
if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
return (vd);
fsize = lseek(fd, 0, SEEK_END);
(void) ftruncate(fd, *newsize);
if (zopt_verbose >= 6) {
(void) printf("%s grew from %lu to %lu bytes\n",
vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
}
(void) close(fd);
return (NULL);
}
/*
* Callback function which expands a given vdev by calling vdev_online().
*/
/* ARGSUSED */
vdev_t *
online_vdev(vdev_t *vd, void *arg)
{
spa_t *spa = vd->vdev_spa;
vdev_t *tvd = vd->vdev_top;
vdev_t *pvd = vd->vdev_parent;
uint64_t guid = vd->vdev_guid;
ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
ASSERT(vd->vdev_ops->vdev_op_leaf);
/* Calling vdev_online will initialize the new metaslabs */
spa_config_exit(spa, SCL_STATE, spa);
(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
/*
* Since we dropped the lock we need to ensure that we're
* still talking to the original vdev. It's possible this
* vdev may have been detached/replaced while we were
* trying to online it.
*/
if (vd != vdev_lookup_by_guid(tvd, guid) || vd->vdev_parent != pvd) {
if (zopt_verbose >= 6) {
(void) printf("vdev %p has disappeared, was "
"guid %llu\n", (void *)vd, (u_longlong_t)guid);
}
return (vd);
}
return (NULL);
}
/*
* Traverse the vdev tree calling the supplied function.
* We continue to walk the tree until we either have walked all
* children or we receive a non-NULL return from the callback.
* If a NULL callback is passed, then we just return back the first
* leaf vdev we encounter.
*/
vdev_t *
vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
{
if (vd->vdev_ops->vdev_op_leaf) {
if (func == NULL)
return (vd);
else
return (func(vd, arg));
}
for (uint_t c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
return (cvd);
}
return (NULL);
}
/* /*
* Verify that dynamic LUN growth works as expected. * Verify that dynamic LUN growth works as expected.
*/ */
@ -1155,44 +1252,108 @@ void
ztest_vdev_LUN_growth(ztest_args_t *za) ztest_vdev_LUN_growth(ztest_args_t *za)
{ {
spa_t *spa = za->za_spa; spa_t *spa = za->za_spa;
char dev_name[MAXPATHLEN]; vdev_t *vd, *tvd = NULL;
uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; size_t psize, newsize;
uint64_t vdev; uint64_t spa_newsize, spa_cursize, ms_count;
size_t fsize;
int fd;
(void) pthread_mutex_lock(&ztest_shared->zs_vdev_lock); (void) mutex_lock(&ztest_shared->zs_vdev_lock);
mutex_enter(&spa_namespace_lock);
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
/* while (tvd == NULL || tvd->vdev_islog) {
* Pick a random leaf vdev. uint64_t vdev;
*/
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
vdev = ztest_random(spa->spa_root_vdev->vdev_children * leaves);
spa_config_exit(spa, SCL_VDEV, FTAG);
(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev); vdev = ztest_random(spa->spa_root_vdev->vdev_children);
tvd = spa->spa_root_vdev->vdev_child[vdev];
if ((fd = open(dev_name, O_RDWR)) != -1) {
/*
* Determine the size.
*/
fsize = lseek(fd, 0, SEEK_END);
/*
* If it's less than 2x the original size, grow by around 3%.
*/
if (fsize < 2 * zopt_vdev_size) {
size_t newsize = fsize + ztest_random(fsize / 32);
(void) ftruncate(fd, newsize);
if (zopt_verbose >= 6) {
(void) printf("%s grew from %lu to %lu bytes\n",
dev_name, (ulong_t)fsize, (ulong_t)newsize);
}
}
(void) close(fd);
} }
(void) pthread_mutex_unlock(&ztest_shared->zs_vdev_lock); /*
* Determine the size of the first leaf vdev associated with
* our top-level device.
*/
vd = vdev_walk_tree(tvd, NULL, NULL);
ASSERT3P(vd, !=, NULL);
ASSERT(vd->vdev_ops->vdev_op_leaf);
psize = vd->vdev_psize;
/*
* We only try to expand the vdev if it's less than 4x its
* original size and it has a valid psize.
*/
if (psize == 0 || psize >= 4 * zopt_vdev_size) {
spa_config_exit(spa, SCL_STATE, spa);
mutex_exit(&spa_namespace_lock);
(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
return;
}
ASSERT(psize > 0);
newsize = psize + psize / 8;
ASSERT3U(newsize, >, psize);
if (zopt_verbose >= 6) {
(void) printf("Expanding vdev %s from %lu to %lu\n",
vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
}
spa_cursize = spa_get_space(spa);
ms_count = tvd->vdev_ms_count;
/*
* Growing the vdev is a two step process:
* 1). expand the physical size (i.e. relabel)
* 2). online the vdev to create the new metaslabs
*/
if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
tvd->vdev_state != VDEV_STATE_HEALTHY) {
if (zopt_verbose >= 5) {
(void) printf("Could not expand LUN because "
"some vdevs were not healthy\n");
}
(void) spa_config_exit(spa, SCL_STATE, spa);
mutex_exit(&spa_namespace_lock);
(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
return;
}
(void) spa_config_exit(spa, SCL_STATE, spa);
mutex_exit(&spa_namespace_lock);
/*
* Expanding the LUN will update the config asynchronously,
* thus we must wait for the async thread to complete any
* pending tasks before proceeding.
*/
mutex_enter(&spa->spa_async_lock);
while (spa->spa_async_thread != NULL || spa->spa_async_tasks)
cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
mutex_exit(&spa->spa_async_lock);
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
spa_newsize = spa_get_space(spa);
/*
* Make sure we were able to grow the pool.
*/
if (ms_count >= tvd->vdev_ms_count ||
spa_cursize >= spa_newsize) {
(void) printf("Top-level vdev metaslab count: "
"before %llu, after %llu\n",
(u_longlong_t)ms_count,
(u_longlong_t)tvd->vdev_ms_count);
fatal(0, "LUN expansion failed: before %llu, "
"after %llu\n", spa_cursize, spa_newsize);
} else if (zopt_verbose >= 5) {
char oldnumbuf[6], newnumbuf[6];
nicenum(spa_cursize, oldnumbuf);
nicenum(spa_newsize, newnumbuf);
(void) printf("%s grew from %s to %s\n",
spa->spa_name, oldnumbuf, newnumbuf);
}
spa_config_exit(spa, SCL_STATE, spa);
(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
} }
/* ARGSUSED */ /* ARGSUSED */
@ -1438,7 +1599,8 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
error = dmu_objset_destroy(snapname); error = dmu_objset_destroy(snapname);
if (error != 0 && error != ENOENT) if (error != 0 && error != ENOENT)
fatal(0, "dmu_objset_destroy() = %d", error); fatal(0, "dmu_objset_destroy() = %d", error);
error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1, FALSE); error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1,
NULL, FALSE);
if (error == ENOSPC) if (error == ENOSPC)
ztest_record_enospc("dmu_take_snapshot"); ztest_record_enospc("dmu_take_snapshot");
else if (error != 0 && error != EEXIST) else if (error != 0 && error != EEXIST)
@ -1446,6 +1608,148 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
(void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock);
} }
/*
* Cleanup non-standard snapshots and clones.
*/
void
ztest_dsl_dataset_cleanup(char *osname, uint64_t curval)
{
char snap1name[100];
char clone1name[100];
char snap2name[100];
char clone2name[100];
char snap3name[100];
int error;
(void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
(void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
(void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
(void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
(void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
error = dmu_objset_destroy(clone2name);
if (error && error != ENOENT)
fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
error = dmu_objset_destroy(snap3name);
if (error && error != ENOENT)
fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error);
error = dmu_objset_destroy(snap2name);
if (error && error != ENOENT)
fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
error = dmu_objset_destroy(clone1name);
if (error && error != ENOENT)
fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
error = dmu_objset_destroy(snap1name);
if (error && error != ENOENT)
fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
}
/*
* Verify dsl_dataset_promote handles EBUSY
*/
void
ztest_dsl_dataset_promote_busy(ztest_args_t *za)
{
int error;
objset_t *os = za->za_os;
objset_t *clone;
dsl_dataset_t *ds;
char snap1name[100];
char clone1name[100];
char snap2name[100];
char clone2name[100];
char snap3name[100];
char osname[MAXNAMELEN];
uint64_t curval = za->za_instance;
(void) rw_rdlock(&ztest_shared->zs_name_lock);
dmu_objset_name(os, osname);
ztest_dsl_dataset_cleanup(osname, curval);
(void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
(void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
(void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
(void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
(void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1,
NULL, FALSE);
if (error && error != EEXIST) {
if (error == ENOSPC) {
ztest_record_enospc("dmu_take_snapshot");
goto out;
}
fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
}
error = dmu_objset_open(snap1name, DMU_OST_OTHER,
DS_MODE_USER | DS_MODE_READONLY, &clone);
if (error)
fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error);
error = dmu_objset_create(clone1name, DMU_OST_OTHER, clone, 0,
NULL, NULL);
dmu_objset_close(clone);
if (error) {
if (error == ENOSPC) {
ztest_record_enospc("dmu_objset_create");
goto out;
}
fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
}
error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
NULL, FALSE);
if (error && error != EEXIST) {
if (error == ENOSPC) {
ztest_record_enospc("dmu_take_snapshot");
goto out;
}
fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
}
error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
NULL, FALSE);
if (error && error != EEXIST) {
if (error == ENOSPC) {
ztest_record_enospc("dmu_take_snapshot");
goto out;
}
fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
}
error = dmu_objset_open(snap3name, DMU_OST_OTHER,
DS_MODE_USER | DS_MODE_READONLY, &clone);
if (error)
fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
error = dmu_objset_create(clone2name, DMU_OST_OTHER, clone, 0,
NULL, NULL);
dmu_objset_close(clone);
if (error) {
if (error == ENOSPC) {
ztest_record_enospc("dmu_objset_create");
goto out;
}
fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
}
error = dsl_dataset_own(snap1name, DS_MODE_READONLY, FTAG, &ds);
if (error)
fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error);
error = dsl_dataset_promote(clone2name);
if (error != EBUSY)
fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
error);
dsl_dataset_disown(ds, FTAG);
out:
ztest_dsl_dataset_cleanup(osname, curval);
(void) rw_unlock(&ztest_shared->zs_name_lock);
}
/* /*
* Verify that dmu_object_{alloc,free} work as expected. * Verify that dmu_object_{alloc,free} work as expected.
*/ */
@ -1469,7 +1773,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
* Create a batch object if necessary, and record it in the directory. * Create a batch object if necessary, and record it in the directory.
*/ */
VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff, VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
sizeof (uint64_t), &batchobj)); sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH));
if (batchobj == 0) { if (batchobj == 0) {
tx = dmu_tx_create(os); tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
@ -1494,7 +1798,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
*/ */
for (b = 0; b < batchsize; b++) { for (b = 0; b < batchsize; b++) {
VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t), VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
sizeof (uint64_t), &object)); sizeof (uint64_t), &object, DMU_READ_PREFETCH));
if (object == 0) if (object == 0)
continue; continue;
/* /*
@ -1529,7 +1833,7 @@ ztest_dmu_object_alloc_free(ztest_args_t *za)
* We expect the word at endoff to be our object number. * We expect the word at endoff to be our object number.
*/ */
VERIFY(0 == dmu_read(os, object, endoff, VERIFY(0 == dmu_read(os, object, endoff,
sizeof (uint64_t), &temp)); sizeof (uint64_t), &temp, DMU_READ_PREFETCH));
if (temp != object) { if (temp != object) {
fatal(0, "bad data in %s, got %llu, expected %llu", fatal(0, "bad data in %s, got %llu, expected %llu",
@ -1714,7 +2018,7 @@ ztest_dmu_read_write(ztest_args_t *za)
* Read the directory info. If it's the first time, set things up. * Read the directory info. If it's the first time, set things up.
*/ */
VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff, VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
sizeof (dd), &dd)); sizeof (dd), &dd, DMU_READ_PREFETCH));
if (dd.dd_chunk == 0) { if (dd.dd_chunk == 0) {
ASSERT(dd.dd_packobj == 0); ASSERT(dd.dd_packobj == 0);
ASSERT(dd.dd_bigobj == 0); ASSERT(dd.dd_bigobj == 0);
@ -1776,9 +2080,11 @@ ztest_dmu_read_write(ztest_args_t *za)
/* /*
* Read the current contents of our objects. * Read the current contents of our objects.
*/ */
error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf); error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf,
DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0); ASSERT3U(error, ==, 0);
error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf); error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf,
DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0); ASSERT3U(error, ==, 0);
/* /*
@ -1884,9 +2190,9 @@ ztest_dmu_read_write(ztest_args_t *za)
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff, VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
packsize, packcheck)); packsize, packcheck, DMU_READ_PREFETCH));
VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff, VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
bigsize, bigcheck)); bigsize, bigcheck, DMU_READ_PREFETCH));
ASSERT(bcmp(packbuf, packcheck, packsize) == 0); ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
@ -1899,6 +2205,314 @@ ztest_dmu_read_write(ztest_args_t *za)
umem_free(bigbuf, bigsize); umem_free(bigbuf, bigsize);
} }
void
compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
uint64_t bigsize, uint64_t n, dmu_read_write_dir_t dd, uint64_t txg)
{
uint64_t i;
bufwad_t *pack;
bufwad_t *bigH;
bufwad_t *bigT;
/*
* For each index from n to n + s, verify that the existing bufwad
* in packobj matches the bufwads at the head and tail of the
* corresponding chunk in bigobj. Then update all three bufwads
* with the new values we want to write out.
*/
for (i = 0; i < s; i++) {
/* LINTED */
pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
/* LINTED */
bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
/* LINTED */
bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
if (pack->bw_txg > txg)
fatal(0, "future leak: got %llx, open txg is %llx",
pack->bw_txg, txg);
if (pack->bw_data != 0 && pack->bw_index != n + i)
fatal(0, "wrong index: got %llx, wanted %llx+%llx",
pack->bw_index, n, i);
if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
pack->bw_index = n + i;
pack->bw_txg = txg;
pack->bw_data = 1 + ztest_random(-2ULL);
*bigH = *pack;
*bigT = *pack;
}
}
void
ztest_dmu_read_write_zcopy(ztest_args_t *za)
{
objset_t *os = za->za_os;
dmu_read_write_dir_t dd;
dmu_tx_t *tx;
uint64_t i;
int error;
uint64_t n, s, txg;
bufwad_t *packbuf, *bigbuf;
uint64_t packoff, packsize, bigoff, bigsize;
uint64_t regions = 997;
uint64_t stride = 123456789ULL;
uint64_t width = 9;
dmu_buf_t *bonus_db;
arc_buf_t **bigbuf_arcbufs;
dmu_object_info_t *doi = &za->za_doi;
/*
* This test uses two objects, packobj and bigobj, that are always
* updated together (i.e. in the same tx) so that their contents are
* in sync and can be compared. Their contents relate to each other
* in a simple way: packobj is a dense array of 'bufwad' structures,
* while bigobj is a sparse array of the same bufwads. Specifically,
* for any index n, there are three bufwads that should be identical:
*
* packobj, at offset n * sizeof (bufwad_t)
* bigobj, at the head of the nth chunk
* bigobj, at the tail of the nth chunk
*
* The chunk size is set equal to bigobj block size so that
* dmu_assign_arcbuf() can be tested for object updates.
*/
/*
* Read the directory info. If it's the first time, set things up.
*/
VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
sizeof (dd), &dd, DMU_READ_PREFETCH));
if (dd.dd_chunk == 0) {
ASSERT(dd.dd_packobj == 0);
ASSERT(dd.dd_bigobj == 0);
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
ztest_record_enospc("create r/w directory");
dmu_tx_abort(tx);
return;
}
dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
DMU_OT_NONE, 0, tx);
dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
DMU_OT_NONE, 0, tx);
ztest_set_random_blocksize(os, dd.dd_packobj, tx);
ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
ASSERT(doi->doi_data_block_size >= 2 * sizeof (bufwad_t));
ASSERT(ISP2(doi->doi_data_block_size));
dd.dd_chunk = doi->doi_data_block_size;
dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
tx);
dmu_tx_commit(tx);
} else {
VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
VERIFY(ISP2(doi->doi_data_block_size));
VERIFY(dd.dd_chunk == doi->doi_data_block_size);
VERIFY(dd.dd_chunk >= 2 * sizeof (bufwad_t));
}
/*
* Pick a random index and compute the offsets into packobj and bigobj.
*/
n = ztest_random(regions) * stride + ztest_random(width);
s = 1 + ztest_random(width - 1);
packoff = n * sizeof (bufwad_t);
packsize = s * sizeof (bufwad_t);
bigoff = n * dd.dd_chunk;
bigsize = s * dd.dd_chunk;
packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
VERIFY(dmu_bonus_hold(os, dd.dd_bigobj, FTAG, &bonus_db) == 0);
bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
/*
* Iteration 0 test zcopy for DB_UNCACHED dbufs.
* Iteration 1 test zcopy to already referenced dbufs.
* Iteration 2 test zcopy to dirty dbuf in the same txg.
* Iteration 3 test zcopy to dbuf dirty in previous txg.
* Iteration 4 test zcopy when dbuf is no longer dirty.
* Iteration 5 test zcopy when it can't be done.
* Iteration 6 one more zcopy write.
*/
for (i = 0; i < 7; i++) {
uint64_t j;
uint64_t off;
/*
* In iteration 5 (i == 5) use arcbufs
* that don't match bigobj blksz to test
* dmu_assign_arcbuf() when it can't directly
* assign an arcbuf to a dbuf.
*/
for (j = 0; j < s; j++) {
if (i != 5) {
bigbuf_arcbufs[j] =
dmu_request_arcbuf(bonus_db,
dd.dd_chunk);
} else {
bigbuf_arcbufs[2 * j] =
dmu_request_arcbuf(bonus_db,
dd.dd_chunk / 2);
bigbuf_arcbufs[2 * j + 1] =
dmu_request_arcbuf(bonus_db,
dd.dd_chunk / 2);
}
}
/*
* Get a tx for the mods to both packobj and bigobj.
*/
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
if (ztest_random(100) == 0) {
error = -1;
} else {
error = dmu_tx_assign(tx, TXG_WAIT);
}
if (error) {
if (error != -1) {
ztest_record_enospc("dmu r/w range");
}
dmu_tx_abort(tx);
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
for (j = 0; j < s; j++) {
if (i != 5) {
dmu_return_arcbuf(bigbuf_arcbufs[j]);
} else {
dmu_return_arcbuf(
bigbuf_arcbufs[2 * j]);
dmu_return_arcbuf(
bigbuf_arcbufs[2 * j + 1]);
}
}
umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
dmu_buf_rele(bonus_db, FTAG);
return;
}
txg = dmu_tx_get_txg(tx);
/*
* 50% of the time don't read objects in the 1st iteration to
* test dmu_assign_arcbuf() for the case when there're no
* existing dbufs for the specified offsets.
*/
if (i != 0 || ztest_random(2) != 0) {
error = dmu_read(os, dd.dd_packobj, packoff,
packsize, packbuf, DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0);
error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize,
bigbuf, DMU_READ_PREFETCH);
ASSERT3U(error, ==, 0);
}
compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
n, dd, txg);
/*
* We've verified all the old bufwads, and made new ones.
* Now write them out.
*/
dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
if (zopt_verbose >= 6) {
(void) printf("writing offset %llx size %llx"
" txg %llx\n",
(u_longlong_t)bigoff,
(u_longlong_t)bigsize,
(u_longlong_t)txg);
}
for (off = bigoff, j = 0; j < s; j++, off += dd.dd_chunk) {
dmu_buf_t *dbt;
if (i != 5) {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[j]->b_data, dd.dd_chunk);
} else {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[2 * j]->b_data,
dd.dd_chunk / 2);
bcopy((caddr_t)bigbuf + (off - bigoff) +
dd.dd_chunk / 2,
bigbuf_arcbufs[2 * j + 1]->b_data,
dd.dd_chunk / 2);
}
if (i == 1) {
VERIFY(dmu_buf_hold(os, dd.dd_bigobj, off,
FTAG, &dbt) == 0);
}
if (i != 5) {
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[j], tx);
} else {
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[2 * j], tx);
dmu_assign_arcbuf(bonus_db,
off + dd.dd_chunk / 2,
bigbuf_arcbufs[2 * j + 1], tx);
}
if (i == 1) {
dmu_buf_rele(dbt, FTAG);
}
}
dmu_tx_commit(tx);
/*
* Sanity check the stuff we just wrote.
*/
{
void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
packsize, packcheck, DMU_READ_PREFETCH));
VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
bigsize, bigcheck, DMU_READ_PREFETCH));
ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
umem_free(packcheck, packsize);
umem_free(bigcheck, bigsize);
}
if (i == 2) {
txg_wait_open(dmu_objset_pool(os), 0);
} else if (i == 3) {
txg_wait_synced(dmu_objset_pool(os), 0);
}
}
dmu_buf_rele(bonus_db, FTAG);
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
}
void void
ztest_dmu_check_future_leak(ztest_args_t *za) ztest_dmu_check_future_leak(ztest_args_t *za)
{ {
@ -1948,6 +2562,8 @@ ztest_dmu_write_parallel(ztest_args_t *za)
uint64_t blkoff; uint64_t blkoff;
zbookmark_t zb; zbookmark_t zb;
dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_t *tx = dmu_tx_create(os);
dmu_buf_t *bonus_db;
arc_buf_t *abuf = NULL;
dmu_objset_name(os, osname); dmu_objset_name(os, osname);
@ -1976,6 +2592,12 @@ ztest_dmu_write_parallel(ztest_args_t *za)
} }
} }
if (off != -1ULL && P2PHASE(off, bs) == 0 && !do_free &&
ztest_random(8) == 0) {
VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &bonus_db) == 0);
abuf = dmu_request_arcbuf(bonus_db, bs);
}
txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT; txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
error = dmu_tx_assign(tx, txg_how); error = dmu_tx_assign(tx, txg_how);
if (error) { if (error) {
@ -1986,6 +2608,10 @@ ztest_dmu_write_parallel(ztest_args_t *za)
ztest_record_enospc("dmu write parallel"); ztest_record_enospc("dmu write parallel");
} }
dmu_tx_abort(tx); dmu_tx_abort(tx);
if (abuf != NULL) {
dmu_return_arcbuf(abuf);
dmu_buf_rele(bonus_db, FTAG);
}
return; return;
} }
txg = dmu_tx_get_txg(tx); txg = dmu_tx_get_txg(tx);
@ -2040,8 +2666,12 @@ ztest_dmu_write_parallel(ztest_args_t *za)
za->za_dbuf = NULL; za->za_dbuf = NULL;
} else if (do_free) { } else if (do_free) {
VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0); VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0);
} else { } else if (abuf == NULL) {
dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx); dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx);
} else {
bcopy(wbt, abuf->b_data, btsize);
dmu_assign_arcbuf(bonus_db, off, abuf, tx);
dmu_buf_rele(bonus_db, FTAG);
} }
(void) pthread_mutex_unlock(lp); (void) pthread_mutex_unlock(lp);
@ -2077,16 +2707,20 @@ ztest_dmu_write_parallel(ztest_args_t *za)
dmu_buf_rele(db, FTAG); dmu_buf_rele(db, FTAG);
za->za_dbuf = NULL; za->za_dbuf = NULL;
(void) pthread_mutex_unlock(lp); if (error) {
(void) mutex_unlock(lp);
if (error)
return; return;
}
if (blk.blk_birth == 0) /* concurrent free */ if (blk.blk_birth == 0) { /* concurrent free */
(void) mutex_unlock(lp);
return; return;
}
txg_suspend(dmu_objset_pool(os)); txg_suspend(dmu_objset_pool(os));
(void) mutex_unlock(lp);
ASSERT(blk.blk_fill == 1); ASSERT(blk.blk_fill == 1);
ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER); ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
ASSERT3U(BP_GET_LEVEL(&blk), ==, 0); ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
@ -2159,7 +2793,7 @@ ztest_zap(ztest_args_t *za)
* Create a new object if necessary, and record it in the directory. * Create a new object if necessary, and record it in the directory.
*/ */
VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff, VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
sizeof (uint64_t), &object)); sizeof (uint64_t), &object, DMU_READ_PREFETCH));
if (object == 0) { if (object == 0) {
tx = dmu_tx_create(os); tx = dmu_tx_create(os);
@ -3011,7 +3645,7 @@ ztest_verify_blocks(char *pool)
isa = strdup(isa); isa = strdup(isa);
/* LINTED */ /* LINTED */
(void) sprintf(bin, (void) sprintf(bin,
"/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache %s", "/usr/sbin%.*s/zdb -bcc%s%s -U /tmp/zpool.cache %s",
isalen, isalen,
isa, isa,
zopt_verbose >= 3 ? "s" : "", zopt_verbose >= 3 ? "s" : "",
@ -3158,7 +3792,7 @@ ztest_resume(void *arg)
spa_vdev_state_enter(spa); spa_vdev_state_enter(spa);
vdev_clear(spa, NULL); vdev_clear(spa, NULL);
(void) spa_vdev_state_exit(spa, NULL, 0); (void) spa_vdev_state_exit(spa, NULL, 0);
zio_resume(spa); (void) zio_resume(spa);
} }
return (NULL); return (NULL);
} }
@ -3434,6 +4068,10 @@ ztest_run(char *pool)
(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
if (zopt_verbose >= 3) if (zopt_verbose >= 3)
(void) printf("Destroying %s to free up space\n", name); (void) printf("Destroying %s to free up space\n", name);
/* Cleanup any non-standard clones and snapshots */
ztest_dsl_dataset_cleanup(name, za[d].za_instance);
(void) dmu_objset_find(name, ztest_destroy_cb, &za[d], (void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
(void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock); (void) pthread_rwlock_unlock(&ztest_shared->zs_name_lock);
@ -3520,6 +4158,8 @@ ztest_init(char *pool)
if (error) if (error)
fatal(0, "spa_open() = %d", error); fatal(0, "spa_open() = %d", error);
metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
if (zopt_verbose >= 3) if (zopt_verbose >= 3)
show_pool_stats(spa); show_pool_stats(spa);
@ -3611,6 +4251,9 @@ main(int argc, char **argv)
zi->zi_call_time = 0; zi->zi_call_time = 0;
} }
/* Set the allocation switch size */
metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1;
pid = fork(); pid = fork();
if (pid == -1) if (pid == -1)

View File

@ -116,6 +116,7 @@ enum {
EZFS_VDEVNOTSUP, /* unsupported vdev type */ EZFS_VDEVNOTSUP, /* unsupported vdev type */
EZFS_NOTSUP, /* ops not supported on this dataset */ EZFS_NOTSUP, /* ops not supported on this dataset */
EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */
EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */
EZFS_UNKNOWN EZFS_UNKNOWN
}; };
@ -178,6 +179,7 @@ extern const char *libzfs_error_action(libzfs_handle_t *);
extern const char *libzfs_error_description(libzfs_handle_t *); extern const char *libzfs_error_description(libzfs_handle_t *);
extern void libzfs_mnttab_init(libzfs_handle_t *); extern void libzfs_mnttab_init(libzfs_handle_t *);
extern void libzfs_mnttab_fini(libzfs_handle_t *); extern void libzfs_mnttab_fini(libzfs_handle_t *);
extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t);
extern int libzfs_mnttab_find(libzfs_handle_t *, const char *, extern int libzfs_mnttab_find(libzfs_handle_t *, const char *,
struct mnttab *); struct mnttab *);
extern void libzfs_mnttab_add(libzfs_handle_t *, const char *, extern void libzfs_mnttab_add(libzfs_handle_t *, const char *,
@ -229,6 +231,8 @@ extern int zpool_vdev_clear(zpool_handle_t *, uint64_t);
extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *,
boolean_t *, boolean_t *); boolean_t *, boolean_t *);
extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *,
boolean_t *, boolean_t *, boolean_t *);
extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *);
/* /*
@ -335,7 +339,8 @@ extern int zpool_stage_history(libzfs_handle_t *, const char *);
extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *,
size_t len); size_t len);
extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *);
extern int zpool_get_physpath(zpool_handle_t *, char *); extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
/* /*
* Basic handle manipulations. These functions do not create or destroy the * Basic handle manipulations. These functions do not create or destroy the
* underlying datasets, only the references to them. * underlying datasets, only the references to them.
@ -368,6 +373,10 @@ extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t,
zprop_source_t *, char *, size_t, boolean_t); zprop_source_t *, char *, size_t, boolean_t);
extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *,
zprop_source_t *, char *, size_t); zprop_source_t *, char *, size_t);
extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
uint64_t *propvalue);
extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
char *propbuf, int proplen, boolean_t literal);
extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
extern int zfs_prop_inherit(zfs_handle_t *, const char *); extern int zfs_prop_inherit(zfs_handle_t *, const char *);
extern const char *zfs_prop_values(zfs_prop_t); extern const char *zfs_prop_values(zfs_prop_t);
@ -384,6 +393,7 @@ typedef struct zprop_list {
} zprop_list_t; } zprop_list_t;
extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **); extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **);
extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *);
#define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_NONE "none"
#define ZFS_MOUNTPOINT_LEGACY "legacy" #define ZFS_MOUNTPOINT_LEGACY "legacy"
@ -454,6 +464,12 @@ extern int zfs_send(zfs_handle_t *, const char *, const char *,
boolean_t, boolean_t, boolean_t, boolean_t, int); boolean_t, boolean_t, boolean_t, boolean_t, int);
extern int zfs_promote(zfs_handle_t *); extern int zfs_promote(zfs_handle_t *);
typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
uid_t rid, uint64_t space);
extern int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
zfs_userspace_cb_t func, void *arg);
typedef struct recvflags { typedef struct recvflags {
/* print informational messages (ie, -v was specified) */ /* print informational messages (ie, -v was specified) */
int verbose : 1; int verbose : 1;
@ -491,17 +507,6 @@ extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *,
zfs_type_t); zfs_type_t);
extern int zfs_spa_version(zfs_handle_t *, int *); extern int zfs_spa_version(zfs_handle_t *, int *);
/*
* dataset permission functions.
*/
extern int zfs_perm_set(zfs_handle_t *, nvlist_t *);
extern int zfs_perm_remove(zfs_handle_t *, nvlist_t *);
extern int zfs_build_perms(zfs_handle_t *, char *, char *,
zfs_deleg_who_type_t, zfs_deleg_inherit_t, nvlist_t **nvlist_t);
extern int zfs_perm_get(zfs_handle_t *, zfs_allow_t **);
extern void zfs_free_allows(zfs_allow_t *);
extern void zfs_deleg_permissions(void);
/* /*
* Mount support functions. * Mount support functions.
*/ */
@ -536,7 +541,7 @@ extern boolean_t zfs_is_shared_iscsi(zfs_handle_t *);
extern int zfs_share_iscsi(zfs_handle_t *); extern int zfs_share_iscsi(zfs_handle_t *);
extern int zfs_unshare_iscsi(zfs_handle_t *); extern int zfs_unshare_iscsi(zfs_handle_t *);
extern int zfs_iscsi_perm_check(libzfs_handle_t *, char *, ucred_t *); extern int zfs_iscsi_perm_check(libzfs_handle_t *, char *, ucred_t *);
extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *,
void *, void *, int, zfs_share_op_t); void *, void *, int, zfs_share_op_t);
/* /*
@ -574,6 +579,15 @@ extern int zpool_remove_zvol_links(zpool_handle_t *);
/* is this zvol valid for use as a dump device? */ /* is this zvol valid for use as a dump device? */
extern int zvol_check_dump_config(char *); extern int zvol_check_dump_config(char *);
/*
* Management interfaces for SMB ACL files
*/
int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *);
int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *);
int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *);
int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *);
/* /*
* Enable and disable datasets within a pool by mounting/unmounting and * Enable and disable datasets within a pool by mounting/unmounting and
* sharing/unsharing them. * sharing/unsharing them.

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -63,6 +63,7 @@ struct libzfs_handle {
int libzfs_printerr; int libzfs_printerr;
void *libzfs_sharehdl; /* libshare handle */ void *libzfs_sharehdl; /* libshare handle */
uint_t libzfs_shareflags; uint_t libzfs_shareflags;
boolean_t libzfs_mnttab_enable;
avl_tree_t libzfs_mnttab_cache; avl_tree_t libzfs_mnttab_cache;
}; };
#define ZFSSHARE_MISS 0x01 /* Didn't find entry in cache */ #define ZFSSHARE_MISS 0x01 /* Didn't find entry in cache */
@ -78,6 +79,7 @@ struct zfs_handle {
nvlist_t *zfs_user_props; nvlist_t *zfs_user_props;
boolean_t zfs_mntcheck; boolean_t zfs_mntcheck;
char *zfs_mntopts; char *zfs_mntopts;
uint8_t *zfs_props_table;
}; };
/* /*
@ -185,7 +187,7 @@ extern int zfs_init_libshare(libzfs_handle_t *, int);
extern void zfs_uninit_libshare(libzfs_handle_t *); extern void zfs_uninit_libshare(libzfs_handle_t *);
extern int zfs_parse_options(char *, zfs_share_proto_t); extern int zfs_parse_options(char *, zfs_share_proto_t);
extern int zfs_unshare_proto(zfs_handle_t *zhp, extern int zfs_unshare_proto(zfs_handle_t *,
const char *, zfs_share_proto_t *); const char *, zfs_share_proto_t *);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
* *
* Portions Copyright 2007 Ramprakash Jelari * Portions Copyright 2007 Ramprakash Jelari
@ -218,6 +218,7 @@ changelist_postfix(prop_changelist_t *clp)
boolean_t sharenfs; boolean_t sharenfs;
boolean_t sharesmb; boolean_t sharesmb;
boolean_t mounted;
/* /*
* If we are in the global zone, but this dataset is exported * If we are in the global zone, but this dataset is exported
@ -272,20 +273,29 @@ changelist_postfix(prop_changelist_t *clp)
shareopts, sizeof (shareopts), NULL, NULL, 0, shareopts, sizeof (shareopts), NULL, NULL, 0,
B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); B_FALSE) == 0) && (strcmp(shareopts, "off") != 0));
if ((cn->cn_mounted || clp->cl_waslegacy || sharenfs || mounted = zfs_is_mounted(cn->cn_handle, NULL);
sharesmb) && !zfs_is_mounted(cn->cn_handle, NULL) &&
zfs_mount(cn->cn_handle, NULL, 0) != 0) if (!mounted && (cn->cn_mounted ||
errors++; ((sharenfs || sharesmb || clp->cl_waslegacy) &&
(zfs_prop_get_int(cn->cn_handle,
ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON)))) {
if (zfs_mount(cn->cn_handle, NULL, 0) != 0)
errors++;
else
mounted = TRUE;
}
/* /*
* We always re-share even if the filesystem is currently * If the file system is mounted we always re-share even
* shared, so that we can adopt any new options. * if the filesystem is currently shared, so that we can
* adopt any new options.
*/ */
if (sharenfs) if (sharenfs && mounted)
errors += zfs_share_nfs(cn->cn_handle); errors += zfs_share_nfs(cn->cn_handle);
else if (cn->cn_shared || clp->cl_waslegacy) else if (cn->cn_shared || clp->cl_waslegacy)
errors += zfs_unshare_nfs(cn->cn_handle, NULL); errors += zfs_unshare_nfs(cn->cn_handle, NULL);
if (sharesmb) if (sharesmb && mounted)
errors += zfs_share_smb(cn->cn_handle); errors += zfs_share_smb(cn->cn_handle);
else if (cn->cn_shared || clp->cl_waslegacy) else if (cn->cn_shared || clp->cl_waslegacy)
errors += zfs_unshare_smb(cn->cn_handle, NULL); errors += zfs_unshare_smb(cn->cn_handle, NULL);
@ -621,8 +631,6 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
clp->cl_prop = ZFS_PROP_MOUNTPOINT; clp->cl_prop = ZFS_PROP_MOUNTPOINT;
} else if (prop == ZFS_PROP_VOLSIZE) { } else if (prop == ZFS_PROP_VOLSIZE) {
clp->cl_prop = ZFS_PROP_MOUNTPOINT; clp->cl_prop = ZFS_PROP_MOUNTPOINT;
} else if (prop == ZFS_PROP_VERSION) {
clp->cl_prop = ZFS_PROP_MOUNTPOINT;
} else { } else {
clp->cl_prop = prop; clp->cl_prop = prop;
} }

File diff suppressed because it is too large Load Diff

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
/* /*
* Iterate over all children of the current object. This includes the normal * Iterate over all children of the current object. This includes the normal
* dataset hierarchy, but also arbitrary hierarchies due to clones. We want to * dataset hierarchy, but also arbitrary hierarchies due to clones. We want to
@ -399,13 +397,6 @@ iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset)
for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0; ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) { (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) {
/*
* Ignore private dataset names.
*/
if (dataset_name_hidden(zc.zc_name))
continue;
/* /*
* Get statistics for this dataset, to determine the type of the * Get statistics for this dataset, to determine the type of the
* dataset and clone statistics. If this fails, the dataset has * dataset and clone statistics. If this fails, the dataset has

View File

@ -42,6 +42,7 @@
#include <sys/zfs_ioctl.h> #include <sys/zfs_ioctl.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <strings.h> #include <strings.h>
#include <dlfcn.h>
#include "zfs_namecheck.h" #include "zfs_namecheck.h"
#include "zfs_prop.h" #include "zfs_prop.h"
@ -55,6 +56,10 @@ static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
#define BOOTCMD "installboot(1M)" #define BOOTCMD "installboot(1M)"
#endif #endif
#define DISK_ROOT "/dev/dsk"
#define RDISK_ROOT "/dev/rdsk"
#define BACKUP_SLICE "s2"
/* /*
* ==================================================================== * ====================================================================
* zpool property functions * zpool property functions
@ -627,6 +632,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp)
} }
/*
* Don't start the slice at the default block of 34; many storage
* devices will use a stripe width of 128k, so start there instead.
*/
#define NEW_START_BLOCK 256
/* /*
* Validate the given pool name, optionally putting an extended error message in * Validate the given pool name, optionally putting an extended error message in
* 'buf'. * 'buf'.
@ -1369,46 +1380,90 @@ zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type)
} }
/* /*
* Find a vdev that matches the search criteria specified. We use the
* the nvpair name to determine how we should look for the device.
* 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL
* spare; but FALSE if its an INUSE spare. * spare; but FALSE if its an INUSE spare.
*/ */
static nvlist_t * static nvlist_t *
vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) boolean_t *l2cache, boolean_t *log)
{ {
uint_t c, children; uint_t c, children;
nvlist_t **child; nvlist_t **child;
uint64_t theguid, present;
char *path;
uint64_t wholedisk = 0;
nvlist_t *ret; nvlist_t *ret;
uint64_t is_log; uint64_t is_log;
char *srchkey;
nvpair_t *pair = nvlist_next_nvpair(search, NULL);
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &theguid) == 0); /* Nothing to look for */
if (search == NULL || pair == NULL)
return (NULL);
if (search == NULL && /* Obtain the key we will use to search */
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &present) == 0) { srchkey = nvpair_name(pair);
/*
* If the device has never been present since import, the only switch (nvpair_type(pair)) {
* reliable way to match the vdev is by GUID. case DATA_TYPE_UINT64: {
*/ uint64_t srchval, theguid, present;
if (theguid == guid)
return (nv); verify(nvpair_value_uint64(pair, &srchval) == 0);
} else if (search != NULL && if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &present) == 0) {
&wholedisk); /*
if (wholedisk) { * If the device has never been present since
/* * import, the only reliable way to match the
* For whole disks, the internal path has 's0', but the * vdev is by GUID.
* path passed in by the user doesn't. */
*/ verify(nvlist_lookup_uint64(nv,
if (strlen(search) == strlen(path) - 2 && ZPOOL_CONFIG_GUID, &theguid) == 0);
strncmp(search, path, strlen(search)) == 0) if (theguid == srchval)
return (nv); return (nv);
} else if (strcmp(search, path) == 0) { }
return (nv);
} }
break;
}
case DATA_TYPE_STRING: {
char *srchval, *val;
verify(nvpair_value_string(pair, &srchval) == 0);
if (nvlist_lookup_string(nv, srchkey, &val) != 0)
break;
/*
* Search for the requested value. We special case the search
* for ZPOOL_CONFIG_PATH when it's a wholedisk. Otherwise,
* all other searches are simple string compares.
*/
if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && val) {
uint64_t wholedisk = 0;
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk);
if (wholedisk) {
/*
* For whole disks, the internal path has 's0',
* but the path passed in by the user doesn't.
*/
if (strlen(srchval) == strlen(val) - 2 &&
strncmp(srchval, val, strlen(srchval)) == 0)
return (nv);
break;
}
}
/*
* Common case
*/
if (strcmp(srchval, val) == 0)
return (nv);
break;
}
default:
break;
} }
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
@ -1416,7 +1471,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
return (NULL); return (NULL);
for (c = 0; c < children; c++) { for (c = 0; c < children; c++) {
if ((ret = vdev_to_nvlist_iter(child[c], search, guid, if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) { avail_spare, l2cache, NULL)) != NULL) {
/* /*
* The 'is_log' value is only set for the toplevel * The 'is_log' value is only set for the toplevel
@ -1437,7 +1492,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
&child, &children) == 0) { &child, &children) == 0) {
for (c = 0; c < children; c++) { for (c = 0; c < children; c++) {
if ((ret = vdev_to_nvlist_iter(child[c], search, guid, if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) { avail_spare, l2cache, NULL)) != NULL) {
*avail_spare = B_TRUE; *avail_spare = B_TRUE;
return (ret); return (ret);
@ -1448,7 +1503,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
&child, &children) == 0) { &child, &children) == 0) {
for (c = 0; c < children; c++) { for (c = 0; c < children; c++) {
if ((ret = vdev_to_nvlist_iter(child[c], search, guid, if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) { avail_spare, l2cache, NULL)) != NULL) {
*l2cache = B_TRUE; *l2cache = B_TRUE;
return (ret); return (ret);
@ -1459,24 +1514,48 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
return (NULL); return (NULL);
} }
/*
* Given a physical path (minus the "/devices" prefix), find the
* associated vdev.
*/
nvlist_t *
zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log)
{
nvlist_t *search, *nvroot, *ret;
verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0);
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
*avail_spare = B_FALSE;
ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
nvlist_free(search);
return (ret);
}
nvlist_t * nvlist_t *
zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
boolean_t *l2cache, boolean_t *log) boolean_t *l2cache, boolean_t *log)
{ {
char buf[MAXPATHLEN]; char buf[MAXPATHLEN];
const char *search;
char *end; char *end;
nvlist_t *nvroot; nvlist_t *nvroot, *search, *ret;
uint64_t guid; uint64_t guid;
verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
guid = strtoull(path, &end, 10); guid = strtoull(path, &end, 10);
if (guid != 0 && *end == '\0') { if (guid != 0 && *end == '\0') {
search = NULL; verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
} else if (path[0] != '/') { } else if (path[0] != '/') {
(void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path); (void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path);
search = buf; verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0);
} else { } else {
search = path; verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0);
} }
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
@ -1486,8 +1565,10 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
*l2cache = B_FALSE; *l2cache = B_FALSE;
if (log != NULL) if (log != NULL)
*log = B_FALSE; *log = B_FALSE;
return (vdev_to_nvlist_iter(nvroot, search, guid, avail_spare, ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
l2cache, log)); nvlist_free(search);
return (ret);
} }
static int static int
@ -1504,80 +1585,141 @@ vdev_online(nvlist_t *nv)
} }
/* /*
* Get phys_path for a root pool * Helper function for zpool_get_physpaths().
* Return 0 on success; non-zeron on failure.
*/ */
int static int
zpool_get_physpath(zpool_handle_t *zhp, char *physpath) vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size,
size_t *bytes_written)
{ {
nvlist_t *vdev_root; size_t bytes_left, pos, rsz;
nvlist_t **child; char *tmppath;
uint_t count; const char *format;
int i;
/* if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH,
* Make sure this is a root pool, as phys_path doesn't mean &tmppath) != 0)
* anything to a non-root pool. return (EZFS_NODEVICE);
*/
if (!pool_is_bootable(zhp))
return (-1);
verify(nvlist_lookup_nvlist(zhp->zpool_config, pos = *bytes_written;
ZPOOL_CONFIG_VDEV_TREE, &vdev_root) == 0); bytes_left = physpath_size - pos;
format = (pos == 0) ? "%s" : " %s";
if (nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN, rsz = snprintf(physpath + pos, bytes_left, format, tmppath);
&child, &count) != 0) *bytes_written += rsz;
return (-2);
for (i = 0; i < count; i++) { if (rsz >= bytes_left) {
nvlist_t **child2; /* if physpath was not copied properly, clear it */
uint_t count2; if (bytes_left != 0) {
char *type; physpath[pos] = 0;
char *tmppath; }
int j; return (EZFS_NOSPC);
}
return (0);
}
if (nvlist_lookup_string(child[i], ZPOOL_CONFIG_TYPE, &type) static int
!= 0) vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size,
return (-3); size_t *rsz, boolean_t is_spare)
{
char *type;
int ret;
if (strcmp(type, VDEV_TYPE_DISK) == 0) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
if (!vdev_online(child[i])) return (EZFS_INVALCONFIG);
return (-8);
verify(nvlist_lookup_string(child[i],
ZPOOL_CONFIG_PHYS_PATH, &tmppath) == 0);
(void) strncpy(physpath, tmppath, strlen(tmppath));
} else if (strcmp(type, VDEV_TYPE_MIRROR) == 0) {
if (nvlist_lookup_nvlist_array(child[i],
ZPOOL_CONFIG_CHILDREN, &child2, &count2) != 0)
return (-4);
for (j = 0; j < count2; j++) { if (strcmp(type, VDEV_TYPE_DISK) == 0) {
if (!vdev_online(child2[j])) /*
return (-8); * An active spare device has ZPOOL_CONFIG_IS_SPARE set.
if (nvlist_lookup_string(child2[j], * For a spare vdev, we only want to boot from the active
ZPOOL_CONFIG_PHYS_PATH, &tmppath) != 0) * spare device.
return (-5); */
if (is_spare) {
uint64_t spare = 0;
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
&spare);
if (!spare)
return (EZFS_INVALCONFIG);
}
if ((strlen(physpath) + strlen(tmppath)) > if (vdev_online(nv)) {
MAXNAMELEN) if ((ret = vdev_get_one_physpath(nv, physpath,
return (-6); phypath_size, rsz)) != 0)
return (ret);
}
} else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
(is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) {
nvlist_t **child;
uint_t count;
int i, ret;
if (strlen(physpath) == 0) { if (nvlist_lookup_nvlist_array(nv,
(void) strncpy(physpath, tmppath, ZPOOL_CONFIG_CHILDREN, &child, &count) != 0)
strlen(tmppath)); return (EZFS_INVALCONFIG);
} else {
(void) strcat(physpath, " "); for (i = 0; i < count; i++) {
(void) strcat(physpath, tmppath); ret = vdev_get_physpaths(child[i], physpath,
} phypath_size, rsz, is_spare);
} if (ret == EZFS_NOSPC)
} else { return (ret);
return (-7);
} }
} }
return (EZFS_POOL_INVALARG);
}
/*
* Get phys_path for a root pool config.
* Return 0 on success; non-zero on failure.
*/
static int
zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size)
{
size_t rsz;
nvlist_t *vdev_root;
nvlist_t **child;
uint_t count;
char *type;
rsz = 0;
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&vdev_root) != 0)
return (EZFS_INVALCONFIG);
if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 ||
nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN,
&child, &count) != 0)
return (EZFS_INVALCONFIG);
/*
* root pool can not have EFI labeled disks and can only have
* a single top-level vdev.
*/
if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1 ||
pool_uses_efi(vdev_root))
return (EZFS_POOL_INVALARG);
(void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz,
B_FALSE);
/* No online devices */
if (rsz == 0)
return (EZFS_NODEVICE);
return (0); return (0);
} }
/*
* Get phys_path for a root pool
* Return 0 on success; non-zero on failure.
*/
int
zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size)
{
return (zpool_get_config_physpath(zhp->zpool_config, physpath,
phypath_size));
}
/* /*
* Returns TRUE if the given guid corresponds to the given type. * Returns TRUE if the given guid corresponds to the given type.
* This is used to check for hot spares (INUSE or not), and level 2 cache * This is used to check for hot spares (INUSE or not), and level 2 cache
@ -1606,6 +1748,45 @@ is_guid_type(zpool_handle_t *zhp, uint64_t guid, const char *type)
return (B_FALSE); return (B_FALSE);
} }
/*
* If the device has being dynamically expanded then we need to relabel
* the disk to use the new unallocated space.
*/
static int
zpool_relabel_disk(libzfs_handle_t *hdl, const char *name)
{
char path[MAXPATHLEN];
char errbuf[1024];
int fd, error;
int (*_efi_use_whole_disk)(int);
if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT,
"efi_use_whole_disk")) == NULL)
return (-1);
(void) snprintf(path, sizeof (path), "%s/%s", RDISK_ROOT, name);
if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
"relabel '%s': unable to open device"), name);
return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
}
/*
* It's possible that we might encounter an error if the device
* does not have any unallocated space left. If so, we simply
* ignore that error and continue on.
*/
error = _efi_use_whole_disk(fd);
(void) close(fd);
if (error && error != VT_ENOSPC) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
"relabel '%s': unable to read disk capacity"), name);
return (zfs_error(hdl, EZFS_NOCAP, errbuf));
}
return (0);
}
/* /*
* Bring the specified vdev online. The 'flags' parameter is a set of the * Bring the specified vdev online. The 'flags' parameter is a set of the
* ZFS_ONLINE_* flags. * ZFS_ONLINE_* flags.
@ -1617,15 +1798,20 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
zfs_cmd_t zc = { 0 }; zfs_cmd_t zc = { 0 };
char msg[1024]; char msg[1024];
nvlist_t *tgt; nvlist_t *tgt;
boolean_t avail_spare, l2cache; boolean_t avail_spare, l2cache, islog;
libzfs_handle_t *hdl = zhp->zpool_hdl; libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg), if (flags & ZFS_ONLINE_EXPAND) {
dgettext(TEXT_DOMAIN, "cannot online %s"), path); (void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot expand %s"), path);
} else {
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot online %s"), path);
}
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
NULL)) == NULL) &islog)) == NULL)
return (zfs_error(hdl, EZFS_NODEVICE, msg)); return (zfs_error(hdl, EZFS_NODEVICE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
@ -1634,6 +1820,31 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE) is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE)
return (zfs_error(hdl, EZFS_ISSPARE, msg)); return (zfs_error(hdl, EZFS_ISSPARE, msg));
if (flags & ZFS_ONLINE_EXPAND ||
zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
char *pathname = NULL;
uint64_t wholedisk = 0;
(void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk);
verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH,
&pathname) == 0);
/*
* XXX - L2ARC 1.0 devices can't support expansion.
*/
if (l2cache) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot expand cache devices"));
return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg));
}
if (wholedisk) {
pathname += strlen(DISK_ROOT) + 1;
(void) zpool_relabel_disk(zhp->zpool_hdl, pathname);
}
}
zc.zc_cookie = VDEV_STATE_ONLINE; zc.zc_cookie = VDEV_STATE_ONLINE;
zc.zc_obj = flags; zc.zc_obj = flags;
@ -1684,6 +1895,12 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
*/ */
return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
case EEXIST:
/*
* The log device has unplayed logs
*/
return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg));
default: default:
return (zpool_standard_error(hdl, errno, msg)); return (zpool_standard_error(hdl, errno, msg));
} }
@ -1888,6 +2105,14 @@ zpool_vdev_attach(zpool_handle_t *zhp,
(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Please " (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Please "
"be sure to invoke %s to make '%s' bootable.\n"), "be sure to invoke %s to make '%s' bootable.\n"),
BOOTCMD, new_disk); BOOTCMD, new_disk);
/*
* XXX need a better way to prevent user from
* booting up a half-baked vdev.
*/
(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make "
"sure to wait until resilver is done "
"before rebooting.\n"));
} }
return (0); return (0);
} }
@ -2803,14 +3028,6 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
free(mntpnt); free(mntpnt);
} }
#define RDISK_ROOT "/dev/rdsk"
#define BACKUP_SLICE "s2"
/*
* Don't start the slice at the default block of 34; many storage
* devices will use a stripe width of 128k, so start there instead.
*/
#define NEW_START_BLOCK 256
/* /*
* Read the EFI label from the config, if a label does not exist then * Read the EFI label from the config, if a label does not exist then
* pass back the error to the caller. If the caller has passed a non-NULL * pass back the error to the caller. If the caller has passed a non-NULL

View File

@ -237,6 +237,8 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv)
zfs_prop_t prop = zfs_name_to_prop(propname); zfs_prop_t prop = zfs_name_to_prop(propname);
nvlist_t *propnv; nvlist_t *propnv;
assert(zfs_prop_user(propname) || prop != ZPROP_INVAL);
if (!zfs_prop_user(propname) && zfs_prop_readonly(prop)) if (!zfs_prop_user(propname) && zfs_prop_readonly(prop))
continue; continue;
@ -594,12 +596,18 @@ dump_filesystem(zfs_handle_t *zhp, void *arg)
zhp->zfs_name, sdd->fromsnap); zhp->zfs_name, sdd->fromsnap);
sdd->err = B_TRUE; sdd->err = B_TRUE;
} else if (!sdd->seento) { } else if (!sdd->seento) {
(void) fprintf(stderr, if (sdd->fromsnap) {
"WARNING: could not send %s@%s:\n" (void) fprintf(stderr,
"incremental source (%s@%s) " "WARNING: could not send %s@%s:\n"
"is not earlier than it\n", "incremental source (%s@%s) "
zhp->zfs_name, sdd->tosnap, "is not earlier than it\n",
zhp->zfs_name, sdd->fromsnap); zhp->zfs_name, sdd->tosnap,
zhp->zfs_name, sdd->fromsnap);
} else {
(void) fprintf(stderr, "WARNING: "
"could not send %s@%s: does not exist\n",
zhp->zfs_name, sdd->tosnap);
}
sdd->err = B_TRUE; sdd->err = B_TRUE;
} }
} else { } else {

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -210,6 +210,9 @@ libzfs_error_description(libzfs_handle_t *hdl)
case EZFS_ACTIVE_SPARE: case EZFS_ACTIVE_SPARE:
return (dgettext(TEXT_DOMAIN, "pool has active shared spare " return (dgettext(TEXT_DOMAIN, "pool has active shared spare "
"device")); "device"));
case EZFS_UNPLAYED_LOGS:
return (dgettext(TEXT_DOMAIN, "log device has unplayed intent "
"logs"));
case EZFS_UNKNOWN: case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error")); return (dgettext(TEXT_DOMAIN, "unknown error"));
default: default:
@ -364,6 +367,11 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case ENOTSUP: case ENOTSUP:
zfs_verror(hdl, EZFS_BADVERSION, fmt, ap); zfs_verror(hdl, EZFS_BADVERSION, fmt, ap);
break; break;
case EAGAIN:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool I/O is currently suspended"));
zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
break;
default: default:
zfs_error_aux(hdl, strerror(errno)); zfs_error_aux(hdl, strerror(errno));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
@ -437,6 +445,11 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case EDQUOT: case EDQUOT:
zfs_verror(hdl, EZFS_NOSPC, fmt, ap); zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
return (-1); return (-1);
case EAGAIN:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool I/O is currently suspended"));
zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
break;
default: default:
zfs_error_aux(hdl, strerror(error)); zfs_error_aux(hdl, strerror(error));
@ -575,6 +588,7 @@ libzfs_init(void)
zfs_prop_init(); zfs_prop_init();
zpool_prop_init(); zpool_prop_init();
libzfs_mnttab_init(hdl);
return (hdl); return (hdl);
} }
@ -592,6 +606,7 @@ libzfs_fini(libzfs_handle_t *hdl)
(void) free(hdl->libzfs_log_str); (void) free(hdl->libzfs_log_str);
zpool_free_handles(hdl); zpool_free_handles(hdl);
namespace_clear(hdl); namespace_clear(hdl);
libzfs_mnttab_fini(hdl);
free(hdl); free(hdl);
} }
@ -1209,7 +1224,7 @@ addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp,
* dataset property, * dataset property,
*/ */
if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL || if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL ||
!zfs_prop_user(propname))) { (!zfs_prop_user(propname) && !zfs_prop_userquota(propname)))) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid property '%s'"), propname); "invalid property '%s'"), propname);
return (zfs_error(hdl, EZFS_BADPROP, return (zfs_error(hdl, EZFS_BADPROP,

View File

@ -58,6 +58,7 @@ extern "C" {
#include <atomic.h> #include <atomic.h>
#include <dirent.h> #include <dirent.h>
#include <time.h> #include <time.h>
#include <libsysevent.h>
#include <sys/note.h> #include <sys/note.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/cred.h> #include <sys/cred.h>
@ -72,6 +73,7 @@ extern "C" {
#include <sys/kstat.h> #include <sys/kstat.h>
#include <sys/u8_textprep.h> #include <sys/u8_textprep.h>
#include <sys/sysevent/eventdefs.h> #include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/dev.h>
/* /*
* Debugging * Debugging
@ -317,6 +319,7 @@ typedef void (task_func_t)(void *);
#define TASKQ_PREPOPULATE 0x0001 #define TASKQ_PREPOPULATE 0x0001
#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */
#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */
#define TASKQ_THREADS_CPU_PCT 0x0008 /* Use dynamic thread scheduling */
#define TQ_SLEEP KM_SLEEP /* Can block for memory */ #define TQ_SLEEP KM_SLEEP /* Can block for memory */
#define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ #define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */
@ -542,6 +545,10 @@ typedef struct ksiddomain {
ksiddomain_t *ksid_lookupdomain(const char *); ksiddomain_t *ksid_lookupdomain(const char *);
void ksiddomain_rele(ksiddomain_t *); void ksiddomain_rele(ksiddomain_t *);
#define DDI_SLEEP KM_SLEEP
#define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \
sysevent_post_event(_c, _d, _b, "libzpool", _e, _f)
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -174,6 +174,19 @@ taskq_create(const char *name, int nthreads, pri_t pri,
taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP); taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP);
int t; int t;
if (flags & TASKQ_THREADS_CPU_PCT) {
int pct;
ASSERT3S(nthreads, >=, 0);
ASSERT3S(nthreads, <=, 100);
pct = MIN(nthreads, 100);
pct = MAX(pct, 0);
nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100;
nthreads = MAX(nthreads, 1); /* need at least 1 thread */
} else {
ASSERT3S(nthreads, >=, 1);
}
rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL);
mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_FM_FS_ZFS_H #ifndef _SYS_FM_FS_ZFS_H
#define _SYS_FM_FS_ZFS_H #define _SYS_FM_FS_ZFS_H
#pragma ident "%Z%%M% %I% %E% SMI"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -57,6 +55,7 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -48,6 +48,10 @@ typedef enum {
#define ZFS_TYPE_DATASET \ #define ZFS_TYPE_DATASET \
(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
#define ZAP_MAXNAMELEN 256
#define ZAP_MAXVALUELEN (1024 * 8)
#define ZAP_OLDMAXVALUELEN 1024
/* /*
* Dataset properties are identified by these constants and must be added to * Dataset properties are identified by these constants and must be added to
* the end of this list to ensure that external consumers are not affected * the end of this list to ensure that external consumers are not affected
@ -105,9 +109,21 @@ typedef enum {
ZFS_PROP_USEDDS, ZFS_PROP_USEDDS,
ZFS_PROP_USEDCHILD, ZFS_PROP_USEDCHILD,
ZFS_PROP_USEDREFRESERV, ZFS_PROP_USEDREFRESERV,
ZFS_PROP_USERACCOUNTING, /* not exposed to the user */
ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */
ZFS_NUM_PROPS ZFS_NUM_PROPS
} zfs_prop_t; } zfs_prop_t;
typedef enum {
ZFS_PROP_USERUSED,
ZFS_PROP_USERQUOTA,
ZFS_PROP_GROUPUSED,
ZFS_PROP_GROUPQUOTA,
ZFS_NUM_USERQUOTA_PROPS
} zfs_userquota_prop_t;
extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
/* /*
* Pool properties are identified by these constants and must be added to the * Pool properties are identified by these constants and must be added to the
* end of this list to ensure that external consumers are not affected * end of this list to ensure that external consumers are not affected
@ -130,6 +146,7 @@ typedef enum {
ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_CACHEFILE,
ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_FAILUREMODE,
ZPOOL_PROP_LISTSNAPS, ZPOOL_PROP_LISTSNAPS,
ZPOOL_PROP_AUTOEXPAND,
ZPOOL_NUM_PROPS ZPOOL_NUM_PROPS
} zpool_prop_t; } zpool_prop_t;
@ -169,6 +186,7 @@ boolean_t zfs_prop_setonce(zfs_prop_t);
const char *zfs_prop_to_name(zfs_prop_t); const char *zfs_prop_to_name(zfs_prop_t);
zfs_prop_t zfs_name_to_prop(const char *); zfs_prop_t zfs_name_to_prop(const char *);
boolean_t zfs_prop_user(const char *); boolean_t zfs_prop_user(const char *);
boolean_t zfs_prop_userquota(const char *name);
int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
boolean_t zfs_prop_valid_for_type(int, zfs_type_t); boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
@ -213,6 +231,9 @@ typedef enum {
#define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GID "gid"
#define ZFS_DELEG_PERM_GROUPS "groups" #define ZFS_DELEG_PERM_GROUPS "groups"
#define ZFS_SMB_ACL_SRC "src"
#define ZFS_SMB_ACL_TARGET "target"
typedef enum { typedef enum {
ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_OFF = 0,
ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_ON = 1,
@ -226,6 +247,13 @@ typedef enum zfs_share_op {
ZFS_UNSHARE_SMB = 3 ZFS_UNSHARE_SMB = 3
} zfs_share_op_t; } zfs_share_op_t;
typedef enum zfs_smb_acl_op {
ZFS_SMB_ACL_ADD,
ZFS_SMB_ACL_REMOVE,
ZFS_SMB_ACL_RENAME,
ZFS_SMB_ACL_PURGE
} zfs_smb_acl_op_t;
typedef enum zfs_cache_type { typedef enum zfs_cache_type {
ZFS_CACHE_NONE = 0, ZFS_CACHE_NONE = 0,
ZFS_CACHE_METADATA = 1, ZFS_CACHE_METADATA = 1,
@ -250,13 +278,16 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_12 12ULL #define SPA_VERSION_12 12ULL
#define SPA_VERSION_13 13ULL #define SPA_VERSION_13 13ULL
#define SPA_VERSION_14 14ULL #define SPA_VERSION_14 14ULL
#define SPA_VERSION_15 15ULL
#define SPA_VERSION_16 16ULL
/* /*
* When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
* format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*}, * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes. * and do the appropriate changes. Also bump the version number in
* usr/src/grub/capability.
*/ */
#define SPA_VERSION SPA_VERSION_14 #define SPA_VERSION SPA_VERSION_16
#define SPA_VERSION_STRING "14" #define SPA_VERSION_STRING "16"
/* /*
* Symbolic names for the changes that caused a SPA_VERSION switch. * Symbolic names for the changes that caused a SPA_VERSION switch.
@ -292,6 +323,8 @@ typedef enum zfs_cache_type {
#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12
#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13
#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14
#define SPA_VERSION_USERSPACE SPA_VERSION_15
#define SPA_VERSION_STMF_PROP SPA_VERSION_16
/* /*
* ZPL version - rev'd whenever an incompatible on-disk format change * ZPL version - rev'd whenever an incompatible on-disk format change
@ -299,19 +332,21 @@ typedef enum zfs_cache_type {
* also update the version_table[] and help message in zfs_prop.c. * also update the version_table[] and help message in zfs_prop.c.
* *
* When changing, be sure to teach GRUB how to read the new format! * When changing, be sure to teach GRUB how to read the new format!
* See usr/src/grub/grub-0.95/stage2/{zfs-include/,fsys_zfs*} * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*}
*/ */
#define ZPL_VERSION_1 1ULL #define ZPL_VERSION_1 1ULL
#define ZPL_VERSION_2 2ULL #define ZPL_VERSION_2 2ULL
#define ZPL_VERSION_3 3ULL #define ZPL_VERSION_3 3ULL
#define ZPL_VERSION ZPL_VERSION_3 #define ZPL_VERSION_4 4ULL
#define ZPL_VERSION_STRING "3" #define ZPL_VERSION ZPL_VERSION_4
#define ZPL_VERSION_STRING "4"
#define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_INITIAL ZPL_VERSION_1
#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2
#define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_FUID ZPL_VERSION_3
#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3
#define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
/* /*
* The following are configuration names used in the nvlist describing a pool's * The following are configuration names used in the nvlist describing a pool's
@ -361,6 +396,7 @@ typedef enum zfs_cache_type {
#define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_FAULTED "faulted"
#define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_DEGRADED "degraded"
#define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_REMOVED "removed"
#define ZPOOL_CONFIG_FRU "fru"
#define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_ROOT "root"
#define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_MIRROR "mirror"
@ -503,7 +539,7 @@ typedef struct vdev_stat {
/* /*
* And here are the things we need with /dev, etc. in front of them. * And here are the things we need with /dev, etc. in front of them.
*/ */
#define ZVOL_PSEUDO_DEV "/devices/pseudo/zvol@0:" #define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:"
#define ZVOL_FULL_DEV_DIR "/dev/" ZVOL_DEV_DIR "/" #define ZVOL_FULL_DEV_DIR "/dev/" ZVOL_DEV_DIR "/"
#define ZVOL_PROP_NAME "name" #define ZVOL_PROP_NAME "name"
@ -531,6 +567,7 @@ typedef enum zfs_ioc {
ZFS_IOC_VDEV_ATTACH, ZFS_IOC_VDEV_ATTACH,
ZFS_IOC_VDEV_DETACH, ZFS_IOC_VDEV_DETACH,
ZFS_IOC_VDEV_SETPATH, ZFS_IOC_VDEV_SETPATH,
ZFS_IOC_VDEV_SETFRU,
ZFS_IOC_OBJSET_STATS, ZFS_IOC_OBJSET_STATS,
ZFS_IOC_OBJSET_ZPLPROPS, ZFS_IOC_OBJSET_ZPLPROPS,
ZFS_IOC_DATASET_LIST_NEXT, ZFS_IOC_DATASET_LIST_NEXT,
@ -560,7 +597,11 @@ typedef enum zfs_ioc {
ZFS_IOC_GET_FSACL, ZFS_IOC_GET_FSACL,
ZFS_IOC_ISCSI_PERM_CHECK, ZFS_IOC_ISCSI_PERM_CHECK,
ZFS_IOC_SHARE, ZFS_IOC_SHARE,
ZFS_IOC_INHERIT_PROP ZFS_IOC_INHERIT_PROP,
ZFS_IOC_SMB_ACL,
ZFS_IOC_USERSPACE_ONE,
ZFS_IOC_USERSPACE_MANY,
ZFS_IOC_USERSPACE_UPGRADE
} zfs_ioc_t; } zfs_ioc_t;
/* /*
@ -602,6 +643,7 @@ typedef enum {
#define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_CHECKREMOVE 0x1
#define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_UNSPARE 0x2
#define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_FORCEFAULT 0x4
#define ZFS_ONLINE_EXPAND 0x8
#define ZFS_OFFLINE_TEMPORARY 0x1 #define ZFS_OFFLINE_TEMPORARY 0x1
/* /*

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _ZFS_DELEG_H #ifndef _ZFS_DELEG_H
#define _ZFS_DELEG_H #define _ZFS_DELEG_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
#ifdef __cplusplus #ifdef __cplusplus
@ -59,6 +57,10 @@ typedef enum {
ZFS_DELEG_NOTE_USERPROP, ZFS_DELEG_NOTE_USERPROP,
ZFS_DELEG_NOTE_MOUNT, ZFS_DELEG_NOTE_MOUNT,
ZFS_DELEG_NOTE_SHARE, ZFS_DELEG_NOTE_SHARE,
ZFS_DELEG_NOTE_USERQUOTA,
ZFS_DELEG_NOTE_GROUPQUOTA,
ZFS_DELEG_NOTE_USERUSED,
ZFS_DELEG_NOTE_GROUPUSED,
ZFS_DELEG_NOTE_NONE ZFS_DELEG_NOTE_NONE
} zfs_deleg_note_t; } zfs_deleg_note_t;

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _ZFS_NAMECHECK_H #ifndef _ZFS_NAMECHECK_H
#define _ZFS_NAMECHECK_H #define _ZFS_NAMECHECK_H
#pragma ident "%Z%%M% %I% %E% SMI"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -50,7 +48,6 @@ typedef enum {
int pool_namecheck(const char *, namecheck_err_t *, char *); int pool_namecheck(const char *, namecheck_err_t *, char *);
int dataset_namecheck(const char *, namecheck_err_t *, char *); int dataset_namecheck(const char *, namecheck_err_t *, char *);
int mountpoint_namecheck(const char *, namecheck_err_t *); int mountpoint_namecheck(const char *, namecheck_err_t *);
int dataset_name_hidden(const char *);
int snapshot_namecheck(const char *, namecheck_err_t *, char *); int snapshot_namecheck(const char *, namecheck_err_t *, char *);
int permset_namecheck(const char *, namecheck_err_t *, char *); int permset_namecheck(const char *, namecheck_err_t *, char *);

View File

@ -19,13 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
#if defined(_KERNEL) #if defined(_KERNEL)
#include <sys/systm.h> #include <sys/systm.h>
#include <sys/sunddi.h> #include <sys/sunddi.h>
@ -66,6 +63,10 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
{ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, {ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
{ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE }, {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE },
{ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, {ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
{ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
{ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
{ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
{ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
{NULL, ZFS_DELEG_NOTE_NONE } {NULL, ZFS_DELEG_NOTE_NONE }
}; };

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
/* /*
* Common name validation routines for ZFS. These routines are shared by the * Common name validation routines for ZFS. These routines are shared by the
* userland code as well as the ioctl() layer to ensure that we don't * userland code as well as the ioctl() layer to ensure that we don't
@ -345,19 +343,3 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
return (0); return (0);
} }
/*
* Check if the dataset name is private for internal usage.
* '$' is reserved for internal dataset names. e.g. "$MOS"
*
* Return 1 if the given name is used internally.
* Return 0 if it is not.
*/
int
dataset_name_hidden(const char *name)
{
if (strchr(name, '$') != NULL)
return (1);
return (0);
}

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -43,6 +43,14 @@
static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS]; static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];
/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */
const char *zfs_userquota_prop_prefixes[] = {
"userused@",
"userquota@",
"groupused@",
"groupquota@"
};
zprop_desc_t * zprop_desc_t *
zfs_prop_get_table(void) zfs_prop_get_table(void)
{ {
@ -133,6 +141,7 @@ zfs_prop_init(void)
{ "1", 1 }, { "1", 1 },
{ "2", 2 }, { "2", 2 },
{ "3", 3 }, { "3", 3 },
{ "4", 4 },
{ "current", ZPL_VERSION }, { "current", ZPL_VERSION },
{ NULL } { NULL }
}; };
@ -218,7 +227,7 @@ zfs_prop_init(void)
/* default index properties */ /* default index properties */
register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
"1 | 2 | 3 | current", "VERSION", version_table); "1 | 2 | 3 | 4 | current", "VERSION", version_table);
register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
"CANMOUNT", canmount_table); "CANMOUNT", canmount_table);
@ -305,8 +314,13 @@ zfs_prop_init(void)
PROP_READONLY, ZFS_TYPE_DATASET, "NAME"); PROP_READONLY, ZFS_TYPE_DATASET, "NAME");
register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING, register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING,
PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
"STMF_SBD_LU");
register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY, register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY,
ZFS_TYPE_DATASET, "GUID"); ZFS_TYPE_DATASET, "GUID");
register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, NULL);
/* oddball properties */ /* oddball properties */
register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL, register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL,
@ -330,7 +344,6 @@ zfs_name_to_prop(const char *propname)
return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET)); return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
} }
/* /*
* For user property names, we allow all lowercase alphanumeric characters, plus * For user property names, we allow all lowercase alphanumeric characters, plus
* a few useful punctuation characters. * a few useful punctuation characters.
@ -367,6 +380,26 @@ zfs_prop_user(const char *name)
return (B_TRUE); return (B_TRUE);
} }
/*
* Returns true if this is a valid userspace-type property (one with a '@').
* Note that after the @, any character is valid (eg, another @, for SID
* user@domain).
*/
boolean_t
zfs_prop_userquota(const char *name)
{
zfs_userquota_prop_t prop;
for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) {
if (strncmp(name, zfs_userquota_prop_prefixes[prop],
strlen(zfs_userquota_prop_prefixes[prop])) == 0) {
return (B_TRUE);
}
}
return (B_FALSE);
}
/* /*
* Tables of index types, plus functions to convert between the user view * Tables of index types, plus functions to convert between the user view
* (strings) and internal representation (uint64_t). * (strings) and internal representation (uint64_t).

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -96,6 +96,8 @@ zpool_prop_init(void)
ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table); ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT, register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT,
ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table); ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table);
register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, PROP_DEFAULT,
ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
/* default index properties */ /* default index properties */
register_index(ZPOOL_PROP_FAILUREMODE, "failmode", register_index(ZPOOL_PROP_FAILUREMODE, "failmode",

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
/* /*
* Common routines used by zfs and zpool property management. * Common routines used by zfs and zpool property management.
*/ */
@ -205,9 +203,6 @@ propname_match(const char *p, size_t len, zprop_desc_t *prop_entry)
#ifndef _KERNEL #ifndef _KERNEL
const char *colname = prop_entry->pd_colname; const char *colname = prop_entry->pd_colname;
int c; int c;
if (colname == NULL)
return (B_FALSE);
#endif #endif
if (len == strlen(propname) && if (len == strlen(propname) &&
@ -215,7 +210,7 @@ propname_match(const char *p, size_t len, zprop_desc_t *prop_entry)
return (B_TRUE); return (B_TRUE);
#ifndef _KERNEL #ifndef _KERNEL
if (len != strlen(colname)) if (colname == NULL || len != strlen(colname))
return (B_FALSE); return (B_FALSE);
for (c = 0; c < len; c++) for (c = 0; c < len; c++)

View File

@ -124,6 +124,7 @@
#include <sys/arc.h> #include <sys/arc.h>
#include <sys/refcount.h> #include <sys/refcount.h>
#include <sys/vdev.h> #include <sys/vdev.h>
#include <sys/vdev_impl.h>
#ifdef _KERNEL #ifdef _KERNEL
#include <sys/vmsystm.h> #include <sys/vmsystm.h>
#include <vm/anon.h> #include <vm/anon.h>
@ -397,6 +398,7 @@ static arc_state_t *arc_l2c_only;
static int arc_no_grow; /* Don't try to grow cache size */ static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve; static uint64_t arc_tempreserve;
static uint64_t arc_loaned_bytes;
static uint64_t arc_meta_used; static uint64_t arc_meta_used;
static uint64_t arc_meta_limit; static uint64_t arc_meta_limit;
static uint64_t arc_meta_max = 0; static uint64_t arc_meta_max = 0;
@ -610,7 +612,7 @@ typedef struct l2arc_write_callback {
struct l2arc_buf_hdr { struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */ /* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */ l2arc_dev_t *b_dev; /* L2ARC device */
daddr_t b_daddr; /* disk address, offset byte */ uint64_t b_daddr; /* disk address, offset byte */
}; };
typedef struct l2arc_data_free { typedef struct l2arc_data_free {
@ -1203,6 +1205,41 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
return (buf); return (buf);
} }
static char *arc_onloan_tag = "onloan";
/*
* Loan out an anonymous arc buffer. Loaned buffers are not counted as in
* flight data by arc_tempreserve_space() until they are "returned". Loaned
* buffers must be returned to the arc before they can be used by the DMU or
* freed.
*/
arc_buf_t *
arc_loan_buf(spa_t *spa, int size)
{
arc_buf_t *buf;
buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
atomic_add_64(&arc_loaned_bytes, size);
return (buf);
}
/*
* Return a loaned arc buffer to the arc.
*/
void
arc_return_buf(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT(hdr->b_state == arc_anon);
ASSERT(buf->b_data != NULL);
VERIFY(refcount_remove(&hdr->b_refcnt, arc_onloan_tag) == 0);
VERIFY(refcount_add(&hdr->b_refcnt, tag) == 1);
atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
}
static arc_buf_t * static arc_buf_t *
arc_buf_clone(arc_buf_t *from) arc_buf_clone(arc_buf_t *from)
{ {
@ -2504,7 +2541,6 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
uint32_t *arc_flags, const zbookmark_t *zb) uint32_t *arc_flags, const zbookmark_t *zb)
{ {
int err; int err;
arc_buf_hdr_t *hdr = pbuf->b_hdr;
ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
@ -2512,9 +2548,8 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
err = arc_read_nolock(pio, spa, bp, done, private, priority, err = arc_read_nolock(pio, spa, bp, done, private, priority,
zio_flags, arc_flags, zb); zio_flags, arc_flags, zb);
ASSERT3P(hdr, ==, pbuf->b_hdr);
rw_exit(&pbuf->b_lock); rw_exit(&pbuf->b_lock);
return (err); return (err);
} }
@ -2604,7 +2639,7 @@ top:
uint64_t size = BP_GET_LSIZE(bp); uint64_t size = BP_GET_LSIZE(bp);
arc_callback_t *acb; arc_callback_t *acb;
vdev_t *vd = NULL; vdev_t *vd = NULL;
daddr_t addr; uint64_t addr;
boolean_t devw = B_FALSE; boolean_t devw = B_FALSE;
if (hdr == NULL) { if (hdr == NULL) {
@ -2923,6 +2958,7 @@ arc_release(arc_buf_t *buf, void *tag)
kmutex_t *hash_lock; kmutex_t *hash_lock;
l2arc_buf_hdr_t *l2hdr; l2arc_buf_hdr_t *l2hdr;
uint64_t buf_size; uint64_t buf_size;
boolean_t released = B_FALSE;
rw_enter(&buf->b_lock, RW_WRITER); rw_enter(&buf->b_lock, RW_WRITER);
hdr = buf->b_hdr; hdr = buf->b_hdr;
@ -2938,12 +2974,12 @@ arc_release(arc_buf_t *buf, void *tag)
ASSERT(buf->b_efunc == NULL); ASSERT(buf->b_efunc == NULL);
arc_buf_thaw(buf); arc_buf_thaw(buf);
rw_exit(&buf->b_lock); rw_exit(&buf->b_lock);
return; released = B_TRUE;
} else {
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
} }
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
l2hdr = hdr->b_l2hdr; l2hdr = hdr->b_l2hdr;
if (l2hdr) { if (l2hdr) {
mutex_enter(&l2arc_buflist_mtx); mutex_enter(&l2arc_buflist_mtx);
@ -2951,6 +2987,9 @@ arc_release(arc_buf_t *buf, void *tag)
buf_size = hdr->b_size; buf_size = hdr->b_size;
} }
if (released)
goto out;
/* /*
* Do we have more than one buf? * Do we have more than one buf?
*/ */
@ -3018,6 +3057,7 @@ arc_release(arc_buf_t *buf, void *tag)
buf->b_efunc = NULL; buf->b_efunc = NULL;
buf->b_private = NULL; buf->b_private = NULL;
out:
if (l2hdr) { if (l2hdr) {
list_remove(l2hdr->b_dev->l2ad_buflist, hdr); list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@ -3311,10 +3351,9 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
} }
static int static int
arc_memory_throttle(uint64_t reserve, uint64_t txg) arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
{ {
#ifdef _KERNEL #ifdef _KERNEL
uint64_t inflight_data = arc_anon->arcs_size;
uint64_t available_memory = ptob(freemem); uint64_t available_memory = ptob(freemem);
static uint64_t page_load = 0; static uint64_t page_load = 0;
static uint64_t last_txg = 0; static uint64_t last_txg = 0;
@ -3376,6 +3415,7 @@ int
arc_tempreserve_space(uint64_t reserve, uint64_t txg) arc_tempreserve_space(uint64_t reserve, uint64_t txg)
{ {
int error; int error;
uint64_t anon_size;
#ifdef ZFS_DEBUG #ifdef ZFS_DEBUG
/* /*
@ -3391,12 +3431,19 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
if (reserve > arc_c) if (reserve > arc_c)
return (ENOMEM); return (ENOMEM);
/*
* Don't count loaned bufs as in flight dirty data to prevent long
* network delays from blocking transactions that are ready to be
* assigned to a txg.
*/
anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
/* /*
* Writes will, almost always, require additional memory allocations * Writes will, almost always, require additional memory allocations
* in order to compress/encrypt/etc the data. We therefor need to * in order to compress/encrypt/etc the data. We therefor need to
* make sure that there is sufficient available memory for this. * make sure that there is sufficient available memory for this.
*/ */
if (error = arc_memory_throttle(reserve, txg)) if (error = arc_memory_throttle(reserve, anon_size, txg))
return (error); return (error);
/* /*
@ -3406,8 +3453,9 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* Note: if two requests come in concurrently, we might let them * Note: if two requests come in concurrently, we might let them
* both succeed, when one of them should fail. Not a huge deal. * both succeed, when one of them should fail. Not a huge deal.
*/ */
if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
arc_anon->arcs_size > arc_c / 4) { if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
anon_size > arc_c / 4) {
dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
"anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
arc_tempreserve>>10, arc_tempreserve>>10,
@ -3592,6 +3640,8 @@ arc_fini(void)
mutex_destroy(&zfs_write_limit_lock); mutex_destroy(&zfs_write_limit_lock);
buf_fini(); buf_fini();
ASSERT(arc_loaned_bytes == 0);
} }
/* /*
@ -4486,7 +4536,7 @@ l2arc_vdev_present(vdev_t *vd)
* validated the vdev and opened it. * validated the vdev and opened it.
*/ */
void void
l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) l2arc_add_vdev(spa_t *spa, vdev_t *vd)
{ {
l2arc_dev_t *adddev; l2arc_dev_t *adddev;
@ -4500,8 +4550,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
adddev->l2ad_vdev = vd; adddev->l2ad_vdev = vd;
adddev->l2ad_write = l2arc_write_max; adddev->l2ad_write = l2arc_write_max;
adddev->l2ad_boost = l2arc_write_boost; adddev->l2ad_boost = l2arc_write_boost;
adddev->l2ad_start = start; adddev->l2ad_start = VDEV_LABEL_START_SIZE;
adddev->l2ad_end = end; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_hand = adddev->l2ad_start;
adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE; adddev->l2ad_first = B_TRUE;

View File

@ -329,7 +329,7 @@ dbuf_verify(dmu_buf_impl_t *db)
if (db->db_parent == dn->dn_dbuf) { if (db->db_parent == dn->dn_dbuf) {
/* db is pointed to by the dnode */ /* db is pointed to by the dnode */
/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
if (db->db.db_object == DMU_META_DNODE_OBJECT) if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
ASSERT(db->db_parent == NULL); ASSERT(db->db_parent == NULL);
else else
ASSERT(db->db_parent != NULL); ASSERT(db->db_parent != NULL);
@ -465,15 +465,15 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
ASSERT(db->db_buf == NULL); ASSERT(db->db_buf == NULL);
if (db->db_blkid == DB_BONUS_BLKID) { if (db->db_blkid == DB_BONUS_BLKID) {
int bonuslen = dn->dn_bonuslen; int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
ASSERT3U(bonuslen, <=, db->db.db_size); ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
if (bonuslen < DN_MAX_BONUSLEN) if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN); bzero(db->db.db_data, DN_MAX_BONUSLEN);
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, if (bonuslen)
bonuslen); bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
dbuf_update_data(db); dbuf_update_data(db);
db->db_state = DB_CACHED; db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
@ -908,15 +908,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* Shouldn't dirty a regular buffer in syncing context. Private * Shouldn't dirty a regular buffer in syncing context. Private
* objects may be dirtied in syncing context, but only if they * objects may be dirtied in syncing context, but only if they
* were already pre-dirtied in open context. * were already pre-dirtied in open context.
* XXX We may want to prohibit dirtying in syncing context even
* if they did pre-dirty.
*/ */
ASSERT(!dmu_tx_is_syncing(tx) || ASSERT(!dmu_tx_is_syncing(tx) ||
BP_IS_HOLE(dn->dn_objset->os_rootbp) || BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
dn->dn_object == DMU_META_DNODE_OBJECT || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
dn->dn_objset->os_dsl_dataset == NULL || dn->dn_objset->os_dsl_dataset == NULL);
dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
/* /*
* We make this assert for private objects as well, but after we * We make this assert for private objects as well, but after we
* check if we're already dirty. They are allowed to re-dirty * check if we're already dirty. They are allowed to re-dirty
@ -975,7 +971,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/* /*
* Only valid if not already dirty. * Only valid if not already dirty.
*/ */
ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == ASSERT(dn->dn_object == 0 ||
dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
ASSERT3U(dn->dn_nlevels, >, db->db_level); ASSERT3U(dn->dn_nlevels, >, db->db_level);
@ -987,15 +984,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/* /*
* We should only be dirtying in syncing context if it's the * We should only be dirtying in syncing context if it's the
* mos, a spa os, or we're initializing the os. However, we are * mos or we're initializing the os or it's a special object.
* allowed to dirty in syncing context provided we already * However, we are allowed to dirty in syncing context provided
* dirtied it in open context. Hence we must make this * we already dirtied it in open context. Hence we must make
* assertion only if we're not already dirty. * this assertion only if we're not already dirty.
*/ */
ASSERT(!dmu_tx_is_syncing(tx) || ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
os->os_dsl_dataset == NULL || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
!dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
!BP_IS_HOLE(os->os_rootbp));
ASSERT(db->db.db_size != 0); ASSERT(db->db.db_size != 0);
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@ -1311,6 +1306,68 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
} }
/*
* Directly assign a provided arc buf to a given dbuf if it's not referenced
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
*/
void
dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
{
ASSERT(!refcount_is_zero(&db->db_holds));
ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
ASSERT(db->db_blkid != DB_BONUS_BLKID);
ASSERT(db->db_level == 0);
ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
ASSERT(buf != NULL);
ASSERT(arc_buf_size(buf) == db->db.db_size);
ASSERT(tx->tx_txg != 0);
arc_return_buf(buf, db);
ASSERT(arc_released(buf));
mutex_enter(&db->db_mtx);
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
if (db->db_state == DB_CACHED &&
refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
VERIFY(arc_buf_remove_ref(buf, db) == 1);
return;
}
if (db->db_state == DB_CACHED) {
dbuf_dirty_record_t *dr = db->db_last_dirty;
ASSERT(db->db_buf != NULL);
if (dr != NULL && dr->dr_txg == tx->tx_txg) {
ASSERT(dr->dt.dl.dr_data == db->db_buf);
if (!arc_released(db->db_buf)) {
ASSERT(dr->dt.dl.dr_override_state ==
DR_OVERRIDDEN);
arc_release(db->db_buf, db);
}
dr->dt.dl.dr_data = buf;
VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
arc_release(db->db_buf, db);
VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
}
db->db_buf = NULL;
}
ASSERT(db->db_buf == NULL);
dbuf_set_data(db, buf);
db->db_state = DB_FILL;
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
dbuf_fill_done(db, tx);
}
/* /*
* "Clear" the contents of this dbuf. This will mark the dbuf * "Clear" the contents of this dbuf. This will mark the dbuf
* EVICTING and clear *most* of its references. Unfortunetely, * EVICTING and clear *most* of its references. Unfortunetely,
@ -1855,6 +1912,19 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
return (db->db_user_ptr); return (db->db_user_ptr);
} }
boolean_t
dmu_buf_freeable(dmu_buf_t *dbuf)
{
boolean_t res = B_FALSE;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
if (db->db_blkptr)
res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
db->db_blkptr->blk_birth);
return (res);
}
static void static void
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
{ {
@ -1943,7 +2013,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dnode_t *dn = db->db_dnode; dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset; objset_impl_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg; uint64_t txg = tx->tx_txg;
int blksz;
ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dmu_tx_is_syncing(tx));
@ -2049,32 +2118,25 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
return; return;
} }
if (db->db_state != DB_NOFILL) { if (db->db_state != DB_NOFILL &&
blksz = arc_buf_size(*datap); dn->dn_object != DMU_META_DNODE_OBJECT &&
refcount_count(&db->db_holds) > 1 &&
if (dn->dn_object != DMU_META_DNODE_OBJECT) { *datap == db->db_buf) {
/* /*
* If this buffer is currently "in use" (i.e., there * If this buffer is currently "in use" (i.e., there
* are active holds and db_data still references it), * are active holds and db_data still references it),
* then make a copy before we start the write so that * then make a copy before we start the write so that
* any modifications from the open txg will not leak * any modifications from the open txg will not leak
* into this write. * into this write.
* *
* NOTE: this copy does not need to be made for * NOTE: this copy does not need to be made for
* objects only modified in the syncing context (e.g. * objects only modified in the syncing context (e.g.
* DNONE_DNODE blocks). * DNONE_DNODE blocks).
*/ */
if (refcount_count(&db->db_holds) > 1 && int blksz = arc_buf_size(*datap);
*datap == db->db_buf) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
arc_buf_contents_t type = *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
DBUF_GET_BUFC_TYPE(db); bcopy(db->db.db_data, (*datap)->b_data, blksz);
*datap =
arc_buf_alloc(os->os_spa, blksz, db, type);
bcopy(db->db.db_data, (*datap)->b_data, blksz);
}
}
ASSERT(*datap != NULL);
} }
db->db_data_pending = dr; db->db_data_pending = dr;

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -85,6 +85,8 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint64_array, TRUE, "FUID table size" }, { byteswap_uint64_array, TRUE, "FUID table size" },
{ zap_byteswap, TRUE, "DSL dataset next clones"}, { zap_byteswap, TRUE, "DSL dataset next clones"},
{ zap_byteswap, TRUE, "scrub work queue" }, { zap_byteswap, TRUE, "scrub work queue" },
{ zap_byteswap, TRUE, "ZFS user/group used" },
{ zap_byteswap, TRUE, "ZFS user/group quota" },
}; };
int int
@ -180,22 +182,22 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
* whose dnodes are in the same block. * whose dnodes are in the same block.
*/ */
static int static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{ {
dsl_pool_t *dp = NULL; dsl_pool_t *dp = NULL;
dmu_buf_t **dbp; dmu_buf_t **dbp;
uint64_t blkid, nblks, i; uint64_t blkid, nblks, i;
uint32_t flags; uint32_t dbuf_flags;
int err; int err;
zio_t *zio; zio_t *zio;
hrtime_t start; hrtime_t start;
ASSERT(length <= DMU_MAX_ACCESS); ASSERT(length <= DMU_MAX_ACCESS);
flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
if (length > zfetch_array_rd_sz) if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
flags |= DB_RF_NOPREFETCH; dbuf_flags |= DB_RF_NOPREFETCH;
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) { if (dn->dn_datablkshift) {
@ -233,7 +235,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
/* initiate async i/o */ /* initiate async i/o */
if (read) { if (read) {
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
(void) dbuf_read(db, zio, flags); (void) dbuf_read(db, zio, dbuf_flags);
rw_enter(&dn->dn_struct_rwlock, RW_READER); rw_enter(&dn->dn_struct_rwlock, RW_READER);
} }
dbp[i] = &db->db; dbp[i] = &db->db;
@ -285,7 +287,7 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
return (err); return (err);
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
numbufsp, dbpp); numbufsp, dbpp, DMU_READ_PREFETCH);
dnode_rele(dn, FTAG); dnode_rele(dn, FTAG);
@ -300,7 +302,7 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
int err; int err;
err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
numbufsp, dbpp); numbufsp, dbpp, DMU_READ_PREFETCH);
return (err); return (err);
} }
@ -442,7 +444,8 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
object_size = align == 1 ? dn->dn_datablksz : object_size = align == 1 ? dn->dn_datablksz :
(dn->dn_maxblkid + 1) << dn->dn_datablkshift; (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
if (trunc || (end = offset + length) > object_size) end = offset + length;
if (trunc || end > object_size)
end = object_size; end = object_size;
if (end <= offset) if (end <= offset)
return (0); return (0);
@ -450,6 +453,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
while (length) { while (length) {
start = end; start = end;
/* assert(offset <= start) */
err = get_next_chunk(dn, &start, offset); err = get_next_chunk(dn, &start, offset);
if (err) if (err)
return (err); return (err);
@ -540,7 +544,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
int int
dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf) void *buf, uint32_t flags)
{ {
dnode_t *dn; dnode_t *dn;
dmu_buf_t **dbp; dmu_buf_t **dbp;
@ -570,7 +574,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
* to be reading in parallel. * to be reading in parallel.
*/ */
err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
TRUE, FTAG, &numbufs, &dbp); TRUE, FTAG, &numbufs, &dbp, flags);
if (err) if (err)
break; break;
@ -810,6 +814,58 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
} }
#endif #endif
/*
* Allocate a loaned anonymous arc buffer.
*/
arc_buf_t *
dmu_request_arcbuf(dmu_buf_t *handle, int size)
{
dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
return (arc_loan_buf(dn->dn_objset->os_spa, size));
}
/*
* Free a loaned arc buffer.
*/
void
dmu_return_arcbuf(arc_buf_t *buf)
{
arc_return_buf(buf, FTAG);
VERIFY(arc_buf_remove_ref(buf, FTAG) == 1);
}
/*
* When possible directly assign passed loaned arc buffer to a dbuf.
* If this is not possible copy the contents of passed arc buf via
* dmu_write().
*/
void
dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
dmu_buf_impl_t *db;
uint32_t blksz = (uint32_t)arc_buf_size(buf);
uint64_t blkid;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock);
if (offset == db->db.db_offset && blksz == db->db.db_size) {
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
dbuf_rele(db, FTAG);
ASSERT(dn->dn_objset->os.os == dn->dn_objset);
dmu_write(&dn->dn_objset->os, dn->dn_object, offset, blksz,
buf->b_data, tx);
dmu_return_arcbuf(buf);
}
}
typedef struct { typedef struct {
dbuf_dirty_record_t *dr; dbuf_dirty_record_t *dr;
dmu_sync_cb_t *done; dmu_sync_cb_t *done;

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/dmu.h> #include <sys/dmu.h>
#include <sys/dmu_objset.h> #include <sys/dmu_objset.h>
#include <sys/dmu_tx.h> #include <sys/dmu_tx.h>
@ -108,22 +106,56 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int int
dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) int blocksize, dmu_object_type_t bonustype, int bonuslen)
{ {
dnode_t *dn; dnode_t *dn;
dmu_tx_t *tx;
int nblkptr;
int err; int err;
if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) if (object == DMU_META_DNODE_OBJECT)
return (EBADF); return (EBADF);
err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
FTAG, &dn); FTAG, &dn);
if (err) if (err)
return (err); return (err);
if (dn->dn_type == ot && dn->dn_datablksz == blocksize &&
dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) {
/* nothing is changing, this is a noop */
dnode_rele(dn, FTAG);
return (0);
}
nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
/*
* If we are losing blkptrs or changing the block size this must
* be a new file instance. We must clear out the previous file
* contents before we can change this type of metadata in the dnode.
*/
if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) {
err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
if (err)
goto out;
}
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, object);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
dmu_tx_abort(tx);
goto out;
}
dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
dmu_tx_commit(tx);
out:
dnode_rele(dn, FTAG); dnode_rele(dn, FTAG);
return (0); return (err);
} }
int int

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -164,10 +164,15 @@ dmu_objset_byteswap(void *buf, size_t size)
{ {
objset_phys_t *osp = buf; objset_phys_t *osp = buf;
ASSERT(size == sizeof (objset_phys_t)); ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
dnode_byteswap(&osp->os_meta_dnode); dnode_byteswap(&osp->os_meta_dnode);
byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
osp->os_type = BSWAP_64(osp->os_type); osp->os_type = BSWAP_64(osp->os_type);
osp->os_flags = BSWAP_64(osp->os_flags);
if (size == sizeof (objset_phys_t)) {
dnode_byteswap(&osp->os_userused_dnode);
dnode_byteswap(&osp->os_groupused_dnode);
}
} }
int int
@ -210,12 +215,30 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
err = EIO; err = EIO;
return (err); return (err);
} }
/* Increase the blocksize if we are permitted. */
if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
arc_buf_size(osi->os_phys_buf) < sizeof (objset_phys_t)) {
arc_buf_t *buf = arc_buf_alloc(spa,
sizeof (objset_phys_t), &osi->os_phys_buf,
ARC_BUFC_METADATA);
bzero(buf->b_data, sizeof (objset_phys_t));
bcopy(osi->os_phys_buf->b_data, buf->b_data,
arc_buf_size(osi->os_phys_buf));
(void) arc_buf_remove_ref(osi->os_phys_buf,
&osi->os_phys_buf);
osi->os_phys_buf = buf;
}
osi->os_phys = osi->os_phys_buf->b_data; osi->os_phys = osi->os_phys_buf->b_data;
osi->os_flags = osi->os_phys->os_flags;
} else { } else {
osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t), int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
osi->os_phys_buf = arc_buf_alloc(spa, size,
&osi->os_phys_buf, ARC_BUFC_METADATA); &osi->os_phys_buf, ARC_BUFC_METADATA);
osi->os_phys = osi->os_phys_buf->b_data; osi->os_phys = osi->os_phys_buf->b_data;
bzero(osi->os_phys, sizeof (objset_phys_t)); bzero(osi->os_phys, size);
} }
/* /*
@ -276,6 +299,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
osi->os_meta_dnode = dnode_special_open(osi, osi->os_meta_dnode = dnode_special_open(osi,
&osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
if (arc_buf_size(osi->os_phys_buf) >= sizeof (objset_phys_t)) {
osi->os_userused_dnode = dnode_special_open(osi,
&osi->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
osi->os_groupused_dnode = dnode_special_open(osi,
&osi->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
}
/* /*
* We should be the only thread trying to do this because we * We should be the only thread trying to do this because we
@ -456,13 +485,15 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
os.os = osi; os.os = osi;
(void) dmu_objset_evict_dbufs(&os); (void) dmu_objset_evict_dbufs(&os);
ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
dnode_special_close(osi->os_meta_dnode); dnode_special_close(osi->os_meta_dnode);
if (osi->os_userused_dnode) {
dnode_special_close(osi->os_userused_dnode);
dnode_special_close(osi->os_groupused_dnode);
}
zil_free(osi->os_zil); zil_free(osi->os_zil);
ASSERT3P(list_head(&osi->os_dnodes), ==, NULL);
VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
mutex_destroy(&osi->os_lock); mutex_destroy(&osi->os_lock);
mutex_destroy(&osi->os_obj_lock); mutex_destroy(&osi->os_obj_lock);
@ -520,6 +551,10 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
ASSERT(type != DMU_OST_ANY); ASSERT(type != DMU_OST_ANY);
ASSERT(type < DMU_OST_NUMTYPES); ASSERT(type < DMU_OST_NUMTYPES);
osi->os_phys->os_type = type; osi->os_phys->os_type = type;
if (dmu_objset_userused_enabled(osi)) {
osi->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
osi->os_flags = osi->os_phys->os_flags;
}
dsl_dataset_dirty(ds, tx); dsl_dataset_dirty(ds, tx);
@ -704,13 +739,33 @@ struct snaparg {
char *snapname; char *snapname;
char failed[MAXPATHLEN]; char failed[MAXPATHLEN];
boolean_t checkperms; boolean_t checkperms;
list_t objsets; nvlist_t *props;
}; };
struct osnode { static int
list_node_t node; snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
objset_t *os; {
}; objset_t *os = arg1;
struct snaparg *sn = arg2;
/* The props have already been checked by zfs_check_userprops(). */
return (dsl_dataset_snapshot_check(os->os->os_dsl_dataset,
sn->snapname, tx));
}
static void
snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
objset_t *os = arg1;
dsl_dataset_t *ds = os->os->os_dsl_dataset;
struct snaparg *sn = arg2;
dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx);
if (sn->props)
dsl_props_set_sync(ds->ds_prev, sn->props, cr, tx);
}
static int static int
dmu_objset_snapshot_one(char *name, void *arg) dmu_objset_snapshot_one(char *name, void *arg)
@ -747,13 +802,8 @@ dmu_objset_snapshot_one(char *name, void *arg)
*/ */
err = zil_suspend(dmu_objset_zil(os)); err = zil_suspend(dmu_objset_zil(os));
if (err == 0) { if (err == 0) {
struct osnode *osn; dsl_sync_task_create(sn->dstg, snapshot_check,
dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check, snapshot_sync, os, sn, 3);
dsl_dataset_snapshot_sync, os->os->os_dsl_dataset,
sn->snapname, 3);
osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP);
osn->os = os;
list_insert_tail(&sn->objsets, osn);
} else { } else {
dmu_objset_close(os); dmu_objset_close(os);
} }
@ -762,11 +812,11 @@ dmu_objset_snapshot_one(char *name, void *arg)
} }
int int
dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) dmu_objset_snapshot(char *fsname, char *snapname,
nvlist_t *props, boolean_t recursive)
{ {
dsl_sync_task_t *dst; dsl_sync_task_t *dst;
struct osnode *osn; struct snaparg sn;
struct snaparg sn = { 0 };
spa_t *spa; spa_t *spa;
int err; int err;
@ -778,8 +828,7 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
sn.snapname = snapname; sn.snapname = snapname;
list_create(&sn.objsets, sizeof (struct osnode), sn.props = props;
offsetof(struct osnode, node));
if (recursive) { if (recursive) {
sn.checkperms = B_TRUE; sn.checkperms = B_TRUE;
@ -790,27 +839,19 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
err = dmu_objset_snapshot_one(fsname, &sn); err = dmu_objset_snapshot_one(fsname, &sn);
} }
if (err) if (err == 0)
goto out; err = dsl_sync_task_group_wait(sn.dstg);
err = dsl_sync_task_group_wait(sn.dstg);
for (dst = list_head(&sn.dstg->dstg_tasks); dst; for (dst = list_head(&sn.dstg->dstg_tasks); dst;
dst = list_next(&sn.dstg->dstg_tasks, dst)) { dst = list_next(&sn.dstg->dstg_tasks, dst)) {
dsl_dataset_t *ds = dst->dst_arg1; objset_t *os = dst->dst_arg1;
dsl_dataset_t *ds = os->os->os_dsl_dataset;
if (dst->dst_err) if (dst->dst_err)
dsl_dataset_name(ds, sn.failed); dsl_dataset_name(ds, sn.failed);
zil_resume(dmu_objset_zil(os));
dmu_objset_close(os);
} }
out:
while (osn = list_head(&sn.objsets)) {
list_remove(&sn.objsets, osn);
zil_resume(dmu_objset_zil(osn->os));
dmu_objset_close(osn->os);
kmem_free(osn, sizeof (struct osnode));
}
list_destroy(&sn.objsets);
if (err) if (err)
(void) strcpy(fsname, sn.failed); (void) strcpy(fsname, sn.failed);
dsl_sync_task_group_destroy(sn.dstg); dsl_sync_task_group_destroy(sn.dstg);
@ -819,7 +860,7 @@ out:
} }
static void static void
dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
{ {
dnode_t *dn; dnode_t *dn;
@ -827,14 +868,20 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
ASSERT(dn->dn_dbuf->db_data_pending); ASSERT(dn->dn_dbuf->db_data_pending);
/* /*
* Initialize dn_zio outside dnode_sync() * Initialize dn_zio outside dnode_sync() because the
* to accomodate meta-dnode * meta-dnode needs to set it ouside dnode_sync().
*/ */
dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
ASSERT(dn->dn_zio); ASSERT(dn->dn_zio);
ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
list_remove(list, dn); list_remove(list, dn);
if (newlist) {
(void) dnode_add_ref(dn, newlist);
list_insert_tail(newlist, dn);
}
dnode_sync(dn, tx); dnode_sync(dn, tx);
} }
} }
@ -853,9 +900,12 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg)
ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(BP_GET_LEVEL(bp) == 0);
/* /*
* Update rootbp fill count. * Update rootbp fill count: it should be the number of objects
* allocated in the object set (not counting the "special"
* objects that are stored in the objset_phys_t -- the meta
* dnode and user/group accounting objects).
*/ */
bp->blk_fill = 1; /* count the meta-dnode */ bp->blk_fill = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++) for (int i = 0; i < dnp->dn_nblkptr; i++)
bp->blk_fill += dnp->dn_blkptr[i].blk_fill; bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
@ -878,6 +928,7 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
writeprops_t wp = { 0 }; writeprops_t wp = { 0 };
zio_t *zio; zio_t *zio;
list_t *list; list_t *list;
list_t *newlist = NULL;
dbuf_dirty_record_t *dr; dbuf_dirty_record_t *dr;
dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
@ -915,20 +966,41 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
} }
arc_release(os->os_phys_buf, &os->os_phys_buf); arc_release(os->os_phys_buf, &os->os_phys_buf);
zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os), zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os, tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/* /*
* Sync meta-dnode - the parent IO for the sync is the root block * Sync special dnodes - the parent IO for the sync is the root block
*/ */
os->os_meta_dnode->dn_zio = zio; os->os_meta_dnode->dn_zio = zio;
dnode_sync(os->os_meta_dnode, tx); dnode_sync(os->os_meta_dnode, tx);
os->os_phys->os_flags = os->os_flags;
if (os->os_userused_dnode &&
os->os_userused_dnode->dn_type != DMU_OT_NONE) {
os->os_userused_dnode->dn_zio = zio;
dnode_sync(os->os_userused_dnode, tx);
os->os_groupused_dnode->dn_zio = zio;
dnode_sync(os->os_groupused_dnode, tx);
}
txgoff = tx->tx_txg & TXG_MASK; txgoff = tx->tx_txg & TXG_MASK;
dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx); if (dmu_objset_userused_enabled(os)) {
dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx); newlist = &os->os_synced_dnodes;
/*
* We must create the list here because it uses the
* dn_dirty_link[] of this txg.
*/
list_create(newlist, sizeof (dnode_t),
offsetof(dnode_t, dn_dirty_link[txgoff]));
}
dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
list = &os->os_meta_dnode->dn_dirty_records[txgoff]; list = &os->os_meta_dnode->dn_dirty_records[txgoff];
while (dr = list_head(list)) { while (dr = list_head(list)) {
@ -945,6 +1017,145 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
zio_nowait(zio); zio_nowait(zio);
} }
static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
void
dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
{
used_cbs[ost] = cb;
}
boolean_t
dmu_objset_userused_enabled(objset_impl_t *os)
{
return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
used_cbs[os->os_phys->os_type] &&
os->os_userused_dnode);
}
void
dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx)
{
dnode_t *dn;
list_t *list = &os->os_synced_dnodes;
static const char zerobuf[DN_MAX_BONUSLEN] = {0};
ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
while (dn = list_head(list)) {
dmu_object_type_t bonustype;
ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
ASSERT(dn->dn_oldphys);
ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
dn->dn_phys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED);
/* Allocate the user/groupused objects if necessary. */
if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
VERIFY(0 == zap_create_claim(&os->os,
DMU_USERUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
VERIFY(0 == zap_create_claim(&os->os,
DMU_GROUPUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
}
/*
* If the object was not previously
* accounted, pretend that it was free.
*/
if (!(dn->dn_oldphys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED)) {
bzero(dn->dn_oldphys, sizeof (dnode_phys_t));
}
/*
* If the object was freed, use the previous bonustype.
*/
bonustype = dn->dn_phys->dn_bonustype ?
dn->dn_phys->dn_bonustype : dn->dn_oldphys->dn_bonustype;
ASSERT(dn->dn_phys->dn_type != 0 ||
(bcmp(DN_BONUS(dn->dn_phys), zerobuf,
DN_MAX_BONUSLEN) == 0 &&
DN_USED_BYTES(dn->dn_phys) == 0));
ASSERT(dn->dn_oldphys->dn_type != 0 ||
(bcmp(DN_BONUS(dn->dn_oldphys), zerobuf,
DN_MAX_BONUSLEN) == 0 &&
DN_USED_BYTES(dn->dn_oldphys) == 0));
used_cbs[os->os_phys->os_type](&os->os, bonustype,
DN_BONUS(dn->dn_oldphys), DN_BONUS(dn->dn_phys),
DN_USED_BYTES(dn->dn_oldphys),
DN_USED_BYTES(dn->dn_phys), tx);
/*
* The mutex is needed here for interlock with dnode_allocate.
*/
mutex_enter(&dn->dn_mtx);
zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t));
dn->dn_oldphys = NULL;
mutex_exit(&dn->dn_mtx);
list_remove(list, dn);
dnode_rele(dn, list);
}
}
boolean_t
dmu_objset_userspace_present(objset_t *os)
{
return (os->os->os_phys->os_flags &
OBJSET_FLAG_USERACCOUNTING_COMPLETE);
}
int
dmu_objset_userspace_upgrade(objset_t *os)
{
uint64_t obj;
int err = 0;
if (dmu_objset_userspace_present(os))
return (0);
if (!dmu_objset_userused_enabled(os->os))
return (ENOTSUP);
if (dmu_objset_is_snapshot(os))
return (EINVAL);
/*
* We simply need to mark every object dirty, so that it will be
* synced out and now accounted. If this is called
* concurrently, or if we already did some work before crashing,
* that's fine, since we track each object's accounted state
* independently.
*/
for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
dmu_tx_t *tx = dmu_tx_create(os);
dmu_buf_t *db;
int objerr;
if (issig(JUSTLOOKING) && issig(FORREAL))
return (EINTR);
objerr = dmu_bonus_hold(os, obj, FTAG, &db);
if (objerr)
continue;
dmu_tx_hold_bonus(tx, obj);
objerr = dmu_tx_assign(tx, TXG_WAIT);
if (objerr) {
dmu_tx_abort(tx);
continue;
}
dmu_buf_will_dirty(db, tx);
dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
}
os->os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
txg_wait_synced(dmu_objset_pool(os), 0);
return (0);
}
void void
dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp) uint64_t *usedobjsp, uint64_t *availobjsp)
@ -978,6 +1189,8 @@ dmu_objset_stats(objset_t *os, nvlist_t *nv)
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
os->os->os_phys->os_type); os->os->os_phys->os_type);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
dmu_objset_userspace_present(os));
} }
int int

View File

@ -161,7 +161,9 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
if (issig(JUSTLOOKING) && issig(FORREAL)) if (issig(JUSTLOOKING) && issig(FORREAL))
return (EINTR); return (EINTR);
if (bp == NULL && zb->zb_object == 0) { if (zb->zb_object != 0 && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
return (0);
} else if (bp == NULL && zb->zb_object == 0) {
uint64_t span = BP_SPAN(dnp, zb->zb_level); uint64_t span = BP_SPAN(dnp, zb->zb_level);
uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
@ -775,11 +777,6 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
dmu_tx_t *tx; dmu_tx_t *tx;
void *data = NULL; void *data = NULL;
err = dmu_object_info(os, drro->drr_object, NULL);
if (err != 0 && err != ENOENT)
return (EINVAL);
if (drro->drr_type == DMU_OT_NONE || if (drro->drr_type == DMU_OT_NONE ||
drro->drr_type >= DMU_OT_NUMTYPES || drro->drr_type >= DMU_OT_NUMTYPES ||
drro->drr_bonustype >= DMU_OT_NUMTYPES || drro->drr_bonustype >= DMU_OT_NUMTYPES ||
@ -792,18 +789,21 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
return (EINVAL); return (EINVAL);
} }
err = dmu_object_info(os, drro->drr_object, NULL);
if (err != 0 && err != ENOENT)
return (EINVAL);
if (drro->drr_bonuslen) { if (drro->drr_bonuslen) {
data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
if (ra->err) if (ra->err)
return (ra->err); return (ra->err);
} }
tx = dmu_tx_create(os);
if (err == ENOENT) { if (err == ENOENT) {
/* currently free, want to be allocated */ /* currently free, want to be allocated */
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
err = dmu_tx_assign(tx, TXG_WAIT); err = dmu_tx_assign(tx, TXG_WAIT);
if (err) { if (err) {
dmu_tx_abort(tx); dmu_tx_abort(tx);
@ -812,28 +812,22 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
err = dmu_object_claim(os, drro->drr_object, err = dmu_object_claim(os, drro->drr_object,
drro->drr_type, drro->drr_blksz, drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen, tx); drro->drr_bonustype, drro->drr_bonuslen, tx);
dmu_tx_commit(tx);
} else { } else {
/* currently allocated, want to be allocated */ /* currently allocated, want to be allocated */
dmu_tx_hold_bonus(tx, drro->drr_object);
/*
* We may change blocksize and delete old content,
* so need to hold_write and hold_free.
*/
dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
dmu_tx_hold_free(tx, drro->drr_object, 0, DMU_OBJECT_END);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
dmu_tx_abort(tx);
return (err);
}
err = dmu_object_reclaim(os, drro->drr_object, err = dmu_object_reclaim(os, drro->drr_object,
drro->drr_type, drro->drr_blksz, drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen, tx); drro->drr_bonustype, drro->drr_bonuslen);
} }
if (err) { if (err)
dmu_tx_commit(tx);
return (EINVAL); return (EINVAL);
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, drro->drr_object);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
dmu_tx_abort(tx);
return (err);
} }
dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -64,6 +64,9 @@ struct traverse_data {
void *td_arg; void *td_arg;
}; };
static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
arc_buf_t *buf, uint64_t objset, uint64_t object);
/* ARGSUSED */ /* ARGSUSED */
static void static void
traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
@ -189,7 +192,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
} }
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_WAIT; uint32_t flags = ARC_WAIT;
int i, j; int i;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
err = arc_read(NULL, td->td_spa, bp, pbuf, err = arc_read(NULL, td->td_spa, bp, pbuf,
@ -201,20 +204,15 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
/* recursively visitbp() blocks below this */ /* recursively visitbp() blocks below this */
dnp = buf->b_data; dnp = buf->b_data;
for (i = 0; i < epb && err == 0; i++, dnp++) { for (i = 0; i < epb && err == 0; i++, dnp++) {
for (j = 0; j < dnp->dn_nblkptr; j++) { err = traverse_dnode(td, dnp, buf, zb->zb_objset,
SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_blkid * epb + i);
zb->zb_blkid * epb + i, if (err)
dnp->dn_nlevels - 1, j); break;
err = traverse_visitbp(td, dnp, buf,
(blkptr_t *)&dnp->dn_blkptr[j], &czb);
if (err)
break;
}
} }
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t flags = ARC_WAIT; uint32_t flags = ARC_WAIT;
objset_phys_t *osp; objset_phys_t *osp;
int j; dnode_phys_t *dnp;
err = arc_read_nolock(NULL, td->td_spa, bp, err = arc_read_nolock(NULL, td->td_spa, bp,
arc_getbuf_func, &buf, arc_getbuf_func, &buf,
@ -223,20 +221,19 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
return (err); return (err);
osp = buf->b_data; osp = buf->b_data;
/*
* traverse_zil is just here for zdb's leak checking.
* For other consumers, there will be no ZIL blocks.
*/
traverse_zil(td, &osp->os_zil_header); traverse_zil(td, &osp->os_zil_header);
for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { dnp = &osp->os_meta_dnode;
SET_BOOKMARK(&czb, zb->zb_objset, 0, err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0);
osp->os_meta_dnode.dn_nlevels - 1, j); if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
err = traverse_visitbp(td, &osp->os_meta_dnode, buf, dnp = &osp->os_userused_dnode;
(blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j], err = traverse_dnode(td, dnp, buf, zb->zb_objset,
&czb); DMU_USERUSED_OBJECT);
if (err) }
break; if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
dnp = &osp->os_groupused_dnode;
err = traverse_dnode(td, dnp, buf, zb->zb_objset,
DMU_GROUPUSED_OBJECT);
} }
} }
@ -249,6 +246,23 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
return (err); return (err);
} }
static int
traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
arc_buf_t *buf, uint64_t objset, uint64_t object)
{
int j, err = 0;
zbookmark_t czb;
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, buf,
(blkptr_t *)&dnp->dn_blkptr[j], &czb);
if (err)
break;
}
return (err);
}
/* ARGSUSED */ /* ARGSUSED */
static int static int
traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -162,6 +162,41 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
return (err); return (err);
} }
static void
dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db,
boolean_t freeable, dmu_buf_impl_t **history)
{
int i = db->db_level + 1;
dnode_t *dn = db->db_dnode;
if (i >= dn->dn_nlevels)
return;
db = db->db_parent;
if (db == NULL) {
uint64_t lvls = dn->dn_nlevels - i;
txh->txh_space_towrite += lvls << dn->dn_indblkshift;
return;
}
if (db != history[i]) {
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint64_t space = 1ULL << dn->dn_indblkshift;
freeable = (db->db_blkptr && (freeable ||
dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth)));
if (freeable)
txh->txh_space_tooverwrite += space;
else
txh->txh_space_towrite += space;
if (db->db_blkptr)
txh->txh_space_tounref += space;
history[i] = db;
dmu_tx_count_indirects(txh, db, freeable, history);
}
}
/* ARGSUSED */ /* ARGSUSED */
static void static void
dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
@ -179,18 +214,24 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
min_ibs = DN_MIN_INDBLKSHIFT; min_ibs = DN_MIN_INDBLKSHIFT;
max_ibs = DN_MAX_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT;
/*
* For i/o error checking, read the first and last level-0
* blocks (if they are not aligned), and all the level-1 blocks.
*/
if (dn) { if (dn) {
dmu_buf_impl_t *last[DN_MAX_LEVELS];
int nlvls = dn->dn_nlevels;
int delta;
/*
* For i/o error checking, read the first and last level-0
* blocks (if they are not aligned), and all the level-1 blocks.
*/
if (dn->dn_maxblkid == 0) { if (dn->dn_maxblkid == 0) {
if ((off > 0 || len < dn->dn_datablksz) && delta = dn->dn_datablksz;
off < dn->dn_datablksz) { start = (off < dn->dn_datablksz) ? 0 : 1;
end = (off+len <= dn->dn_datablksz) ? 0 : 1;
if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
err = dmu_tx_check_ioerr(NULL, dn, 0, 0); err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
if (err) if (err)
goto out; goto out;
delta -= off;
} }
} else { } else {
zio_t *zio = zio_root(dn->dn_objset->os_spa, zio_t *zio = zio_root(dn->dn_objset->os_spa,
@ -215,10 +256,9 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
} }
/* level-1 blocks */ /* level-1 blocks */
if (dn->dn_nlevels > 1) { if (nlvls > 1) {
start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (i = (start>>shft)+1; i < end>>shft; i++) {
for (i = start+1; i < end; i++) {
err = dmu_tx_check_ioerr(zio, dn, 1, i); err = dmu_tx_check_ioerr(zio, dn, 1, i);
if (err) if (err)
goto out; goto out;
@ -228,20 +268,70 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
err = zio_wait(zio); err = zio_wait(zio);
if (err) if (err)
goto out; goto out;
delta = P2NPHASE(off, dn->dn_datablksz);
} }
}
/* if (dn->dn_maxblkid > 0) {
* If there's more than one block, the blocksize can't change, /*
* so we can make a more precise estimate. Alternatively, * The blocksize can't change,
* if the dnode's ibs is larger than max_ibs, always use that. * so we can make a more precise estimate.
* This ensures that if we reduce DN_MAX_INDBLKSHIFT, */
* the code will still work correctly on existing pools. ASSERT(dn->dn_datablkshift != 0);
*/
if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
min_ibs = max_ibs = dn->dn_indblkshift;
if (dn->dn_datablkshift != 0)
min_bs = max_bs = dn->dn_datablkshift; min_bs = max_bs = dn->dn_datablkshift;
min_ibs = max_ibs = dn->dn_indblkshift;
} else if (dn->dn_indblkshift > max_ibs) {
/*
* This ensures that if we reduce DN_MAX_INDBLKSHIFT,
* the code will still work correctly on older pools.
*/
min_ibs = max_ibs = dn->dn_indblkshift;
}
/*
* If this write is not off the end of the file
* we need to account for overwrites/unref.
*/
if (start <= dn->dn_maxblkid)
bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
while (start <= dn->dn_maxblkid) {
spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
dmu_buf_impl_t *db;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold_level(dn, 0, start, FTAG);
rw_exit(&dn->dn_struct_rwlock);
if (db->db_blkptr && dsl_dataset_block_freeable(ds,
db->db_blkptr->blk_birth)) {
dprintf_bp(db->db_blkptr, "can free old%s", "");
txh->txh_space_tooverwrite += dn->dn_datablksz;
txh->txh_space_tounref += dn->dn_datablksz;
dmu_tx_count_indirects(txh, db, TRUE, last);
} else {
txh->txh_space_towrite += dn->dn_datablksz;
if (db->db_blkptr)
txh->txh_space_tounref +=
bp_get_dasize(spa, db->db_blkptr);
dmu_tx_count_indirects(txh, db, FALSE, last);
}
dbuf_rele(db, FTAG);
if (++start > end) {
/*
* Account for new indirects appearing
* before this IO gets assigned into a txg.
*/
bits = 64 - min_bs;
epbs = min_ibs - SPA_BLKPTRSHIFT;
for (bits -= epbs * (nlvls - 1);
bits >= 0; bits -= epbs)
txh->txh_fudge += 1ULL << max_ibs;
goto out;
}
off += delta;
if (len >= delta)
len -= delta;
delta = dn->dn_datablksz;
}
} }
/* /*
@ -264,20 +354,22 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
start >>= epbs; start >>= epbs;
end >>= epbs; end >>= epbs;
/* ASSERT3U(end, >=, start);
* If we increase the number of levels of indirection,
* we'll need new blkid=0 indirect blocks. If start == 0,
* we're already accounting for that blocks; and if end == 0,
* we can't increase the number of levels beyond that.
*/
if (start != 0 && end != 0)
txh->txh_space_towrite += 1ULL << max_ibs;
txh->txh_space_towrite += (end - start + 1) << max_ibs; txh->txh_space_towrite += (end - start + 1) << max_ibs;
if (start != 0) {
/*
* We also need a new blkid=0 indirect block
* to reference any existing file data.
*/
txh->txh_space_towrite += 1ULL << max_ibs;
}
} }
ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS);
out: out:
if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
2 * DMU_MAX_ACCESS)
err = EFBIG;
if (err) if (err)
txh->txh_tx->tx_err = err; txh->txh_tx->tx_err = err;
} }
@ -294,6 +386,7 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh)
dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
dn->dn_dbuf->db_blkptr->blk_birth)) { dn->dn_dbuf->db_blkptr->blk_birth)) {
txh->txh_space_tooverwrite += space; txh->txh_space_tooverwrite += space;
txh->txh_space_tounref += space;
} else { } else {
txh->txh_space_towrite += space; txh->txh_space_towrite += space;
if (dn && dn->dn_dbuf->db_blkptr) if (dn && dn->dn_dbuf->db_blkptr)
@ -537,7 +630,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
} }
void void
dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
{ {
dmu_tx_hold_t *txh; dmu_tx_hold_t *txh;
dnode_t *dn; dnode_t *dn;
@ -586,9 +679,9 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
} else { } else {
txh->txh_space_towrite += SPA_MAXBLOCKSIZE; txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
txh->txh_space_tounref +=
BP_GET_ASIZE(dn->dn_phys->dn_blkptr);
} }
if (dn->dn_phys->dn_blkptr[0].blk_birth)
txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
return; return;
} }
@ -605,12 +698,9 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
} }
} }
/* err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add,
* 3 blocks overwritten: target leaf, ptrtbl block, header block &txh->txh_space_towrite, &txh->txh_space_tooverwrite,
* 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks txh->txh_dnode->dn_datablkshift);
*/
dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
(3 + (add ? 3 : 0)) << dn->dn_datablkshift);
/* /*
* If the modified blocks are scattered to the four winds, * If the modified blocks are scattered to the four winds,
@ -618,7 +708,10 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
*/ */
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
txh->txh_space_towrite += 3 << dn->dn_indblkshift; if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
txh->txh_space_towrite += 3 << dn->dn_indblkshift;
else
txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
} }
void void

View File

@ -156,7 +156,7 @@ dnode_verify(dnode_t *dn)
} }
if (dn->dn_phys->dn_type != DMU_OT_NONE) if (dn->dn_phys->dn_type != DMU_OT_NONE)
ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL); ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
if (dn->dn_dbuf != NULL) { if (dn->dn_dbuf != NULL) {
ASSERT3P(dn->dn_phys, ==, ASSERT3P(dn->dn_phys, ==,
(dnode_phys_t *)dn->dn_dbuf->db.db_data + (dnode_phys_t *)dn->dn_dbuf->db.db_data +
@ -321,6 +321,7 @@ dnode_destroy(dnode_t *dn)
} }
ASSERT(NULL == list_head(&dn->dn_dbufs)); ASSERT(NULL == list_head(&dn->dn_dbufs));
#endif #endif
ASSERT(dn->dn_oldphys == NULL);
mutex_enter(&os->os_lock); mutex_enter(&os->os_lock);
list_remove(&os->os_dnodes, dn); list_remove(&os->os_dnodes, dn);
@ -417,8 +418,7 @@ void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{ {
int i, nblkptr; int nblkptr;
dmu_buf_impl_t *db = NULL;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
@ -430,42 +430,25 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
for (i = 0; i < TXG_SIZE; i++)
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
/* clean up any unreferenced dbufs */ /* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn); dnode_evict_dbufs(dn);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
/*
* XXX I should really have a generation number to tell if we
* need to do this...
*/
if (blocksize != dn->dn_datablksz ||
dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) {
/* free all old data */
dnode_free_range(dn, 0, -1ULL, tx);
}
nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
/* change blocksize */
rw_enter(&dn->dn_struct_rwlock, RW_WRITER); rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (blocksize != dn->dn_datablksz &&
(!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
list_head(&dn->dn_dbufs) != NULL)) {
db = dbuf_hold(dn, 0, FTAG);
dbuf_new_size(db, blocksize, tx);
}
dnode_setdblksz(dn, blocksize);
dnode_setdirty(dn, tx); dnode_setdirty(dn, tx);
dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; if (dn->dn_datablksz != blocksize) {
dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; /* change blocksize */
ASSERT(dn->dn_maxblkid == 0 &&
(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
dnode_block_freed(dn, 0)));
dnode_setdblksz(dn, blocksize);
dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
}
if (dn->dn_bonuslen != bonuslen)
dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
if (dn->dn_nblkptr != nblkptr) if (dn->dn_nblkptr != nblkptr)
dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
rw_exit(&dn->dn_struct_rwlock); rw_exit(&dn->dn_struct_rwlock);
if (db)
dbuf_rele(db, FTAG);
/* change type */ /* change type */
dn->dn_type = ot; dn->dn_type = ot;
@ -569,6 +552,22 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
*/ */
ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0); ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
dn = (object == DMU_USERUSED_OBJECT) ?
os->os_userused_dnode : os->os_groupused_dnode;
if (dn == NULL)
return (ENOENT);
type = dn->dn_type;
if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
return (ENOENT);
if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
return (EEXIST);
DNODE_VERIFY(dn);
(void) refcount_add(&dn->dn_holds, tag);
*dnp = dn;
return (0);
}
if (object == 0 || object >= DN_MAX_OBJECT) if (object == 0 || object >= DN_MAX_OBJECT)
return (EINVAL); return (EINVAL);
@ -627,7 +626,8 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
type = dn->dn_type; type = dn->dn_type;
if (dn->dn_free_txg || if (dn->dn_free_txg ||
((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) { ((flag & DNODE_MUST_BE_FREE) &&
(type != DMU_OT_NONE || dn->dn_oldphys))) {
mutex_exit(&dn->dn_mtx); mutex_exit(&dn->dn_mtx);
dbuf_rele(db, FTAG); dbuf_rele(db, FTAG);
return (type == DMU_OT_NONE ? ENOENT : EEXIST); return (type == DMU_OT_NONE ? ENOENT : EEXIST);
@ -692,8 +692,10 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
objset_impl_t *os = dn->dn_objset; objset_impl_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg; uint64_t txg = tx->tx_txg;
if (dn->dn_object == DMU_META_DNODE_OBJECT) if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
dsl_dataset_dirty(os->os_dsl_dataset, tx);
return; return;
}
DNODE_VERIFY(dn); DNODE_VERIFY(dn);
@ -1189,11 +1191,6 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
if (dn->dn_free_txg) if (dn->dn_free_txg)
return (TRUE); return (TRUE);
/*
* If dn_datablkshift is not set, then there's only a single
* block, in which case there will never be a free range so it
* won't matter.
*/
range_tofind.fr_blkid = blkid; range_tofind.fr_blkid = blkid;
mutex_enter(&dn->dn_mtx); mutex_enter(&dn->dn_mtx);
for (i = 0; i < TXG_SIZE; i++) { for (i = 0; i < TXG_SIZE; i++) {
@ -1278,7 +1275,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
dprintf("probing object %llu offset %llx level %d of %u\n", dprintf("probing object %llu offset %llx level %d of %u\n",
dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
hole = flags & DNODE_FIND_HOLE; hole = ((flags & DNODE_FIND_HOLE) != 0);
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
ASSERT(txg == 0 || !hole); ASSERT(txg == 0 || !hole);
@ -1325,16 +1322,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
for (i = (*offset >> span) & (blkfill - 1); for (i = (*offset >> span) & (blkfill - 1);
i >= 0 && i < blkfill; i += inc) { i >= 0 && i < blkfill; i += inc) {
boolean_t newcontents = B_TRUE; if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
if (txg) {
int j;
newcontents = B_FALSE;
for (j = 0; j < dnp[i].dn_nblkptr; j++) {
if (dnp[i].dn_blkptr[j].blk_birth > txg)
newcontents = B_TRUE;
}
}
if (!dnp[i].dn_type == hole && newcontents)
break; break;
*offset += (1ULL << span) * inc; *offset += (1ULL << span) * inc;
} }

View File

@ -504,9 +504,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
/* /*
* Write out the dnode's dirty buffers. * Write out the dnode's dirty buffers.
*
* NOTE: The dnode is kept in memory by being dirty. Once the
* dirty bit is cleared, it may be evicted. Beware of this!
*/ */
void void
dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_sync(dnode_t *dn, dmu_tx_t *tx)
@ -515,20 +512,33 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnode_phys_t *dnp = dn->dn_phys; dnode_phys_t *dnp = dn->dn_phys;
int txgoff = tx->tx_txg & TXG_MASK; int txgoff = tx->tx_txg & TXG_MASK;
list_t *list = &dn->dn_dirty_records[txgoff]; list_t *list = &dn->dn_dirty_records[txgoff];
static const dnode_phys_t zerodn = { 0 };
ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
ASSERT(dnp->dn_type != DMU_OT_NONE ||
bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
DNODE_VERIFY(dn); DNODE_VERIFY(dn);
ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
if (dmu_objset_userused_enabled(dn->dn_objset) &&
!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
ASSERT(dn->dn_oldphys == NULL);
dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t));
*dn->dn_oldphys = *dn->dn_phys; /* struct assignment */
dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
} else {
/* Once we account for it, we should always account for it. */
ASSERT(!(dn->dn_phys->dn_flags &
DNODE_FLAG_USERUSED_ACCOUNTED));
}
mutex_enter(&dn->dn_mtx); mutex_enter(&dn->dn_mtx);
if (dn->dn_allocated_txg == tx->tx_txg) { if (dn->dn_allocated_txg == tx->tx_txg) {
/* The dnode is newly allocated or reallocated */ /* The dnode is newly allocated or reallocated */
if (dnp->dn_type == DMU_OT_NONE) { if (dnp->dn_type == DMU_OT_NONE) {
/* this is a first alloc, not a realloc */ /* this is a first alloc, not a realloc */
/* XXX shouldn't the phys already be zeroed? */
bzero(dnp, DNODE_CORE_SIZE);
dnp->dn_nlevels = 1; dnp->dn_nlevels = 1;
dnp->dn_nblkptr = dn->dn_nblkptr; dnp->dn_nblkptr = dn->dn_nblkptr;
} }
@ -626,7 +636,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dbuf_sync_list(list, tx); dbuf_sync_list(list, tx);
if (dn->dn_object != DMU_META_DNODE_OBJECT) { if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
ASSERT3P(list_head(list), ==, NULL); ASSERT3P(list_head(list), ==, NULL);
dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
} }

View File

@ -229,7 +229,7 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
} }
int boolean_t
dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
{ {
return (blk_birth > dsl_dataset_prev_snap_txg(ds)); return (blk_birth > dsl_dataset_prev_snap_txg(ds));
@ -548,6 +548,7 @@ dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner,
return (err); return (err);
if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
dsl_dataset_rele(*dsp, owner); dsl_dataset_rele(*dsp, owner);
*dsp = NULL;
return (EBUSY); return (EBUSY);
} }
return (0); return (0);
@ -974,6 +975,27 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
(void) dmu_free_object(os, obj); (void) dmu_free_object(os, obj);
} }
/*
* We need to sync out all in-flight IO before we try to evict
* (the dataset evict func is trying to clear the cached entries
* for this dataset in the ARC).
*/
txg_wait_synced(dd->dd_pool, 0);
/*
* If we managed to free all the objects in open
* context, the user space accounting should be zero.
*/
if (ds->ds_phys->ds_bp.blk_fill == 0 &&
dmu_objset_userused_enabled(os->os)) {
uint64_t count;
ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
count == 0);
ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
count == 0);
}
dmu_objset_close(os); dmu_objset_close(os);
if (err != ESRCH) if (err != ESRCH)
goto out; goto out;
@ -1058,7 +1080,6 @@ dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
return (ds->ds_user_ptr); return (ds->ds_user_ptr);
} }
blkptr_t * blkptr_t *
dsl_dataset_get_blkptr(dsl_dataset_t *ds) dsl_dataset_get_blkptr(dsl_dataset_t *ds)
{ {
@ -1164,8 +1185,18 @@ kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
if (bp == NULL) if (bp == NULL)
return (0); return (0);
ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); if ((zb->zb_level == -1ULL && zb->zb_blkid != 0) ||
(void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); (zb->zb_object != 0 && dnp == NULL)) {
/*
* It's a block in the intent log. It has no
* accounting, so just free it.
*/
VERIFY3U(0, ==, dsl_free(ka->zio, ka->tx->tx_pool,
ka->tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT));
} else {
ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
(void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
}
return (0); return (0);
} }
@ -1209,13 +1240,7 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
dmu_buf_will_dirty(ds->ds_dbuf, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx);
/*
* Before the roll back destroy the zil.
*/
if (ds->ds_user_ptr != NULL) { if (ds->ds_user_ptr != NULL) {
zil_rollback_destroy(
((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx);
/* /*
* We need to make sure that the objset_impl_t is reopened after * We need to make sure that the objset_impl_t is reopened after
* we do the rollback, otherwise it will have the wrong * we do the rollback, otherwise it will have the wrong
@ -1248,7 +1273,10 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
ds->ds_phys->ds_deadlist_obj)); ds->ds_phys->ds_deadlist_obj));
{ {
/* Free blkptrs that we gave birth to */ /*
* Free blkptrs that we gave birth to - this covers
* claimed but not played log blocks too.
*/
zio_t *zio; zio_t *zio;
struct killarg ka; struct killarg ka;
@ -1262,8 +1290,7 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
(void) zio_wait(zio); (void) zio_wait(zio);
} }
ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) || ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
ds->ds_phys->ds_unique_bytes == 0);
if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) { if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
/* Change our contents to that of the prev snapshot */ /* Change our contents to that of the prev snapshot */
@ -1481,7 +1508,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
if (after_branch_point && if (after_branch_point &&
ds_prev->ds_phys->ds_next_clones_obj != 0) { ds_prev->ds_phys->ds_next_clones_obj != 0) {
VERIFY(0 == zap_remove_int(mos, VERIFY3U(0, ==, zap_remove_int(mos,
ds_prev->ds_phys->ds_next_clones_obj, obj, tx)); ds_prev->ds_phys->ds_next_clones_obj, obj, tx));
if (ds->ds_phys->ds_next_snap_obj != 0) { if (ds->ds_phys->ds_next_snap_obj != 0) {
VERIFY(0 == zap_add_int(mos, VERIFY(0 == zap_add_int(mos,
@ -1654,7 +1681,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
TRAVERSE_POST, kill_blkptr, &ka); TRAVERSE_POST, kill_blkptr, &ka);
ASSERT3U(err, ==, 0); ASSERT3U(err, ==, 0);
ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE || ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
ds->ds_phys->ds_unique_bytes == 0); ds->ds_phys->ds_unique_bytes == 0);
} }
@ -2583,7 +2610,7 @@ snaplist_destroy(list_t *l, boolean_t own)
{ {
struct promotenode *snap; struct promotenode *snap;
if (!list_link_active(&l->list_head)) if (!l || !list_link_active(&l->list_head))
return; return;
while ((snap = list_tail(l)) != NULL) { while ((snap = list_tail(l)) != NULL) {

View File

@ -227,24 +227,11 @@ dsl_dir_namelen(dsl_dir_t *dd)
return (result); return (result);
} }
int
dsl_dir_is_private(dsl_dir_t *dd)
{
int rv = FALSE;
if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
rv = TRUE;
if (dataset_name_hidden(dd->dd_myname))
rv = TRUE;
return (rv);
}
static int static int
getcomponent(const char *path, char *component, const char **nextp) getcomponent(const char *path, char *component, const char **nextp)
{ {
char *p; char *p;
if (path == NULL) if ((path == NULL) || (path[0] == '\0'))
return (ENOENT); return (ENOENT);
/* This would be a good place to reserve some namespace... */ /* This would be a good place to reserve some namespace... */
p = strpbrk(path, "/@"); p = strpbrk(path, "/@");

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -90,6 +90,9 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
1, 4, 0);
return (dp); return (dp);
} }
@ -129,14 +132,15 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
goto out; goto out;
err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
FTAG, &ds); FTAG, &ds);
if (err) if (err == 0) {
goto out; err = dsl_dataset_hold_obj(dp,
err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, ds->ds_phys->ds_prev_snap_obj, dp,
dp, &dp->dp_origin_snap); &dp->dp_origin_snap);
if (err) dsl_dataset_rele(ds, FTAG);
goto out; }
dsl_dataset_rele(ds, FTAG);
dsl_dir_close(dd, dp); dsl_dir_close(dd, dp);
if (err)
goto out;
} }
/* get scrub status */ /* get scrub status */
@ -226,6 +230,7 @@ dsl_pool_close(dsl_pool_t *dp)
rw_destroy(&dp->dp_config_rwlock); rw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock); mutex_destroy(&dp->dp_lock);
mutex_destroy(&dp->dp_scrub_cancel_lock); mutex_destroy(&dp->dp_scrub_cancel_lock);
taskq_destroy(dp->dp_vnrele_taskq);
if (dp->dp_blkstats) if (dp->dp_blkstats)
kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
kmem_free(dp, sizeof (dsl_pool_t)); kmem_free(dp, sizeof (dsl_pool_t));
@ -296,24 +301,52 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
tx = dmu_tx_create_assigned(dp, txg); tx = dmu_tx_create_assigned(dp, txg);
dp->dp_read_overhead = 0; dp->dp_read_overhead = 0;
start = gethrtime();
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
if (!list_link_active(&ds->ds_synced_link)) /*
list_insert_tail(&dp->dp_synced_datasets, ds); * We must not sync any non-MOS datasets twice, because
else * we may have taken a snapshot of them. However, we
dmu_buf_rele(ds->ds_dbuf, ds); * may sync newly-created datasets on pass 2.
*/
ASSERT(!list_link_active(&ds->ds_synced_link));
list_insert_tail(&dp->dp_synced_datasets, ds);
dsl_dataset_sync(ds, zio, tx); dsl_dataset_sync(ds, zio, tx);
} }
DTRACE_PROBE(pool_sync__1setup); DTRACE_PROBE(pool_sync__1setup);
start = gethrtime();
err = zio_wait(zio); err = zio_wait(zio);
write_time = gethrtime() - start; write_time = gethrtime() - start;
ASSERT(err == 0); ASSERT(err == 0);
DTRACE_PROBE(pool_sync__2rootzio); DTRACE_PROBE(pool_sync__2rootzio);
while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) for (ds = list_head(&dp->dp_synced_datasets); ds;
ds = list_next(&dp->dp_synced_datasets, ds))
dmu_objset_do_userquota_callbacks(ds->ds_user_ptr, tx);
/*
* Sync the datasets again to push out the changes due to
* userquota updates. This must be done before we process the
* sync tasks, because that could cause a snapshot of a dataset
* whose ds_bp will be rewritten when we do this 2nd sync.
*/
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
ASSERT(list_link_active(&ds->ds_synced_link));
dmu_buf_rele(ds->ds_dbuf, ds);
dsl_dataset_sync(ds, zio, tx);
}
err = zio_wait(zio);
while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
/*
* No more sync tasks should have been added while we
* were syncing.
*/
ASSERT(spa_sync_pass(dp->dp_spa) == 1);
dsl_sync_task_group_sync(dstg, tx); dsl_sync_task_group_sync(dstg, tx);
}
DTRACE_PROBE(pool_sync__3task); DTRACE_PROBE(pool_sync__3task);
start = gethrtime(); start = gethrtime();
@ -611,3 +644,9 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(ds, FTAG);
rw_exit(&dp->dp_config_rwlock); rw_exit(&dp->dp_config_rwlock);
} }
taskq_t *
dsl_pool_vnrele_taskq(dsl_pool_t *dp)
{
return (dp->dp_vnrele_taskq);
}

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/dmu.h> #include <sys/dmu.h>
#include <sys/dmu_objset.h> #include <sys/dmu_objset.h>
#include <sys/dmu_tx.h> #include <sys/dmu_tx.h>
@ -415,6 +413,34 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
"%s=%s dataset = %llu", psa->name, valstr, ds->ds_object); "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object);
} }
void
dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
{
dsl_dataset_t *ds = arg1;
nvlist_t *nvl = arg2;
nvpair_t *elem = NULL;
while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
struct prop_set_arg psa;
psa.name = nvpair_name(elem);
if (nvpair_type(elem) == DATA_TYPE_STRING) {
VERIFY(nvpair_value_string(elem,
(char **)&psa.buf) == 0);
psa.intsz = 1;
psa.numints = strlen(psa.buf) + 1;
} else {
uint64_t intval;
VERIFY(nvpair_value_uint64(elem, &intval) == 0);
psa.intsz = sizeof (intval);
psa.numints = 1;
psa.buf = &intval;
}
dsl_prop_set_sync(ds, &psa, cr, tx);
}
}
void void
dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
cred_t *cr, dmu_tx_t *tx) cred_t *cr, dmu_tx_t *tx)
@ -438,6 +464,7 @@ dsl_prop_set(const char *dsname, const char *propname,
int intsz, int numints, const void *buf) int intsz, int numints, const void *buf)
{ {
dsl_dataset_t *ds; dsl_dataset_t *ds;
uint64_t version;
int err; int err;
struct prop_set_arg psa; struct prop_set_arg psa;
@ -447,15 +474,19 @@ dsl_prop_set(const char *dsname, const char *propname,
*/ */
if (strlen(propname) >= ZAP_MAXNAMELEN) if (strlen(propname) >= ZAP_MAXNAMELEN)
return (ENAMETOOLONG); return (ENAMETOOLONG);
if (intsz * numints >= ZAP_MAXVALUELEN)
return (E2BIG);
err = dsl_dataset_hold(dsname, FTAG, &ds); err = dsl_dataset_hold(dsname, FTAG, &ds);
if (err) if (err)
return (err); return (err);
version = spa_version(ds->ds_dir->dd_pool->dp_spa);
if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ?
ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
dsl_dataset_rele(ds, FTAG);
return (E2BIG);
}
if (dsl_dataset_is_snapshot(ds) && if (dsl_dataset_is_snapshot(ds) &&
spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) { version < SPA_VERSION_SNAP_PROPS) {
dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(ds, FTAG);
return (ENOTSUP); return (ENOTSUP);
} }
@ -471,6 +502,50 @@ dsl_prop_set(const char *dsname, const char *propname,
return (err); return (err);
} }
int
dsl_props_set(const char *dsname, nvlist_t *nvl)
{
dsl_dataset_t *ds;
uint64_t version;
nvpair_t *elem = NULL;
int err;
if (err = dsl_dataset_hold(dsname, FTAG, &ds))
return (err);
/*
* Do these checks before the syncfunc, since it can't fail.
*/
version = spa_version(ds->ds_dir->dd_pool->dp_spa);
while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
dsl_dataset_rele(ds, FTAG);
return (ENAMETOOLONG);
}
if (nvpair_type(elem) == DATA_TYPE_STRING) {
char *valstr;
VERIFY(nvpair_value_string(elem, &valstr) == 0);
if (strlen(valstr) >= (version <
SPA_VERSION_STMF_PROP ?
ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
dsl_dataset_rele(ds, FTAG);
return (E2BIG);
}
}
}
if (dsl_dataset_is_snapshot(ds) &&
version < SPA_VERSION_SNAP_PROPS) {
dsl_dataset_rele(ds, FTAG);
return (ENOTSUP);
}
err = dsl_sync_task_do(ds->ds_dir->dd_pool,
NULL, dsl_props_set_sync, ds, nvl, 2);
dsl_dataset_rele(ds, FTAG);
return (err);
}
/* /*
* Iterate over all properties for this dataset and return them in an nvlist. * Iterate over all properties for this dataset and return them in an nvlist.
*/ */

View File

@ -45,6 +45,8 @@ typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
static scrub_cb_t dsl_pool_scrub_clean_cb; static scrub_cb_t dsl_pool_scrub_clean_cb;
static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
uint64_t objset, uint64_t object);
int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
@ -348,6 +350,12 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
if (bp->blk_birth <= dp->dp_scrub_min_txg) if (bp->blk_birth <= dp->dp_scrub_min_txg)
return; return;
/*
* One block ("stubby") can be allocated a long time ago; we
* want to visit that one because it has been allocated
* (on-disk) even if it hasn't been claimed (even though for
* plain scrub there's nothing to do to it).
*/
if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
return; return;
@ -373,6 +381,11 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
if (bp->blk_birth <= dp->dp_scrub_min_txg) if (bp->blk_birth <= dp->dp_scrub_min_txg)
return; return;
/*
* birth can be < claim_txg if this record's txg is
* already txg sync'ed (but this log block contains
* other records that are not synced)
*/
if (claim_txg == 0 || bp->blk_birth < claim_txg) if (claim_txg == 0 || bp->blk_birth < claim_txg)
return; return;
@ -472,7 +485,7 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_WAIT; uint32_t flags = ARC_WAIT;
dnode_phys_t *child_dnp; dnode_phys_t *child_dnp;
int i, j; int i;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
err = arc_read(NULL, dp->dp_spa, bp, pbuf, err = arc_read(NULL, dp->dp_spa, bp, pbuf,
@ -487,20 +500,12 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
child_dnp = buf->b_data; child_dnp = buf->b_data;
for (i = 0; i < epb; i++, child_dnp++) { for (i = 0; i < epb; i++, child_dnp++) {
for (j = 0; j < child_dnp->dn_nblkptr; j++) { scrub_visitdnode(dp, child_dnp, buf, zb->zb_objset,
zbookmark_t czb; zb->zb_blkid * epb + i);
SET_BOOKMARK(&czb, zb->zb_objset,
zb->zb_blkid * epb + i,
child_dnp->dn_nlevels - 1, j);
scrub_visitbp(dp, child_dnp, buf,
&child_dnp->dn_blkptr[j], &czb);
}
} }
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
uint32_t flags = ARC_WAIT; uint32_t flags = ARC_WAIT;
objset_phys_t *osp; objset_phys_t *osp;
int j;
err = arc_read_nolock(NULL, dp->dp_spa, bp, err = arc_read_nolock(NULL, dp->dp_spa, bp,
arc_getbuf_func, &buf, arc_getbuf_func, &buf,
@ -516,13 +521,13 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
traverse_zil(dp, &osp->os_zil_header); traverse_zil(dp, &osp->os_zil_header);
for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { scrub_visitdnode(dp, &osp->os_meta_dnode,
zbookmark_t czb; buf, zb->zb_objset, 0);
if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
SET_BOOKMARK(&czb, zb->zb_objset, 0, scrub_visitdnode(dp, &osp->os_userused_dnode,
osp->os_meta_dnode.dn_nlevels - 1, j); buf, zb->zb_objset, 0);
scrub_visitbp(dp, &osp->os_meta_dnode, buf, scrub_visitdnode(dp, &osp->os_groupused_dnode,
&osp->os_meta_dnode.dn_blkptr[j], &czb); buf, zb->zb_objset, 0);
} }
} }
@ -531,6 +536,21 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
(void) arc_buf_remove_ref(buf, &buf); (void) arc_buf_remove_ref(buf, &buf);
} }
static void
scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
uint64_t objset, uint64_t object)
{
int j;
for (j = 0; j < dnp->dn_nblkptr; j++) {
zbookmark_t czb;
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
}
}
static void static void
scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
{ {

View File

@ -19,11 +19,111 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI" /*
* Fletcher Checksums
* ------------------
*
* ZFS's 2nd and 4th order Fletcher checksums are defined by the following
* recurrence relations:
*
* a = a + f
* i i-1 i-1
*
* b = b + a
* i i-1 i
*
* c = c + b (fletcher-4 only)
* i i-1 i
*
* d = d + c (fletcher-4 only)
* i i-1 i
*
* Where
* a_0 = b_0 = c_0 = d_0 = 0
* and
* f_0 .. f_(n-1) are the input data.
*
* Using standard techniques, these translate into the following series:
*
* __n_ __n_
* \ | \ |
* a = > f b = > i * f
* n /___| n - i n /___| n - i
* i = 1 i = 1
*
*
* __n_ __n_
* \ | i*(i+1) \ | i*(i+1)*(i+2)
* c = > ------- f d = > ------------- f
* n /___| 2 n - i n /___| 6 n - i
* i = 1 i = 1
*
* For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
* Since the additions are done mod (2^64), errors in the high bits may not
* be noticed. For this reason, fletcher-2 is deprecated.
*
* For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
* A conservative estimate of how big the buffer can get before we overflow
* can be estimated using f_i = 0xffffffff for all i:
*
* % bc
* f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
* 2264
* quit
* %
*
* So blocks of up to 2k will not overflow. Our largest block size is
* 128k, which has 32k 4-byte words, so we can compute the largest possible
* accumulators, then divide by 2^64 to figure the max amount of overflow:
*
* % bc
* a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
* a/2^64;b/2^64;c/2^64;d/2^64
* 0
* 0
* 1365
* 11186858
* quit
* %
*
* So a and b cannot overflow. To make sure each bit of input has some
* effect on the contents of c and d, we can look at what the factors of
* the coefficients in the equations for c_n and d_n are. The number of 2s
* in the factors determines the lowest set bit in the multiplier. Running
* through the cases for n*(n+1)/2 reveals that the highest power of 2 is
* 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow
* the 64-bit accumulators, every bit of every f_i effects every accumulator,
* even for 128k blocks.
*
* If we wanted to make a stronger version of fletcher4 (fletcher4c?),
* we could do our calculations mod (2^32 - 1) by adding in the carries
* periodically, and store the number of carries in the top 32-bits.
*
* --------------------
* Checksum Performance
* --------------------
*
* There are two interesting components to checksum performance: cached and
* uncached performance. With cached data, fletcher-2 is about four times
* faster than fletcher-4. With uncached data, the performance difference is
* negligible, since the cost of a cache fill dominates the processing time.
* Even though fletcher-4 is slower than fletcher-2, it is still a pretty
* efficient pass over the data.
*
* In normal operation, the data which is being checksummed is in a buffer
* which has been filled either by:
*
* 1. a compression step, which will be mostly cached, or
* 2. a bcopy() or copyin(), which will be uncached (because the
* copy is cache-bypassing).
*
* For both cached and uncached data, both fletcher checksums are much faster
* than sha-256, and slower than 'off', which doesn't touch the data at all.
*/
#include <sys/types.h> #include <sys/types.h>
#include <sys/sysmacros.h> #include <sys/sysmacros.h>

View File

@ -85,6 +85,8 @@ void *arc_data_buf_alloc(uint64_t space);
void arc_data_buf_free(void *buf, uint64_t space); void arc_data_buf_free(void *buf, uint64_t space);
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
arc_buf_contents_t type); arc_buf_contents_t type);
arc_buf_t *arc_loan_buf(spa_t *spa, int size);
void arc_return_buf(arc_buf_t *buf, void *tag);
void arc_buf_add_ref(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag);
int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf); int arc_buf_size(arc_buf_t *buf);
@ -134,7 +136,7 @@ void arc_fini(void);
* Level 2 ARC * Level 2 ARC
*/ */
void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end); void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
void l2arc_remove_vdev(vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd);
boolean_t l2arc_vdev_present(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd);
void l2arc_init(void); void l2arc_init(void);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -264,6 +264,7 @@ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_clear(dmu_buf_impl_t *db); void dbuf_clear(dmu_buf_impl_t *db);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -60,6 +60,7 @@ struct zbookmark;
struct spa; struct spa;
struct nvlist; struct nvlist;
struct objset_impl; struct objset_impl;
struct arc_buf;
typedef struct objset objset_t; typedef struct objset objset_t;
typedef struct dmu_tx dmu_tx_t; typedef struct dmu_tx dmu_tx_t;
@ -114,6 +115,8 @@ typedef enum dmu_object_type {
DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */
DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_NEXT_CLONES, /* ZAP */
DMU_OT_SCRUB_QUEUE, /* ZAP */ DMU_OT_SCRUB_QUEUE, /* ZAP */
DMU_OT_USERGROUP_USED, /* ZAP */
DMU_OT_USERGROUP_QUOTA, /* ZAP */
DMU_OT_NUMTYPES DMU_OT_NUMTYPES
} dmu_object_type_t; } dmu_object_type_t;
@ -156,6 +159,9 @@ void zfs_znode_byteswap(void *buf, size_t size);
#define DMU_MAX_ACCESS (10<<20) /* 10MB */ #define DMU_MAX_ACCESS (10<<20) /* 10MB */
#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
#define DMU_USERUSED_OBJECT (-1ULL)
#define DMU_GROUPUSED_OBJECT (-2ULL)
/* /*
* Public routines to create, destroy, open, and close objsets. * Public routines to create, destroy, open, and close objsets.
*/ */
@ -171,7 +177,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type,
int dmu_objset_destroy(const char *name); int dmu_objset_destroy(const char *name);
int dmu_snapshots_destroy(char *fsname, char *snapname); int dmu_snapshots_destroy(char *fsname, char *snapname);
int dmu_objset_rollback(objset_t *os); int dmu_objset_rollback(objset_t *os);
int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
boolean_t recursive);
int dmu_objset_rename(const char *name, const char *newname, int dmu_objset_rename(const char *name, const char *newname,
boolean_t recursive); boolean_t recursive);
int dmu_objset_find(char *name, int func(char *, void *), void *arg, int dmu_objset_find(char *name, int func(char *, void *), void *arg,
@ -235,7 +242,7 @@ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); int blocksize, dmu_object_type_t bonustype, int bonuslen);
/* /*
* Free an object from this objset. * Free an object from this objset.
@ -397,6 +404,11 @@ void *dmu_buf_get_user(dmu_buf_t *db);
*/ */
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
/*
* Tells if the given dbuf is freeable.
*/
boolean_t dmu_buf_freeable(dmu_buf_t *);
/* /*
* You must create a transaction, then hold the objects which you will * You must create a transaction, then hold the objects which you will
* (or might) modify as part of this transaction. Then you must assign * (or might) modify as part of this transaction. Then you must assign
@ -422,7 +434,7 @@ dmu_tx_t *dmu_tx_create(objset_t *os);
void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
uint64_t len); uint64_t len);
void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
void dmu_tx_abort(dmu_tx_t *tx); void dmu_tx_abort(dmu_tx_t *tx);
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
@ -465,8 +477,10 @@ int dmu_free_object(objset_t *os, uint64_t object);
* Canfail routines will return 0 on success, or an errno if there is a * Canfail routines will return 0 on success, or an errno if there is a
* nonrecoverable I/O error. * nonrecoverable I/O error.
*/ */
#define DMU_READ_PREFETCH 0 /* prefetch */
#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf); void *buf, uint32_t flags);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx); const void *buf, dmu_tx_t *tx);
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
@ -476,6 +490,10 @@ int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
dmu_tx_t *tx); dmu_tx_t *tx);
int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, struct page *pp, dmu_tx_t *tx); uint64_t size, struct page *pp, dmu_tx_t *tx);
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
dmu_tx_t *tx);
extern int zfs_prefetch_disable; extern int zfs_prefetch_disable;
@ -582,6 +600,12 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
int maxlen, boolean_t *conflict); int maxlen, boolean_t *conflict);
extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
uint64_t *idp, uint64_t *offp); uint64_t *idp, uint64_t *offp);
typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype,
void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused,
dmu_tx_t *tx);
extern void dmu_objset_register_type(dmu_objset_type_t ost,
objset_used_cb_t *cb);
extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
extern void *dmu_objset_get_user(objset_t *os); extern void *dmu_objset_get_user(objset_t *os);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -42,12 +42,20 @@ struct dsl_dataset;
struct dmu_tx; struct dmu_tx;
struct objset_impl; struct objset_impl;
#define OBJSET_PHYS_SIZE 2048
#define OBJSET_OLD_PHYS_SIZE 1024
#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0)
typedef struct objset_phys { typedef struct objset_phys {
dnode_phys_t os_meta_dnode; dnode_phys_t os_meta_dnode;
zil_header_t os_zil_header; zil_header_t os_zil_header;
uint64_t os_type; uint64_t os_type;
char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) - uint64_t os_flags;
sizeof (uint64_t)]; char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
sizeof (zil_header_t) - sizeof (uint64_t)*2];
dnode_phys_t os_userused_dnode;
dnode_phys_t os_groupused_dnode;
} objset_phys_t; } objset_phys_t;
struct objset { struct objset {
@ -62,6 +70,8 @@ typedef struct objset_impl {
arc_buf_t *os_phys_buf; arc_buf_t *os_phys_buf;
objset_phys_t *os_phys; objset_phys_t *os_phys;
dnode_t *os_meta_dnode; dnode_t *os_meta_dnode;
dnode_t *os_userused_dnode;
dnode_t *os_groupused_dnode;
zilog_t *os_zil; zilog_t *os_zil;
objset_t os; objset_t os;
uint8_t os_checksum; /* can change, under dsl_dir's locks */ uint8_t os_checksum; /* can change, under dsl_dir's locks */
@ -74,6 +84,8 @@ typedef struct objset_impl {
struct dmu_tx *os_synctx; /* XXX sketchy */ struct dmu_tx *os_synctx; /* XXX sketchy */
blkptr_t *os_rootbp; blkptr_t *os_rootbp;
zil_header_t os_zil_header; zil_header_t os_zil_header;
list_t os_synced_dnodes;
uint64_t os_flags;
/* Protected by os_obj_lock */ /* Protected by os_obj_lock */
kmutex_t os_obj_lock; kmutex_t os_obj_lock;
@ -92,6 +104,7 @@ typedef struct objset_impl {
} objset_impl_t; } objset_impl_t;
#define DMU_META_DNODE_OBJECT 0 #define DMU_META_DNODE_OBJECT 0
#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
#define DMU_OS_IS_L2CACHEABLE(os) \ #define DMU_OS_IS_L2CACHEABLE(os) \
((os)->os_secondary_cache == ZFS_CACHE_ALL || \ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \
@ -106,7 +119,8 @@ int dmu_objset_create(const char *name, dmu_objset_type_t type,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
int dmu_objset_destroy(const char *name); int dmu_objset_destroy(const char *name);
int dmu_objset_rollback(objset_t *os); int dmu_objset_rollback(objset_t *os);
int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
boolean_t recursive);
void dmu_objset_stats(objset_t *os, nvlist_t *nv); void dmu_objset_stats(objset_t *os, nvlist_t *nv);
void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
@ -127,6 +141,10 @@ objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
objset_impl_t **osip); objset_impl_t **osip);
void dmu_objset_evict(struct dsl_dataset *ds, void *arg); void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
void dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx);
boolean_t dmu_objset_userused_enabled(objset_impl_t *os);
int dmu_objset_userspace_upgrade(objset_t *os);
boolean_t dmu_objset_userspace_present(objset_t *os);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -98,7 +98,8 @@ enum dnode_dirtycontext {
}; };
/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ /* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */
#define DNODE_FLAG_USED_BYTES (1<<0) #define DNODE_FLAG_USED_BYTES (1<<0)
#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1)
typedef struct dnode_phys { typedef struct dnode_phys {
uint8_t dn_type; /* dmu_object_type_t */ uint8_t dn_type; /* dmu_object_type_t */
@ -131,10 +132,7 @@ typedef struct dnode {
*/ */
krwlock_t dn_struct_rwlock; krwlock_t dn_struct_rwlock;
/* /* Our link on dn_objset->os_dnodes list; protected by os_lock. */
* Our link on dataset's dd_dnodes list.
* Protected by dd_accounting_mtx.
*/
list_node_t dn_link; list_node_t dn_link;
/* immutable: */ /* immutable: */
@ -191,6 +189,9 @@ typedef struct dnode {
/* parent IO for current sync write */ /* parent IO for current sync write */
zio_t *dn_zio; zio_t *dn_zio;
/* used in syncing context */
dnode_phys_t *dn_oldphys;
/* holds prefetch structure */ /* holds prefetch structure */
struct zfetch dn_zfetch; struct zfetch dn_zfetch;
} dnode_t; } dnode_t;

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -195,7 +195,7 @@ void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
dmu_tx_t *tx); dmu_tx_t *tx);
int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_DSL_DELEG_H #ifndef _SYS_DSL_DELEG_H
#define _SYS_DSL_DELEG_H #define _SYS_DSL_DELEG_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/dmu.h> #include <sys/dmu.h>
#include <sys/dsl_pool.h> #include <sys/dsl_pool.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -51,6 +49,10 @@ extern "C" {
#define ZFS_DELEG_PERM_ALLOW "allow" #define ZFS_DELEG_PERM_ALLOW "allow"
#define ZFS_DELEG_PERM_USERPROP "userprop" #define ZFS_DELEG_PERM_USERPROP "userprop"
#define ZFS_DELEG_PERM_VSCAN "vscan" #define ZFS_DELEG_PERM_VSCAN "vscan"
#define ZFS_DELEG_PERM_USERQUOTA "userquota"
#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
#define ZFS_DELEG_PERM_USERUSED "userused"
#define ZFS_DELEG_PERM_GROUPUSED "groupused"
/* /*
* Note: the names of properties that are marked delegatable are also * Note: the names of properties that are marked delegatable are also

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -107,7 +107,6 @@ int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
const char *tail, void *tag, dsl_dir_t **); const char *tail, void *tag, dsl_dir_t **);
void dsl_dir_name(dsl_dir_t *dd, char *buf); void dsl_dir_name(dsl_dir_t *dd, char *buf);
int dsl_dir_namelen(dsl_dir_t *dd); int dsl_dir_namelen(dsl_dir_t *dd);
int dsl_dir_is_private(dsl_dir_t *dd);
uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
const char *name, dmu_tx_t *tx); const char *name, dmu_tx_t *tx);
dsl_checkfunc_t dsl_dir_destroy_check; dsl_checkfunc_t dsl_dir_destroy_check;

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -77,6 +77,7 @@ typedef struct dsl_pool {
struct dsl_dir *dp_mos_dir; struct dsl_dir *dp_mos_dir;
struct dsl_dataset *dp_origin_snap; struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj; uint64_t dp_root_dir_obj;
struct taskq *dp_vnrele_taskq;
/* No lock needed - sync context only */ /* No lock needed - sync context only */
blkptr_t dp_meta_rootbp; blkptr_t dp_meta_rootbp;
@ -143,6 +144,8 @@ int dsl_pool_scrub_clean(dsl_pool_t *dp);
void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
void dsl_pool_scrub_restart(dsl_pool_t *dp); void dsl_pool_scrub_restart(dsl_pool_t *dp);
taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -19,18 +19,17 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_DSL_PROP_H #ifndef _SYS_DSL_PROP_H
#define _SYS_DSL_PROP_H #define _SYS_DSL_PROP_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/dmu.h> #include <sys/dmu.h>
#include <sys/dsl_pool.h> #include <sys/dsl_pool.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/dsl_synctask.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -66,8 +65,10 @@ int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
int intsz, int numints, void *buf, char *setpoint); int intsz, int numints, void *buf, char *setpoint);
dsl_syncfunc_t dsl_props_set_sync;
int dsl_prop_set(const char *ddname, const char *propname, int dsl_prop_set(const char *ddname, const char *propname,
int intsz, int numints, const void *buf); int intsz, int numints, const void *buf);
int dsl_props_set(const char *dsname, nvlist_t *nvl);
void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
cred_t *cr, dmu_tx_t *tx); cred_t *cr, dmu_tx_t *tx);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -39,6 +39,8 @@ extern "C" {
typedef struct metaslab_class metaslab_class_t; typedef struct metaslab_class metaslab_class_t;
typedef struct metaslab_group metaslab_group_t; typedef struct metaslab_group metaslab_group_t;
extern space_map_ops_t *zfs_metaslab_ops;
extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
uint64_t start, uint64_t size, uint64_t txg); uint64_t start, uint64_t size, uint64_t txg);
extern void metaslab_fini(metaslab_t *msp); extern void metaslab_fini(metaslab_t *msp);
@ -55,7 +57,7 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
boolean_t now); boolean_t now);
extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
extern metaslab_class_t *metaslab_class_create(void); extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
extern void metaslab_class_destroy(metaslab_class_t *mc); extern void metaslab_class_destroy(metaslab_class_t *mc);
extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_METASLAB_IMPL_H #ifndef _SYS_METASLAB_IMPL_H
#define _SYS_METASLAB_IMPL_H #define _SYS_METASLAB_IMPL_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/metaslab.h> #include <sys/metaslab.h>
#include <sys/space_map.h> #include <sys/space_map.h>
#include <sys/vdev.h> #include <sys/vdev.h>
@ -41,6 +39,7 @@ extern "C" {
struct metaslab_class { struct metaslab_class {
metaslab_group_t *mc_rotor; metaslab_group_t *mc_rotor;
uint64_t mc_allocated; uint64_t mc_allocated;
space_map_ops_t *mc_ops;
}; };
struct metaslab_group { struct metaslab_group {

View File

@ -324,12 +324,9 @@ extern int spa_get_stats(const char *pool, nvlist_t **config,
char *altroot, size_t buflen); char *altroot, size_t buflen);
extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
const char *history_str, nvlist_t *zplprops); const char *history_str, nvlist_t *zplprops);
extern int spa_check_rootconf(char *devpath, char *devid,
nvlist_t **bestconf, uint64_t *besttxg);
extern boolean_t spa_rootdev_validate(nvlist_t *nv);
extern int spa_import_rootpool(char *devpath, char *devid); extern int spa_import_rootpool(char *devpath, char *devid);
extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props); extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *); extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool); extern int spa_destroy(char *pool);
extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
@ -347,6 +344,7 @@ extern void spa_inject_delref(spa_t *spa);
#define SPA_ASYNC_PROBE 0x04 #define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_RESILVER 0x10
#define SPA_ASYNC_AUTOEXPAND 0x20
/* device manipulation */ /* device manipulation */
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
@ -356,6 +354,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done); int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
/* spare state (which is global across all pools) */ /* spare state (which is global across all pools) */
extern void spa_spare_add(vdev_t *vd); extern void spa_spare_add(vdev_t *vd);

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_SPA_BOOT_H #ifndef _SYS_SPA_BOOT_H
#define _SYS_SPA_BOOT_H #define _SYS_SPA_BOOT_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/nvpair.h> #include <sys/nvpair.h>
#ifdef __cplusplus #ifdef __cplusplus
@ -36,7 +34,6 @@ extern "C" {
extern char *spa_get_bootprop(char *prop); extern char *spa_get_bootprop(char *prop);
extern void spa_free_bootprop(char *prop); extern void spa_free_bootprop(char *prop);
extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -141,9 +141,6 @@ struct spa {
int spa_async_suspended; /* async tasks suspended */ int spa_async_suspended; /* async tasks suspended */
kcondvar_t spa_async_cv; /* wait for thread_exit() */ kcondvar_t spa_async_cv; /* wait for thread_exit() */
uint16_t spa_async_tasks; /* async task mask */ uint16_t spa_async_tasks; /* async task mask */
kmutex_t spa_async_root_lock; /* protects async root count */
uint64_t spa_async_root_count; /* number of async root zios */
kcondvar_t spa_async_root_cv; /* notify when count == 0 */
char *spa_root; /* alternate root directory */ char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */ uint64_t spa_ena; /* spa-wide ereport ENA */
boolean_t spa_last_open_failed; /* true if last open faled */ boolean_t spa_last_open_failed; /* true if last open faled */
@ -163,15 +160,16 @@ struct spa {
uint64_t spa_failmode; /* failure mode for the pool */ uint64_t spa_failmode; /* failure mode for the pool */
uint64_t spa_delegation; /* delegation on/off */ uint64_t spa_delegation; /* delegation on/off */
list_t spa_config_list; /* previous cache file(s) */ list_t spa_config_list; /* previous cache file(s) */
zio_t *spa_async_zio_root; /* root of all async I/O */
zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ zio_t *spa_suspend_zio_root; /* root of all suspended I/O */
kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
kcondvar_t spa_suspend_cv; /* notification of resume */ kcondvar_t spa_suspend_cv; /* notification of resume */
uint8_t spa_suspended; /* pool is suspended */ uint8_t spa_suspended; /* pool is suspended */
boolean_t spa_import_faulted; /* allow faulted vdevs */
boolean_t spa_is_root; /* pool is root */ boolean_t spa_is_root; /* pool is root */
int spa_minref; /* num refs when first opened */ int spa_minref; /* num refs when first opened */
int spa_mode; /* FREAD | FWRITE */ int spa_mode; /* FREAD | FWRITE */
spa_log_state_t spa_log_state; /* log state */ spa_log_state_t spa_log_state; /* log state */
uint64_t spa_autoexpand; /* lun expansion on/off */
/* /*
* spa_refcnt & spa_config_lock must be the last elements * spa_refcnt & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options. * because refcount_t changes size based on compilation options.

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -46,12 +46,14 @@ typedef struct space_map {
uint8_t sm_loading; /* map loading? */ uint8_t sm_loading; /* map loading? */
kcondvar_t sm_load_cv; /* map load completion */ kcondvar_t sm_load_cv; /* map load completion */
space_map_ops_t *sm_ops; /* space map block picker ops vector */ space_map_ops_t *sm_ops; /* space map block picker ops vector */
avl_tree_t *sm_pp_root; /* picker-private AVL tree */
void *sm_ppd; /* picker-private data */ void *sm_ppd; /* picker-private data */
kmutex_t *sm_lock; /* pointer to lock that protects map */ kmutex_t *sm_lock; /* pointer to lock that protects map */
} space_map_t; } space_map_t;
typedef struct space_seg { typedef struct space_seg {
avl_node_t ss_node; /* AVL node */ avl_node_t ss_node; /* AVL node */
avl_node_t ss_pp_node; /* AVL picker-private node */
uint64_t ss_start; /* starting offset of this segment */ uint64_t ss_start; /* starting offset of this segment */
uint64_t ss_end; /* ending offset (non-inclusive) */ uint64_t ss_end; /* ending offset (non-inclusive) */
} space_seg_t; } space_seg_t;
@ -74,6 +76,7 @@ struct space_map_ops {
uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size); uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
uint64_t (*smop_max)(space_map_t *sm);
}; };
/* /*
@ -152,6 +155,7 @@ extern void space_map_unload(space_map_t *sm);
extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size); extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
extern uint64_t space_map_maxsize(space_map_t *sm);
extern void space_map_sync(space_map_t *sm, uint8_t maptype, extern void space_map_sync(space_map_t *sm, uint8_t maptype,
space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -50,7 +50,6 @@ extern int vdev_open(vdev_t *);
extern int vdev_validate(vdev_t *); extern int vdev_validate(vdev_t *);
extern void vdev_close(vdev_t *); extern void vdev_close(vdev_t *);
extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
extern void vdev_init(vdev_t *, uint64_t txg);
extern void vdev_reopen(vdev_t *); extern void vdev_reopen(vdev_t *);
extern int vdev_validate_aux(vdev_t *vd); extern int vdev_validate_aux(vdev_t *vd);
extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
@ -71,6 +70,8 @@ extern boolean_t vdev_resilver_needed(vdev_t *vd,
extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_clear_stats(vdev_t *vd); extern void vdev_clear_stats(vdev_t *vd);
@ -113,7 +114,8 @@ extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd);
extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
boolean_t);
extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -113,6 +113,7 @@ struct vdev {
uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid; /* unique ID for this vdev */
uint64_t vdev_guid_sum; /* self guid + all child guids */ uint64_t vdev_guid_sum; /* self guid + all child guids */
uint64_t vdev_asize; /* allocatable device capacity */ uint64_t vdev_asize; /* allocatable device capacity */
uint64_t vdev_min_asize; /* min acceptable asize */
uint64_t vdev_ashift; /* block alignment shift */ uint64_t vdev_ashift; /* block alignment shift */
uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_state; /* see VDEV_STATE_* #defines */
uint64_t vdev_prevstate; /* used when reopening a vdev */ uint64_t vdev_prevstate; /* used when reopening a vdev */
@ -125,6 +126,7 @@ struct vdev {
uint64_t vdev_children; /* number of children */ uint64_t vdev_children; /* number of children */
space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */ space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
vdev_stat_t vdev_stat; /* virtual device statistics */ vdev_stat_t vdev_stat; /* virtual device statistics */
boolean_t vdev_expanding; /* expand the vdev? */
/* /*
* Top-level vdev state. * Top-level vdev state.
@ -159,6 +161,7 @@ struct vdev {
char *vdev_path; /* vdev path (if any) */ char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */ char *vdev_devid; /* vdev devid (if any) */
char *vdev_physpath; /* vdev device path (if any) */ char *vdev_physpath; /* vdev device path (if any) */
char *vdev_fru; /* physical FRU location */
uint64_t vdev_not_present; /* not present during import */ uint64_t vdev_not_present; /* not present during import */
uint64_t vdev_unspare; /* unspare when resilvering done */ uint64_t vdev_unspare; /* unspare when resilvering done */
hrtime_t vdev_last_try; /* last reopen time */ hrtime_t vdev_last_try; /* last reopen time */
@ -188,8 +191,9 @@ struct vdev {
kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */
}; };
#define VDEV_SKIP_SIZE (8 << 10) #define VDEV_PAD_SIZE (8 << 10)
#define VDEV_BOOT_HEADER_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_pad2) to skip */
#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
#define VDEV_PHYS_SIZE (112 << 10) #define VDEV_PHYS_SIZE (112 << 10)
#define VDEV_UBERBLOCK_RING (128 << 10) #define VDEV_UBERBLOCK_RING (128 << 10)
@ -201,26 +205,14 @@ struct vdev {
offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
/* ZFS boot block */
#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL
#define VDEV_BOOT_VERSION 1 /* version number */
typedef struct vdev_boot_header {
uint64_t vb_magic; /* VDEV_BOOT_MAGIC */
uint64_t vb_version; /* VDEV_BOOT_VERSION */
uint64_t vb_offset; /* start offset (bytes) */
uint64_t vb_size; /* size (bytes) */
char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
} vdev_boot_header_t;
typedef struct vdev_phys { typedef struct vdev_phys {
char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
zio_block_tail_t vp_zbt; zio_block_tail_t vp_zbt;
} vdev_phys_t; } vdev_phys_t;
typedef struct vdev_label { typedef struct vdev_label {
char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ char vl_pad1[VDEV_PAD_SIZE]; /* 8K */
vdev_boot_header_t vl_boot_header; /* 8K */ char vl_pad2[VDEV_PAD_SIZE]; /* 8K */
vdev_phys_t vl_vdev_phys; /* 112K */ vdev_phys_t vl_vdev_phys; /* 112K */
char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
} vdev_label_t; /* 256K total */ } vdev_label_t; /* 256K total */
@ -249,6 +241,7 @@ typedef struct vdev_label {
#define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_ADD 1
#define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_SPARE 2
#define VDEV_ALLOC_L2CACHE 3 #define VDEV_ALLOC_L2CACHE 3
#define VDEV_ALLOC_ROOTPOOL 4
/* /*
* Allocate or free a vdev * Allocate or free a vdev
@ -269,6 +262,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
/* /*
* vdev sync load and sync * vdev sync load and sync
*/ */
extern void vdev_load_log_state(vdev_t *vd, nvlist_t *nv);
extern void vdev_load(vdev_t *vd); extern void vdev_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
@ -290,7 +284,8 @@ extern vdev_ops_t vdev_spare_ops;
* Common size functions * Common size functions
*/ */
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
extern uint64_t vdev_get_rsize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd);
extern void vdev_set_min_asize(vdev_t *vd);
/* /*
* zdb uses this tunable, so it must be declared here to make lint happy. * zdb uses this tunable, so it must be declared here to make lint happy.

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_ZAP_H #ifndef _SYS_ZAP_H
#define _SYS_ZAP_H #define _SYS_ZAP_H
#pragma ident "%Z%%M% %I% %E% SMI"
/* /*
* ZAP - ZFS Attribute Processor * ZAP - ZFS Attribute Processor
* *
@ -87,9 +85,6 @@
extern "C" { extern "C" {
#endif #endif
#define ZAP_MAXNAMELEN 256
#define ZAP_MAXVALUELEN 1024
/* /*
* The matchtype specifies which entry will be accessed. * The matchtype specifies which entry will be accessed.
* MT_EXACT: only find an exact match (non-normalized) * MT_EXACT: only find an exact match (non-normalized)
@ -186,6 +181,10 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
matchtype_t mt, char *realname, int rn_len, matchtype_t mt, char *realname, int rn_len,
boolean_t *normalization_conflictp); boolean_t *normalization_conflictp);
int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
int add, uint64_t *towrite, uint64_t *tooverwrite,
uint64_t dn_datablkshift);
/* /*
* Create an attribute with the given name and value. * Create an attribute with the given name and value.
* *

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_ZAP_IMPL_H #ifndef _SYS_ZAP_IMPL_H
#define _SYS_ZAP_IMPL_H #define _SYS_ZAP_IMPL_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zap.h> #include <sys/zap.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/avl.h> #include <sys/avl.h>
@ -195,6 +193,8 @@ int fzap_count(zap_t *zap, uint64_t *count);
int fzap_lookup(zap_name_t *zn, int fzap_lookup(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers, void *buf, uint64_t integer_size, uint64_t num_integers, void *buf,
char *realname, int rn_len, boolean_t *normalization_conflictp); char *realname, int rn_len, boolean_t *normalization_conflictp);
int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
uint64_t *tooverwrite);
int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx); const void *val, dmu_tx_t *tx);
int fzap_update(zap_name_t *zn, int fzap_update(zap_name_t *zn,

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -114,8 +114,6 @@ typedef struct zfs_acl_phys {
uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
} zfs_acl_phys_t; } zfs_acl_phys_t;
typedef struct acl_ops { typedef struct acl_ops {
uint32_t (*ace_mask_get) (void *acep); /* get access mask */ uint32_t (*ace_mask_get) (void *acep); /* get access mask */
void (*ace_mask_set) (void *acep, void (*ace_mask_set) (void *acep,
@ -161,12 +159,21 @@ typedef struct zfs_acl {
zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ zfs_acl_node_t *z_curr_node; /* current node iterator is handling */
list_t z_acl; /* chunks of ACE data */ list_t z_acl; /* chunks of ACE data */
acl_ops_t z_ops; /* ACL operations */ acl_ops_t z_ops; /* ACL operations */
boolean_t z_has_fuids; /* FUIDs present in ACL? */
} zfs_acl_t; } zfs_acl_t;
#define ACL_DATA_ALLOCED 0x1 #define ACL_DATA_ALLOCED 0x1
#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) #define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt))
struct zfs_fuid_info;
typedef struct zfs_acl_ids {
uint64_t z_fuid; /* file owner fuid */
uint64_t z_fgid; /* file group owner fuid */
uint64_t z_mode; /* mode to set on create */
zfs_acl_t *z_aclp; /* ACL to create with file */
struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */
} zfs_acl_ids_t;
/* /*
* Property values for acl_mode and acl_inherit. * Property values for acl_mode and acl_inherit.
* *
@ -183,16 +190,18 @@ typedef struct zfs_acl {
struct znode; struct znode;
struct zfsvfs; struct zfsvfs;
struct zfs_fuid_info;
#ifdef _KERNEL #ifdef _KERNEL
void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *, int zfs_acl_ids_create(struct znode *, int, vattr_t *,
dmu_tx_t *, cred_t *, zfs_acl_t *, zfs_fuid_info_t **); cred_t *, vsecattr_t *, zfs_acl_ids_t *);
void zfs_acl_ids_free(zfs_acl_ids_t *);
boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *);
int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
void zfs_acl_rele(void *); void zfs_acl_rele(void *);
void zfs_oldace_byteswap(ace_t *, int); void zfs_oldace_byteswap(ace_t *, int);
void zfs_ace_byteswap(void *, size_t, boolean_t); void zfs_ace_byteswap(void *, size_t, boolean_t);
extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr);
extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *);
extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
@ -202,9 +211,9 @@ int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
int zfs_zaccess_rename(struct znode *, struct znode *, int zfs_zaccess_rename(struct znode *, struct znode *,
struct znode *, struct znode *, cred_t *cr); struct znode *, struct znode *, cred_t *cr);
void zfs_acl_free(zfs_acl_t *); void zfs_acl_free(zfs_acl_t *);
int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, zfs_acl_t **); int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, struct zfs_fuid_info **, zfs_acl_t **);
struct zfs_fuid_info **, dmu_tx_t *); int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
#endif #endif

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_ZFS_CONTEXT_H #ifndef _SYS_ZFS_CONTEXT_H
#define _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H
#pragma ident "%Z%%M% %I% %E% SMI"
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -62,6 +60,7 @@ extern "C" {
#include <sys/zfs_debug.h> #include <sys/zfs_debug.h>
#include <sys/sysevent.h> #include <sys/sysevent.h>
#include <sys/sysevent/eventdefs.h> #include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/dev.h>
#include <sys/fm/util.h> #include <sys/fm/util.h>
#define CPU_SEQID (CPU->cpu_seqid) #define CPU_SEQID (CPU->cpu_seqid)

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _ZFS_CTLDIR_H #ifndef _ZFS_CTLDIR_H
#define _ZFS_CTLDIR_H #define _ZFS_CTLDIR_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/pathname.h> #include <sys/pathname.h>
#include <sys/vnode.h> #include <sys/vnode.h>
#include <sys/zfs_vfsops.h> #include <sys/zfs_vfsops.h>
@ -66,6 +64,7 @@ int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
#define ZFSCTL_INO_ROOT 0x1 #define ZFSCTL_INO_ROOT 0x1
#define ZFSCTL_INO_SNAPDIR 0x2 #define ZFSCTL_INO_SNAPDIR 0x2
#define ZFSCTL_INO_SHARES 0x3
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_FS_ZFS_DIR_H #ifndef _SYS_FS_ZFS_DIR_H
#define _SYS_FS_ZFS_DIR_H #define _SYS_FS_ZFS_DIR_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/pathname.h> #include <sys/pathname.h>
#include <sys/dmu.h> #include <sys/dmu.h>
#include <sys/zfs_znode.h> #include <sys/zfs_znode.h>
@ -59,7 +57,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
pathname_t *); pathname_t *);
extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
uint_t, znode_t **, int, zfs_acl_t *, zfs_fuid_info_t **); uint_t, znode_t **, int, zfs_acl_ids_t *);
extern void zfs_rmnode(znode_t *); extern void zfs_rmnode(znode_t *);
extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
extern boolean_t zfs_dirempty(znode_t *); extern boolean_t zfs_dirempty(znode_t *);

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_FS_ZFS_FUID_H #ifndef _SYS_FS_ZFS_FUID_H
#define _SYS_FS_ZFS_FUID_H #define _SYS_FS_ZFS_FUID_H
#pragma ident "%Z%%M% %I% %E% SMI"
#ifdef _KERNEL #ifdef _KERNEL
#include <sys/kidmap.h> #include <sys/kidmap.h>
#include <sys/sid.h> #include <sys/sid.h>
@ -51,11 +49,11 @@ typedef enum {
* Estimate space needed for one more fuid table entry. * Estimate space needed for one more fuid table entry.
* for now assume its current size + 1K * for now assume its current size + 1K
*/ */
#define FUID_SIZE_ESTIMATE(z) (z->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) #define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
#define FUID_INDEX(x) (x >> 32) #define FUID_INDEX(x) ((x) >> 32)
#define FUID_RID(x) (x & 0xffffffff) #define FUID_RID(x) ((x) & 0xffffffff)
#define FUID_ENCODE(idx, rid) ((idx << 32) | rid) #define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid))
/* /*
* FUIDs cause problems for the intent log * FUIDs cause problems for the intent log
* we need to replay the creation of the FUID, * we need to replay the creation of the FUID,
@ -104,17 +102,23 @@ struct znode;
extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
extern void zfs_fuid_destroy(zfsvfs_t *); extern void zfs_fuid_destroy(zfsvfs_t *);
extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
dmu_tx_t *, cred_t *, zfs_fuid_info_t **); cred_t *, zfs_fuid_info_t **);
extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t, extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t,
dmu_tx_t *, zfs_fuid_info_t **); zfs_fuid_info_t **);
extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, uid_t *uid, extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr,
uid_t *gid); uid_t *uid, uid_t *gid);
extern zfs_fuid_info_t *zfs_fuid_info_alloc(void); extern zfs_fuid_info_t *zfs_fuid_info_alloc(void);
extern void zfs_fuid_info_free(); extern void zfs_fuid_info_free(zfs_fuid_info_t *);
extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *); extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *);
void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *);
extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain,
char **retdomain, boolean_t addok);
extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx);
extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
#endif #endif
char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t);
void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *);
uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *); uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *);
void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *); void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *);

View File

@ -19,15 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#ifndef _SYS_ZFS_IOCTL_H #ifndef _SYS_ZFS_IOCTL_H
#define _SYS_ZFS_IOCTL_H #define _SYS_ZFS_IOCTL_H
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/cred.h> #include <sys/cred.h>
#include <sys/dmu.h> #include <sys/dmu.h>
#include <sys/zio.h> #include <sys/zio.h>
@ -118,7 +116,7 @@ typedef struct zinject_record {
uint32_t zi_error; uint32_t zi_error;
uint64_t zi_type; uint64_t zi_type;
uint32_t zi_freq; uint32_t zi_freq;
uint32_t zi_pad; /* pad out to 64 bit alignment */ uint32_t zi_failfast;
} zinject_record_t; } zinject_record_t;
#define ZINJECT_NULL 0x1 #define ZINJECT_NULL 0x1
@ -162,12 +160,20 @@ typedef struct zfs_cmd {
uint64_t zc_history_len; uint64_t zc_history_len;
uint64_t zc_history_offset; uint64_t zc_history_offset;
uint64_t zc_obj; uint64_t zc_obj;
uint64_t zc_iflags; /* internal to zfs(7fs) */
zfs_share_t zc_share; zfs_share_t zc_share;
dmu_objset_stats_t zc_objset_stats; dmu_objset_stats_t zc_objset_stats;
struct drr_begin zc_begin_record; struct drr_begin zc_begin_record;
zinject_record_t zc_inject_record; zinject_record_t zc_inject_record;
} zfs_cmd_t; } zfs_cmd_t;
typedef struct zfs_useracct {
char zu_domain[256];
uid_t zu_rid;
uint32_t zu_pad;
uint64_t zu_space;
} zfs_useracct_t;
#define ZVOL_MAX_MINOR (1 << 16) #define ZVOL_MAX_MINOR (1 << 16)
#define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) #define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1)

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -53,6 +53,7 @@ struct zfsvfs {
avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */
krwlock_t z_fuid_lock; /* fuid lock */ krwlock_t z_fuid_lock; /* fuid lock */
boolean_t z_fuid_loaded; /* fuid tables are loaded */ boolean_t z_fuid_loaded; /* fuid tables are loaded */
boolean_t z_fuid_dirty; /* need to sync fuid table ? */
struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */
zilog_t *z_log; /* intent log pointer */ zilog_t *z_log; /* intent log pointer */
uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_mode; /* acl chmod/mode behavior */
@ -72,8 +73,12 @@ struct zfsvfs {
boolean_t z_vscan; /* virus scan on/off */ boolean_t z_vscan; /* virus scan on/off */
boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */ boolean_t z_replay; /* set during ZIL replay */
kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */ kmutex_t z_online_recv_lock; /* held while recv in progress */
uint64_t z_version; /* ZPL version */ uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */
kmutex_t z_lock;
uint64_t z_userquota_obj;
uint64_t z_groupquota_obj;
#define ZFS_OBJ_MTX_SZ 64 #define ZFS_OBJ_MTX_SZ 64
kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */
}; };
@ -130,6 +135,17 @@ extern uint_t zfs_fsyncer_key;
extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode); extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode);
extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode);
extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t *valuep);
extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t quota);
extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs,
boolean_t isgroup, uint64_t fuid);
extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
extern int zfsvfs_create(const char *name, int mode, zfsvfs_t **zvp);
extern void zfsvfs_free(zfsvfs_t *zfsvfs);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -93,12 +93,15 @@ extern "C" {
/* /*
* Special attributes for master node. * Special attributes for master node.
* "userquota@" and "groupquota@" are also valid (from
* zfs_userquota_prop_prefixes[]).
*/ */
#define ZFS_FSID "FSID" #define ZFS_FSID "FSID"
#define ZFS_UNLINKED_SET "DELETE_QUEUE" #define ZFS_UNLINKED_SET "DELETE_QUEUE"
#define ZFS_ROOT_OBJ "ROOT" #define ZFS_ROOT_OBJ "ROOT"
#define ZPL_VERSION_STR "VERSION" #define ZPL_VERSION_STR "VERSION"
#define ZFS_FUID_TABLES "FUID" #define ZFS_FUID_TABLES "FUID"
#define ZFS_SHARES_DIR "SHARES"
#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) #define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
@ -309,7 +312,6 @@ extern int zfs_create_op_tables();
extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr); extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr);
extern dev_t zfs_cmpldev(uint64_t); extern dev_t zfs_cmpldev(uint64_t);
extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
extern int zfs_set_version(const char *name, uint64_t newvers);
extern int zfs_get_stats(objset_t *os, nvlist_t *nv); extern int zfs_get_stats(objset_t *os, nvlist_t *nv);
extern void zfs_znode_dmu_fini(znode_t *); extern void zfs_znode_dmu_fini(znode_t *);
@ -336,6 +338,7 @@ extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap); extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap);
extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
extern caddr_t zfs_map_page(page_t *, enum seg_rw); extern caddr_t zfs_map_page(page_t *, enum seg_rw);
extern void zfs_unmap_page(page_t *, caddr_t); extern void zfs_unmap_page(page_t *, caddr_t);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -56,9 +56,15 @@ typedef struct zil_header {
uint64_t zh_replay_seq; /* highest replayed sequence number */ uint64_t zh_replay_seq; /* highest replayed sequence number */
blkptr_t zh_log; /* log chain */ blkptr_t zh_log; /* log chain */
uint64_t zh_claim_seq; /* highest claimed sequence number */ uint64_t zh_claim_seq; /* highest claimed sequence number */
uint64_t zh_pad[5]; uint64_t zh_flags; /* header flags */
uint64_t zh_pad[4];
} zil_header_t; } zil_header_t;
/*
* zh_flags bit settings
*/
#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */
/* /*
* Log block trailer - structure at the end of the header and each log block * Log block trailer - structure at the end of the header and each log block
* *
@ -299,7 +305,27 @@ typedef struct {
*/ */
/* /*
* ZFS intent log transaction structure * Writes are handled in three different ways:
*
* WR_INDIRECT:
* In this mode, if we need to commit the write later, then the block
* is immediately written into the file system (using dmu_sync),
* and a pointer to the block is put into the log record.
* When the txg commits the block is linked in.
* This saves additionally writing the data into the log record.
* There are a few requirements for this to occur:
* - write is greater than zfs/zvol_immediate_write_sz
* - not using slogs (as slogs are assumed to always be faster
* than writing into the main pool)
* - the write occupies only one block
* WR_COPIED:
* If we know we'll immediately be committing the
* transaction (FSYNC or FDSYNC), the we allocate a larger
* log record here for the data and copy the data in.
* WR_NEED_COPY:
* Otherwise we don't allocate a buffer, and *if* we need to
* flush the write later then a buffer is allocated and
* we retrieve the data using the dmu.
*/ */
typedef enum { typedef enum {
WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ WR_INDIRECT, /* indirect - a large write (dmu_sync() data */
@ -359,9 +385,9 @@ extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
extern int zil_vdev_offline(char *osname, void *txarg);
extern int zil_claim(char *osname, void *txarg); extern int zil_claim(char *osname, void *txarg);
extern int zil_check_log_chain(char *osname, void *txarg); extern int zil_check_log_chain(char *osname, void *txarg);
extern int zil_clear_log_chain(char *osname, void *txarg);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_clean(zilog_t *zilog); extern void zil_clean(zilog_t *zilog);
extern int zil_is_committed(zilog_t *zilog); extern int zil_is_committed(zilog_t *zilog);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -101,6 +101,9 @@ typedef struct zil_dva_node {
avl_node_t zn_node; avl_node_t zn_node;
} zil_dva_node_t; } zil_dva_node_t;
#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
sizeof (lr_write_t))
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -76,7 +76,7 @@ enum zio_checksum {
ZIO_CHECKSUM_FUNCTIONS ZIO_CHECKSUM_FUNCTIONS
}; };
#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2 #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
enum zio_compress { enum zio_compress {
@ -116,30 +116,33 @@ enum zio_compress {
#define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) #define ZIO_PRIORITY_SCRUB (zio_priority_table[9])
#define ZIO_PRIORITY_TABLE_SIZE 10 #define ZIO_PRIORITY_TABLE_SIZE 10
#define ZIO_FLAG_MUSTSUCCEED 0x00000 #define ZIO_FLAG_MUSTSUCCEED 0x000000
#define ZIO_FLAG_CANFAIL 0x00001 #define ZIO_FLAG_CANFAIL 0x000001
#define ZIO_FLAG_SPECULATIVE 0x00002 #define ZIO_FLAG_SPECULATIVE 0x000002
#define ZIO_FLAG_CONFIG_WRITER 0x00004 #define ZIO_FLAG_CONFIG_WRITER 0x000004
#define ZIO_FLAG_DONT_RETRY 0x00008 #define ZIO_FLAG_DONT_RETRY 0x000008
#define ZIO_FLAG_DONT_CACHE 0x00010 #define ZIO_FLAG_DONT_CACHE 0x000010
#define ZIO_FLAG_DONT_QUEUE 0x00020 #define ZIO_FLAG_DONT_QUEUE 0x000020
#define ZIO_FLAG_DONT_AGGREGATE 0x00040 #define ZIO_FLAG_DONT_AGGREGATE 0x000040
#define ZIO_FLAG_DONT_PROPAGATE 0x00080 #define ZIO_FLAG_DONT_PROPAGATE 0x000080
#define ZIO_FLAG_IO_BYPASS 0x00100 #define ZIO_FLAG_IO_BYPASS 0x000100
#define ZIO_FLAG_IO_REPAIR 0x00200 #define ZIO_FLAG_IO_REPAIR 0x000200
#define ZIO_FLAG_IO_RETRY 0x00400 #define ZIO_FLAG_IO_RETRY 0x000400
#define ZIO_FLAG_IO_REWRITE 0x00800 #define ZIO_FLAG_IO_REWRITE 0x000800
#define ZIO_FLAG_SELF_HEAL 0x01000 #define ZIO_FLAG_SELF_HEAL 0x001000
#define ZIO_FLAG_RESILVER 0x02000 #define ZIO_FLAG_RESILVER 0x002000
#define ZIO_FLAG_SCRUB 0x04000 #define ZIO_FLAG_SCRUB 0x004000
#define ZIO_FLAG_SCRUB_THREAD 0x08000 #define ZIO_FLAG_SCRUB_THREAD 0x008000
#define ZIO_FLAG_PROBE 0x10000 #define ZIO_FLAG_PROBE 0x010000
#define ZIO_FLAG_GANG_CHILD 0x20000 #define ZIO_FLAG_GANG_CHILD 0x020000
#define ZIO_FLAG_RAW 0x40000 #define ZIO_FLAG_RAW 0x040000
#define ZIO_FLAG_GODFATHER 0x080000
#define ZIO_FLAG_TRYHARD 0x100000
#define ZIO_FLAG_GANG_INHERIT \ #define ZIO_FLAG_GANG_INHERIT \
(ZIO_FLAG_CANFAIL | \ (ZIO_FLAG_CANFAIL | \
@ -157,7 +160,8 @@ enum zio_compress {
(ZIO_FLAG_GANG_INHERIT | \ (ZIO_FLAG_GANG_INHERIT | \
ZIO_FLAG_IO_REPAIR | \ ZIO_FLAG_IO_REPAIR | \
ZIO_FLAG_IO_RETRY | \ ZIO_FLAG_IO_RETRY | \
ZIO_FLAG_PROBE) ZIO_FLAG_PROBE | \
ZIO_FLAG_TRYHARD)
#define ZIO_FLAG_AGG_INHERIT \ #define ZIO_FLAG_AGG_INHERIT \
(ZIO_FLAG_DONT_AGGREGATE | \ (ZIO_FLAG_DONT_AGGREGATE | \
@ -281,7 +285,6 @@ struct zio {
int io_cmd; int io_cmd;
uint8_t io_priority; uint8_t io_priority;
uint8_t io_reexecute; uint8_t io_reexecute;
uint8_t io_async_root;
uint8_t io_state[ZIO_WAIT_TYPES]; uint8_t io_state[ZIO_WAIT_TYPES];
uint64_t io_txg; uint64_t io_txg;
spa_t *io_spa; spa_t *io_spa;
@ -324,6 +327,7 @@ struct zio {
int io_child_error[ZIO_CHILD_TYPES]; int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
uint64_t *io_stall; uint64_t *io_stall;
zio_t *io_gang_leader;
zio_gang_node_t *io_gang_tree; zio_gang_node_t *io_gang_tree;
void *io_executor; void *io_executor;
void *io_waiter; void *io_waiter;
@ -415,7 +419,7 @@ extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
extern void zio_suspend(spa_t *spa, zio_t *zio); extern void zio_suspend(spa_t *spa, zio_t *zio);
extern void zio_resume(spa_t *spa); extern int zio_resume(spa_t *spa);
extern void zio_resume_wait(spa_t *spa); extern void zio_resume_wait(spa_t *spa);
/* /*
@ -435,7 +439,7 @@ extern int zio_inject_list_next(int *id, char *name, size_t buflen,
struct zinject_record *record); struct zinject_record *record);
extern int zio_clear_fault(int id); extern int zio_clear_fault(int id);
extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_fault_injection(zio_t *zio, int error);
extern int zio_handle_device_injection(vdev_t *vd, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -35,19 +35,36 @@
uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
/*
* The minimum free space, in percent, which must be available
* in a space map to continue allocations in a first-fit fashion.
* Once the space_map's free space drops below this level we dynamically
* switch to using best-fit allocations.
*/
int metaslab_df_free_pct = 30;
/* /*
* ========================================================================== * ==========================================================================
* Metaslab classes * Metaslab classes
* ========================================================================== * ==========================================================================
*/ */
metaslab_class_t * metaslab_class_t *
metaslab_class_create(void) metaslab_class_create(space_map_ops_t *ops)
{ {
metaslab_class_t *mc; metaslab_class_t *mc;
mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
mc->mc_rotor = NULL; mc->mc_rotor = NULL;
mc->mc_ops = ops;
return (mc); return (mc);
} }
@ -202,30 +219,14 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
} }
/* /*
* ========================================================================== * This is a helper function that can be used by the allocator to find
* The first-fit block allocator * a suitable block to allocate. This will search the specified AVL
* ========================================================================== * tree looking for a block that matches the specified criteria.
*/ */
static void
metaslab_ff_load(space_map_t *sm)
{
ASSERT(sm->sm_ppd == NULL);
sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
}
static void
metaslab_ff_unload(space_map_t *sm)
{
kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
sm->sm_ppd = NULL;
}
static uint64_t static uint64_t
metaslab_ff_alloc(space_map_t *sm, uint64_t size) metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
uint64_t align)
{ {
avl_tree_t *t = &sm->sm_root;
uint64_t align = size & -size;
uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
space_seg_t *ss, ssearch; space_seg_t *ss, ssearch;
avl_index_t where; avl_index_t where;
@ -254,7 +255,37 @@ metaslab_ff_alloc(space_map_t *sm, uint64_t size)
return (-1ULL); return (-1ULL);
*cursor = 0; *cursor = 0;
return (metaslab_ff_alloc(sm, size)); return (metaslab_block_picker(t, cursor, size, align));
}
/*
* ==========================================================================
* The first-fit block allocator
* ==========================================================================
*/
static void
metaslab_ff_load(space_map_t *sm)
{
ASSERT(sm->sm_ppd == NULL);
sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
sm->sm_pp_root = NULL;
}
static void
metaslab_ff_unload(space_map_t *sm)
{
kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
sm->sm_ppd = NULL;
}
static uint64_t
metaslab_ff_alloc(space_map_t *sm, uint64_t size)
{
avl_tree_t *t = &sm->sm_root;
uint64_t align = size & -size;
uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
return (metaslab_block_picker(t, cursor, size, align));
} }
/* ARGSUSED */ /* ARGSUSED */
@ -276,9 +307,136 @@ static space_map_ops_t metaslab_ff_ops = {
metaslab_ff_unload, metaslab_ff_unload,
metaslab_ff_alloc, metaslab_ff_alloc,
metaslab_ff_claim, metaslab_ff_claim,
metaslab_ff_free metaslab_ff_free,
NULL /* maxsize */
}; };
/*
* Dynamic block allocator -
* Uses the first fit allocation scheme until space get low and then
* adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
* and metaslab_df_free_pct to determine when to switch the allocation scheme.
*/
uint64_t
metaslab_df_maxsize(space_map_t *sm)
{
avl_tree_t *t = sm->sm_pp_root;
space_seg_t *ss;
if (t == NULL || (ss = avl_last(t)) == NULL)
return (0ULL);
return (ss->ss_end - ss->ss_start);
}
static int
metaslab_df_seg_compare(const void *x1, const void *x2)
{
const space_seg_t *s1 = x1;
const space_seg_t *s2 = x2;
uint64_t ss_size1 = s1->ss_end - s1->ss_start;
uint64_t ss_size2 = s2->ss_end - s2->ss_start;
if (ss_size1 < ss_size2)
return (-1);
if (ss_size1 > ss_size2)
return (1);
if (s1->ss_start < s2->ss_start)
return (-1);
if (s1->ss_start > s2->ss_start)
return (1);
return (0);
}
static void
metaslab_df_load(space_map_t *sm)
{
space_seg_t *ss;
ASSERT(sm->sm_ppd == NULL);
sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
avl_add(sm->sm_pp_root, ss);
}
static void
metaslab_df_unload(space_map_t *sm)
{
void *cookie = NULL;
kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
sm->sm_ppd = NULL;
while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
/* tear down the tree */
}
avl_destroy(sm->sm_pp_root);
kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
sm->sm_pp_root = NULL;
}
static uint64_t
metaslab_df_alloc(space_map_t *sm, uint64_t size)
{
avl_tree_t *t = &sm->sm_root;
uint64_t align = size & -size;
uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
uint64_t max_size = metaslab_df_maxsize(sm);
int free_pct = sm->sm_space * 100 / sm->sm_size;
ASSERT(MUTEX_HELD(sm->sm_lock));
ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
if (max_size < size)
return (-1ULL);
/*
* If we're running low on space switch to using the size
* sorted AVL tree (best-fit).
*/
if (max_size < metaslab_df_alloc_threshold ||
free_pct < metaslab_df_free_pct) {
t = sm->sm_pp_root;
*cursor = 0;
}
return (metaslab_block_picker(t, cursor, size, 1ULL));
}
/* ARGSUSED */
static void
metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
{
/* No need to update cursor */
}
/* ARGSUSED */
static void
metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
{
/* No need to update cursor */
}
static space_map_ops_t metaslab_df_ops = {
metaslab_df_load,
metaslab_df_unload,
metaslab_df_alloc,
metaslab_df_claim,
metaslab_df_free,
metaslab_df_maxsize
};
space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
/* /*
* ========================================================================== * ==========================================================================
* Metaslabs * Metaslabs
@ -414,20 +572,28 @@ metaslab_weight(metaslab_t *msp)
} }
static int static int
metaslab_activate(metaslab_t *msp, uint64_t activation_weight) metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
{ {
space_map_t *sm = &msp->ms_map; space_map_t *sm = &msp->ms_map;
space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
int error = space_map_load(sm, &metaslab_ff_ops, int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
SM_FREE, &msp->ms_smo,
msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
if (error) { if (error) {
metaslab_group_sort(msp->ms_group, msp, 0); metaslab_group_sort(msp->ms_group, msp, 0);
return (error); return (error);
} }
/*
* If we were able to load the map then make sure
* that this map is still able to satisfy our request.
*/
if (msp->ms_weight < size)
return (ENOSPC);
metaslab_group_sort(msp->ms_group, msp, metaslab_group_sort(msp->ms_group, msp,
msp->ms_weight | activation_weight); msp->ms_weight | activation_weight);
} }
@ -636,11 +802,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
int i; int i;
activation_weight = METASLAB_WEIGHT_PRIMARY; activation_weight = METASLAB_WEIGHT_PRIMARY;
for (i = 0; i < d; i++) for (i = 0; i < d; i++) {
if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
activation_weight = METASLAB_WEIGHT_SECONDARY; activation_weight = METASLAB_WEIGHT_SECONDARY;
break;
}
}
for (;;) { for (;;) {
boolean_t was_active;
mutex_enter(&mg->mg_lock); mutex_enter(&mg->mg_lock);
for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
if (msp->ms_weight < size) { if (msp->ms_weight < size) {
@ -648,6 +819,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
return (-1ULL); return (-1ULL);
} }
was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
if (activation_weight == METASLAB_WEIGHT_PRIMARY) if (activation_weight == METASLAB_WEIGHT_PRIMARY)
break; break;
@ -673,7 +845,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
* another thread may have changed the weight while we * another thread may have changed the weight while we
* were blocked on the metaslab lock. * were blocked on the metaslab lock.
*/ */
if (msp->ms_weight < size) { if (msp->ms_weight < size || (was_active &&
!(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
activation_weight == METASLAB_WEIGHT_PRIMARY)) {
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
continue; continue;
} }
@ -686,7 +860,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
continue; continue;
} }
if (metaslab_activate(msp, activation_weight) != 0) { if (metaslab_activate(msp, activation_weight, size) != 0) {
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
continue; continue;
} }
@ -869,7 +1043,7 @@ next:
goto top; goto top;
} }
if (!zio_lock) { if (!allocatable && !zio_lock) {
dshift = 3; dshift = 3;
zio_lock = B_TRUE; zio_lock = B_TRUE;
goto top; goto top;
@ -955,7 +1129,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock); mutex_enter(&msp->ms_lock);
error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
if (error || txg == 0) { /* txg == 0 indicates dry run */ if (error || txg == 0) { /* txg == 0 indicates dry run */
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
return (error); return (error);

File diff suppressed because it is too large Load Diff

View File

@ -432,10 +432,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
*/ */
for (c = 0; c < rvd->vdev_children; c++) { for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c]; vdev_t *tvd = rvd->vdev_child[c];
if (tvd->vdev_ms_array == 0) { if (tvd->vdev_ms_array == 0)
vdev_init(tvd, txg); vdev_metaslab_set_size(tvd);
vdev_config_dirty(tvd); vdev_expand(tvd, txg);
}
} }
} }
spa_config_exit(spa, SCL_ALL, FTAG); spa_config_exit(spa, SCL_ALL, FTAG);

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
/* /*
* Routines to manage the on-disk persistent error log. * Routines to manage the on-disk persistent error log.
* *
@ -61,8 +59,8 @@
* lowercase hexidecimal numbers that don't overflow. * lowercase hexidecimal numbers that don't overflow.
*/ */
#ifdef _KERNEL #ifdef _KERNEL
static uint64_t uint64_t
strtonum(char *str, char **nptr) strtonum(const char *str, char **nptr)
{ {
uint64_t val = 0; uint64_t val = 0;
char c; char c;
@ -82,7 +80,8 @@ strtonum(char *str, char **nptr)
str++; str++;
} }
*nptr = str; if (nptr)
*nptr = (char *)str;
return (val); return (val);
} }

View File

@ -20,12 +20,10 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/spa_impl.h> #include <sys/spa_impl.h>
#include <sys/zap.h> #include <sys/zap.h>
@ -127,12 +125,12 @@ spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
buf)) != 0) buf, DMU_READ_PREFETCH)) != 0)
return (err); return (err);
if (firstread != sizeof (reclen)) { if (firstread != sizeof (reclen)) {
if ((err = dmu_read(mos, spa->spa_history, if ((err = dmu_read(mos, spa->spa_history,
shpp->sh_pool_create_len, sizeof (reclen) - firstread, shpp->sh_pool_create_len, sizeof (reclen) - firstread,
buf + firstread)) != 0) buf + firstread, DMU_READ_PREFETCH)) != 0)
return (err); return (err);
} }
@ -380,10 +378,11 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
return (0); return (0);
} }
err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf); err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
DMU_READ_PREFETCH);
if (leftover && err == 0) { if (leftover && err == 0) {
err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
leftover, buf + read_len); leftover, buf + read_len, DMU_READ_PREFETCH);
} }
mutex_exit(&spa->spa_history_lock); mutex_exit(&spa->spa_history_lock);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -425,7 +425,6 @@ spa_add(const char *name, const char *altroot)
spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
@ -434,7 +433,6 @@ spa_add(const char *name, const char *altroot)
mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_async_root_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
@ -508,12 +506,10 @@ spa_remove(spa_t *spa)
spa_config_lock_destroy(spa); spa_config_lock_destroy(spa);
cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_async_cv);
cv_destroy(&spa->spa_async_root_cv);
cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv); cv_destroy(&spa->spa_suspend_cv);
mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_async_lock);
mutex_destroy(&spa->spa_async_root_lock);
mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_scrub_lock);
mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_errlog_lock);
mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlist_lock);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -116,12 +116,23 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
if (merge_before && merge_after) { if (merge_before && merge_after) {
avl_remove(&sm->sm_root, ss_before); avl_remove(&sm->sm_root, ss_before);
if (sm->sm_pp_root) {
avl_remove(sm->sm_pp_root, ss_before);
avl_remove(sm->sm_pp_root, ss_after);
}
ss_after->ss_start = ss_before->ss_start; ss_after->ss_start = ss_before->ss_start;
kmem_free(ss_before, sizeof (*ss_before)); kmem_free(ss_before, sizeof (*ss_before));
ss = ss_after;
} else if (merge_before) { } else if (merge_before) {
ss_before->ss_end = end; ss_before->ss_end = end;
if (sm->sm_pp_root)
avl_remove(sm->sm_pp_root, ss_before);
ss = ss_before;
} else if (merge_after) { } else if (merge_after) {
ss_after->ss_start = start; ss_after->ss_start = start;
if (sm->sm_pp_root)
avl_remove(sm->sm_pp_root, ss_after);
ss = ss_after;
} else { } else {
ss = kmem_alloc(sizeof (*ss), KM_SLEEP); ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
ss->ss_start = start; ss->ss_start = start;
@ -129,6 +140,9 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
avl_insert(&sm->sm_root, ss, where); avl_insert(&sm->sm_root, ss, where);
} }
if (sm->sm_pp_root)
avl_add(sm->sm_pp_root, ss);
sm->sm_space += size; sm->sm_space += size;
} }
@ -163,12 +177,17 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
left_over = (ss->ss_start != start); left_over = (ss->ss_start != start);
right_over = (ss->ss_end != end); right_over = (ss->ss_end != end);
if (sm->sm_pp_root)
avl_remove(sm->sm_pp_root, ss);
if (left_over && right_over) { if (left_over && right_over) {
newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP); newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
newseg->ss_start = end; newseg->ss_start = end;
newseg->ss_end = ss->ss_end; newseg->ss_end = ss->ss_end;
ss->ss_end = start; ss->ss_end = start;
avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
if (sm->sm_pp_root)
avl_add(sm->sm_pp_root, newseg);
} else if (left_over) { } else if (left_over) {
ss->ss_end = start; ss->ss_end = start;
} else if (right_over) { } else if (right_over) {
@ -176,8 +195,12 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
} else { } else {
avl_remove(&sm->sm_root, ss); avl_remove(&sm->sm_root, ss);
kmem_free(ss, sizeof (*ss)); kmem_free(ss, sizeof (*ss));
ss = NULL;
} }
if (sm->sm_pp_root && ss != NULL)
avl_add(sm->sm_pp_root, ss);
sm->sm_space -= size; sm->sm_space -= size;
} }
@ -288,7 +311,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
smo->smo_object, offset, size); smo->smo_object, offset, size);
mutex_exit(sm->sm_lock); mutex_exit(sm->sm_lock);
error = dmu_read(os, smo->smo_object, offset, size, entry_map); error = dmu_read(os, smo->smo_object, offset, size, entry_map,
DMU_READ_PREFETCH);
mutex_enter(sm->sm_lock); mutex_enter(sm->sm_lock);
if (error != 0) if (error != 0)
break; break;
@ -341,6 +365,15 @@ space_map_unload(space_map_t *sm)
space_map_vacate(sm, NULL, NULL); space_map_vacate(sm, NULL, NULL);
} }
uint64_t
space_map_maxsize(space_map_t *sm)
{
if (sm->sm_loaded && sm->sm_ops != NULL)
return (sm->sm_ops->smop_max(sm));
else
return (-1ULL);
}
uint64_t uint64_t
space_map_alloc(space_map_t *sm, uint64_t size) space_map_alloc(space_map_t *sm, uint64_t size)
{ {

View File

@ -39,6 +39,7 @@
#include <sys/zap.h> #include <sys/zap.h>
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
#include <sys/arc.h> #include <sys/arc.h>
#include <sys/zil.h>
/* /*
* Virtual device management. * Virtual device management.
@ -83,9 +84,8 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
{ {
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize; uint64_t csize;
uint64_t c;
for (c = 0; c < vd->vdev_children; c++) { for (int c = 0; c < vd->vdev_children; c++) {
csize = vdev_psize_to_asize(vd->vdev_child[c], psize); csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
asize = MAX(asize, csize); asize = MAX(asize, csize);
} }
@ -94,40 +94,47 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
} }
/* /*
* Get the replaceable or attachable device size. * Get the minimum allocatable size. We define the allocatable size as
* If the parent is a mirror or raidz, the replaceable size is the minimum * the vdev's asize rounded to the nearest metaslab. This allows us to
* psize of all its children. For the rest, just return our own psize. * replace or attach devices which don't have the same physical size but
* * can still satisfy the same number of allocations.
* e.g.
* psize rsize
* root - -
* mirror/raidz - -
* disk1 20g 20g
* disk2 40g 20g
* disk3 80g 80g
*/ */
uint64_t uint64_t
vdev_get_rsize(vdev_t *vd) vdev_get_min_asize(vdev_t *vd)
{ {
vdev_t *pvd, *cvd; vdev_t *pvd = vd->vdev_parent;
uint64_t c, rsize;
pvd = vd->vdev_parent;
/* /*
* If our parent is NULL or the root, just return our own psize. * The our parent is NULL (inactive spare or cache) or is the root,
* just return our own asize.
*/ */
if (pvd == NULL || pvd->vdev_parent == NULL) if (pvd == NULL)
return (vd->vdev_psize); return (vd->vdev_asize);
rsize = 0; /*
* The top-level vdev just returns the allocatable size rounded
* to the nearest metaslab.
*/
if (vd == vd->vdev_top)
return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
for (c = 0; c < pvd->vdev_children; c++) { /*
cvd = pvd->vdev_child[c]; * The allocatable space for a raidz vdev is N * sizeof(smallest child),
rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; * so each child must provide at least 1/Nth of its asize.
} */
if (pvd->vdev_ops == &vdev_raidz_ops)
return (pvd->vdev_min_asize / pvd->vdev_children);
return (rsize); return (pvd->vdev_min_asize);
}
void
vdev_set_min_asize(vdev_t *vd)
{
vd->vdev_min_asize = vdev_get_min_asize(vd);
for (int c = 0; c < vd->vdev_children; c++)
vdev_set_min_asize(vd->vdev_child[c]);
} }
vdev_t * vdev_t *
@ -148,13 +155,12 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev)
vdev_t * vdev_t *
vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
{ {
int c;
vdev_t *mvd; vdev_t *mvd;
if (vd->vdev_guid == guid) if (vd->vdev_guid == guid)
return (vd); return (vd);
for (c = 0; c < vd->vdev_children; c++) for (int c = 0; c < vd->vdev_children; c++)
if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
NULL) NULL)
return (mvd); return (mvd);
@ -250,17 +256,17 @@ vdev_compact_children(vdev_t *pvd)
{ {
vdev_t **newchild, *cvd; vdev_t **newchild, *cvd;
int oldc = pvd->vdev_children; int oldc = pvd->vdev_children;
int newc, c; int newc;
ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
for (c = newc = 0; c < oldc; c++) for (int c = newc = 0; c < oldc; c++)
if (pvd->vdev_child[c]) if (pvd->vdev_child[c])
newc++; newc++;
newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
for (c = newc = 0; c < oldc; c++) { for (int c = newc = 0; c < oldc; c++) {
if ((cvd = pvd->vdev_child[c]) != NULL) { if ((cvd = pvd->vdev_child[c]) != NULL) {
newchild[newc] = cvd; newchild[newc] = cvd;
cvd->vdev_id = newc++; cvd->vdev_id = newc++;
@ -372,6 +378,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
} else if (alloctype == VDEV_ALLOC_L2CACHE) { } else if (alloctype == VDEV_ALLOC_L2CACHE) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (EINVAL); return (EINVAL);
} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (EINVAL);
} }
/* /*
@ -435,6 +444,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
&vd->vdev_physpath) == 0) &vd->vdev_physpath) == 0)
vd->vdev_physpath = spa_strdup(vd->vdev_physpath); vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
vd->vdev_fru = spa_strdup(vd->vdev_fru);
/* /*
* Set the whole_disk property. If it's not specified, leave the value * Set the whole_disk property. If it's not specified, leave the value
@ -448,9 +459,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
* Look for the 'not present' flag. This will only be set if the device * Look for the 'not present' flag. This will only be set if the device
* was not present at the time of import. * was not present at the time of import.
*/ */
if (!spa->spa_import_faulted) (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present);
&vd->vdev_not_present);
/* /*
* Get the alignment requirement. * Get the alignment requirement.
@ -473,13 +483,23 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
* If we're a leaf vdev, try to load the DTL object and other state. * If we're a leaf vdev, try to load the DTL object and other state.
*/ */
if (vd->vdev_ops->vdev_op_leaf && if (vd->vdev_ops->vdev_op_leaf &&
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
alloctype == VDEV_ALLOC_ROOTPOOL)) {
if (alloctype == VDEV_ALLOC_LOAD) { if (alloctype == VDEV_ALLOC_LOAD) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
&vd->vdev_dtl_smo.smo_object); &vd->vdev_dtl_smo.smo_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
&vd->vdev_unspare); &vd->vdev_unspare);
} }
if (alloctype == VDEV_ALLOC_ROOTPOOL) {
uint64_t spare = 0;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
&spare) == 0 && spare)
spa_spare_add(vd);
}
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline); &vd->vdev_offline);
@ -511,7 +531,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
void void
vdev_free(vdev_t *vd) vdev_free(vdev_t *vd)
{ {
int c;
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
/* /*
@ -525,7 +544,7 @@ vdev_free(vdev_t *vd)
/* /*
* Free all children. * Free all children.
*/ */
for (c = 0; c < vd->vdev_children; c++) for (int c = 0; c < vd->vdev_children; c++)
vdev_free(vd->vdev_child[c]); vdev_free(vd->vdev_child[c]);
ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_child == NULL);
@ -560,6 +579,8 @@ vdev_free(vdev_t *vd)
spa_strfree(vd->vdev_devid); spa_strfree(vd->vdev_devid);
if (vd->vdev_physpath) if (vd->vdev_physpath)
spa_strfree(vd->vdev_physpath); spa_strfree(vd->vdev_physpath);
if (vd->vdev_fru)
spa_strfree(vd->vdev_fru);
if (vd->vdev_isspare) if (vd->vdev_isspare)
spa_spare_remove(vd); spa_spare_remove(vd);
@ -653,14 +674,12 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
static void static void
vdev_top_update(vdev_t *tvd, vdev_t *vd) vdev_top_update(vdev_t *tvd, vdev_t *vd)
{ {
int c;
if (vd == NULL) if (vd == NULL)
return; return;
vd->vdev_top = tvd; vd->vdev_top = tvd;
for (c = 0; c < vd->vdev_children; c++) for (int c = 0; c < vd->vdev_children; c++)
vdev_top_update(tvd, vd->vdev_child[c]); vdev_top_update(tvd, vd->vdev_child[c]);
} }
@ -679,6 +698,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_asize = cvd->vdev_asize;
mvd->vdev_min_asize = cvd->vdev_min_asize;
mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_ashift = cvd->vdev_ashift;
mvd->vdev_state = cvd->vdev_state; mvd->vdev_state = cvd->vdev_state;
@ -751,6 +771,15 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ if (vd->vdev_ms_shift == 0) /* not being allocated from yet */
return (0); return (0);
/*
* Compute the raidz-deflation ratio. Note, we hard-code
* in 128k (1 << 17) because it is the current "typical" blocksize.
* Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
* or we will inconsistently account for existing bp's.
*/
vd->vdev_deflate_ratio = (1 << 17) /
(vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
ASSERT(oldc <= newc); ASSERT(oldc <= newc);
if (vd->vdev_islog) if (vd->vdev_islog)
@ -776,7 +805,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
if (txg == 0) { if (txg == 0) {
uint64_t object = 0; uint64_t object = 0;
error = dmu_read(mos, vd->vdev_ms_array, error = dmu_read(mos, vd->vdev_ms_array,
m * sizeof (uint64_t), sizeof (uint64_t), &object); m * sizeof (uint64_t), sizeof (uint64_t), &object,
DMU_READ_PREFETCH);
if (error) if (error)
return (error); return (error);
if (object != 0) { if (object != 0) {
@ -903,7 +933,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
ZIO_FLAG_DONT_RETRY; ZIO_FLAG_TRYHARD;
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/* /*
@ -950,8 +980,8 @@ vdev_probe(vdev_t *vd, zio_t *zio)
for (int l = 1; l < VDEV_LABELS; l++) { for (int l = 1; l < VDEV_LABELS; l++) {
zio_nowait(zio_read_phys(pio, vd, zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l, vdev_label_offset(vd->vdev_psize, l,
offsetof(vdev_label_t, vl_pad)), offsetof(vdev_label_t, vl_pad2)),
VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE), VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
} }
@ -971,7 +1001,6 @@ vdev_open(vdev_t *vd)
{ {
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
int error; int error;
int c;
uint64_t osize = 0; uint64_t osize = 0;
uint64_t asize, psize; uint64_t asize, psize;
uint64_t ashift = 0; uint64_t ashift = 0;
@ -983,6 +1012,9 @@ vdev_open(vdev_t *vd)
vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_state == VDEV_STATE_OFFLINE);
vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
vd->vdev_min_asize = vdev_get_min_asize(vd);
if (!vd->vdev_removed && vd->vdev_faulted) { if (!vd->vdev_removed && vd->vdev_faulted) {
ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_children == 0);
@ -998,7 +1030,7 @@ vdev_open(vdev_t *vd)
error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
if (zio_injection_enabled && error == 0) if (zio_injection_enabled && error == 0)
error = zio_handle_device_injection(vd, ENXIO); error = zio_handle_device_injection(vd, NULL, ENXIO);
if (error) { if (error) {
if (vd->vdev_removed && if (vd->vdev_removed &&
@ -1020,12 +1052,13 @@ vdev_open(vdev_t *vd)
vd->vdev_state = VDEV_STATE_HEALTHY; vd->vdev_state = VDEV_STATE_HEALTHY;
} }
for (c = 0; c < vd->vdev_children; c++) for (int c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
VDEV_AUX_NONE); VDEV_AUX_NONE);
break; break;
} }
}
osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
@ -1050,6 +1083,15 @@ vdev_open(vdev_t *vd)
vd->vdev_psize = psize; vd->vdev_psize = psize;
/*
* Make sure the allocatable size hasn't shrunk.
*/
if (asize < vd->vdev_min_asize) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (EINVAL);
}
if (vd->vdev_asize == 0) { if (vd->vdev_asize == 0) {
/* /*
* This is the first-ever open, so use the computed values. * This is the first-ever open, so use the computed values.
@ -1066,26 +1108,19 @@ vdev_open(vdev_t *vd)
VDEV_AUX_BAD_LABEL); VDEV_AUX_BAD_LABEL);
return (EINVAL); return (EINVAL);
} }
/*
* Make sure the device hasn't shrunk.
*/
if (asize < vd->vdev_asize) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (EINVAL);
}
/*
* If all children are healthy and the asize has increased,
* then we've experienced dynamic LUN growth.
*/
if (vd->vdev_state == VDEV_STATE_HEALTHY &&
asize > vd->vdev_asize) {
vd->vdev_asize = asize;
}
} }
/*
* If all children are healthy and the asize has increased,
* then we've experienced dynamic LUN growth. If automatic
* expansion is enabled then use the additional space.
*/
if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
(vd->vdev_expanding || spa->spa_autoexpand))
vd->vdev_asize = asize;
vdev_set_min_asize(vd);
/* /*
* Ensure we can issue some IO before declaring the * Ensure we can issue some IO before declaring the
* vdev open for business. * vdev open for business.
@ -1097,18 +1132,6 @@ vdev_open(vdev_t *vd)
return (error); return (error);
} }
/*
* If this is a top-level vdev, compute the raidz-deflation
* ratio. Note, we hard-code in 128k (1<<17) because it is the
* current "typical" blocksize. Even if SPA_MAXBLOCKSIZE
* changes, this algorithm must never change, or we will
* inconsistently account for existing bp's.
*/
if (vd->vdev_top == vd) {
vd->vdev_deflate_ratio = (1<<17) /
(vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT);
}
/* /*
* If a leaf vdev has a DTL, and seems healthy, then kick off a * If a leaf vdev has a DTL, and seems healthy, then kick off a
* resilver. But don't do this if we are doing a reopen for a scrub, * resilver. But don't do this if we are doing a reopen for a scrub,
@ -1135,12 +1158,11 @@ int
vdev_validate(vdev_t *vd) vdev_validate(vdev_t *vd)
{ {
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
int c;
nvlist_t *label; nvlist_t *label;
uint64_t guid, top_guid; uint64_t guid, top_guid;
uint64_t state; uint64_t state;
for (c = 0; c < vd->vdev_children; c++) for (int c = 0; c < vd->vdev_children; c++)
if (vdev_validate(vd->vdev_child[c]) != 0) if (vdev_validate(vd->vdev_child[c]) != 0)
return (EBADF); return (EBADF);
@ -1226,7 +1248,7 @@ vdev_close(vdev_t *vd)
vdev_cache_purge(vd); vdev_cache_purge(vd);
/* /*
* We record the previous state before we close it, so that if we are * We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that * doing a reopen(), we don't generate FMA ereports if we notice that
* it's still faulted. * it's still faulted.
*/ */
@ -1257,12 +1279,9 @@ vdev_reopen(vdev_t *vd)
if (vd->vdev_aux) { if (vd->vdev_aux) {
(void) vdev_validate_aux(vd); (void) vdev_validate_aux(vd);
if (vdev_readable(vd) && vdev_writeable(vd) && if (vdev_readable(vd) && vdev_writeable(vd) &&
!l2arc_vdev_present(vd)) { vd->vdev_aux == &spa->spa_l2cache &&
uint64_t size = vdev_get_rsize(vd); !l2arc_vdev_present(vd))
l2arc_add_vdev(spa, vd, l2arc_add_vdev(spa, vd);
VDEV_LABEL_START_SIZE,
size - VDEV_LABEL_START_SIZE);
}
} else { } else {
(void) vdev_validate(vd); (void) vdev_validate(vd);
} }
@ -1302,26 +1321,14 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
return (0); return (0);
} }
/*
* The is the latter half of vdev_create(). It is distinct because it
* involves initiating transactions in order to do metaslab creation.
* For creation, we want to try to create all vdevs at once and then undo it
* if anything fails; this is much harder if we have pending transactions.
*/
void void
vdev_init(vdev_t *vd, uint64_t txg) vdev_metaslab_set_size(vdev_t *vd)
{ {
/* /*
* Aim for roughly 200 metaslabs per vdev. * Aim for roughly 200 metaslabs per vdev.
*/ */
vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
/*
* Initialize the vdev's metaslabs. This can't fail because
* there's nothing to read when creating all new metaslabs.
*/
VERIFY(vdev_metaslab_init(vd, txg) == 0);
} }
void void
@ -1879,7 +1886,7 @@ vdev_degrade(spa_t *spa, uint64_t guid)
int int
vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
{ {
vdev_t *vd; vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
spa_vdev_state_enter(spa); spa_vdev_state_enter(spa);
@ -1889,13 +1896,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
if (!vd->vdev_ops->vdev_op_leaf) if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
tvd = vd->vdev_top;
vd->vdev_offline = B_FALSE; vd->vdev_offline = B_FALSE;
vd->vdev_tmpoffline = B_FALSE; vd->vdev_tmpoffline = B_FALSE;
vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
vdev_reopen(vd->vdev_top);
/* XXX - L2ARC 1.0 does not support expansion */
if (!vd->vdev_aux) {
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
}
vdev_reopen(tvd);
vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
if (!vd->vdev_aux) {
for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
pvd->vdev_expanding = B_FALSE;
}
if (newstate) if (newstate)
*newstate = vd->vdev_state; *newstate = vd->vdev_state;
if ((flags & ZFS_ONLINE_UNSPARE) && if ((flags & ZFS_ONLINE_UNSPARE) &&
@ -1904,13 +1924,21 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
vd->vdev_parent->vdev_child[0] == vd) vd->vdev_parent->vdev_child[0] == vd)
vd->vdev_unspare = B_TRUE; vd->vdev_unspare = B_TRUE;
if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
/* XXX - L2ARC 1.0 does not support expansion */
if (vd->vdev_aux)
return (spa_vdev_state_exit(spa, vd, ENOTSUP));
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
return (spa_vdev_state_exit(spa, vd, 0)); return (spa_vdev_state_exit(spa, vd, 0));
} }
int int
vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
{ {
vdev_t *vd; vdev_t *vd, *tvd;
int error;
spa_vdev_state_enter(spa); spa_vdev_state_enter(spa);
@ -1920,34 +1948,58 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
if (!vd->vdev_ops->vdev_op_leaf) if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
tvd = vd->vdev_top;
/* /*
* If the device isn't already offline, try to offline it. * If the device isn't already offline, try to offline it.
*/ */
if (!vd->vdev_offline) { if (!vd->vdev_offline) {
/* /*
* If this device has the only valid copy of some data, * If this device has the only valid copy of some data,
* don't allow it to be offlined. * don't allow it to be offlined. Log devices are always
* expendable.
*/ */
if (vd->vdev_aux == NULL && vdev_dtl_required(vd)) if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
vdev_dtl_required(vd))
return (spa_vdev_state_exit(spa, NULL, EBUSY)); return (spa_vdev_state_exit(spa, NULL, EBUSY));
/* /*
* Offline this device and reopen its top-level vdev. * Offline this device and reopen its top-level vdev.
* If this action results in the top-level vdev becoming * If the top-level vdev is a log device then just offline
* unusable, undo it and fail the request. * it. Otherwise, if this action results in the top-level
* vdev becoming unusable, undo it and fail the request.
*/ */
vd->vdev_offline = B_TRUE; vd->vdev_offline = B_TRUE;
vdev_reopen(vd->vdev_top); vdev_reopen(tvd);
if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) {
if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
vdev_is_dead(tvd)) {
vd->vdev_offline = B_FALSE; vd->vdev_offline = B_FALSE;
vdev_reopen(vd->vdev_top); vdev_reopen(tvd);
return (spa_vdev_state_exit(spa, NULL, EBUSY)); return (spa_vdev_state_exit(spa, NULL, EBUSY));
} }
} }
vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
return (spa_vdev_state_exit(spa, vd, 0)); if (!tvd->vdev_islog || !vdev_is_dead(tvd))
return (spa_vdev_state_exit(spa, vd, 0));
(void) spa_vdev_state_exit(spa, vd, 0);
error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
NULL, DS_FIND_CHILDREN);
if (error) {
(void) vdev_online(spa, guid, 0, NULL);
return (error);
}
/*
* If we successfully offlined the log device then we need to
* sync out the current txg so that the "stubby" block can be
* removed by zil_sync().
*/
txg_wait_synced(spa->spa_dsl_pool, 0);
return (0);
} }
/* /*
@ -2062,7 +2114,9 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state; vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_rsize(vd); vs->vs_rsize = vdev_get_min_asize(vd);
if (vd->vdev_ops->vdev_op_leaf)
vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
mutex_exit(&vd->vdev_stat_lock); mutex_exit(&vd->vdev_stat_lock);
/* /*
@ -2155,14 +2209,24 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (flags & ZIO_FLAG_SPECULATIVE) if (flags & ZIO_FLAG_SPECULATIVE)
return; return;
/*
* If this is an I/O error that is going to be retried, then ignore the
* error. Otherwise, the user may interpret B_FAILFAST I/O errors as
* hard errors, when in reality they can happen for any number of
* innocuous reasons (bus resets, MPxIO link failure, etc).
*/
if (zio->io_error == EIO &&
!(zio->io_flags & ZIO_FLAG_IO_RETRY))
return;
mutex_enter(&vd->vdev_stat_lock); mutex_enter(&vd->vdev_stat_lock);
if (type == ZIO_TYPE_READ) { if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
if (zio->io_error == ECKSUM) if (zio->io_error == ECKSUM)
vs->vs_checksum_errors++; vs->vs_checksum_errors++;
else else
vs->vs_read_errors++; vs->vs_read_errors++;
} }
if (type == ZIO_TYPE_WRITE) if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
vs->vs_write_errors++; vs->vs_write_errors++;
mutex_exit(&vd->vdev_stat_lock); mutex_exit(&vd->vdev_stat_lock);
@ -2205,10 +2269,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
void void
vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
{ {
int c;
vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_t *vs = &vd->vdev_stat;
for (c = 0; c < vd->vdev_children; c++) for (int c = 0; c < vd->vdev_children; c++)
vdev_scrub_stat_update(vd->vdev_child[c], type, complete); vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
mutex_enter(&vd->vdev_stat_lock); mutex_enter(&vd->vdev_stat_lock);
@ -2252,6 +2315,7 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
* childrens', thus not accurate enough for us. * childrens', thus not accurate enough for us.
*/ */
ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
vd->vdev_deflate_ratio; vd->vdev_deflate_ratio;
@ -2293,8 +2357,8 @@ vdev_config_dirty(vdev_t *vd)
int c; int c;
/* /*
* If this is an aux vdev (as with l2cache devices), then we update the * If this is an aux vdev (as with l2cache and spare devices), then we
* vdev config manually and set the sync flag. * update the vdev config manually and set the sync flag.
*/ */
if (vd->vdev_aux != NULL) { if (vd->vdev_aux != NULL) {
spa_aux_vdev_t *sav = vd->vdev_aux; spa_aux_vdev_t *sav = vd->vdev_aux;
@ -2316,8 +2380,11 @@ vdev_config_dirty(vdev_t *vd)
sav->sav_sync = B_TRUE; sav->sav_sync = B_TRUE;
VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, if (nvlist_lookup_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0); ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
}
ASSERT(c < naux); ASSERT(c < naux);
@ -2415,11 +2482,10 @@ vdev_propagate_state(vdev_t *vd)
vdev_t *rvd = spa->spa_root_vdev; vdev_t *rvd = spa->spa_root_vdev;
int degraded = 0, faulted = 0; int degraded = 0, faulted = 0;
int corrupted = 0; int corrupted = 0;
int c;
vdev_t *child; vdev_t *child;
if (vd->vdev_children > 0) { if (vd->vdev_children > 0) {
for (c = 0; c < vd->vdev_children; c++) { for (int c = 0; c < vd->vdev_children; c++) {
child = vd->vdev_child[c]; child = vd->vdev_child[c];
if (!vdev_readable(child) || if (!vdev_readable(child) ||
@ -2523,7 +2589,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
* an error. * an error.
*/ */
if (spa->spa_load_state == SPA_LOAD_IMPORT && if (spa->spa_load_state == SPA_LOAD_IMPORT &&
!spa->spa_import_faulted &&
vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_leaf)
vd->vdev_not_present = 1; vd->vdev_not_present = 1;
@ -2582,8 +2647,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
vd->vdev_removed = B_FALSE; vd->vdev_removed = B_FALSE;
} }
if (!isopen) if (!isopen && vd->vdev_parent)
vdev_propagate_state(vd); vdev_propagate_state(vd->vdev_parent);
} }
/* /*
@ -2595,8 +2660,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
boolean_t boolean_t
vdev_is_bootable(vdev_t *vd) vdev_is_bootable(vdev_t *vd)
{ {
int c;
if (!vd->vdev_ops->vdev_op_leaf) { if (!vd->vdev_ops->vdev_op_leaf) {
char *vdev_type = vd->vdev_ops->vdev_op_type; char *vdev_type = vd->vdev_ops->vdev_op_type;
@ -2611,9 +2674,53 @@ vdev_is_bootable(vdev_t *vd)
return (B_FALSE); return (B_FALSE);
} }
for (c = 0; c < vd->vdev_children; c++) { for (int c = 0; c < vd->vdev_children; c++) {
if (!vdev_is_bootable(vd->vdev_child[c])) if (!vdev_is_bootable(vd->vdev_child[c]))
return (B_FALSE); return (B_FALSE);
} }
return (B_TRUE); return (B_TRUE);
} }
void
vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
{
uint_t children;
nvlist_t **child;
uint64_t val;
spa_t *spa = vd->vdev_spa;
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) == 0) {
for (int c = 0; c < children; c++)
vdev_load_log_state(vd->vdev_child[c], child[c]);
}
if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
/*
* It would be nice to call vdev_offline()
* directly but the pool isn't fully loaded and
* the txg threads have not been started yet.
*/
spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
vd->vdev_offline = val;
vdev_reopen(vd->vdev_top);
spa_config_exit(spa, SCL_STATE_ALL, FTAG);
}
}
/*
* Expand a vdev if possible.
*/
void
vdev_expand(vdev_t *vd, uint64_t txg)
{
ASSERT(vd->vdev_top == vd);
ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
VERIFY(vdev_metaslab_init(vd, txg) == 0);
vdev_config_dirty(vd);
}
}

View File

@ -233,6 +233,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
vd->vdev_physpath) == 0); vd->vdev_physpath) == 0);
if (vd->vdev_fru != NULL)
VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU,
vd->vdev_fru) == 0);
if (vd->vdev_nparity != 0) { if (vd->vdev_nparity != 0) {
ASSERT(strcmp(vd->vdev_ops->vdev_op_type, ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
VDEV_TYPE_RAIDZ) == 0); VDEV_TYPE_RAIDZ) == 0);
@ -335,8 +339,8 @@ vdev_label_read_config(vdev_t *vd)
nvlist_t *config = NULL; nvlist_t *config = NULL;
vdev_phys_t *vp; vdev_phys_t *vp;
zio_t *zio; zio_t *zio;
int flags = int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; ZIO_FLAG_SPECULATIVE;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
@ -345,6 +349,7 @@ vdev_label_read_config(vdev_t *vd)
vp = zio_buf_alloc(sizeof (vdev_phys_t)); vp = zio_buf_alloc(sizeof (vdev_phys_t));
retry:
for (int l = 0; l < VDEV_LABELS; l++) { for (int l = 0; l < VDEV_LABELS; l++) {
zio = zio_root(spa, NULL, NULL, flags); zio = zio_root(spa, NULL, NULL, flags);
@ -364,6 +369,11 @@ vdev_label_read_config(vdev_t *vd)
} }
} }
if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
flags |= ZIO_FLAG_TRYHARD;
goto retry;
}
zio_buf_free(vp, sizeof (vdev_phys_t)); zio_buf_free(vp, sizeof (vdev_phys_t));
return (config); return (config);
@ -488,7 +498,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
nvlist_t *label; nvlist_t *label;
vdev_phys_t *vp; vdev_phys_t *vp;
vdev_boot_header_t *vb; char *pad2;
uberblock_t *ub; uberblock_t *ub;
zio_t *zio; zio_t *zio;
char *buf; char *buf;
@ -629,16 +639,6 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
return (error == EFAULT ? ENAMETOOLONG : EINVAL); return (error == EFAULT ? ENAMETOOLONG : EINVAL);
} }
/*
* Initialize boot block header.
*/
vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
bzero(vb, sizeof (vdev_boot_header_t));
vb->vb_magic = VDEV_BOOT_MAGIC;
vb->vb_version = VDEV_BOOT_VERSION;
vb->vb_offset = VDEV_BOOT_OFFSET;
vb->vb_size = VDEV_BOOT_SIZE;
/* /*
* Initialize uberblock template. * Initialize uberblock template.
*/ */
@ -647,9 +647,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
*ub = spa->spa_uberblock; *ub = spa->spa_uberblock;
ub->ub_txg = 0; ub->ub_txg = 0;
/* Initialize the 2nd padding area. */
pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
bzero(pad2, VDEV_PAD_SIZE);
/* /*
* Write everything in parallel. * Write everything in parallel.
*/ */
retry:
zio = zio_root(spa, NULL, NULL, flags); zio = zio_root(spa, NULL, NULL, flags);
for (int l = 0; l < VDEV_LABELS; l++) { for (int l = 0; l < VDEV_LABELS; l++) {
@ -658,9 +663,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
offsetof(vdev_label_t, vl_vdev_phys), offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags); sizeof (vdev_phys_t), NULL, NULL, flags);
vdev_label_write(zio, vd, l, vb, /*
offsetof(vdev_label_t, vl_boot_header), * Skip the 1st padding area.
sizeof (vdev_boot_header_t), NULL, NULL, flags); * Zero out the 2nd padding area where it might have
* left over data from previous filesystem format.
*/
vdev_label_write(zio, vd, l, pad2,
offsetof(vdev_label_t, vl_pad2),
VDEV_PAD_SIZE, NULL, NULL, flags);
for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_write(zio, vd, l, ub, vdev_label_write(zio, vd, l, ub,
@ -671,9 +681,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
error = zio_wait(zio); error = zio_wait(zio);
if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
flags |= ZIO_FLAG_TRYHARD;
goto retry;
}
nvlist_free(label); nvlist_free(label);
zio_buf_free(pad2, VDEV_PAD_SIZE);
zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd)); zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
zio_buf_free(vb, sizeof (vdev_boot_header_t));
zio_buf_free(vp, sizeof (vdev_phys_t)); zio_buf_free(vp, sizeof (vdev_phys_t));
/* /*
@ -757,8 +772,8 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
{ {
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev; vdev_t *rvd = spa->spa_root_vdev;
int flags = int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
if (vd == rvd) { if (vd == rvd) {
ASSERT(zio == NULL); ASSERT(zio == NULL);
@ -996,7 +1011,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
* at any time, you can just call it again, and it will resume its work. * at any time, you can just call it again, and it will resume its work.
*/ */
int int
vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
{ {
spa_t *spa = svd[0]->vdev_spa; spa_t *spa = svd[0]->vdev_spa;
uberblock_t *ub = &spa->spa_uberblock; uberblock_t *ub = &spa->spa_uberblock;
@ -1005,6 +1020,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
int error; int error;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
/*
* Normally, we don't want to try too hard to write every label and
* uberblock. If there is a flaky disk, we don't want the rest of the
* sync process to block while we retry. But if we can't write a
* single label out, we should retry with ZIO_FLAG_TRYHARD before
* bailing out and declaring the pool faulted.
*/
if (tryhard)
flags |= ZIO_FLAG_TRYHARD;
ASSERT(ub->ub_txg <= txg); ASSERT(ub->ub_txg <= txg);
/* /*

View File

@ -48,10 +48,11 @@ int zfs_vdev_time_shift = 6;
int zfs_vdev_ramp_rate = 2; int zfs_vdev_ramp_rate = 2;
/* /*
* i/os will be aggregated into a single large i/o up to * To reduce IOPs, we aggregate small adjacent i/os into one large i/o.
* zfs_vdev_aggregation_limit bytes long. * For read i/os, we also aggregate across small adjacency gaps.
*/ */
int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
/* /*
* Virtual device vector for disk I/O scheduling. * Virtual device vector for disk I/O scheduling.
@ -159,16 +160,23 @@ vdev_queue_agg_io_done(zio_t *aio)
zio_buf_free(aio->io_data, aio->io_size); zio_buf_free(aio->io_data, aio->io_size);
} }
#define IS_ADJACENT(io, nio) \ /*
((io)->io_offset + (io)->io_size == (nio)->io_offset) * Compute the range spanned by two i/os, which is the endpoint of the last
* (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
* Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
* thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
*/
#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
static zio_t * static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{ {
zio_t *fio, *lio, *aio, *dio, *nio; zio_t *fio, *lio, *aio, *dio, *nio;
avl_tree_t *t; avl_tree_t *t;
uint64_t size;
int flags; int flags;
uint64_t maxspan = zfs_vdev_aggregation_limit;
uint64_t maxgap;
ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT(MUTEX_HELD(&vq->vq_lock));
@ -179,8 +187,8 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
fio = lio = avl_first(&vq->vq_deadline_tree); fio = lio = avl_first(&vq->vq_deadline_tree);
t = fio->io_vdev_tree; t = fio->io_vdev_tree;
size = fio->io_size;
flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
/* /*
@ -191,22 +199,18 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
* scrub/resilver, can be preserved in the aggregate. * scrub/resilver, can be preserved in the aggregate.
*/ */
while ((dio = AVL_PREV(t, fio)) != NULL && while ((dio = AVL_PREV(t, fio)) != NULL &&
IS_ADJACENT(dio, fio) &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
size + dio->io_size <= zfs_vdev_aggregation_limit) { IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap)
fio = dio; fio = dio;
size += dio->io_size;
}
while ((dio = AVL_NEXT(t, lio)) != NULL && while ((dio = AVL_NEXT(t, lio)) != NULL &&
IS_ADJACENT(lio, dio) &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
size + dio->io_size <= zfs_vdev_aggregation_limit) { IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap)
lio = dio; lio = dio;
size += dio->io_size;
}
} }
if (fio != lio) { if (fio != lio) {
uint64_t size = IO_SPAN(fio, lio);
ASSERT(size <= zfs_vdev_aggregation_limit); ASSERT(size <= zfs_vdev_aggregation_limit);
aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
@ -214,9 +218,10 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL); vdev_queue_agg_io_done, NULL);
/* We want to process lio, then stop */ nio = fio;
lio = AVL_NEXT(t, lio); do {
for (dio = fio; dio != lio; dio = nio) { dio = nio;
nio = AVL_NEXT(t, dio);
ASSERT(dio->io_type == aio->io_type); ASSERT(dio->io_type == aio->io_type);
ASSERT(dio->io_vdev_tree == t); ASSERT(dio->io_vdev_tree == t);
@ -224,13 +229,12 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
bcopy(dio->io_data, (char *)aio->io_data + bcopy(dio->io_data, (char *)aio->io_data +
(dio->io_offset - aio->io_offset), (dio->io_offset - aio->io_offset),
dio->io_size); dio->io_size);
nio = AVL_NEXT(t, dio);
zio_add_child(dio, aio); zio_add_child(dio, aio);
vdev_queue_io_remove(vq, dio); vdev_queue_io_remove(vq, dio);
zio_vdev_io_bypass(dio); zio_vdev_io_bypass(dio);
zio_execute(dio); zio_execute(dio);
} } while (dio != lio);
avl_add(&vq->vq_pending_tree, aio); avl_add(&vq->vq_pending_tree, aio);

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -697,7 +697,7 @@ vdev_raidz_io_start(zio_t *zio)
continue; continue;
} }
if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
(zio->io_flags & ZIO_FLAG_SCRUB)) { (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd, zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_data, rc->rc_size, rc->rc_offset, rc->rc_data, rc->rc_size,
zio->io_type, zio->io_priority, 0, zio->io_type, zio->io_priority, 0,

View File

@ -19,13 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
/* /*
* This file contains the top half of the zfs directory structure * This file contains the top half of the zfs directory structure
* implementation. The bottom half is in zap_leaf.c. * implementation. The bottom half is in zap_leaf.c.
@ -45,6 +42,7 @@
#include <sys/dmu.h> #include <sys/dmu.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/zfs_znode.h> #include <sys/zfs_znode.h>
#include <sys/fs/zfs.h>
#include <sys/zap.h> #include <sys/zap.h>
#include <sys/refcount.h> #include <sys/refcount.h>
#include <sys/zap_impl.h> #include <sys/zap_impl.h>
@ -1157,3 +1155,58 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
} }
} }
} }
int
fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
uint64_t *tooverwrite)
{
zap_t *zap = zn->zn_zap;
zap_leaf_t *l;
int err;
/*
* Account for the header block of the fatzap.
*/
if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
tooverwrite += zap->zap_dbuf->db_size;
} else {
towrite += zap->zap_dbuf->db_size;
}
/*
* Account for the pointer table blocks.
* If we are adding we need to account for the following cases :
* - If the pointer table is embedded, this operation could force an
* external pointer table.
* - If this already has an external pointer table this operation
* could extend the table.
*/
if (add) {
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
towrite += zap->zap_dbuf->db_size;
else
towrite += (zap->zap_dbuf->db_size * 3);
}
/*
* Now, check if the block containing leaf is freeable
* and account accordingly.
*/
err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
if (err != 0) {
return (err);
}
if (!add && dmu_buf_freeable(l->l_dbuf)) {
tooverwrite += l->l_dbuf->db_size;
} else {
/*
* If this an add operation, the leaf block could split.
* Hence, we need to account for an additional leaf block.
*/
towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
}
zap_put_leaf(l);
return (0);
}

View File

@ -19,24 +19,23 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
/* /*
* The 512-byte leaf is broken into 32 16-byte chunks. * The 512-byte leaf is broken into 32 16-byte chunks.
* chunk number n means l_chunk[n], even though the header precedes it. * chunk number n means l_chunk[n], even though the header precedes it.
* the names are stored null-terminated. * the names are stored null-terminated.
*/ */
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/fs/zfs.h>
#include <sys/zap.h> #include <sys/zap.h>
#include <sys/zap_impl.h> #include <sys/zap_impl.h>
#include <sys/zap_leaf.h> #include <sys/zap_leaf.h>
#include <sys/spa.h>
#include <sys/dmu.h>
static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);

View File

@ -19,12 +19,10 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/dmu.h> #include <sys/dmu.h>
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -78,8 +76,8 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm)
err = 0; err = 0;
(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST, zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL |
&err); U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err);
return (err); return (err);
} }
@ -1106,3 +1104,79 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
zap_unlockdir(zap); zap_unlockdir(zap);
return (0); return (0);
} }
int
zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
uint64_t *towrite, uint64_t *tooverwrite, uint64_t dn_datablkshift)
{
zap_t *zap;
int err = 0;
/*
* Since, we don't have a name, we cannot figure out which blocks will
* be affected in this operation. So, account for the worst case :
* - 3 blocks overwritten: target leaf, ptrtbl block, header block
* - 4 new blocks written if adding:
* - 2 blocks for possibly split leaves,
* - 2 grown ptrtbl blocks
*
* This also accomodates the case where an add operation to a fairly
* large microzap results in a promotion to fatzap.
*/
if (name == NULL) {
*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
return (err);
}
/*
* We lock the zap with adding == FALSE. Because, if we pass
* the actual value of add, it could trigger a mzap_upgrade().
* At present we are just evaluating the possibility of this operation
* and hence we donot want to trigger an upgrade.
*/
err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
if (err)
return (err);
if (!zap->zap_ismicro) {
zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT);
if (zn) {
err = fzap_count_write(zn, add, towrite,
tooverwrite);
zap_name_free(zn);
} else {
/*
* We treat this case as similar to (name == NULL)
*/
*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
}
} else {
if (!add) {
if (dmu_buf_freeable(zap->zap_dbuf))
*tooverwrite += SPA_MAXBLOCKSIZE;
else
*towrite += SPA_MAXBLOCKSIZE;
} else {
/*
* We are here if we are adding and (name != NULL).
* It is hard to find out if this add will promote this
* microzap to fatzap. Hence, we assume the worst case
* and account for the blocks assuming this microzap
* would be promoted to a fatzap.
*
* 1 block overwritten : header block
* 4 new blocks written : 2 new split leaf, 2 grown
* ptrtbl blocks
*/
if (dmu_buf_freeable(zap->zap_dbuf))
*tooverwrite += 1 << dn_datablkshift;
else
*towrite += 1 << dn_datablkshift;
*towrite += 4 << dn_datablkshift;
}
}
zap_unlockdir(zap);
return (err);
}

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -65,15 +65,16 @@
ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ #define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ #define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\ #define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) #define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
ACE_DELETE|ACE_DELETE_CHILD)
#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ #define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
@ -538,8 +539,9 @@ zfs_acl_curr_node(zfs_acl_t *aclp)
* ACE FUIDs will be created later. * ACE FUIDs will be created later.
*/ */
int int
zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap, zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
zfs_ace_t *z_acl, int aclcnt, size_t *size) void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size,
zfs_fuid_info_t **fuidp, cred_t *cr)
{ {
int i; int i;
uint16_t entry_type; uint16_t entry_type;
@ -555,9 +557,9 @@ zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap,
entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
entry_type != ACE_EVERYONE) { entry_type != ACE_EVERYONE) {
if (!aclp->z_has_fuids) aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who); cr, (entry_type == 0) ?
aceptr->z_fuid = (uint64_t)acep->a_who; ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
} }
/* /*
@ -682,7 +684,7 @@ zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
* convert old ACL format to new * convert old ACL format to new
*/ */
void void
zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp) zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
{ {
zfs_oldace_t *oldaclp; zfs_oldace_t *oldaclp;
int i; int i;
@ -714,9 +716,9 @@ zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp)
newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
sizeof (zfs_object_ace_t)); sizeof (zfs_object_ace_t));
aclp->z_ops = zfs_acl_fuid_ops; aclp->z_ops = zfs_acl_fuid_ops;
VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp, VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
newaclnode->z_acldata, aclp->z_acl_count, oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
&newaclnode->z_size) == 0); &newaclnode->z_size, NULL, cr) == 0);
newaclnode->z_ace_count = aclp->z_acl_count; newaclnode->z_ace_count = aclp->z_acl_count;
aclp->z_version = ZFS_ACL_VERSION; aclp->z_version = ZFS_ACL_VERSION;
kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
@ -770,8 +772,7 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
* Also, create FUIDs for any User/Group ACEs * Also, create FUIDs for any User/Group ACEs
*/ */
static uint64_t static uint64_t
zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
{ {
int entry_type; int entry_type;
mode_t mode; mode_t mode;
@ -905,15 +906,6 @@ zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
} }
} }
} }
/*
* Now handle FUID create for user/group ACEs
*/
if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) {
aclp->z_ops.ace_who_set(acep,
zfs_fuid_create(zp->z_zfsvfs, who, cr,
(entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP,
tx, fuidp));
}
} }
return (mode); return (mode);
} }
@ -989,7 +981,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
aclnode = zfs_acl_node_alloc(aclsize); aclnode = zfs_acl_node_alloc(aclsize);
list_insert_head(&aclp->z_acl, aclnode); list_insert_head(&aclp->z_acl, aclnode);
error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
aclsize, aclnode->z_acldata); aclsize, aclnode->z_acldata, DMU_READ_PREFETCH);
aclnode->z_ace_count = acl_count; aclnode->z_ace_count = acl_count;
aclp->z_acl_count = acl_count; aclp->z_acl_count = acl_count;
aclp->z_acl_bytes = aclsize; aclp->z_acl_bytes = aclsize;
@ -1014,8 +1006,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
* already checked the acl and knows whether to inherit. * already checked the acl and knows whether to inherit.
*/ */
int int
zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
{ {
int error; int error;
znode_phys_t *zphys = zp->z_phys; znode_phys_t *zphys = zp->z_phys;
@ -1026,12 +1017,9 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
dmu_object_type_t otype; dmu_object_type_t otype;
zfs_acl_node_t *aclnode; zfs_acl_node_t *aclnode;
ASSERT(MUTEX_HELD(&zp->z_lock));
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
dmu_buf_will_dirty(zp->z_dbuf, tx); dmu_buf_will_dirty(zp->z_dbuf, tx);
zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx); zphys->zp_mode = zfs_mode_compute(zp, aclp);
/* /*
* Decide which opbject type to use. If we are forced to * Decide which opbject type to use. If we are forced to
@ -1043,7 +1031,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
} else { } else {
if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
(zfsvfs->z_version >= ZPL_VERSION_FUID)) (zfsvfs->z_version >= ZPL_VERSION_FUID))
zfs_acl_xform(zp, aclp); zfs_acl_xform(zp, aclp, cr);
ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
otype = DMU_OT_ACL; otype = DMU_OT_ACL;
} }
@ -1125,7 +1113,6 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
return (0); return (0);
} }
@ -1336,7 +1323,7 @@ zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep)
* Prepend deny ACE * Prepend deny ACE
*/ */
static void * static void *
zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep, zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep,
mode_t mode) mode_t mode)
{ {
zfs_acl_node_t *aclnode; zfs_acl_node_t *aclnode;
@ -1349,7 +1336,7 @@ zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep,
fuid = aclp->z_ops.ace_who_get(acep); fuid = aclp->z_ops.ace_who_get(acep);
flags = aclp->z_ops.ace_flags_get(acep); flags = aclp->z_ops.ace_flags_get(acep);
zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS)); zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS));
zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid); zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid);
return (newacep); return (newacep);
} }
@ -1473,9 +1460,9 @@ zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep,
* in PSARC/2002/240 * in PSARC/2002/240
*/ */
static void static void
zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid,
uint64_t mode, zfs_acl_t *aclp)
{ {
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
void *acep = NULL, *prevacep = NULL; void *acep = NULL, *prevacep = NULL;
uint64_t who; uint64_t who;
int i; int i;
@ -1485,11 +1472,6 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
uint16_t iflags, type; uint16_t iflags, type;
uint32_t access_mask; uint32_t access_mask;
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
ASSERT(MUTEX_HELD(&zp->z_lock));
aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
/* /*
* If discard then just discard all ACL nodes which * If discard then just discard all ACL nodes which
* represent the ACEs. * represent the ACEs.
@ -1554,17 +1536,15 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
if (!reuse_deny) { if (!reuse_deny) {
prevacep = prevacep =
zfs_acl_prepend_deny(zp, zfs_acl_prepend_deny(uid,
aclp, acep, mode); aclp, acep, mode);
} else { } else {
zfs_acl_prepend_fixup( zfs_acl_prepend_fixup(
aclp, prevacep, aclp, prevacep,
acep, mode, acep, mode, uid);
zp->z_phys->zp_uid);
} }
zfs_fixup_group_entries(aclp, acep, zfs_fixup_group_entries(aclp, acep,
prevacep, mode); prevacep, mode);
} }
} }
} }
@ -1623,8 +1603,10 @@ zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_acl_lock);
*aclp = NULL; *aclp = NULL;
error = zfs_acl_node_read(zp, aclp, B_TRUE); error = zfs_acl_node_read(zp, aclp, B_TRUE);
if (error == 0) if (error == 0) {
zfs_acl_chmod(zp, mode, *aclp); (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS;
zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp);
}
mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_acl_lock);
mutex_exit(&zp->z_lock); mutex_exit(&zp->z_lock);
return (error); return (error);
@ -1649,9 +1631,8 @@ zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep)
* Should ACE be inherited? * Should ACE be inherited?
*/ */
static int static int
zfs_ace_can_use(znode_t *zp, uint16_t acep_flags) zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
{ {
int vtype = ZTOV(zp)->v_type;
int iflags = (acep_flags & 0xf); int iflags = (acep_flags & 0xf);
if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
@ -1666,10 +1647,9 @@ zfs_ace_can_use(znode_t *zp, uint16_t acep_flags)
* inherit inheritable ACEs from parent * inherit inheritable ACEs from parent
*/ */
static zfs_acl_t * static zfs_acl_t *
zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
boolean_t *need_chmod) uint64_t mode, boolean_t *need_chmod)
{ {
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
void *pacep; void *pacep;
void *acep, *acep2; void *acep, *acep2;
zfs_acl_node_t *aclnode, *aclnode2; zfs_acl_node_t *aclnode, *aclnode2;
@ -1680,8 +1660,8 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
size_t ace_size; size_t ace_size;
void *data1, *data2; void *data1, *data2;
size_t data1sz, data2sz; size_t data1sz, data2sz;
boolean_t vdir = ZTOV(zp)->v_type == VDIR; boolean_t vdir = vtype == VDIR;
boolean_t vreg = ZTOV(zp)->v_type == VREG; boolean_t vreg = vtype == VREG;
boolean_t passthrough, passthrough_x, noallow; boolean_t passthrough, passthrough_x, noallow;
passthrough_x = passthrough_x =
@ -1710,7 +1690,7 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
ace_size = aclp->z_ops.ace_size(pacep); ace_size = aclp->z_ops.ace_size(pacep);
if (!zfs_ace_can_use(zp, iflags)) if (!zfs_ace_can_use(vtype, iflags))
continue; continue;
/* /*
@ -1806,55 +1786,58 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
* Create file system object initial permissions * Create file system object initial permissions
* including inheritable ACEs. * including inheritable ACEs.
*/ */
void int
zfs_perm_init(znode_t *zp, znode_t *parent, int flag, zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
vattr_t *vap, dmu_tx_t *tx, cred_t *cr, vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp)
{ {
uint64_t mode, fuid, fgid;
int error; int error;
zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zfs_acl_t *aclp = NULL;
zfs_acl_t *paclp; zfs_acl_t *paclp;
xvattr_t *xvap = (xvattr_t *)vap;
gid_t gid; gid_t gid;
boolean_t need_chmod = B_TRUE; boolean_t need_chmod = B_TRUE;
if (setaclp) bzero(acl_ids, sizeof (zfs_acl_ids_t));
aclp = setaclp; acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
mode = MAKEIMODE(vap->va_type, vap->va_mode); if (vsecp)
if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
&acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
return (error);
/* /*
* Determine uid and gid. * Determine uid and gid.
*/ */
if ((flag & (IS_ROOT_NODE | IS_REPLAY)) || if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
((flag & IS_XATTR) && (vap->va_type == VDIR))) { ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr, acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
ZFS_OWNER, tx, fuidp); (uint64_t)vap->va_uid, cr,
fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, ZFS_OWNER, &acl_ids->z_fuidp);
ZFS_GROUP, tx, fuidp); acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
(uint64_t)vap->va_gid, cr,
ZFS_GROUP, &acl_ids->z_fuidp);
gid = vap->va_gid; gid = vap->va_gid;
} else { } else {
fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp); acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
fgid = 0; cr, &acl_ids->z_fuidp);
acl_ids->z_fgid = 0;
if (vap->va_mask & AT_GID) { if (vap->va_mask & AT_GID) {
fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
ZFS_GROUP, tx, fuidp); (uint64_t)vap->va_gid,
cr, ZFS_GROUP, &acl_ids->z_fuidp);
gid = vap->va_gid; gid = vap->va_gid;
if (fgid != parent->z_phys->zp_gid && if (acl_ids->z_fgid != dzp->z_phys->zp_gid &&
!groupmember(vap->va_gid, cr) && !groupmember(vap->va_gid, cr) &&
secpolicy_vnode_create_gid(cr) != 0) secpolicy_vnode_create_gid(cr) != 0)
fgid = 0; acl_ids->z_fgid = 0;
} }
if (fgid == 0) { if (acl_ids->z_fgid == 0) {
if (parent->z_phys->zp_mode & S_ISGID) { if (dzp->z_phys->zp_mode & S_ISGID) {
fgid = parent->z_phys->zp_gid; acl_ids->z_fgid = dzp->z_phys->zp_gid;
gid = zfs_fuid_map_id(zfsvfs, fgid, gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
cr, ZFS_GROUP); cr, ZFS_GROUP);
} else { } else {
fgid = zfs_fuid_create_cred(zfsvfs, acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
ZFS_GROUP, tx, cr, fuidp); ZFS_GROUP, cr, &acl_ids->z_fuidp);
gid = crgetgid(cr); gid = crgetgid(cr);
} }
} }
@ -1867,57 +1850,61 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
* file's new group, clear the file's set-GID bit. * file's new group, clear the file's set-GID bit.
*/ */
if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) { if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) &&
mode |= S_ISGID; (vap->va_type == VDIR)) {
acl_ids->z_mode |= S_ISGID;
} else { } else {
if ((mode & S_ISGID) && if ((acl_ids->z_mode & S_ISGID) &&
secpolicy_vnode_setids_setgids(cr, gid) != 0) secpolicy_vnode_setids_setgids(cr, gid) != 0)
mode &= ~S_ISGID; acl_ids->z_mode &= ~S_ISGID;
} }
zp->z_phys->zp_uid = fuid; if (acl_ids->z_aclp == NULL) {
zp->z_phys->zp_gid = fgid; mutex_enter(&dzp->z_lock);
zp->z_phys->zp_mode = mode; if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR &&
(dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
if (aclp == NULL) { !(dzp->z_phys->zp_flags & ZFS_XATTR)) {
mutex_enter(&parent->z_lock); mutex_enter(&dzp->z_acl_lock);
if ((ZTOV(parent)->v_type == VDIR && VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
(parent->z_phys->zp_flags & ZFS_INHERIT_ACE)) && mutex_exit(&dzp->z_acl_lock);
!(zp->z_phys->zp_flags & ZFS_XATTR)) { acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
mutex_enter(&parent->z_acl_lock); vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE));
mutex_exit(&parent->z_acl_lock);
aclp = zfs_acl_inherit(zp, paclp, mode, &need_chmod);
zfs_acl_free(paclp); zfs_acl_free(paclp);
} else { } else {
aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); acl_ids->z_aclp =
zfs_acl_alloc(zfs_acl_version_zp(dzp));
}
mutex_exit(&dzp->z_lock);
if (need_chmod) {
acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ?
ZFS_ACL_AUTO_INHERIT : 0;
zfs_acl_chmod(zfsvfs, acl_ids->z_fuid,
acl_ids->z_mode, acl_ids->z_aclp);
} }
mutex_exit(&parent->z_lock);
mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
if (need_chmod)
zfs_acl_chmod(zp, mode, aclp);
} else {
mutex_enter(&zp->z_lock);
mutex_enter(&zp->z_acl_lock);
} }
/* Force auto_inherit on all new directory objects */ return (0);
if (vap->va_type == VDIR) }
aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
error = zfs_aclset_common(zp, aclp, cr, fuidp, tx); /*
* Free ACL and fuid_infop, but not the acl_ids structure
*/
void
zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
{
if (acl_ids->z_aclp)
zfs_acl_free(acl_ids->z_aclp);
if (acl_ids->z_fuidp)
zfs_fuid_info_free(acl_ids->z_fuidp);
acl_ids->z_aclp = NULL;
acl_ids->z_fuidp = NULL;
}
/* Set optional attributes if any */ boolean_t
if (vap->va_mask & AT_XVATTR) zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
zfs_xvattr_set(zp, xvap); {
return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
mutex_exit(&zp->z_lock); zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
mutex_exit(&zp->z_acl_lock);
ASSERT3U(error, ==, 0);
if (aclp != setaclp)
zfs_acl_free(aclp);
} }
/* /*
@ -2018,7 +2005,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
int int
zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
vsecattr_t *vsecp, zfs_acl_t **zaclp) vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
{ {
zfs_acl_t *aclp; zfs_acl_t *aclp;
zfs_acl_node_t *aclnode; zfs_acl_node_t *aclnode;
@ -2041,9 +2028,9 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
return (error); return (error);
} }
} else { } else {
if ((error = zfs_copy_ace_2_fuid(obj_type, aclp, if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
&aclnode->z_size)) != 0) { &aclnode->z_size, fuidp, cr)) != 0) {
zfs_acl_free(aclp); zfs_acl_free(aclp);
zfs_acl_node_free(aclnode); zfs_acl_node_free(aclnode);
return (error); return (error);
@ -2084,6 +2071,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
int error; int error;
zfs_acl_t *aclp; zfs_acl_t *aclp;
zfs_fuid_info_t *fuidp = NULL; zfs_fuid_info_t *fuidp = NULL;
boolean_t fuid_dirtied;
if (mask == 0) if (mask == 0)
return (ENOSYS); return (ENOSYS);
@ -2094,7 +2082,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
return (error); return (error);
error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp); error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
&aclp);
if (error) if (error)
return (error); return (error);
@ -2135,18 +2124,9 @@ top:
} else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
} }
if (aclp->z_has_fuids) { fuid_dirtied = zfsvfs->z_fuid_dirty;
if (zfsvfs->z_fuid_obj == 0) { if (fuid_dirtied)
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); zfs_fuid_txhold(zfsvfs, tx);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
} else {
dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
}
}
error = dmu_tx_assign(tx, TXG_NOWAIT); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
@ -2163,9 +2143,13 @@ top:
return (error); return (error);
} }
error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); error = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT(error == 0); ASSERT(error == 0);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
zfs_log_acl(zilog, tx, zp, vsecp, fuidp); zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
if (fuidp) if (fuidp)
@ -2180,45 +2164,17 @@ done:
} }
/* /*
* working_mode returns the permissions that were not granted * Check accesses of interest (AoI) against attributes of the dataset
* such as read-only. Returns zero if no AoI conflict with dataset
* attributes, otherwise an appropriate errno is returned.
*/ */
static int static int
zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
{ {
zfs_acl_t *aclp;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int error;
uid_t uid = crgetuid(cr);
uint64_t who;
uint16_t type, iflags;
uint16_t entry_type;
uint32_t access_mask;
uint32_t deny_mask = 0;
zfs_ace_hdr_t *acep = NULL;
boolean_t checkit;
uid_t fowner;
uid_t gowner;
/*
* Short circuit empty requests
*/
if (v4_mode == 0)
return (0);
*check_privs = B_TRUE;
if (zfsvfs->z_replay) {
*working_mode = 0;
return (0);
}
*working_mode = v4_mode;
if ((v4_mode & WRITE_MASK) && if ((v4_mode & WRITE_MASK) &&
(zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
(!IS_DEVVP(ZTOV(zp)))) { (!IS_DEVVP(ZTOV(zp)) ||
*check_privs = B_FALSE; (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
return (EROFS); return (EROFS);
} }
@ -2230,31 +2186,64 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
(zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) || (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
(ZTOV(zp)->v_type == VDIR && (ZTOV(zp)->v_type == VDIR &&
(zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) { (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) {
*check_privs = B_FALSE;
return (EPERM); return (EPERM);
} }
if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
(zp->z_phys->zp_flags & ZFS_NOUNLINK)) { (zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
*check_privs = B_FALSE;
return (EPERM); return (EPERM);
} }
if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) { (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) {
*check_privs = B_FALSE;
return (EACCES); return (EACCES);
} }
/* return (0);
* The caller requested that the ACL check be skipped. This }
* would only happen if the caller checked VOP_ACCESS() with a
* 32 bit ACE mask and already had the appropriate permissions. /*
*/ * The primary usage of this function is to loop through all of the
if (skipaclchk) { * ACEs in the znode, determining what accesses of interest (AoI) to
*working_mode = 0; * the caller are allowed or denied. The AoI are expressed as bits in
return (0); * the working_mode parameter. As each ACE is processed, bits covered
} * by that ACE are removed from the working_mode. This removal
* facilitates two things. The first is that when the working mode is
* empty (= 0), we know we've looked at all the AoI. The second is
* that the ACE interpretation rules don't allow a later ACE to undo
* something granted or denied by an earlier ACE. Removing the
* discovered access or denial enforces this rule. At the end of
* processing the ACEs, all AoI that were found to be denied are
* placed into the working_mode, giving the caller a mask of denied
* accesses. Returns:
* 0 if all AoI granted
* EACCESS if the denied mask is non-zero
* other error if abnormal failure (e.g., IO error)
*
* A secondary usage of the function is to determine if any of the
* AoI are granted. If an ACE grants any access in
* the working_mode, we immediately short circuit out of the function.
* This mode is chosen by setting anyaccess to B_TRUE. The
* working_mode is not a denied access mask upon exit if the function
* is used in this manner.
*/
static int
zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
boolean_t anyaccess, cred_t *cr)
{
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
zfs_acl_t *aclp;
int error;
uid_t uid = crgetuid(cr);
uint64_t who;
uint16_t type, iflags;
uint16_t entry_type;
uint32_t access_mask;
uint32_t deny_mask = 0;
zfs_ace_hdr_t *acep = NULL;
boolean_t checkit;
uid_t fowner;
uid_t gowner;
zfs_fuid_map_ids(zp, cr, &fowner, &gowner); zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
@ -2268,6 +2257,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
&iflags, &type)) { &iflags, &type)) {
uint32_t mask_matched;
if (!zfs_acl_valid_ace_type(type, iflags)) if (!zfs_acl_valid_ace_type(type, iflags))
continue; continue;
@ -2275,6 +2265,11 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
continue; continue;
/* Skip ACE if it does not affect any AoI */
mask_matched = (access_mask & *working_mode);
if (!mask_matched)
continue;
entry_type = (iflags & ACE_TYPE_FLAGS); entry_type = (iflags & ACE_TYPE_FLAGS);
checkit = B_FALSE; checkit = B_FALSE;
@ -2313,14 +2308,24 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
} }
if (checkit) { if (checkit) {
uint32_t mask_matched = (access_mask & *working_mode); if (type == DENY) {
DTRACE_PROBE3(zfs__ace__denies,
if (mask_matched) { znode_t *, zp,
if (type == DENY) zfs_ace_hdr_t *, acep,
deny_mask |= mask_matched; uint32_t, mask_matched);
deny_mask |= mask_matched;
*working_mode &= ~mask_matched; } else {
DTRACE_PROBE3(zfs__ace__allows,
znode_t *, zp,
zfs_ace_hdr_t *, acep,
uint32_t, mask_matched);
if (anyaccess) {
mutex_exit(&zp->z_acl_lock);
zfs_acl_free(aclp);
return (0);
}
} }
*working_mode &= ~mask_matched;
} }
/* Are we done? */ /* Are we done? */
@ -2342,6 +2347,69 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
return (0); return (0);
} }
/*
* Return true if any access whatsoever granted, we don't actually
* care what access is granted.
*/
boolean_t
zfs_has_access(znode_t *zp, cred_t *cr)
{
uint32_t have = ACE_ALL_PERMS;
if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
uid_t owner;
owner = zfs_fuid_map_id(zp->z_zfsvfs,
zp->z_phys->zp_uid, cr, ZFS_OWNER);
return (
secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 ||
secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 ||
secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 ||
secpolicy_vnode_chown(cr, B_TRUE) == 0 ||
secpolicy_vnode_chown(cr, B_FALSE) == 0 ||
secpolicy_vnode_setdac(cr, owner) == 0 ||
secpolicy_vnode_remove(cr) == 0);
}
return (B_TRUE);
}
static int
zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
{
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
int err;
*working_mode = v4_mode;
*check_privs = B_TRUE;
/*
* Short circuit empty requests
*/
if (v4_mode == 0 || zfsvfs->z_replay) {
*working_mode = 0;
return (0);
}
if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
*check_privs = B_FALSE;
return (err);
}
/*
* The caller requested that the ACL check be skipped. This
* would only happen if the caller checked VOP_ACCESS() with a
* 32 bit ACE mask and already had the appropriate permissions.
*/
if (skipaclchk) {
*working_mode = 0;
return (0);
}
return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
}
static int static int
zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
cred_t *cr) cred_t *cr)

View File

@ -114,12 +114,16 @@ snapentry_compare(const void *a, const void *b)
vnodeops_t *zfsctl_ops_root; vnodeops_t *zfsctl_ops_root;
vnodeops_t *zfsctl_ops_snapdir; vnodeops_t *zfsctl_ops_snapdir;
vnodeops_t *zfsctl_ops_snapshot; vnodeops_t *zfsctl_ops_snapshot;
vnodeops_t *zfsctl_ops_shares;
vnodeops_t *zfsctl_ops_shares_dir;
static const fs_operation_def_t zfsctl_tops_root[]; static const fs_operation_def_t zfsctl_tops_root[];
static const fs_operation_def_t zfsctl_tops_snapdir[]; static const fs_operation_def_t zfsctl_tops_snapdir[];
static const fs_operation_def_t zfsctl_tops_snapshot[]; static const fs_operation_def_t zfsctl_tops_snapshot[];
static const fs_operation_def_t zfsctl_tops_shares[];
static vnode_t *zfsctl_mknode_snapdir(vnode_t *); static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
static vnode_t *zfsctl_mknode_shares(vnode_t *);
static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
@ -127,14 +131,18 @@ static gfs_opsvec_t zfsctl_opsvec[] = {
{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root }, { ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir }, { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot }, { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
{ ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir },
{ ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares },
{ NULL } { NULL }
}; };
/* /*
* Root directory elements. We have only a single static entry, 'snapshot'. * Root directory elements. We only have two entries
* snapshot and shares.
*/ */
static gfs_dirent_t zfsctl_root_entries[] = { static gfs_dirent_t zfsctl_root_entries[] = {
{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
{ "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
{ NULL } { NULL }
}; };
@ -166,21 +174,34 @@ zfsctl_fini(void)
vn_freevnodeops(zfsctl_ops_snapdir); vn_freevnodeops(zfsctl_ops_snapdir);
if (zfsctl_ops_snapshot) if (zfsctl_ops_snapshot)
vn_freevnodeops(zfsctl_ops_snapshot); vn_freevnodeops(zfsctl_ops_snapshot);
if (zfsctl_ops_shares)
vn_freevnodeops(zfsctl_ops_shares);
if (zfsctl_ops_shares_dir)
vn_freevnodeops(zfsctl_ops_shares_dir);
zfsctl_ops_root = NULL; zfsctl_ops_root = NULL;
zfsctl_ops_snapdir = NULL; zfsctl_ops_snapdir = NULL;
zfsctl_ops_snapshot = NULL; zfsctl_ops_snapshot = NULL;
zfsctl_ops_shares = NULL;
zfsctl_ops_shares_dir = NULL;
} }
/* /*
* Return the inode number associated with the 'snapshot' directory. * Return the inode number associated with the 'snapshot' or
* 'shares' directory.
*/ */
/* ARGSUSED */ /* ARGSUSED */
static ino64_t static ino64_t
zfsctl_root_inode_cb(vnode_t *vp, int index) zfsctl_root_inode_cb(vnode_t *vp, int index)
{ {
ASSERT(index == 0); zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
return (ZFSCTL_INO_SNAPDIR);
ASSERT(index <= 2);
if (index == 0)
return (ZFSCTL_INO_SNAPDIR);
return (zfsvfs->z_shares_dir);
} }
/* /*
@ -348,6 +369,30 @@ zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
return (0); return (0);
} }
/*ARGSUSED*/
static int
zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
{
zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
znode_t *dzp;
int error;
ZFS_ENTER(zfsvfs);
if (zfsvfs->z_shares_dir == 0) {
ZFS_EXIT(zfsvfs);
return (ENOTSUP);
}
if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
error = VOP_FID(ZTOV(dzp), fidp, ct);
VN_RELE(ZTOV(dzp));
}
ZFS_EXIT(zfsvfs);
return (error);
}
/* /*
* .zfs inode namespace * .zfs inode namespace
* *
@ -478,7 +523,7 @@ zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
VN_RELE(svp); VN_RELE(svp);
return (error); return (error);
} }
VFS_RELE(svp->v_vfsp);
/* /*
* We can't use VN_RELE(), as that will try to invoke * We can't use VN_RELE(), as that will try to invoke
* zfsctl_snapdir_inactive(), which would cause us to destroy * zfsctl_snapdir_inactive(), which would cause us to destroy
@ -691,7 +736,7 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp,
return (err); return (err);
if (err == 0) { if (err == 0) {
err = dmu_objset_snapshot(name, dirname, B_FALSE); err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE);
if (err) if (err)
return (err); return (err);
err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
@ -732,9 +777,6 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
ASSERT(dvp->v_type == VDIR); ASSERT(dvp->v_type == VDIR);
if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
return (0);
/* /*
* If we get a recursive call, that means we got called * If we get a recursive call, that means we got called
* from the domount() code while it was trying to look up the * from the domount() code while it was trying to look up the
@ -746,6 +788,11 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
ZFS_ENTER(zfsvfs); ZFS_ENTER(zfsvfs);
if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
ZFS_EXIT(zfsvfs);
return (0);
}
if (flags & FIGNORECASE) { if (flags & FIGNORECASE) {
boolean_t conflict = B_FALSE; boolean_t conflict = B_FALSE;
@ -844,7 +891,7 @@ domount:
* Return the mounted root rather than the covered mount point. * Return the mounted root rather than the covered mount point.
* Takes the GFS vnode at .zfs/snapshot/<snapname> and returns * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
* the ZFS vnode mounted on top of the GFS node. This ZFS * the ZFS vnode mounted on top of the GFS node. This ZFS
* vnode is the root the newly created vfsp. * vnode is the root of the newly created vfsp.
*/ */
VFS_RELE(vfsp); VFS_RELE(vfsp);
err = traverse(vpp); err = traverse(vpp);
@ -877,6 +924,37 @@ domount:
return (err); return (err);
} }
/* ARGSUSED */
static int
zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
int *direntflags, pathname_t *realpnp)
{
zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
znode_t *dzp;
int error;
ZFS_ENTER(zfsvfs);
if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
ZFS_EXIT(zfsvfs);
return (0);
}
if (zfsvfs->z_shares_dir == 0) {
ZFS_EXIT(zfsvfs);
return (ENOTSUP);
}
if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0)
error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp,
flags, rdir, cr, ct, direntflags, realpnp);
VN_RELE(ZTOV(dzp));
ZFS_EXIT(zfsvfs);
return (error);
}
/* ARGSUSED */ /* ARGSUSED */
static int static int
zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
@ -921,6 +999,33 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
return (0); return (0);
} }
/* ARGSUSED */
static int
zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
caller_context_t *ct, int flags)
{
zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
znode_t *dzp;
int error;
ZFS_ENTER(zfsvfs);
if (zfsvfs->z_shares_dir == 0) {
ZFS_EXIT(zfsvfs);
return (ENOTSUP);
}
if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags);
VN_RELE(ZTOV(dzp));
} else {
*eofp = 1;
error = ENOENT;
}
ZFS_EXIT(zfsvfs);
return (error);
}
/* /*
* pvp is the '.zfs' directory (zfsctl_node_t). * pvp is the '.zfs' directory (zfsctl_node_t).
* Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
@ -946,6 +1051,45 @@ zfsctl_mknode_snapdir(vnode_t *pvp)
return (vp); return (vp);
} }
vnode_t *
zfsctl_mknode_shares(vnode_t *pvp)
{
vnode_t *vp;
zfsctl_node_t *sdp;
vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
NULL, NULL);
sdp = vp->v_data;
sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
return (vp);
}
/* ARGSUSED */
static int
zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
caller_context_t *ct)
{
zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
znode_t *dzp;
int error;
ZFS_ENTER(zfsvfs);
if (zfsvfs->z_shares_dir == 0) {
ZFS_EXIT(zfsvfs);
return (ENOTSUP);
}
if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct);
VN_RELE(ZTOV(dzp));
}
ZFS_EXIT(zfsvfs);
return (error);
}
/* ARGSUSED */ /* ARGSUSED */
static int static int
zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
@ -996,6 +1140,20 @@ static const fs_operation_def_t zfsctl_tops_snapdir[] = {
{ NULL } { NULL }
}; };
static const fs_operation_def_t zfsctl_tops_shares[] = {
{ VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
{ VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
{ VOPNAME_IOCTL, { .error = fs_inval } },
{ VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } },
{ VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } },
{ VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } },
{ VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } },
{ VOPNAME_SEEK, { .vop_seek = fs_seek } },
{ VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } },
{ VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } },
{ NULL }
};
/* /*
* pvp is the GFS vnode '.zfs/snapshot'. * pvp is the GFS vnode '.zfs/snapshot'.
* *
@ -1013,7 +1171,6 @@ zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
zcp = vp->v_data; zcp = vp->v_data;
zcp->zc_id = objset; zcp->zc_id = objset;
VFS_HOLD(vp->v_vfsp);
return (vp); return (vp);
} }
@ -1052,7 +1209,6 @@ zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
mutex_exit(&sdp->sd_lock); mutex_exit(&sdp->sd_lock);
VN_RELE(dvp); VN_RELE(dvp);
VFS_RELE(vp->v_vfsp);
/* /*
* Dispose of the vnode for the snapshot mount point. * Dispose of the vnode for the snapshot mount point.

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -805,44 +805,49 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
znode_t *xzp; znode_t *xzp;
dmu_tx_t *tx; dmu_tx_t *tx;
int error; int error;
zfs_fuid_info_t *fuidp = NULL; zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
*xvpp = NULL; *xvpp = NULL;
if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
return (error); return (error);
if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
&acl_ids)) != 0)
return (error);
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
return (EDQUOT);
}
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_bonus(tx, zp->z_id);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { fuid_dirtied = zfsvfs->z_fuid_dirty;
if (zfsvfs->z_fuid_obj == 0) { if (fuid_dirtied)
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); zfs_fuid_txhold(zfsvfs, tx);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
} else {
dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
}
}
error = dmu_tx_assign(tx, TXG_NOWAIT); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
zfs_acl_ids_free(&acl_ids);
if (error == ERESTART) if (error == ERESTART)
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
return (error); return (error);
} }
zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp); zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
ASSERT(xzp->z_phys->zp_parent == zp->z_id); ASSERT(xzp->z_phys->zp_parent == zp->z_id);
dmu_buf_will_dirty(zp->z_dbuf, tx); dmu_buf_will_dirty(zp->z_dbuf, tx);
zp->z_phys->zp_xattr = xzp->z_id; zp->z_phys->zp_xattr = xzp->z_id;
(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
xzp, "", NULL, fuidp, vap); xzp, "", NULL, acl_ids.z_fuidp, vap);
if (fuidp)
zfs_fuid_info_free(fuidp); zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx); dmu_tx_commit(tx);
*xvpp = ZTOV(xzp); *xvpp = ZTOV(xzp);

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -96,7 +96,6 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
nvlist_t *ereport, *detector; nvlist_t *ereport, *detector;
uint64_t ena; uint64_t ena;
char class[64]; char class[64];
int state;
/* /*
* If we are doing a spa_tryimport(), ignore errors. * If we are doing a spa_tryimport(), ignore errors.
@ -130,15 +129,39 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
return; return;
/* /*
* If the vdev has already been marked as failing due to a * If this I/O is not a retry I/O, don't post an ereport.
* failed probe, then ignore any subsequent I/O errors, as the * Otherwise, we risk making bad diagnoses based on B_FAILFAST
* DE will automatically fault the vdev on the first such * I/Os.
* failure.
*/ */
if (vd != NULL && if (zio->io_error == EIO &&
(!vdev_readable(vd) || !vdev_writeable(vd)) && !(zio->io_flags & ZIO_FLAG_IO_RETRY))
strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
return; return;
if (vd != NULL) {
/*
* If the vdev has already been marked as failing due
* to a failed probe, then ignore any subsequent I/O
* errors, as the DE will automatically fault the vdev
* on the first such failure. This also catches cases
* where vdev_remove_wanted is set and the device has
* not yet been asynchronously placed into the REMOVED
* state.
*/
if (zio->io_vd == vd &&
!vdev_accessible(vd, zio) &&
strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
return;
/*
* Ignore checksum errors for reads from DTL regions of
* leaf vdevs.
*/
if (zio->io_type == ZIO_TYPE_READ &&
zio->io_error == ECKSUM &&
vd->vdev_ops->vdev_op_leaf &&
vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
return;
}
} }
if ((ereport = fm_nvlist_create(NULL)) == NULL) if ((ereport = fm_nvlist_create(NULL)) == NULL)
@ -188,14 +211,6 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
* passed in. * passed in.
*/ */
/*
* If we are importing a faulted pool, then we treat it like an open,
* not an import. Otherwise, the DE will ignore all faults during
* import, since the default behavior is to mark the devices as
* persistently unavailable, not leave them in the faulted state.
*/
state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state;
/* /*
* Generic payload members common to all ereports. * Generic payload members common to all ereports.
*/ */
@ -203,7 +218,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
DATA_TYPE_UINT64, spa_guid(spa), DATA_TYPE_UINT64, spa_guid(spa),
FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
state, NULL); spa->spa_load_state, NULL);
if (spa != NULL) { if (spa != NULL) {
fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
@ -222,14 +237,18 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
DATA_TYPE_UINT64, vd->vdev_guid, DATA_TYPE_UINT64, vd->vdev_guid,
FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
if (vd->vdev_path) if (vd->vdev_path != NULL)
fm_payload_set(ereport, fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
DATA_TYPE_STRING, vd->vdev_path, NULL); DATA_TYPE_STRING, vd->vdev_path, NULL);
if (vd->vdev_devid) if (vd->vdev_devid != NULL)
fm_payload_set(ereport, fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
DATA_TYPE_STRING, vd->vdev_devid, NULL); DATA_TYPE_STRING, vd->vdev_devid, NULL);
if (vd->vdev_fru != NULL)
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
DATA_TYPE_STRING, vd->vdev_fru, NULL);
if (pvd != NULL) { if (pvd != NULL) {
fm_payload_set(ereport, fm_payload_set(ereport,

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -47,8 +47,10 @@
* During file system initialization the nvlist(s) are read and * During file system initialization the nvlist(s) are read and
* two AVL trees are created. One tree is keyed by the index number * two AVL trees are created. One tree is keyed by the index number
* and the other by the domain string. Nodes are never removed from * and the other by the domain string. Nodes are never removed from
* trees, but new entries may be added. If a new entry is added then the * trees, but new entries may be added. If a new entry is added then
* on-disk packed nvlist will also be updated. * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then
* be responsible for calling zfs_fuid_sync() to sync the changes to disk.
*
*/ */
#define FUID_IDX "fuid_idx" #define FUID_IDX "fuid_idx"
@ -97,6 +99,15 @@ domain_compare(const void *arg1, const void *arg2)
return (val > 0 ? 1 : -1); return (val > 0 ? 1 : -1);
} }
void
zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
{
avl_create(idx_tree, idx_compare,
sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
avl_create(domain_tree, domain_compare,
sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
}
/* /*
* load initial fuid domain and idx trees. This function is used by * load initial fuid domain and idx trees. This function is used by
* both the kernel and zdb. * both the kernel and zdb.
@ -108,12 +119,9 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
dmu_buf_t *db; dmu_buf_t *db;
uint64_t fuid_size; uint64_t fuid_size;
avl_create(idx_tree, idx_compare, ASSERT(fuid_obj != 0);
sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); VERIFY(0 == dmu_bonus_hold(os, fuid_obj,
avl_create(domain_tree, domain_compare, FTAG, &db));
sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db));
fuid_size = *(uint64_t *)db->db_data; fuid_size = *(uint64_t *)db->db_data;
dmu_buf_rele(db, FTAG); dmu_buf_rele(db, FTAG);
@ -125,7 +133,8 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
int i; int i;
packed = kmem_alloc(fuid_size, KM_SLEEP); packed = kmem_alloc(fuid_size, KM_SLEEP);
VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0); VERIFY(dmu_read(os, fuid_obj, 0,
fuid_size, packed, DMU_READ_PREFETCH) == 0);
VERIFY(nvlist_unpack(packed, fuid_size, VERIFY(nvlist_unpack(packed, fuid_size,
&nvp, 0) == 0); &nvp, 0) == 0);
VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
@ -189,10 +198,8 @@ zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
* Load the fuid table(s) into memory. * Load the fuid table(s) into memory.
*/ */
static void static void
zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx) zfs_fuid_init(zfsvfs_t *zfsvfs)
{ {
int error = 0;
rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
if (zfsvfs->z_fuid_loaded) { if (zfsvfs->z_fuid_loaded) {
@ -200,41 +207,101 @@ zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
return; return;
} }
if (zfsvfs->z_fuid_obj == 0) { zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
/* first make sure we need to allocate object */
error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
if (error == ENOENT && tx != NULL) {
zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
sizeof (uint64_t), tx);
VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
ZFS_FUID_TABLES, sizeof (uint64_t), 1,
&zfsvfs->z_fuid_obj, tx) == 0);
}
}
(void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
if (zfsvfs->z_fuid_obj != 0) { if (zfsvfs->z_fuid_obj != 0) {
zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx,
&zfsvfs->z_fuid_domain); &zfsvfs->z_fuid_domain);
zfsvfs->z_fuid_loaded = B_TRUE;
} }
zfsvfs->z_fuid_loaded = B_TRUE;
rw_exit(&zfsvfs->z_fuid_lock);
}
/*
* sync out AVL trees to persistent storage.
*/
void
zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
{
nvlist_t *nvp;
nvlist_t **fuids;
size_t nvsize = 0;
char *packed;
dmu_buf_t *db;
fuid_domain_t *domnode;
int numnodes;
int i;
if (!zfsvfs->z_fuid_dirty) {
return;
}
rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
/*
* First see if table needs to be created?
*/
if (zfsvfs->z_fuid_obj == 0) {
zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
sizeof (uint64_t), tx);
VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
ZFS_FUID_TABLES, sizeof (uint64_t), 1,
&zfsvfs->z_fuid_obj, tx) == 0);
}
VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
numnodes = avl_numnodes(&zfsvfs->z_fuid_idx);
fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP);
for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++,
domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) {
VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
domnode->f_idx) == 0);
VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0);
VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
domnode->f_ksid->kd_name) == 0);
}
VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
fuids, numnodes) == 0);
for (i = 0; i != numnodes; i++)
nvlist_free(fuids[i]);
kmem_free(fuids, numnodes * sizeof (void *));
VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
packed = kmem_alloc(nvsize, KM_SLEEP);
VERIFY(nvlist_pack(nvp, &packed, &nvsize,
NV_ENCODE_XDR, KM_SLEEP) == 0);
nvlist_free(nvp);
zfsvfs->z_fuid_size = nvsize;
dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
zfsvfs->z_fuid_size, packed, tx);
kmem_free(packed, zfsvfs->z_fuid_size);
VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
FTAG, &db));
dmu_buf_will_dirty(db, tx);
*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
dmu_buf_rele(db, FTAG);
zfsvfs->z_fuid_dirty = B_FALSE;
rw_exit(&zfsvfs->z_fuid_lock); rw_exit(&zfsvfs->z_fuid_lock);
} }
/* /*
* Query domain table for a given domain. * Query domain table for a given domain.
* *
* If domain isn't found it is added to AVL trees and * If domain isn't found and addok is set, it is added to AVL trees and
* the results are pushed out to disk. * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be
* necessary for the caller or another thread to detect the dirty table
* and sync out the changes.
*/ */
int int
zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain,
dmu_tx_t *tx) char **retdomain, boolean_t addok)
{ {
fuid_domain_t searchnode, *findnode; fuid_domain_t searchnode, *findnode;
avl_index_t loc; avl_index_t loc;
@ -246,16 +313,16 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
* for the user nobody. * for the user nobody.
*/ */
if (domain[0] == '\0') { if (domain[0] == '\0') {
*retdomain = nulldomain; if (retdomain)
*retdomain = nulldomain;
return (0); return (0);
} }
searchnode.f_ksid = ksid_lookupdomain(domain); searchnode.f_ksid = ksid_lookupdomain(domain);
if (retdomain) { if (retdomain)
*retdomain = searchnode.f_ksid->kd_name; *retdomain = searchnode.f_ksid->kd_name;
}
if (!zfsvfs->z_fuid_loaded) if (!zfsvfs->z_fuid_loaded)
zfs_fuid_init(zfsvfs, tx); zfs_fuid_init(zfsvfs);
retry: retry:
rw_enter(&zfsvfs->z_fuid_lock, rw); rw_enter(&zfsvfs->z_fuid_lock, rw);
@ -265,15 +332,9 @@ retry:
rw_exit(&zfsvfs->z_fuid_lock); rw_exit(&zfsvfs->z_fuid_lock);
ksiddomain_rele(searchnode.f_ksid); ksiddomain_rele(searchnode.f_ksid);
return (findnode->f_idx); return (findnode->f_idx);
} else { } else if (addok) {
fuid_domain_t *domnode; fuid_domain_t *domnode;
nvlist_t *nvp;
nvlist_t **fuids;
uint64_t retidx; uint64_t retidx;
size_t nvsize = 0;
char *packed;
dmu_buf_t *db;
int i = 0;
if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) { if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
rw_exit(&zfsvfs->z_fuid_lock); rw_exit(&zfsvfs->z_fuid_lock);
@ -288,46 +349,11 @@ retry:
avl_add(&zfsvfs->z_fuid_domain, domnode); avl_add(&zfsvfs->z_fuid_domain, domnode);
avl_add(&zfsvfs->z_fuid_idx, domnode); avl_add(&zfsvfs->z_fuid_idx, domnode);
/* zfsvfs->z_fuid_dirty = B_TRUE;
* Now resync the on-disk nvlist.
*/
VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
domnode = avl_first(&zfsvfs->z_fuid_domain);
fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP);
while (domnode) {
VERIFY(nvlist_alloc(&fuids[i],
NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
domnode->f_idx) == 0);
VERIFY(nvlist_add_uint64(fuids[i],
FUID_OFFSET, 0) == 0);
VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN,
domnode->f_ksid->kd_name) == 0);
domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode);
}
VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
fuids, retidx) == 0);
for (i = 0; i != retidx; i++)
nvlist_free(fuids[i]);
kmem_free(fuids, retidx * sizeof (void *));
VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
packed = kmem_alloc(nvsize, KM_SLEEP);
VERIFY(nvlist_pack(nvp, &packed, &nvsize,
NV_ENCODE_XDR, KM_SLEEP) == 0);
nvlist_free(nvp);
zfsvfs->z_fuid_size = nvsize;
dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
zfsvfs->z_fuid_size, packed, tx);
kmem_free(packed, zfsvfs->z_fuid_size);
VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
FTAG, &db));
dmu_buf_will_dirty(db, tx);
*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
dmu_buf_rele(db, FTAG);
rw_exit(&zfsvfs->z_fuid_lock); rw_exit(&zfsvfs->z_fuid_lock);
return (retidx); return (retidx);
} else {
return (-1);
} }
} }
@ -337,7 +363,7 @@ retry:
* Returns a pointer from an avl node of the domain string. * Returns a pointer from an avl node of the domain string.
* *
*/ */
static char * const char *
zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
{ {
char *domain; char *domain;
@ -346,7 +372,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
return (NULL); return (NULL);
if (!zfsvfs->z_fuid_loaded) if (!zfsvfs->z_fuid_loaded)
zfs_fuid_init(zfsvfs, NULL); zfs_fuid_init(zfsvfs);
rw_enter(&zfsvfs->z_fuid_lock, RW_READER); rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
@ -374,7 +400,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
cred_t *cr, zfs_fuid_type_t type) cred_t *cr, zfs_fuid_type_t type)
{ {
uint32_t index = FUID_INDEX(fuid); uint32_t index = FUID_INDEX(fuid);
char *domain; const char *domain;
uid_t id; uid_t id;
if (index == 0) if (index == 0)
@ -439,6 +465,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
} }
if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
/* /*
* Now allocate fuid entry and add it on the end of the list * Now allocate fuid entry and add it on the end of the list
*/ */
@ -463,7 +490,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
*/ */
uint64_t uint64_t
zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp) cred_t *cr, zfs_fuid_info_t **fuidp)
{ {
uint64_t idx; uint64_t idx;
ksid_t *ksid; ksid_t *ksid;
@ -490,7 +517,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
rid = ksid_getrid(ksid); rid = ksid_getrid(ksid);
domain = ksid_getdomain(ksid); domain = ksid_getdomain(ksid);
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
@ -511,7 +538,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
*/ */
uint64_t uint64_t
zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp) zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
{ {
const char *domain; const char *domain;
char *kdomain; char *kdomain;
@ -581,10 +608,11 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
} }
} }
idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
if (!zfsvfs->z_replay) if (!zfsvfs->z_replay)
zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type); zfs_fuid_node_add(fuidpp, kdomain,
rid, idx, id, type);
else if (zfuid != NULL) { else if (zfuid != NULL) {
list_remove(&fuidp->z_fuids, zfuid); list_remove(&fuidp->z_fuids, zfuid);
kmem_free(zfuid, sizeof (zfs_fuid_t)); kmem_free(zfuid, sizeof (zfs_fuid_t));
@ -658,16 +686,15 @@ boolean_t
zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
{ {
ksid_t *ksid = crgetsid(cr, KSID_GROUP); ksid_t *ksid = crgetsid(cr, KSID_GROUP);
ksidlist_t *ksidlist = crgetsidlist(cr);
uid_t gid; uid_t gid;
if (ksid) { if (ksid && ksidlist) {
int i; int i;
ksid_t *ksid_groups; ksid_t *ksid_groups;
ksidlist_t *ksidlist = crgetsidlist(cr);
uint32_t idx = FUID_INDEX(id); uint32_t idx = FUID_INDEX(id);
uint32_t rid = FUID_RID(id); uint32_t rid = FUID_RID(id);
ASSERT(ksidlist);
ksid_groups = ksidlist->ksl_sids; ksid_groups = ksidlist->ksl_sids;
for (i = 0; i != ksidlist->ksl_nsid; i++) { for (i = 0; i != ksidlist->ksl_nsid; i++) {
@ -677,7 +704,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
return (B_TRUE); return (B_TRUE);
} }
} else { } else {
char *domain; const char *domain;
domain = zfs_fuid_find_by_idx(zfsvfs, idx); domain = zfs_fuid_find_by_idx(zfsvfs, idx);
ASSERT(domain != NULL); ASSERT(domain != NULL);
@ -700,4 +727,19 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
return (groupmember(gid, cr)); return (groupmember(gid, cr));
} }
void
zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
{
if (zfsvfs->z_fuid_obj == 0) {
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
} else {
dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
}
}
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -467,9 +467,6 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
*/ */
ssize_t zfs_immediate_write_sz = 32768; ssize_t zfs_immediate_write_sz = 32768;
#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
sizeof (lr_write_t))
void void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t resid, int ioflag) znode_t *zp, offset_t off, ssize_t resid, int ioflag)
@ -483,29 +480,6 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */
/*
* Writes are handled in three different ways:
*
* WR_INDIRECT:
* In this mode, if we need to commit the write later, then the block
* is immediately written into the file system (using dmu_sync),
* and a pointer to the block is put into the log record.
* When the txg commits the block is linked in.
* This saves additionally writing the data into the log record.
* There are a few requirements for this to occur:
* - write is greater than zfs_immediate_write_sz
* - not using slogs (as slogs are assumed to always be faster
* than writing into the main pool)
* - the write occupies only one block
* WR_COPIED:
* If we know we'll immediately be committing the
* transaction (FSYNC or FDSYNC), the we allocate a larger
* log record here for the data and copy the data in.
* WR_NEED_COPY:
* Otherwise we don't allocate a buffer, and *if* we need to
* flush the write later then a buffer is allocated and
* we retrieve the data using the dmu.
*/
slogging = spa_has_slogs(zilog->zl_spa); slogging = spa_has_slogs(zilog->zl_spa);
if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz) if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz)
write_state = WR_INDIRECT; write_state = WR_INDIRECT;
@ -535,7 +509,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
(write_state == WR_COPIED ? len : 0)); (write_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr; lr = (lr_write_t *)&itx->itx_lr;
if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
zp->z_id, off, len, lr + 1) != 0) { zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
kmem_free(itx, offsetof(itx_t, itx_lr) + kmem_free(itx, offsetof(itx_t, itx_lr) +
itx->itx_lr.lrc_reclen); itx->itx_lr.lrc_reclen);
itx = zil_itx_create(txtype, sizeof (*lr)); itx = zil_itx_create(txtype, sizeof (*lr));

View File

@ -19,7 +19,7 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
@ -67,6 +67,8 @@ static major_t zfs_major;
static minor_t zfs_minor; static minor_t zfs_minor;
static kmutex_t zfs_dev_mtx; static kmutex_t zfs_dev_mtx;
extern int sys_shutdown;
static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
@ -145,12 +147,24 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
* Sync a specific filesystem. * Sync a specific filesystem.
*/ */
zfsvfs_t *zfsvfs = vfsp->vfs_data; zfsvfs_t *zfsvfs = vfsp->vfs_data;
dsl_pool_t *dp;
ZFS_ENTER(zfsvfs); ZFS_ENTER(zfsvfs);
dp = dmu_objset_pool(zfsvfs->z_os);
/*
* If the system is shutting down, then skip any
* filesystems which may exist on a suspended pool.
*/
if (sys_shutdown && spa_suspended(dp->dp_spa)) {
ZFS_EXIT(zfsvfs);
return (0);
}
if (zfsvfs->z_log != NULL) if (zfsvfs->z_log != NULL)
zil_commit(zfsvfs->z_log, UINT64_MAX, 0); zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
else else
txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); txg_wait_synced(dp, 0);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
} else { } else {
/* /*
@ -554,6 +568,393 @@ unregister:
} }
static void
uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
int64_t delta, dmu_tx_t *tx)
{
uint64_t used = 0;
char buf[32];
int err;
uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
if (delta == 0)
return;
(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
err = zap_lookup(os, obj, buf, 8, 1, &used);
ASSERT(err == 0 || err == ENOENT);
/* no underflow/overflow */
ASSERT(delta > 0 || used >= -delta);
ASSERT(delta < 0 || used + delta > used);
used += delta;
if (used == 0)
err = zap_remove(os, obj, buf, tx);
else
err = zap_update(os, obj, buf, 8, 1, &used, tx);
ASSERT(err == 0);
}
static void
zfs_space_delta_cb(objset_t *os, dmu_object_type_t bonustype,
void *oldbonus, void *newbonus,
uint64_t oldused, uint64_t newused, dmu_tx_t *tx)
{
znode_phys_t *oldznp = oldbonus;
znode_phys_t *newznp = newbonus;
if (bonustype != DMU_OT_ZNODE)
return;
/* We charge 512 for the dnode (if it's allocated). */
if (oldznp->zp_gen != 0)
oldused += DNODE_SIZE;
if (newznp->zp_gen != 0)
newused += DNODE_SIZE;
if (oldznp->zp_uid == newznp->zp_uid) {
uidacct(os, B_FALSE, oldznp->zp_uid, newused-oldused, tx);
} else {
uidacct(os, B_FALSE, oldznp->zp_uid, -oldused, tx);
uidacct(os, B_FALSE, newznp->zp_uid, newused, tx);
}
if (oldznp->zp_gid == newznp->zp_gid) {
uidacct(os, B_TRUE, oldznp->zp_gid, newused-oldused, tx);
} else {
uidacct(os, B_TRUE, oldznp->zp_gid, -oldused, tx);
uidacct(os, B_TRUE, newznp->zp_gid, newused, tx);
}
}
static void
fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
char *domainbuf, int buflen, uid_t *ridp)
{
extern uint64_t strtonum(const char *str, char **nptr);
uint64_t fuid;
const char *domain;
fuid = strtonum(fuidstr, NULL);
domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
if (domain)
(void) strlcpy(domainbuf, domain, buflen);
else
domainbuf[0] = '\0';
*ridp = FUID_RID(fuid);
}
static uint64_t
zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
{
switch (type) {
case ZFS_PROP_USERUSED:
return (DMU_USERUSED_OBJECT);
case ZFS_PROP_GROUPUSED:
return (DMU_GROUPUSED_OBJECT);
case ZFS_PROP_USERQUOTA:
return (zfsvfs->z_userquota_obj);
case ZFS_PROP_GROUPQUOTA:
return (zfsvfs->z_groupquota_obj);
}
return (0);
}
int
zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
{
int error;
zap_cursor_t zc;
zap_attribute_t za;
zfs_useracct_t *buf = vbuf;
uint64_t obj;
if (!dmu_objset_userspace_present(zfsvfs->z_os))
return (ENOTSUP);
obj = zfs_userquota_prop_to_obj(zfsvfs, type);
if (obj == 0) {
*bufsizep = 0;
return (0);
}
for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
(error = zap_cursor_retrieve(&zc, &za)) == 0;
zap_cursor_advance(&zc)) {
if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
*bufsizep)
break;
fuidstr_to_sid(zfsvfs, za.za_name,
buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
buf->zu_space = za.za_first_integer;
buf++;
}
if (error == ENOENT)
error = 0;
ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
*cookiep = zap_cursor_serialize(&zc);
zap_cursor_fini(&zc);
return (error);
}
/*
* buf must be big enough (eg, 32 bytes)
*/
static int
id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
char *buf, boolean_t addok)
{
uint64_t fuid;
int domainid = 0;
if (domain && domain[0]) {
domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
if (domainid == -1)
return (ENOENT);
}
fuid = FUID_ENCODE(domainid, rid);
(void) sprintf(buf, "%llx", (longlong_t)fuid);
return (0);
}
int
zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t *valp)
{
char buf[32];
int err;
uint64_t obj;
*valp = 0;
if (!dmu_objset_userspace_present(zfsvfs->z_os))
return (ENOTSUP);
obj = zfs_userquota_prop_to_obj(zfsvfs, type);
if (obj == 0)
return (0);
err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
if (err)
return (err);
err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
if (err == ENOENT)
err = 0;
return (err);
}
int
zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t quota)
{
char buf[32];
int err;
dmu_tx_t *tx;
uint64_t *objp;
boolean_t fuid_dirtied;
if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
return (EINVAL);
if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
return (ENOTSUP);
objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
&zfsvfs->z_groupquota_obj;
err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
if (err)
return (err);
fuid_dirtied = zfsvfs->z_fuid_dirty;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
if (*objp == 0) {
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
zfs_userquota_prop_prefixes[type]);
}
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
dmu_tx_abort(tx);
return (err);
}
mutex_enter(&zfsvfs->z_lock);
if (*objp == 0) {
*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
DMU_OT_NONE, 0, tx);
VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
}
mutex_exit(&zfsvfs->z_lock);
if (quota == 0) {
err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
if (err == ENOENT)
err = 0;
} else {
err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
}
ASSERT(err == 0);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
dmu_tx_commit(tx);
return (err);
}
boolean_t
zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
{
char buf[32];
uint64_t used, quota, usedobj, quotaobj;
int err;
usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
if (quotaobj == 0 || zfsvfs->z_replay)
return (B_FALSE);
(void) sprintf(buf, "%llx", (longlong_t)fuid);
err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
if (err != 0)
return (B_FALSE);
err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
if (err != 0)
return (B_FALSE);
return (used >= quota);
}
int
zfsvfs_create(const char *osname, int mode, zfsvfs_t **zvp)
{
objset_t *os;
zfsvfs_t *zfsvfs;
uint64_t zval;
int i, error;
if (error = dsl_prop_get_integer(osname, "readonly", &zval, NULL))
return (error);
if (zval)
mode |= DS_MODE_READONLY;
error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
if (error == EROFS) {
mode |= DS_MODE_READONLY;
error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &os);
}
if (error)
return (error);
/*
* Initialize the zfs-specific filesystem structure.
* Should probably make this a kmem cache, shuffle fields,
* and just bzero up to z_hold_mtx[].
*/
zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
zfsvfs->z_vfs = NULL;
zfsvfs->z_parent = zfsvfs;
zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
zfsvfs->z_os = os;
error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
if (error) {
goto out;
} else if (zfsvfs->z_version > ZPL_VERSION) {
(void) printf("Mismatched versions: File system "
"is version %llu on-disk format, which is "
"incompatible with this software version %lld!",
(u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
error = ENOTSUP;
goto out;
}
if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
goto out;
zfsvfs->z_norm = (int)zval;
if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
goto out;
zfsvfs->z_utf8 = (zval != 0);
if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
goto out;
zfsvfs->z_case = (uint_t)zval;
/*
* Fold case on file systems that are always or sometimes case
* insensitive.
*/
if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
zfsvfs->z_case == ZFS_CASE_MIXED)
zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
&zfsvfs->z_root);
if (error)
goto out;
ASSERT(zfsvfs->z_root != 0);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
&zfsvfs->z_unlinkedobj);
if (error)
goto out;
error = zap_lookup(os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
8, 1, &zfsvfs->z_userquota_obj);
if (error && error != ENOENT)
goto out;
error = zap_lookup(os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
8, 1, &zfsvfs->z_groupquota_obj);
if (error && error != ENOENT)
goto out;
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
&zfsvfs->z_fuid_obj);
if (error && error != ENOENT)
goto out;
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
&zfsvfs->z_shares_dir);
if (error && error != ENOENT)
goto out;
mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
rrw_init(&zfsvfs->z_teardown_lock);
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
*zvp = zfsvfs;
return (0);
out:
dmu_objset_close(os);
*zvp = NULL;
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
}
static int static int
zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
{ {
@ -570,6 +971,12 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
dmu_objset_set_user(zfsvfs->z_os, zfsvfs); dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
if (zil_disable) {
zil_destroy(zfsvfs->z_log, 0);
zfsvfs->z_log = NULL;
}
/* /*
* If we are not mounting (ie: online recv), then we don't * If we are not mounting (ie: online recv), then we don't
* have to worry about replaying the log as we blocked all * have to worry about replaying the log as we blocked all
@ -588,11 +995,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
else else
zfs_unlinked_drain(zfsvfs); zfs_unlinked_drain(zfsvfs);
zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); if (zfsvfs->z_log) {
if (zil_disable) {
zil_destroy(zfsvfs->z_log, 0);
zfsvfs->z_log = NULL;
} else {
/* /*
* Parse and replay the intent log. * Parse and replay the intent log.
* *
@ -630,49 +1033,63 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
return (0); return (0);
} }
static void void
zfs_freezfsvfs(zfsvfs_t *zfsvfs) zfsvfs_free(zfsvfs_t *zfsvfs)
{ {
int i;
extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
/*
* This is a barrier to prevent the filesystem from going away in
* zfs_znode_move() until we can safely ensure that the filesystem is
* not unmounted. We consider the filesystem valid before the barrier
* and invalid after the barrier.
*/
rw_enter(&zfsvfs_lock, RW_READER);
rw_exit(&zfsvfs_lock);
zfs_fuid_destroy(zfsvfs);
mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_znodes_lock);
mutex_destroy(&zfsvfs->z_online_recv_lock); mutex_destroy(&zfsvfs->z_online_recv_lock);
mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes); list_destroy(&zfsvfs->z_all_znodes);
rrw_destroy(&zfsvfs->z_teardown_lock); rrw_destroy(&zfsvfs->z_teardown_lock);
rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock);
rw_destroy(&zfsvfs->z_fuid_lock); rw_destroy(&zfsvfs->z_fuid_lock);
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_destroy(&zfsvfs->z_hold_mtx[i]);
kmem_free(zfsvfs, sizeof (zfsvfs_t)); kmem_free(zfsvfs, sizeof (zfsvfs_t));
} }
static void
zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
{
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) {
vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
}
}
static int static int
zfs_domount(vfs_t *vfsp, char *osname) zfs_domount(vfs_t *vfsp, char *osname)
{ {
dev_t mount_dev; dev_t mount_dev;
uint64_t recordsize, readonly; uint64_t recordsize, fsid_guid;
int error = 0; int error = 0;
int mode;
zfsvfs_t *zfsvfs; zfsvfs_t *zfsvfs;
znode_t *zp = NULL;
ASSERT(vfsp); ASSERT(vfsp);
ASSERT(osname); ASSERT(osname);
/* error = zfsvfs_create(osname, DS_MODE_OWNER, &zfsvfs);
* Initialize the zfs-specific filesystem structure. if (error)
* Should probably make this a kmem cache, shuffle fields, return (error);
* and just bzero up to z_hold_mtx[].
*/
zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
zfsvfs->z_vfs = vfsp; zfsvfs->z_vfs = vfsp;
zfsvfs->z_parent = zfsvfs;
zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
rrw_init(&zfsvfs->z_teardown_lock);
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
/* Initialize the generic filesystem structure. */ /* Initialize the generic filesystem structure. */
vfsp->vfs_bcount = 0; vfsp->vfs_bcount = 0;
@ -694,39 +1111,24 @@ zfs_domount(vfs_t *vfsp, char *osname)
vfsp->vfs_flag |= VFS_NOTRUNC; vfsp->vfs_flag |= VFS_NOTRUNC;
vfsp->vfs_data = zfsvfs; vfsp->vfs_data = zfsvfs;
if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) /*
goto out; * The fsid is 64 bits, composed of an 8-bit fs type, which
* separates our fsid from any other filesystem types, and a
mode = DS_MODE_OWNER; * 56-bit objset unique ID. The objset unique ID is unique to
if (readonly) * all objsets open on this system, provided by unique_create().
mode |= DS_MODE_READONLY; * The 8-bit fs type must be put in the low bits of fsid[1]
* because that's where other Solaris filesystems put it.
error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); */
if (error == EROFS) { fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
mode = DS_MODE_OWNER | DS_MODE_READONLY; ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
error = dmu_objset_open(osname, DMU_OST_ZFS, mode, vfsp->vfs_fsid.val[0] = fsid_guid;
&zfsvfs->z_os); vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
} zfsfstype & 0xFF;
if (error)
goto out;
if (error = zfs_init_fs(zfsvfs, &zp))
goto out;
/* The call to zfs_init_fs leaves the vnode held, release it here. */
VN_RELE(ZTOV(zp));
/* /*
* Set features for file system. * Set features for file system.
*/ */
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfs_set_fuid_feature(zfsvfs);
if (zfsvfs->z_use_fuids) {
vfs_set_feature(vfsp, VFSFT_XVATTR);
vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
}
if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
@ -739,13 +1141,16 @@ zfs_domount(vfs_t *vfsp, char *osname)
if (dmu_objset_is_snapshot(zfsvfs->z_os)) { if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
uint64_t pval; uint64_t pval;
ASSERT(mode & DS_MODE_READONLY);
atime_changed_cb(zfsvfs, B_FALSE); atime_changed_cb(zfsvfs, B_FALSE);
readonly_changed_cb(zfsvfs, B_TRUE); readonly_changed_cb(zfsvfs, B_TRUE);
if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
goto out; goto out;
xattr_changed_cb(zfsvfs, pval); xattr_changed_cb(zfsvfs, pval);
zfsvfs->z_issnap = B_TRUE; zfsvfs->z_issnap = B_TRUE;
mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
} else { } else {
error = zfsvfs_setup(zfsvfs, B_TRUE); error = zfsvfs_setup(zfsvfs, B_TRUE);
} }
@ -754,9 +1159,8 @@ zfs_domount(vfs_t *vfsp, char *osname)
zfsctl_create(zfsvfs); zfsctl_create(zfsvfs);
out: out:
if (error) { if (error) {
if (zfsvfs->z_os) dmu_objset_close(zfsvfs->z_os);
dmu_objset_close(zfsvfs->z_os); zfsvfs_free(zfsvfs);
zfs_freezfsvfs(zfsvfs);
} else { } else {
atomic_add_32(&zfs_active_fs_count, 1); atomic_add_32(&zfs_active_fs_count, 1);
} }
@ -1067,6 +1471,13 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
error = zfs_domount(vfsp, osname); error = zfs_domount(vfsp, osname);
/*
* Add an extra VFS_HOLD on our parent vfs so that it can't
* disappear due to a forced unmount.
*/
if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
VFS_HOLD(mvp->v_vfsp);
out: out:
pn_free(&spn); pn_free(&spn);
return (error); return (error);
@ -1426,15 +1837,16 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
* 'z_teardown_inactive_lock' write held. * 'z_teardown_inactive_lock' write held.
*/ */
int int
zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *modep)
{ {
int error; int error;
if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
return (error); return (error);
*mode = zfsvfs->z_os->os_mode; *modep = zfsvfs->z_os->os_mode;
dmu_objset_name(zfsvfs->z_os, name); if (name)
dmu_objset_name(zfsvfs->z_os, name);
dmu_objset_close(zfsvfs->z_os); dmu_objset_close(zfsvfs->z_os);
return (0); return (0);
@ -1493,13 +1905,15 @@ static void
zfs_freevfs(vfs_t *vfsp) zfs_freevfs(vfs_t *vfsp)
{ {
zfsvfs_t *zfsvfs = vfsp->vfs_data; zfsvfs_t *zfsvfs = vfsp->vfs_data;
int i;
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) /*
mutex_destroy(&zfsvfs->z_hold_mtx[i]); * If this is a snapshot, we have an extra VFS_HOLD on our parent
* from zfs_mount(). Release it here.
*/
if (zfsvfs->z_issnap)
VFS_RELE(zfsvfs->z_parent->z_vfs);
zfs_fuid_destroy(zfsvfs); zfsvfs_free(zfsvfs);
zfs_freezfsvfs(zfsvfs);
atomic_add_32(&zfs_active_fs_count, -1); atomic_add_32(&zfs_active_fs_count, -1);
} }
@ -1558,6 +1972,8 @@ zfs_init(void)
* Initialize znode cache, vnode ops, etc... * Initialize znode cache, vnode ops, etc...
*/ */
zfs_znode_init(); zfs_znode_init();
dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
} }
void void
@ -1574,54 +1990,46 @@ zfs_busy(void)
} }
int int
zfs_set_version(const char *name, uint64_t newvers) zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
{ {
int error; int error;
objset_t *os; objset_t *os = zfsvfs->z_os;
dmu_tx_t *tx; dmu_tx_t *tx;
uint64_t curvers;
/*
* XXX for now, require that the filesystem be unmounted. Would
* be nice to find the zfsvfs_t and just update that if
* possible.
*/
if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
return (EINVAL); return (EINVAL);
error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os); if (newvers < zfsvfs->z_version)
if (error) return (EINVAL);
return (error);
error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
8, 1, &curvers);
if (error)
goto out;
if (newvers < curvers) {
error = EINVAL;
goto out;
}
tx = dmu_tx_create(os); tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
error = dmu_tx_assign(tx, TXG_WAIT); error = dmu_tx_assign(tx, TXG_WAIT);
if (error) { if (error) {
dmu_tx_abort(tx); dmu_tx_abort(tx);
goto out; return (error);
}
error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
8, 1, &newvers, tx);
if (error) {
dmu_tx_commit(tx);
return (error);
} }
error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
&newvers, tx);
spa_history_internal_log(LOG_DS_UPGRADE, spa_history_internal_log(LOG_DS_UPGRADE,
dmu_objset_spa(os), tx, CRED(), dmu_objset_spa(os), tx, CRED(),
"oldver=%llu newver=%llu dataset = %llu", curvers, newvers, "oldver=%llu newver=%llu dataset = %llu",
dmu_objset_id(os)); zfsvfs->z_version, newvers, dmu_objset_id(os));
dmu_tx_commit(tx); dmu_tx_commit(tx);
out: zfsvfs->z_version = newvers;
dmu_objset_close(os);
return (error); if (zfsvfs->z_version >= ZPL_VERSION_FUID)
zfs_set_fuid_feature(zfsvfs);
return (0);
} }
/* /*

View File

@ -101,6 +101,7 @@
* pushing cached pages (which acquires range locks) and syncing out * pushing cached pages (which acquires range locks) and syncing out
* cached atime changes. Third, zfs_zinactive() may require a new tx, * cached atime changes. Third, zfs_zinactive() may require a new tx,
* which could deadlock the system if you were already holding one. * which could deadlock the system if you were already holding one.
* If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
* *
* (3) All range locks must be grabbed before calling dmu_tx_assign(), * (3) All range locks must be grabbed before calling dmu_tx_assign(),
* as they can span dmu_tx_assign() calls. * as they can span dmu_tx_assign() calls.
@ -363,7 +364,8 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
caddr_t va; caddr_t va;
va = zfs_map_page(pp, S_WRITE); va = zfs_map_page(pp, S_WRITE);
(void) dmu_read(os, oid, start+off, nbytes, va+off); (void) dmu_read(os, oid, start+off, nbytes, va+off,
DMU_READ_PREFETCH);
zfs_unmap_page(pp, va); zfs_unmap_page(pp, va);
page_unlock(pp); page_unlock(pp);
} }
@ -567,6 +569,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
int max_blksz = zfsvfs->z_max_blksz; int max_blksz = zfsvfs->z_max_blksz;
uint64_t pflags; uint64_t pflags;
int error; int error;
arc_buf_t *abuf;
/* /*
* Fasttrack empty write * Fasttrack empty write
@ -663,10 +666,46 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
* and allows us to do more fine-grained space accounting. * and allows us to do more fine-grained space accounting.
*/ */
while (n > 0) { while (n > 0) {
abuf = NULL;
woff = uio->uio_loffset;
again:
if (zfs_usergroup_overquota(zfsvfs,
B_FALSE, zp->z_phys->zp_uid) ||
zfs_usergroup_overquota(zfsvfs,
B_TRUE, zp->z_phys->zp_gid)) {
if (abuf != NULL)
dmu_return_arcbuf(abuf);
error = EDQUOT;
break;
}
/*
* If dmu_assign_arcbuf() is expected to execute with minimum
* overhead loan an arc buffer and copy user data to it before
* we enter a txg. This avoids holding a txg forever while we
* pagefault on a hanging NFS server mapping.
*/
if (abuf == NULL && n >= max_blksz &&
woff >= zp->z_phys->zp_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
size_t cbytes;
abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
ASSERT(abuf != NULL);
ASSERT(arc_buf_size(abuf) == max_blksz);
if (error = uiocopy(abuf->b_data, max_blksz,
UIO_WRITE, uio, &cbytes)) {
dmu_return_arcbuf(abuf);
break;
}
ASSERT(cbytes == max_blksz);
}
/* /*
* Start a transaction. * Start a transaction.
*/ */
woff = uio->uio_loffset;
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_bonus(tx, zp->z_id);
dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
@ -675,9 +714,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
if (error == ERESTART) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); dmu_tx_abort(tx);
continue; goto again;
} }
dmu_tx_abort(tx); dmu_tx_abort(tx);
if (abuf != NULL)
dmu_return_arcbuf(abuf);
break; break;
} }
@ -706,12 +747,22 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
*/ */
nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
tx_bytes = uio->uio_resid; if (abuf == NULL) {
error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, nbytes, tx); tx_bytes = uio->uio_resid;
tx_bytes -= uio->uio_resid; error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
if (tx_bytes && vn_has_cached_data(vp)) nbytes, tx);
tx_bytes -= uio->uio_resid;
} else {
tx_bytes = nbytes;
ASSERT(tx_bytes == max_blksz);
dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
ASSERT(tx_bytes <= uio->uio_resid);
uioskip(uio, tx_bytes);
}
if (tx_bytes && vn_has_cached_data(vp)) {
update_pages(vp, woff, update_pages(vp, woff,
tx_bytes, zfsvfs->z_os, zp->z_id); tx_bytes, zfsvfs->z_os, zp->z_id);
}
/* /*
* If we made no progress, we're done. If we made even * If we made no progress, we're done. If we made even
@ -791,10 +842,15 @@ zfs_get_done(dmu_buf_t *db, void *vzgd)
zgd_t *zgd = (zgd_t *)vzgd; zgd_t *zgd = (zgd_t *)vzgd;
rl_t *rl = zgd->zgd_rl; rl_t *rl = zgd->zgd_rl;
vnode_t *vp = ZTOV(rl->r_zp); vnode_t *vp = ZTOV(rl->r_zp);
objset_t *os = rl->r_zp->z_zfsvfs->z_os;
dmu_buf_rele(db, vzgd); dmu_buf_rele(db, vzgd);
zfs_range_unlock(rl); zfs_range_unlock(rl);
VN_RELE(vp); /*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
kmem_free(zgd, sizeof (zgd_t)); kmem_free(zgd, sizeof (zgd_t));
} }
@ -824,7 +880,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
return (ENOENT); return (ENOENT);
if (zp->z_unlinked) { if (zp->z_unlinked) {
VN_RELE(ZTOV(zp)); /*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
VN_RELE_ASYNC(ZTOV(zp),
dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
return (ENOENT); return (ENOENT);
} }
@ -842,7 +903,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
error = ENOENT; error = ENOENT;
goto out; goto out;
} }
VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf,
DMU_READ_NO_PREFETCH));
} else { /* indirect write */ } else { /* indirect write */
uint64_t boff; /* block starting offset */ uint64_t boff; /* block starting offset */
@ -896,7 +958,11 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
} }
out: out:
zfs_range_unlock(rl); zfs_range_unlock(rl);
VN_RELE(ZTOV(zp)); /*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
return (error); return (error);
} }
@ -1074,11 +1140,11 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
zfs_dirlock_t *dl; zfs_dirlock_t *dl;
dmu_tx_t *tx; dmu_tx_t *tx;
int error; int error;
zfs_acl_t *aclp = NULL;
zfs_fuid_info_t *fuidp = NULL;
ksid_t *ksid; ksid_t *ksid;
uid_t uid; uid_t uid;
gid_t gid = crgetgid(cr); gid_t gid = crgetgid(cr);
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
/* /*
* If we have an ephemeral id, ACL, or XVATTR then * If we have an ephemeral id, ACL, or XVATTR then
@ -1141,21 +1207,9 @@ top:
if (strcmp(name, "..") == 0) if (strcmp(name, "..") == 0)
error = EISDIR; error = EISDIR;
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
if (aclp)
zfs_acl_free(aclp);
return (error); return (error);
} }
} }
if (vsecp && aclp == NULL) {
error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
if (error) {
ZFS_EXIT(zfsvfs);
if (dl)
zfs_dirent_unlock(dl);
return (error);
}
}
if (zp == NULL) { if (zp == NULL) {
uint64_t txtype; uint64_t txtype;
@ -1177,30 +1231,28 @@ top:
goto out; goto out;
} }
if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
&acl_ids)) != 0)
goto out;
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
error = EDQUOT;
goto out;
}
tx = dmu_tx_create(os); tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || fuid_dirtied = zfsvfs->z_fuid_dirty;
IS_EPHEMERAL(gid)) { if (fuid_dirtied)
if (zfsvfs->z_fuid_obj == 0) { zfs_fuid_txhold(zfsvfs, tx);
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
FALSE, NULL);
} else {
dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
}
}
dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_bonus(tx, dzp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, SPA_MAXBLOCKSIZE); 0, SPA_MAXBLOCKSIZE);
} }
error = dmu_tx_assign(tx, TXG_NOWAIT); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl); zfs_dirent_unlock(dl);
if (error == ERESTART) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
@ -1209,19 +1261,21 @@ top:
} }
dmu_tx_abort(tx); dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
if (aclp)
zfs_acl_free(aclp);
return (error); return (error);
} }
zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
(void) zfs_link_create(dl, zp, tx, ZNEW); (void) zfs_link_create(dl, zp, tx, ZNEW);
txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
if (flag & FIGNORECASE) if (flag & FIGNORECASE)
txtype |= TX_CI; txtype |= TX_CI;
zfs_log_create(zilog, tx, txtype, dzp, zp, name, zfs_log_create(zilog, tx, txtype, dzp, zp, name,
vsecp, fuidp, vap); vsecp, acl_ids.z_fuidp, vap);
if (fuidp) zfs_acl_ids_free(&acl_ids);
zfs_fuid_info_free(fuidp);
dmu_tx_commit(tx); dmu_tx_commit(tx);
} else { } else {
int aflags = (flag & FAPPEND) ? V_APPEND : 0; int aflags = (flag & FAPPEND) ? V_APPEND : 0;
@ -1292,8 +1346,6 @@ out:
*vpp = svp; *vpp = svp;
} }
} }
if (aclp)
zfs_acl_free(aclp);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (error); return (error);
@ -1528,12 +1580,12 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
uint64_t txtype; uint64_t txtype;
dmu_tx_t *tx; dmu_tx_t *tx;
int error; int error;
zfs_acl_t *aclp = NULL;
zfs_fuid_info_t *fuidp = NULL;
int zf = ZNEW; int zf = ZNEW;
ksid_t *ksid; ksid_t *ksid;
uid_t uid; uid_t uid;
gid_t gid = crgetgid(cr); gid_t gid = crgetgid(cr);
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
ASSERT(vap->va_type == VDIR); ASSERT(vap->va_type == VDIR);
@ -1594,38 +1646,33 @@ top:
return (error); return (error);
} }
if (vsecp && aclp == NULL) { if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); &acl_ids)) != 0) {
if (error) { zfs_dirent_unlock(dl);
zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs);
ZFS_EXIT(zfsvfs); return (error);
return (error);
}
} }
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_dirent_unlock(dl);
ZFS_EXIT(zfsvfs);
return (EDQUOT);
}
/* /*
* Add a new entry to the directory. * Add a new entry to the directory.
*/ */
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || fuid_dirtied = zfsvfs->z_fuid_dirty;
IS_EPHEMERAL(gid)) { if (fuid_dirtied)
if (zfsvfs->z_fuid_obj == 0) { zfs_fuid_txhold(zfsvfs, tx);
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
} else {
dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
}
}
if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, SPA_MAXBLOCKSIZE); 0, SPA_MAXBLOCKSIZE);
error = dmu_tx_assign(tx, TXG_NOWAIT); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl); zfs_dirent_unlock(dl);
if (error == ERESTART) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
@ -1634,19 +1681,16 @@ top:
} }
dmu_tx_abort(tx); dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
if (aclp)
zfs_acl_free(aclp);
return (error); return (error);
} }
/* /*
* Create new node. * Create new node.
*/ */
zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
if (aclp)
zfs_acl_free(aclp);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
/* /*
* Now put new name in parent dir. * Now put new name in parent dir.
*/ */
@ -1657,10 +1701,10 @@ top:
txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
if (flags & FIGNORECASE) if (flags & FIGNORECASE)
txtype |= TX_CI; txtype |= TX_CI;
zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
acl_ids.z_fuidp, vap);
if (fuidp) zfs_acl_ids_free(&acl_ids);
zfs_fuid_info_free(fuidp);
dmu_tx_commit(tx); dmu_tx_commit(tx);
zfs_dirent_unlock(dl); zfs_dirent_unlock(dl);
@ -1969,6 +2013,21 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
} }
} }
if (flags & V_RDDIR_ACCFILTER) {
/*
* If we have no access at all, don't include
* this entry in the returned information
*/
znode_t *ezp;
if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
goto skip_entry;
if (!zfs_has_access(ezp, cr)) {
VN_RELE(ZTOV(ezp));
goto skip_entry;
}
VN_RELE(ZTOV(ezp));
}
if (flags & V_RDDIR_ENTFLAGS) if (flags & V_RDDIR_ENTFLAGS)
reclen = EDIRENT_RECLEN(strlen(zap.za_name)); reclen = EDIRENT_RECLEN(strlen(zap.za_name));
else else
@ -2020,6 +2079,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
if (prefetch) if (prefetch)
dmu_prefetch(os, objnum, 0, 0); dmu_prefetch(os, objnum, 0, 0);
skip_entry:
/* /*
* Move to the next entry, fill in the previous offset. * Move to the next entry, fill in the previous offset.
*/ */
@ -2120,8 +2180,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
ZFS_VERIFY_ZP(zp); ZFS_VERIFY_ZP(zp);
pzp = zp->z_phys; pzp = zp->z_phys;
mutex_enter(&zp->z_lock);
/* /*
* If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
* Also, if we are the owner don't bother, since owner should * Also, if we are the owner don't bother, since owner should
@ -2131,7 +2189,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
(pzp->zp_uid != crgetuid(cr))) { (pzp->zp_uid != crgetuid(cr))) {
if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
skipaclchk, cr)) { skipaclchk, cr)) {
mutex_exit(&zp->z_lock);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (error); return (error);
} }
@ -2142,6 +2199,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
* than to determine whether we were asked the question. * than to determine whether we were asked the question.
*/ */
mutex_enter(&zp->z_lock);
vap->va_type = vp->v_type; vap->va_type = vp->v_type;
vap->va_mode = pzp->zp_mode & MODEMASK; vap->va_mode = pzp->zp_mode & MODEMASK;
zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
@ -2312,6 +2370,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
uint_t saved_mask; uint_t saved_mask;
int trim_mask = 0; int trim_mask = 0;
uint64_t new_mode; uint64_t new_mode;
uint64_t new_uid, new_gid;
znode_t *attrzp; znode_t *attrzp;
int need_policy = FALSE; int need_policy = FALSE;
int err; int err;
@ -2320,6 +2379,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
xoptattr_t *xoap; xoptattr_t *xoap;
zfs_acl_t *aclp = NULL; zfs_acl_t *aclp = NULL;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
boolean_t fuid_dirtied = B_FALSE;
if (mask == 0) if (mask == 0)
return (0); return (0);
@ -2610,30 +2670,14 @@ top:
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_bonus(tx, zp->z_id);
if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) {
if (zfsvfs->z_fuid_obj == 0) {
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
} else {
dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
}
}
if (mask & AT_MODE) { if (mask & AT_MODE) {
uint64_t pmode = pzp->zp_mode; uint64_t pmode = pzp->zp_mode;
new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
dmu_tx_abort(tx); goto out;
ZFS_EXIT(zfsvfs);
return (err);
}
if (pzp->zp_acl.z_acl_extern_obj) { if (pzp->zp_acl.z_acl_extern_obj) {
/* Are we upgrading ACL from old V0 format to new V1 */ /* Are we upgrading ACL from old V0 format to new V1 */
if (zfsvfs->z_version <= ZPL_VERSION_FUID && if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
@ -2655,36 +2699,53 @@ top:
} }
} }
if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { if (mask & (AT_UID | AT_GID)) {
err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); if (pzp->zp_xattr) {
if (err) { err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
dmu_tx_abort(tx); if (err)
ZFS_EXIT(zfsvfs); goto out;
if (aclp) dmu_tx_hold_bonus(tx, attrzp->z_id);
zfs_acl_free(aclp); }
return (err); if (mask & AT_UID) {
new_uid = zfs_fuid_create(zfsvfs,
(uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
if (new_uid != pzp->zp_uid &&
zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
err = EDQUOT;
goto out;
}
}
if (mask & AT_GID) {
new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
cr, ZFS_GROUP, &fuidp);
if (new_gid != pzp->zp_gid &&
zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
err = EDQUOT;
goto out;
}
}
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied) {
if (zfsvfs->z_fuid_obj == 0) {
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
FALSE, NULL);
} else {
dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
}
} }
dmu_tx_hold_bonus(tx, attrzp->z_id);
} }
err = dmu_tx_assign(tx, TXG_NOWAIT); err = dmu_tx_assign(tx, TXG_NOWAIT);
if (err) { if (err) {
if (attrzp) if (err == ERESTART)
VN_RELE(ZTOV(attrzp));
if (aclp) {
zfs_acl_free(aclp);
aclp = NULL;
}
if (err == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
dmu_tx_abort(tx); goto out;
goto top;
}
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (err);
} }
dmu_buf_will_dirty(zp->z_dbuf, tx); dmu_buf_will_dirty(zp->z_dbuf, tx);
@ -2702,7 +2763,7 @@ top:
if (mask & AT_MODE) { if (mask & AT_MODE) {
mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_acl_lock);
zp->z_phys->zp_mode = new_mode; zp->z_phys->zp_mode = new_mode;
err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); err = zfs_aclset_common(zp, aclp, cr, tx);
ASSERT3U(err, ==, 0); ASSERT3U(err, ==, 0);
mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_acl_lock);
} }
@ -2711,25 +2772,17 @@ top:
mutex_enter(&attrzp->z_lock); mutex_enter(&attrzp->z_lock);
if (mask & AT_UID) { if (mask & AT_UID) {
pzp->zp_uid = zfs_fuid_create(zfsvfs, pzp->zp_uid = new_uid;
vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); if (attrzp)
if (attrzp) { attrzp->z_phys->zp_uid = new_uid;
attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs,
vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
}
} }
if (mask & AT_GID) { if (mask & AT_GID) {
pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, pzp->zp_gid = new_gid;
cr, ZFS_GROUP, tx, &fuidp);
if (attrzp) if (attrzp)
attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, attrzp->z_phys->zp_gid = new_gid;
vap->va_gid, cr, ZFS_GROUP, tx, &fuidp);
} }
if (aclp)
zfs_acl_free(aclp);
if (attrzp) if (attrzp)
mutex_exit(&attrzp->z_lock); mutex_exit(&attrzp->z_lock);
@ -2791,17 +2844,35 @@ top:
zfs_xvattr_set(zp, xvap); zfs_xvattr_set(zp, xvap);
} }
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
if (mask != 0) if (mask != 0)
zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
if (fuidp)
zfs_fuid_info_free(fuidp);
mutex_exit(&zp->z_lock); mutex_exit(&zp->z_lock);
out:
if (attrzp) if (attrzp)
VN_RELE(ZTOV(attrzp)); VN_RELE(ZTOV(attrzp));
dmu_tx_commit(tx); if (aclp) {
zfs_acl_free(aclp);
aclp = NULL;
}
if (fuidp) {
zfs_fuid_info_free(fuidp);
fuidp = NULL;
}
if (err)
dmu_tx_abort(tx);
else
dmu_tx_commit(tx);
if (err == ERESTART)
goto top;
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
return (err); return (err);
@ -3232,7 +3303,8 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
int len = strlen(link); int len = strlen(link);
int error; int error;
int zflg = ZNEW; int zflg = ZNEW;
zfs_fuid_info_t *fuidp = NULL; zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
ASSERT(vap->va_type == VLNK); ASSERT(vap->va_type == VLNK);
@ -3267,26 +3339,25 @@ top:
return (error); return (error);
} }
VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids));
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl);
ZFS_EXIT(zfsvfs);
return (EDQUOT);
}
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
fuid_dirtied = zfsvfs->z_fuid_dirty;
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_bonus(tx, dzp->z_id);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { if (fuid_dirtied)
if (zfsvfs->z_fuid_obj == 0) { zfs_fuid_txhold(zfsvfs, tx);
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
} else {
dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
FUID_SIZE_ESTIMATE(zfsvfs));
}
}
error = dmu_tx_assign(tx, TXG_NOWAIT); error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) { if (error) {
zfs_acl_ids_free(&acl_ids);
zfs_dirent_unlock(dl); zfs_dirent_unlock(dl);
if (error == ERESTART) { if (error == ERESTART) {
dmu_tx_wait(tx); dmu_tx_wait(tx);
@ -3306,13 +3377,16 @@ top:
* otherwise, store it just like any other file data. * otherwise, store it just like any other file data.
*/ */
if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
if (len != 0) if (len != 0)
bcopy(link, zp->z_phys + 1, len); bcopy(link, zp->z_phys + 1, len);
} else { } else {
dmu_buf_t *dbp; dmu_buf_t *dbp;
zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
/* /*
* Nothing can access the znode yet so no locking needed * Nothing can access the znode yet so no locking needed
* for growing the znode's blocksize. * for growing the znode's blocksize.
@ -3333,15 +3407,14 @@ top:
* Insert the new object into the directory. * Insert the new object into the directory.
*/ */
(void) zfs_link_create(dl, zp, tx, ZNEW); (void) zfs_link_create(dl, zp, tx, ZNEW);
out:
if (error == 0) { if (error == 0) {
uint64_t txtype = TX_SYMLINK; uint64_t txtype = TX_SYMLINK;
if (flags & FIGNORECASE) if (flags & FIGNORECASE)
txtype |= TX_CI; txtype |= TX_CI;
zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
} }
if (fuidp)
zfs_fuid_info_free(fuidp); zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx); dmu_tx_commit(tx);
@ -3618,6 +3691,12 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
pvn_write_done(trunc, flags); pvn_write_done(trunc, flags);
len = filesz - off; len = filesz - off;
} }
if (zfs_usergroup_overquota(zfsvfs, B_FALSE, zp->z_phys->zp_uid) ||
zfs_usergroup_overquota(zfsvfs, B_TRUE, zp->z_phys->zp_gid)) {
err = EDQUOT;
goto out;
}
top: top:
tx = dmu_tx_create(zfsvfs->z_os); tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_write(tx, zp->z_id, off, len);
@ -3705,7 +3784,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
else else
io_off = 0; io_off = 0;
if (len > 0 && ISP2(blksz)) if (len > 0 && ISP2(blksz))
io_len = P2ROUNDUP_TYPED(len + (io_off - off), blksz, size_t); io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
else else
io_len = 0; io_len = 0;
@ -3869,7 +3948,8 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
* If we can't find a page in the cache, we will create a new page * If we can't find a page in the cache, we will create a new page
* and fill it with file data. For efficiency, we may try to fill * and fill it with file data. For efficiency, we may try to fill
* multiple pages at once (klustering) to fill up the supplied page * multiple pages at once (klustering) to fill up the supplied page
* list. * list. Note that the pages to be filled are held with an exclusive
* lock to prevent access by other threads while they are being filled.
*/ */
static int static int
zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
@ -3888,7 +3968,8 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
*/ */
io_off = off; io_off = off;
io_len = PAGESIZE; io_len = PAGESIZE;
pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); pp = page_create_va(vp, io_off, io_len,
PG_EXCL | PG_WAIT, seg, addr);
} else { } else {
/* /*
* Try to find enough pages to fill the page list * Try to find enough pages to fill the page list
@ -3913,7 +3994,8 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
ASSERT3U(io_off, ==, cur_pp->p_offset); ASSERT3U(io_off, ==, cur_pp->p_offset);
va = zfs_map_page(cur_pp, S_WRITE); va = zfs_map_page(cur_pp, S_WRITE);
err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
DMU_READ_PREFETCH);
zfs_unmap_page(cur_pp, va); zfs_unmap_page(cur_pp, va);
if (err) { if (err) {
/* On error, toss the entire kluster */ /* On error, toss the entire kluster */
@ -3991,7 +4073,7 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
*protp = PROT_ALL; *protp = PROT_ALL;
/* /*
* Loop through the requested range [off, off + len] looking * Loop through the requested range [off, off + len) looking
* for pages. If we don't find a page, we will need to create * for pages. If we don't find a page, we will need to create
* a new page and fill it with data from the file. * a new page and fill it with data from the file.
*/ */
@ -4337,6 +4419,11 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
(vp->v_type == VREG || vp->v_type == VDIR); (vp->v_type == VREG || vp->v_type == VDIR);
return (0); return (0);
case _PC_ACCESS_FILTERING:
*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
vp->v_type == VDIR;
return (0);
case _PC_ACL_ENABLED: case _PC_ACL_ENABLED:
*valp = _ACL_ACE_ENABLED; *valp = _ACL_ACE_ENABLED;
return (0); return (0);
@ -4488,6 +4575,22 @@ const fs_operation_def_t zfs_symvnodeops_template[] = {
NULL, NULL NULL, NULL
}; };
/*
* special share hidden files vnode operations template
*/
vnodeops_t *zfs_sharevnodeops;
const fs_operation_def_t zfs_sharevnodeops_template[] = {
VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
VOPNAME_ACCESS, { .vop_access = zfs_access },
VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
VOPNAME_FID, { .vop_fid = zfs_fid },
VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
NULL, NULL
};
/* /*
* Extended attribute directory vnode operations template * Extended attribute directory vnode operations template
* This template is identical to the directory vnodes * This template is identical to the directory vnodes

View File

@ -87,6 +87,12 @@
* (such as VFS logic) that will not compile easily in userland. * (such as VFS logic) that will not compile easily in userland.
*/ */
#ifdef _KERNEL #ifdef _KERNEL
/*
* Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
* be freed before it can be safely accessed.
*/
krwlock_t zfsvfs_lock;
static kmem_cache_t *znode_cache = NULL; static kmem_cache_t *znode_cache = NULL;
/*ARGSUSED*/ /*ARGSUSED*/
@ -154,8 +160,9 @@ zfs_znode_cache_destructor(void *buf, void *arg)
#ifdef ZNODE_STATS #ifdef ZNODE_STATS
static struct { static struct {
uint64_t zms_zfsvfs_invalid; uint64_t zms_zfsvfs_invalid;
uint64_t zms_zfsvfs_recheck1;
uint64_t zms_zfsvfs_unmounted; uint64_t zms_zfsvfs_unmounted;
uint64_t zms_zfsvfs_recheck_invalid; uint64_t zms_zfsvfs_recheck2;
uint64_t zms_obj_held; uint64_t zms_obj_held;
uint64_t zms_vnode_locked; uint64_t zms_vnode_locked;
uint64_t zms_not_only_dnlc; uint64_t zms_not_only_dnlc;
@ -206,17 +213,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
POINTER_INVALIDATE(&ozp->z_zfsvfs); POINTER_INVALIDATE(&ozp->z_zfsvfs);
} }
/*
* Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise
* returns a non-zero error code.
*/
static int
zfs_enter(zfsvfs_t *zfsvfs)
{
ZFS_ENTER(zfsvfs);
return (0);
}
/*ARGSUSED*/ /*ARGSUSED*/
static kmem_cbrc_t static kmem_cbrc_t
zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
@ -240,12 +236,32 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
} }
/* /*
* Ensure that the filesystem is not unmounted during the move. * Close a small window in which it's possible that the filesystem could
* be unmounted and freed, and zfsvfs, though valid in the previous
* statement, could point to unrelated memory by the time we try to
* prevent the filesystem from being unmounted.
*/ */
if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ rw_enter(&zfsvfs_lock, RW_WRITER);
if (zfsvfs != ozp->z_zfsvfs) {
rw_exit(&zfsvfs_lock);
ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
return (KMEM_CBRC_DONT_KNOW);
}
/*
* If the znode is still valid, then so is the file system. We know that
* no valid file system can be freed while we hold zfsvfs_lock, so we
* can safely ensure that the filesystem is not and will not be
* unmounted. The next statement is equivalent to ZFS_ENTER().
*/
rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
if (zfsvfs->z_unmounted) {
ZFS_EXIT(zfsvfs);
rw_exit(&zfsvfs_lock);
ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
return (KMEM_CBRC_DONT_KNOW); return (KMEM_CBRC_DONT_KNOW);
} }
rw_exit(&zfsvfs_lock);
mutex_enter(&zfsvfs->z_znodes_lock); mutex_enter(&zfsvfs->z_znodes_lock);
/* /*
@ -255,7 +271,7 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
if (zfsvfs != ozp->z_zfsvfs) { if (zfsvfs != ozp->z_zfsvfs) {
mutex_exit(&zfsvfs->z_znodes_lock); mutex_exit(&zfsvfs->z_znodes_lock);
ZFS_EXIT(zfsvfs); ZFS_EXIT(zfsvfs);
ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
return (KMEM_CBRC_DONT_KNOW); return (KMEM_CBRC_DONT_KNOW);
} }
@ -311,6 +327,7 @@ zfs_znode_init(void)
/* /*
* Initialize zcache * Initialize zcache
*/ */
rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
ASSERT(znode_cache == NULL); ASSERT(znode_cache == NULL);
znode_cache = kmem_cache_create("zfs_znode_cache", znode_cache = kmem_cache_create("zfs_znode_cache",
sizeof (znode_t), 0, zfs_znode_cache_constructor, sizeof (znode_t), 0, zfs_znode_cache_constructor,
@ -332,6 +349,7 @@ zfs_znode_fini(void)
if (znode_cache) if (znode_cache)
kmem_cache_destroy(znode_cache); kmem_cache_destroy(znode_cache);
znode_cache = NULL; znode_cache = NULL;
rw_destroy(&zfsvfs_lock);
} }
struct vnodeops *zfs_dvnodeops; struct vnodeops *zfs_dvnodeops;
@ -339,6 +357,7 @@ struct vnodeops *zfs_fvnodeops;
struct vnodeops *zfs_symvnodeops; struct vnodeops *zfs_symvnodeops;
struct vnodeops *zfs_xdvnodeops; struct vnodeops *zfs_xdvnodeops;
struct vnodeops *zfs_evnodeops; struct vnodeops *zfs_evnodeops;
struct vnodeops *zfs_sharevnodeops;
void void
zfs_remove_op_tables() zfs_remove_op_tables()
@ -363,12 +382,15 @@ zfs_remove_op_tables()
vn_freevnodeops(zfs_xdvnodeops); vn_freevnodeops(zfs_xdvnodeops);
if (zfs_evnodeops) if (zfs_evnodeops)
vn_freevnodeops(zfs_evnodeops); vn_freevnodeops(zfs_evnodeops);
if (zfs_sharevnodeops)
vn_freevnodeops(zfs_sharevnodeops);
zfs_dvnodeops = NULL; zfs_dvnodeops = NULL;
zfs_fvnodeops = NULL; zfs_fvnodeops = NULL;
zfs_symvnodeops = NULL; zfs_symvnodeops = NULL;
zfs_xdvnodeops = NULL; zfs_xdvnodeops = NULL;
zfs_evnodeops = NULL; zfs_evnodeops = NULL;
zfs_sharevnodeops = NULL;
} }
extern const fs_operation_def_t zfs_dvnodeops_template[]; extern const fs_operation_def_t zfs_dvnodeops_template[];
@ -376,6 +398,7 @@ extern const fs_operation_def_t zfs_fvnodeops_template[];
extern const fs_operation_def_t zfs_xdvnodeops_template[]; extern const fs_operation_def_t zfs_xdvnodeops_template[];
extern const fs_operation_def_t zfs_symvnodeops_template[]; extern const fs_operation_def_t zfs_symvnodeops_template[];
extern const fs_operation_def_t zfs_evnodeops_template[]; extern const fs_operation_def_t zfs_evnodeops_template[];
extern const fs_operation_def_t zfs_sharevnodeops_template[];
int int
zfs_create_op_tables() zfs_create_op_tables()
@ -412,103 +435,58 @@ zfs_create_op_tables()
error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
&zfs_evnodeops); &zfs_evnodeops);
if (error)
return (error);
error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
&zfs_sharevnodeops);
return (error); return (error);
} }
/*
* zfs_init_fs - Initialize the zfsvfs struct and the file system
* incore "master" object. Verify version compatibility.
*/
int int
zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
{ {
extern int zfsfstype; zfs_acl_ids_t acl_ids;
vattr_t vattr;
znode_t *sharezp;
vnode_t *vp;
znode_t *zp;
int error;
objset_t *os = zfsvfs->z_os; vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
int i, error; vattr.va_type = VDIR;
uint64_t fsid_guid; vattr.va_mode = S_IFDIR|0555;
uint64_t zval; vattr.va_uid = crgetuid(kcred);
vattr.va_gid = crgetgid(kcred);
*zpp = NULL; sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
sharezp->z_unlinked = 0;
sharezp->z_atime_dirty = 0;
sharezp->z_zfsvfs = zfsvfs;
error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); vp = ZTOV(sharezp);
if (error) { vn_reinit(vp);
return (error); vp->v_type = VDIR;
} else if (zfsvfs->z_version > ZPL_VERSION) {
(void) printf("Mismatched versions: File system "
"is version %llu on-disk format, which is "
"incompatible with this software version %lld!",
(u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
return (ENOTSUP);
}
if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
return (error); kcred, NULL, &acl_ids));
zfsvfs->z_norm = (int)zval; zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) &zp, 0, &acl_ids);
return (error); ASSERT3P(zp, ==, sharezp);
zfsvfs->z_utf8 = (zval != 0); ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) POINTER_INVALIDATE(&sharezp->z_zfsvfs);
return (error); error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
zfsvfs->z_case = (uint_t)zval; ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
/* zfsvfs->z_shares_dir = sharezp->z_id;
* Fold case on file systems that are always or sometimes case
* insensitive.
*/
if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
zfsvfs->z_case == ZFS_CASE_MIXED)
zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
/* zfs_acl_ids_free(&acl_ids);
* The fsid is 64 bits, composed of an 8-bit fs type, which ZTOV(sharezp)->v_count = 0;
* separates our fsid from any other filesystem types, and a dmu_buf_rele(sharezp->z_dbuf, NULL);
* 56-bit objset unique ID. The objset unique ID is unique to sharezp->z_dbuf = NULL;
* all objsets open on this system, provided by unique_create(). kmem_cache_free(znode_cache, sharezp);
* The 8-bit fs type must be put in the low bits of fsid[1]
* because that's where other Solaris filesystems put it.
*/
fsid_guid = dmu_objset_fsid_guid(os);
ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
zfsfstype & 0xFF;
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, return (error);
&zfsvfs->z_root);
if (error)
return (error);
ASSERT(zfsvfs->z_root != 0);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
&zfsvfs->z_unlinkedobj);
if (error)
return (error);
/*
* Initialize zget mutex's
*/
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
if (error) {
/*
* On error, we destroy the mutexes here since it's not
* possible for the caller to determine if the mutexes were
* initialized properly.
*/
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_destroy(&zfsvfs->z_hold_mtx[i]);
return (error);
}
ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
&zfsvfs->z_fuid_obj);
if (error == ENOENT)
error = 0;
return (0);
} }
/* /*
@ -676,7 +654,10 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
break; break;
case VREG: case VREG:
vp->v_flag |= VMODSORT; vp->v_flag |= VMODSORT;
vn_setops(vp, zfs_fvnodeops); if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir)
vn_setops(vp, zfs_sharevnodeops);
else
vn_setops(vp, zfs_fvnodeops);
break; break;
case VLNK: case VLNK:
vn_setops(vp, zfs_symvnodeops); vn_setops(vp, zfs_symvnodeops);
@ -720,8 +701,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
*/ */
void void
zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
zfs_fuid_info_t **fuidp)
{ {
dmu_buf_t *db; dmu_buf_t *db;
znode_phys_t *pzp; znode_phys_t *pzp;
@ -846,7 +826,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
*/ */
*zpp = dzp; *zpp = dzp;
} }
zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); pzp->zp_uid = acl_ids->z_fuid;
pzp->zp_gid = acl_ids->z_fgid;
pzp->zp_mode = acl_ids->z_mode;
VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
if (vap->va_mask & AT_XVATTR)
zfs_xvattr_set(*zpp, (xvattr_t *)vap);
} }
void void
@ -1474,7 +1459,7 @@ void
zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
{ {
zfsvfs_t zfsvfs; zfsvfs_t zfsvfs;
uint64_t moid, doid, version; uint64_t moid, obj, version;
uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t sense = ZFS_CASE_SENSITIVE;
uint64_t norm = 0; uint64_t norm = 0;
nvpair_t *elem; nvpair_t *elem;
@ -1483,6 +1468,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
vnode_t *vp; vnode_t *vp;
vattr_t vattr; vattr_t vattr;
znode_t *zp; znode_t *zp;
zfs_acl_ids_t acl_ids;
/* /*
* First attempt to create master node. * First attempt to create master node.
@ -1499,12 +1485,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
/* /*
* Set starting attributes. * Set starting attributes.
*/ */
if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
version = ZPL_VERSION; version = ZPL_VERSION;
else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
version = ZPL_VERSION_USERSPACE - 1;
else else
version = ZPL_VERSION_FUID - 1; version = ZPL_VERSION_FUID - 1;
error = zap_update(os, moid, ZPL_VERSION_STR,
8, 1, &version, tx);
elem = NULL; elem = NULL;
while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
/* For the moment we expect all zpl props to be uint64_ts */ /* For the moment we expect all zpl props to be uint64_ts */
@ -1515,9 +1501,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
VERIFY(nvpair_value_uint64(elem, &val) == 0); VERIFY(nvpair_value_uint64(elem, &val) == 0);
name = nvpair_name(elem); name = nvpair_name(elem);
if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
version = val; if (val < version)
error = zap_update(os, moid, ZPL_VERSION_STR, version = val;
8, 1, &version, tx);
} else { } else {
error = zap_update(os, moid, name, 8, 1, &val, tx); error = zap_update(os, moid, name, 8, 1, &val, tx);
} }
@ -1528,13 +1513,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
sense = val; sense = val;
} }
ASSERT(version != 0); ASSERT(version != 0);
error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
/* /*
* Create a delete queue. * Create a delete queue.
*/ */
doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
ASSERT(error == 0); ASSERT(error == 0);
/* /*
@ -1575,17 +1561,28 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
rootzp->z_zfsvfs = &zfsvfs; rootzp->z_zfsvfs = &zfsvfs;
zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
cr, NULL, &acl_ids));
zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
ASSERT3P(zp, ==, rootzp); ASSERT3P(zp, ==, rootzp);
ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */ ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */
error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
ASSERT(error == 0); ASSERT(error == 0);
zfs_acl_ids_free(&acl_ids);
POINTER_INVALIDATE(&rootzp->z_zfsvfs); POINTER_INVALIDATE(&rootzp->z_zfsvfs);
ZTOV(rootzp)->v_count = 0; ZTOV(rootzp)->v_count = 0;
dmu_buf_rele(rootzp->z_dbuf, NULL); dmu_buf_rele(rootzp->z_dbuf, NULL);
rootzp->z_dbuf = NULL; rootzp->z_dbuf = NULL;
kmem_cache_free(znode_cache, rootzp); kmem_cache_free(znode_cache, rootzp);
/*
* Create shares directory
*/
error = zfs_create_share_dir(&zfsvfs, tx);
ASSERT(error == 0);
} }
#endif /* _KERNEL */ #endif /* _KERNEL */

View File

@ -19,12 +19,13 @@
* CDDL HEADER END * CDDL HEADER END
*/ */
/* /*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/dmu.h> #include <sys/dmu.h>
#include <sys/zap.h> #include <sys/zap.h>
#include <sys/arc.h> #include <sys/arc.h>
@ -471,34 +472,22 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
} }
/* /*
* zil_rollback_destroy() is only called by the rollback code. * return true if the initial log block is not valid
* We already have a syncing tx. Rollback has exclusive access to the
* dataset, so we don't have to worry about concurrent zil access.
* The actual freeing of any log blocks occurs in zil_sync() later in
* this txg syncing phase.
*/ */
void static boolean_t
zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx) zil_empty(zilog_t *zilog)
{ {
const zil_header_t *zh = zilog->zl_header; const zil_header_t *zh = zilog->zl_header;
uint64_t txg; arc_buf_t *abuf = NULL;
if (BP_IS_HOLE(&zh->zh_log)) if (BP_IS_HOLE(&zh->zh_log))
return; return (B_TRUE);
txg = dmu_tx_get_txg(tx); if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
ASSERT3U(zilog->zl_destroy_txg, <, txg); return (B_TRUE);
zilog->zl_destroy_txg = txg;
zilog->zl_keep_first = B_FALSE;
/* VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
* Ensure there's no outstanding ZIL IO. No lwbs or just the return (B_FALSE);
* unused one that allocated in advance is ok.
*/
ASSERT(zilog->zl_lwb_list.list_head.list_next ==
zilog->zl_lwb_list.list_head.list_prev);
(void) zil_parse(zilog, zil_free_log_block, zil_free_log_record,
tx, zh->zh_claim_txg);
} }
int int
@ -520,6 +509,30 @@ zil_claim(char *osname, void *txarg)
zilog = dmu_objset_zil(os); zilog = dmu_objset_zil(os);
zh = zil_header_in_syncing_context(zilog); zh = zil_header_in_syncing_context(zilog);
if (zilog->zl_spa->spa_log_state == SPA_LOG_CLEAR) {
if (!BP_IS_HOLE(&zh->zh_log))
zio_free_blk(zilog->zl_spa, &zh->zh_log, first_txg);
BP_ZERO(&zh->zh_log);
dsl_dataset_dirty(dmu_objset_ds(os), tx);
}
/*
* Record here whether the zil has any records to replay.
* If the header block pointer is null or the block points
* to the stubby then we know there are no valid log records.
* We use the header to store this state as the the zilog gets
* freed later in dmu_objset_close().
* The flags (and the rest of the header fields) are cleared in
* zil_sync() as a result of a zil_destroy(), after replaying the log.
*
* Note, the intent log can be empty but still need the
* stubby to be claimed.
*/
if (!zil_empty(zilog)) {
zh->zh_flags |= ZIL_REPLAY_NEEDED;
dsl_dataset_dirty(dmu_objset_ds(os), tx);
}
/* /*
* Claim all log blocks if we haven't already done so, and remember * Claim all log blocks if we haven't already done so, and remember
* the highest claimed sequence number. This ensures that if we can * the highest claimed sequence number. This ensures that if we can
@ -587,36 +600,6 @@ zil_check_log_chain(char *osname, void *txarg)
return (error); return (error);
} }
/*
* Clear a log chain
*/
/* ARGSUSED */
int
zil_clear_log_chain(char *osname, void *txarg)
{
zilog_t *zilog;
zil_header_t *zh;
objset_t *os;
dmu_tx_t *tx;
int error;
error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
if (error) {
cmn_err(CE_WARN, "can't open objset for %s", osname);
return (0);
}
zilog = dmu_objset_zil(os);
tx = dmu_tx_create(zilog->zl_os);
(void) dmu_tx_assign(tx, TXG_WAIT);
zh = zil_header_in_syncing_context(zilog);
BP_ZERO(&zh->zh_log);
dsl_dataset_dirty(dmu_objset_ds(os), tx);
dmu_tx_commit(tx);
dmu_objset_close(os);
return (0);
}
static int static int
zil_vdev_compare(const void *x1, const void *x2) zil_vdev_compare(const void *x1, const void *x2)
{ {
@ -719,18 +702,26 @@ zil_lwb_write_done(zio_t *zio)
ASSERT(zio->io_bp->blk_fill == 0); ASSERT(zio->io_bp->blk_fill == 0);
/* /*
* Now that we've written this log block, we have a stable pointer * Ensure the lwb buffer pointer is cleared before releasing
* to the next block in the chain, so it's OK to let the txg in * the txg. If we have had an allocation failure and
* which we allocated the next block sync. * the txg is waiting to sync then we want want zil_sync()
* to remove the lwb so that it's not picked up as the next new
* one in zil_commit_writer(). zil_sync() will only remove
* the lwb if lwb_buf is null.
*/ */
txg_rele_to_sync(&lwb->lwb_txgh);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lock);
lwb->lwb_buf = NULL; lwb->lwb_buf = NULL;
if (zio->io_error) if (zio->io_error)
zilog->zl_log_error = B_TRUE; zilog->zl_log_error = B_TRUE;
mutex_exit(&zilog->zl_lock); mutex_exit(&zilog->zl_lock);
/*
* Now that we've written this log block, we have a stable pointer
* to the next block in the chain, so it's OK to let the txg in
* which we allocated the next block sync.
*/
txg_rele_to_sync(&lwb->lwb_txgh);
} }
/* /*
@ -752,9 +743,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
} }
if (lwb->lwb_zio == NULL) { if (lwb->lwb_zio == NULL) {
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
0, &lwb->lwb_blk, lwb->lwb_buf, 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz,
lwb->lwb_sz, zil_lwb_write_done, lwb, zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb); ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb);
} }
} }
@ -1040,7 +1031,7 @@ zil_clean(zilog_t *zilog)
if ((itx != NULL) && if ((itx != NULL) &&
(itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
(void) taskq_dispatch(zilog->zl_clean_taskq, (void) taskq_dispatch(zilog->zl_clean_taskq,
(void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP); (task_func_t *)zil_itx_clean, zilog, TQ_SLEEP);
} }
mutex_exit(&zilog->zl_lock); mutex_exit(&zilog->zl_lock);
} }
@ -1216,6 +1207,13 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
spa_t *spa = zilog->zl_spa; spa_t *spa = zilog->zl_spa;
lwb_t *lwb; lwb_t *lwb;
/*
* We don't zero out zl_destroy_txg, so make sure we don't try
* to destroy it twice.
*/
if (spa_sync_pass(spa) != 1)
return;
mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lock);
ASSERT(zilog->zl_stop_sync == 0); ASSERT(zilog->zl_stop_sync == 0);
@ -1226,7 +1224,6 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
blkptr_t blk = zh->zh_log; blkptr_t blk = zh->zh_log;
ASSERT(list_head(&zilog->zl_lwb_list) == NULL); ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
ASSERT(spa_sync_pass(spa) == 1);
bzero(zh, sizeof (zil_header_t)); bzero(zh, sizeof (zil_header_t));
bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
@ -1245,12 +1242,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
} }
} }
for (;;) { while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
lwb = list_head(&zilog->zl_lwb_list);
if (lwb == NULL) {
mutex_exit(&zilog->zl_lock);
return;
}
zh->zh_log = lwb->lwb_blk; zh->zh_log = lwb->lwb_blk;
if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
break; break;
@ -1343,25 +1335,6 @@ zil_free(zilog_t *zilog)
kmem_free(zilog, sizeof (zilog_t)); kmem_free(zilog, sizeof (zilog_t));
} }
/*
* return true if the initial log block is not valid
*/
static boolean_t
zil_empty(zilog_t *zilog)
{
const zil_header_t *zh = zilog->zl_header;
arc_buf_t *abuf = NULL;
if (BP_IS_HOLE(&zh->zh_log))
return (B_TRUE);
if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
return (B_TRUE);
VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
return (B_FALSE);
}
/* /*
* Open an intent log. * Open an intent log.
*/ */
@ -1417,7 +1390,7 @@ zil_suspend(zilog_t *zilog)
const zil_header_t *zh = zilog->zl_header; const zil_header_t *zh = zilog->zl_header;
mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lock);
if (zh->zh_claim_txg != 0) { /* unplayed log */ if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
mutex_exit(&zilog->zl_lock); mutex_exit(&zilog->zl_lock);
return (EBUSY); return (EBUSY);
} }
@ -1601,7 +1574,7 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
const zil_header_t *zh = zilog->zl_header; const zil_header_t *zh = zilog->zl_header;
zil_replay_arg_t zr; zil_replay_arg_t zr;
if (zil_empty(zilog)) { if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
zil_destroy(zilog, B_TRUE); zil_destroy(zilog, B_TRUE);
return; return;
} }
@ -1671,3 +1644,24 @@ out:
mutex_exit(&zilog->zl_lock); mutex_exit(&zilog->zl_lock);
return (ret); return (ret);
} }
/* ARGSUSED */
int
zil_vdev_offline(char *osname, void *arg)
{
objset_t *os;
zilog_t *zilog;
int error;
error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
if (error)
return (error);
zilog = dmu_objset_zil(os);
if (zil_suspend(zilog) != 0)
error = EEXIST;
else
zil_resume(zilog);
dmu_objset_close(os);
return (error);
}

Some files were not shown because too many files have changed in this diff Show More