Merge commit 'refs/top-bases/linux-kernel-module' into linux-kernel-module

Conflicts: module/zcommon/zfs_comutil.c module/zfs/arc.c module/zfs/dmu_objset.c module/zfs/dmu_tx.c module/zfs/dsl_dataset.c module/zfs/dsl_dir.c module/zfs/spa.c module/zfs/spa_boot.c module/zfs/spa_misc.c module/zfs/vdev.c module/zfs/zil.c module/zfs/zio.c
2010-05-28 16:01:35 -07:00 · 2010-05-28 16:01:35 -07:00 · 97c49607ec
parent 65334acb7d 812761eac5
commit 97c49607ec
174 changed files with 35845 additions and 14621 deletions
--- a/ZFS.RELEASE
+++ b/ZFS.RELEASE
@ -1 +1 @@
-http://dlc.sun.com/osol/on/downloads/b121/on-src.tar.bz2
+ssh://anon@hg.opensolaris.org/hg/onnv/onnv-gate/onnv_141
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@ -40,12 +40,14 @@

 extern uint8_t dump_opt[256];

+static char prefix[4] = "\t\t\t";
+
 static void
 print_log_bp(const blkptr_t *bp, const char *prefix)
 {
 	char blkbuf[BP_SPRINTF_LEN];

-	sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+	sprintf_blkptr(blkbuf, bp);
 	(void) printf("%s%s\n", prefix, blkbuf);
 }

@ -54,19 +56,29 @@ static void
 zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
 {
 	time_t crtime = lr->lr_crtime[0];
-	char *name = (char *)(lr + 1);
-	char *link = name + strlen(name) + 1;
+	char *name, *link;
+	lr_attr_t *lrattr;

-	if (txtype == TX_SYMLINK)
-		(void) printf("\t\t\t%s -> %s\n", name, link);
-	else
-		(void) printf("\t\t\t%s\n", name);
+	name = (char *)(lr + 1);

-	(void) printf("\t\t\t%s", ctime(&crtime));
-	(void) printf("\t\t\tdoid %llu, foid %llu, mode %llo\n",
+	if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR ||
+	    lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) {
+		lrattr = (lr_attr_t *)(lr + 1);
+		name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+	}
+
+	if (txtype == TX_SYMLINK) {
+		link = name + strlen(name) + 1;
+		(void) printf("%s%s -> %s\n", prefix, name, link);
+	} else if (txtype != TX_MKXATTR) {
+		(void) printf("%s%s\n", prefix, name);
+	}
+
+	(void) printf("%s%s", prefix, ctime(&crtime));
+	(void) printf("%sdoid %llu, foid %llu, mode %llo\n", prefix,
 	    (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid,
 	    (longlong_t)lr->lr_mode);
-	(void) printf("\t\t\tuid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+	(void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", prefix,
 	    (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
 	    (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
 }
@ -75,7 +87,7 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
 static void
 zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
 {
-	(void) printf("\t\t\tdoid %llu, name %s\n",
+	(void) printf("%sdoid %llu, name %s\n", prefix,
 	    (u_longlong_t)lr->lr_doid, (char *)(lr + 1));
 }

@ -83,7 +95,7 @@ zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
 static void
 zil_prt_rec_link(zilog_t *zilog, int txtype, lr_link_t *lr)
 {
-	(void) printf("\t\t\tdoid %llu, link_obj %llu, name %s\n",
+	(void) printf("%sdoid %llu, link_obj %llu, name %s\n", prefix,
 	    (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
 	    (char *)(lr + 1));
 }
@ -95,9 +107,9 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
 	char *snm = (char *)(lr + 1);
 	char *tnm = snm + strlen(snm) + 1;

-	(void) printf("\t\t\tsdoid %llu, tdoid %llu\n",
+	(void) printf("%ssdoid %llu, tdoid %llu\n", prefix,
 	    (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
-	(void) printf("\t\t\tsrc %s tgt %s\n", snm, tnm);
+	(void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm);
 }

 /* ARGSUSED */
@ -106,43 +118,48 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 {
 	char *data, *dlimit;
 	blkptr_t *bp = &lr->lr_blkptr;
+	zbookmark_t zb;
 	char buf[SPA_MAXBLOCKSIZE];
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int error;

-	(void) printf("\t\t\tfoid %llu, offset 0x%llx,"
-	    " length 0x%llx, blkoff 0x%llx\n",
-	    (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
-	    (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blkoff);
+	(void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix,
+	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
+	    (u_longlong_t)lr->lr_length);

-	if (verbose < 5)
+	if (txtype == TX_WRITE2 || verbose < 5)
 		return;

 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
-		(void) printf("\t\t\thas blkptr, %s\n",
+		(void) printf("%shas blkptr, %s\n", prefix,
 		    bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
-		print_log_bp(bp, "\t\t\t");
+		print_log_bp(bp, prefix);
+
+		if (BP_IS_HOLE(bp)) {
+			(void) printf("\t\t\tLSIZE 0x%llx\n",
+			    (u_longlong_t)BP_GET_LSIZE(bp));
+		}
 		if (bp->blk_birth == 0) {
 			bzero(buf, sizeof (buf));
-		} else {
-			zbookmark_t zb;
+			(void) printf("%s<hole>\n", prefix);
+			return;
+		}
+		if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+			(void) printf("%s<block already committed>\n", prefix);
+			return;
+		}

-			ASSERT3U(bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ==,
-			    dmu_objset_id(zilog->zl_os));
-
-			zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
-			zb.zb_object = 0;
-			zb.zb_level = -1;
-			zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+		SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
+		    lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));

 		error = zio_wait(zio_read(NULL, zilog->zl_spa,
 		    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
 		if (error)
 			return;
-		}
-		data = buf + lr->lr_blkoff;
+		data = buf;
 	} else {
 		data = (char *)(lr + 1);
 	}
@ -150,7 +167,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 	dlimit = data + MIN(lr->lr_length,
 	    (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));

-	(void) printf("\t\t\t");
+	(void) printf("%s", prefix);
 	while (data < dlimit) {
 		if (isprint(*data))
 			(void) printf("%c ", *data);
@ -165,7 +182,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 static void
 zil_prt_rec_truncate(zilog_t *zilog, int txtype, lr_truncate_t *lr)
 {
-	(void) printf("\t\t\tfoid %llu, offset 0x%llx, length 0x%llx\n",
+	(void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", prefix,
 	    (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
 	    (u_longlong_t)lr->lr_length);
 }
@ -177,38 +194,38 @@ zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr)
 	time_t atime = (time_t)lr->lr_atime[0];
 	time_t mtime = (time_t)lr->lr_mtime[0];

-	(void) printf("\t\t\tfoid %llu, mask 0x%llx\n",
+	(void) printf("%sfoid %llu, mask 0x%llx\n", prefix,
 	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);

 	if (lr->lr_mask & AT_MODE) {
-		(void) printf("\t\t\tAT_MODE  %llo\n",
+		(void) printf("%sAT_MODE  %llo\n", prefix,
 		    (longlong_t)lr->lr_mode);
 	}

 	if (lr->lr_mask & AT_UID) {
-		(void) printf("\t\t\tAT_UID   %llu\n",
+		(void) printf("%sAT_UID   %llu\n", prefix,
 		    (u_longlong_t)lr->lr_uid);
 	}

 	if (lr->lr_mask & AT_GID) {
-		(void) printf("\t\t\tAT_GID   %llu\n",
+		(void) printf("%sAT_GID   %llu\n", prefix,
 		    (u_longlong_t)lr->lr_gid);
 	}

 	if (lr->lr_mask & AT_SIZE) {
-		(void) printf("\t\t\tAT_SIZE  %llu\n",
+		(void) printf("%sAT_SIZE  %llu\n", prefix,
 		    (u_longlong_t)lr->lr_size);
 	}

 	if (lr->lr_mask & AT_ATIME) {
-		(void) printf("\t\t\tAT_ATIME %llu.%09llu %s",
+		(void) printf("%sAT_ATIME %llu.%09llu %s", prefix,
 		    (u_longlong_t)lr->lr_atime[0],
 		    (u_longlong_t)lr->lr_atime[1],
 		    ctime(&atime));
 	}

 	if (lr->lr_mask & AT_MTIME) {
-		(void) printf("\t\t\tAT_MTIME %llu.%09llu %s",
+		(void) printf("%sAT_MTIME %llu.%09llu %s", prefix,
 		    (u_longlong_t)lr->lr_mtime[0],
 		    (u_longlong_t)lr->lr_mtime[1],
 		    ctime(&mtime));
@ -219,7 +236,7 @@ zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr)
 static void
 zil_prt_rec_acl(zilog_t *zilog, int txtype, lr_acl_t *lr)
 {
-	(void) printf("\t\t\tfoid %llu, aclcnt %llu\n",
+	(void) printf("%sfoid %llu, aclcnt %llu\n", prefix,
 	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
 }

@ -251,10 +268,11 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
 	{ (zil_prt_rec_func_t)zil_prt_rec_create,	"TX_MKDIR_ACL       " },
 	{ (zil_prt_rec_func_t)zil_prt_rec_create,	"TX_MKDIR_ATTR      " },
 	{ (zil_prt_rec_func_t)zil_prt_rec_create,	"TX_MKDIR_ACL_ATTR  " },
+	{ (zil_prt_rec_func_t)zil_prt_rec_write,	"TX_WRITE2          " },
 };

 /* ARGSUSED */
-static void
+static int
 print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
 {
 	int txtype;
@ -278,23 +296,24 @@ print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)

 	zil_rec_info[txtype].zri_count++;
 	zil_rec_info[0].zri_count++;
+
+	return (0);
 }

 /* ARGSUSED */
-static void
+static int
 print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
-	char blkbuf[BP_SPRINTF_LEN];
+	char blkbuf[BP_SPRINTF_LEN + 10];
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	char *claim;

 	if (verbose <= 3)
-		return;
+		return (0);

 	if (verbose >= 5) {
 		(void) strcpy(blkbuf, ", ");
-		sprintf_blkptr(blkbuf + strlen(blkbuf),
-		    BP_SPRINTF_LEN - strlen(blkbuf), bp);
+		sprintf_blkptr(blkbuf + strlen(blkbuf), bp);
 	} else {
 		blkbuf[0] = '\0';
 	}
@ -308,6 +327,8 @@ print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)

 	(void) printf("\tBlock seqno %llu, %s%s\n",
 	    (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
+
+	return (0);
 }

 static void
@ -340,17 +361,17 @@ dump_intent_log(zilog_t *zilog)
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int i;

-	if (zh->zh_log.blk_birth == 0 || verbose < 2)
+	if (zh->zh_log.blk_birth == 0 || verbose < 1)
 		return;

-	(void) printf("\n    ZIL header: claim_txg %llu, claim_seq %llu",
-	    (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_claim_seq);
+	(void) printf("\n    ZIL header: claim_txg %llu, "
+	    "claim_blk_seq %llu, claim_lr_seq %llu",
+	    (u_longlong_t)zh->zh_claim_txg,
+	    (u_longlong_t)zh->zh_claim_blk_seq,
+	    (u_longlong_t)zh->zh_claim_lr_seq);
 	(void) printf(" replay_seq %llu, flags 0x%llx\n",
 	    (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);

-	if (verbose >= 4)
-		print_log_bp(&zh->zh_log, "\n\tfirst block: ");
-
 	for (i = 0; i < TX_MAX_TYPE; i++)
 		zil_rec_info[i].zri_count = 0;

--- a/cmd/zfs/zfs_iter.c
+++ b/cmd/zfs/zfs_iter.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #include <libintl.h>
@ -107,7 +106,8 @@ zfs_callback(zfs_handle_t *zhp, void *data)
 					zfs_prune_proplist(zhp,
 					    cb->cb_props_table);

-				if (zfs_expand_proplist(zhp, cb->cb_proplist)
+				if (zfs_expand_proplist(zhp, cb->cb_proplist,
+				    (cb->cb_flags & ZFS_ITER_RECVD_PROPS))
 				    != 0) {
 					free(node);
 					return (-1);
@ -350,11 +350,8 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
 	avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t),
 	    offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT);

-	if (avl_pool == NULL) {
-		(void) fprintf(stderr,
-		    gettext("internal error: out of memory\n"));
-		exit(1);
-	}
+	if (avl_pool == NULL)
+		nomem();

 	cb.cb_sortcol = sortcol;
 	cb.cb_flags = flags;
@ -399,11 +396,8 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
 		    sizeof (cb.cb_props_table));
 	}

-	if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) {
-		(void) fprintf(stderr,
-		    gettext("internal error: out of memory\n"));
-		exit(1);
-	}
+	if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+		nomem();

 	if (argc == 0) {
 		/*
@ -453,11 +447,8 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
 	/*
 	 * Finally, clean up the AVL tree.
 	 */
-	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) {
-		(void) fprintf(stderr,
-		    gettext("internal error: out of memory"));
-		exit(1);
-	}
+	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+		nomem();

 	while ((node = uu_avl_walk_next(walk)) != NULL) {
 		uu_avl_remove(cb.cb_avl, node);
--- a/cmd/zfs/zfs_iter.h
+++ b/cmd/zfs/zfs_iter.h
@ -42,6 +42,7 @@ typedef struct zfs_sort_column {
 #define	ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1)
 #define	ZFS_ITER_PROP_LISTSNAPS    (1 << 2)
 #define	ZFS_ITER_DEPTH_LIMIT	   (1 << 3)
+#define	ZFS_ITER_RECVD_PROPS	   (1 << 4)

 int zfs_for_each(int, char **, int options, zfs_type_t,
    zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *);
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
--- a/cmd/zfs/zfs_util.h
+++ b/cmd/zfs/zfs_util.h
@ -19,15 +19,12 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_ZFS_UTIL_H
 #define	_ZFS_UTIL_H

-
-
 #include <libzfs.h>

 #ifdef	__cplusplus
@ -35,6 +32,7 @@ extern "C" {
 #endif

 void * safe_malloc(size_t size);
+void nomem(void);
 libzfs_handle_t *g_zfs;

 #ifdef	__cplusplus
--- a/cmd/zinject/translate.c
+++ b/cmd/zinject/translate.c
@ -19,14 +19,11 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #include <libzfs.h>

-#undef verify	/* both libzfs.h and zfs_context.h want to define this */
-
 #include <sys/zfs_context.h>

 #include <errno.h>
@ -69,6 +66,18 @@ ziprintf(const char *fmt, ...)
 	va_end(ap);
 }

+static void
+compress_slashes(const char *src, char *dest)
+{
+	while (*src != '\0') {
+		*dest = *src++;
+		while (*dest == '/' && *src == '/')
+			++src;
+		++dest;
+	}
+	*dest = '\0';
+}
+
 /*
 * Given a full path to a file, translate into a dataset name and a relative
 * path within the dataset.  'dataset' must be at least MAXNAMELEN characters,
@ -76,13 +85,16 @@ ziprintf(const char *fmt, ...)
 * buffer, which we need later to get the object ID.
 */
 static int
-parse_pathname(const char *fullpath, char *dataset, char *relpath,
+parse_pathname(const char *inpath, char *dataset, char *relpath,
    struct stat64 *statbuf)
 {
 	struct extmnttab mp;
 	FILE *fp;
 	int match;
 	const char *rel;
+	char fullpath[MAXPATHLEN];
+
+	compress_slashes(inpath, fullpath);

 	if (fullpath[0] != '/') {
 		(void) fprintf(stderr, "invalid object '%s': must be full "
@ -162,8 +174,8 @@ object_from_path(const char *dataset, const char *path, struct stat64 *statbuf,
 	 */
 	sync();

-	if ((err = dmu_objset_open(dataset, DMU_OST_ZFS,
-	    DS_MODE_USER | DS_MODE_READONLY, &os)) != 0) {
+	err = dmu_objset_own(dataset, DMU_OST_ZFS, B_TRUE, FTAG, &os);
+	if (err != 0) {
 		(void) fprintf(stderr, "cannot open dataset '%s': %s\n",
 		    dataset, strerror(err));
 		return (-1);
@ -172,7 +184,7 @@ object_from_path(const char *dataset, const char *path, struct stat64 *statbuf,
 	record->zi_objset = dmu_objset_id(os);
 	record->zi_object = statbuf->st_ino;

-	dmu_objset_close(os);
+	dmu_objset_disown(os, FTAG);

 	return (0);
 }
@ -249,17 +261,17 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range,
 	 * Get the dnode associated with object, so we can calculate the block
 	 * size.
 	 */
-	if ((err = dmu_objset_open(dataset, DMU_OST_ANY,
-	    DS_MODE_USER | DS_MODE_READONLY, &os)) != 0) {
+	if ((err = dmu_objset_own(dataset, DMU_OST_ANY,
+	    B_TRUE, FTAG, &os)) != 0) {
 		(void) fprintf(stderr, "cannot open dataset '%s': %s\n",
 		    dataset, strerror(err));
 		goto out;
 	}

 	if (record->zi_object == 0) {
-		dn = os->os->os_meta_dnode;
+		dn = os->os_meta_dnode;
 	} else {
-		err = dnode_hold(os->os, record->zi_object, FTAG, &dn);
+		err = dnode_hold(os, record->zi_object, FTAG, &dn);
 		if (err != 0) {
 			(void) fprintf(stderr, "failed to hold dnode "
 			    "for object %llu\n",
@ -308,11 +320,11 @@ calculate_range(const char *dataset, err_type_t type, int level, char *range,
 	ret = 0;
 out:
 	if (dn) {
-		if (dn != os->os->os_meta_dnode)
+		if (dn != os->os_meta_dnode)
 			dnode_rele(dn, FTAG);
 	}
 	if (os)
-		dmu_objset_close(os);
+		dmu_objset_disown(os, FTAG);

 	return (ret);
 }
@ -351,8 +363,8 @@ translate_record(err_type_t type, const char *object, const char *range,
 		case TYPE_CONFIG:
 			record->zi_type = DMU_OT_PACKED_NVLIST;
 			break;
-		case TYPE_BPLIST:
-			record->zi_type = DMU_OT_BPLIST;
+		case TYPE_BPOBJ:
+			record->zi_type = DMU_OT_BPOBJ;
 			break;
 		case TYPE_SPACEMAP:
 			record->zi_type = DMU_OT_SPACE_MAP;
@ -475,6 +487,14 @@ translate_device(const char *pool, const char *device, err_type_t label_type,
 		record->zi_start = offsetof(vdev_label_t, vl_vdev_phys);
 		record->zi_end = record->zi_start + VDEV_PHYS_SIZE - 1;
 		break;
+	case TYPE_LABEL_PAD1:
+		record->zi_start = offsetof(vdev_label_t, vl_pad1);
+		record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
+		break;
+	case TYPE_LABEL_PAD2:
+		record->zi_start = offsetof(vdev_label_t, vl_pad2);
+		record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
+		break;
 	}
 	return (0);
 }
--- a/cmd/zinject/zinject.c
+++ b/cmd/zinject/zinject.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 /*
@ -42,12 +41,12 @@
 * any attempt to read from the device will return EIO, but any attempt to
 * reopen the device will also return ENXIO.
 * For label faults, the -L option must be specified. This allows faults
- * to be injected into either the nvlist or uberblock region of all the labels
- * for the specified device.
+ * to be injected into either the nvlist, uberblock, pad1, or pad2 region
+ * of all the labels for the specified device.
 *
 * This form of the command looks like:
 *
- * 	zinject -d device [-e errno] [-L <uber | nvlist>] pool
+ * 	zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
 *
 *
 * DATA FAULTS
@ -70,7 +69,7 @@
 * 		mos		Any data in the MOS
 * 		mosdir		object directory
 * 		config		pool configuration
- * 		bplist		blkptr list
+ * 		bpobj		blkptr list
 * 		spacemap	spacemap
 * 		metaslab	metaslab
 * 		errlog		persistent error log
@ -164,11 +163,13 @@ static const char *errtable[TYPE_INVAL] = {
 	"mosdir",
 	"metaslab",
 	"config",
-	"bplist",
+	"bpobj",
 	"spacemap",
 	"errlog",
 	"uber",
-	"nvlist"
+	"nvlist",
+	"pad1",
+	"pad2"
 };

 static err_type_t
@ -192,8 +193,8 @@ type_to_name(uint64_t type)
 		return ("metaslab");
 	case DMU_OT_PACKED_NVLIST:
 		return ("config");
-	case DMU_OT_BPLIST:
-		return ("bplist");
+	case DMU_OT_BPOBJ:
+		return ("bpobj");
 	case DMU_OT_SPACE_MAP:
 		return ("spacemap");
 	case DMU_OT_ERROR_LOG:
@ -222,11 +223,28 @@ usage(void)
 	    "\t\tClear the particular record (if given a numeric ID), or\n"
 	    "\t\tall records if 'all' is specificed.\n"
 	    "\n"
-	    "\tzinject -d device [-e errno] [-L <nvlist|uber>] [-F] pool\n"
+	    "\tzinject -p <function name> pool\n"
+	    "\t\tInject a panic fault at the specified function. Only \n"
+	    "\t\tfunctions which call spa_vdev_config_exit(), or \n"
+	    "\t\tspa_vdev_exit() will trigger a panic.\n"
+	    "\n"
+	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
+	    "\t    [-T <read|write|free|claim|all> pool\n"
 	    "\t\tInject a fault into a particular device or the device's\n"
-	    "\t\tlabel.  Label injection can either be 'nvlist' or 'uber'.\n"
+	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
+	    "\t\t'pad1', or 'pad2'.\n"
 	    "\t\t'errno' can either be 'nxio' (the default) or 'io'.\n"
 	    "\n"
+	    "\tzinject -d device -A <degrade|fault> pool\n"
+	    "\t\tPerform a specific action on a particular device\n"
+	    "\n"
+	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
+	    "\t\tCause the pool to stop writing blocks yet not\n"
+	    "\t\treport errors for a duration.  Simulates buggy hardware\n"
+	    "\t\tthat fails to honor cache flush requests.\n"
+	    "\t\tDefault duration is 30 seconds.  The machine is panicked\n"
+	    "\t\tat the end of the duration.\n"
+	    "\n"
 	    "\tzinject -b objset:object:level:blkid pool\n"
 	    "\n"
 	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
@ -267,7 +285,7 @@ usage(void)
 	    "\t\t\ton a ZFS filesystem.\n"
 	    "\n"
 	    "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
-	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bplist,\n"
+	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bpobj,\n"
 	    "\t\t\tspacemap, metaslab, errlog.  The only valid <object> is\n"
 	    "\t\t\tthe poolname.\n");
 }
@ -286,6 +304,12 @@ iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
 		    &zc.zc_inject_record, data)) != 0)
 			return (ret);

+	if (errno != ENOENT) {
+		(void) fprintf(stderr, "Unable to list handlers: %s\n",
+		    strerror(errno));
+		return (-1);
+	}
+
 	return (0);
 }

@ -295,7 +319,7 @@ print_data_handler(int id, const char *pool, zinject_record_t *record,
 {
 	int *count = data;

-	if (record->zi_guid != 0)
+	if (record->zi_guid != 0 || record->zi_func[0] != '\0')
 		return (0);

 	if (*count == 0) {
@ -327,7 +351,7 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
 {
 	int *count = data;

-	if (record->zi_guid == 0)
+	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
 		return (0);

 	if (*count == 0) {
@ -343,6 +367,27 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
 	return (0);
 }

+static int
+print_panic_handler(int id, const char *pool, zinject_record_t *record,
+    void *data)
+{
+	int *count = data;
+
+	if (record->zi_func[0] == '\0')
+		return (0);
+
+	if (*count == 0) {
+		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "FUNCTION");
+		(void) printf("---  ---------------  ----------------\n");
+	}
+
+	*count += 1;
+
+	(void) printf("%3d  %-15s  %s\n", id, pool, record->zi_func);
+
+	return (0);
+}
+
 /*
 * Print all registered error handlers.  Returns the number of handlers
 * registered.
@ -356,6 +401,9 @@ print_all_handlers(void)
 	(void) printf("\n");
 	count = 0;
 	(void) iter_handlers(print_data_handler, &count);
+	(void) printf("\n");
+	count = 0;
+	(void) iter_handlers(print_panic_handler, &count);

 	return (count);
 }
@ -386,6 +434,7 @@ cancel_all_handlers(void)
 {
 	int ret = iter_handlers(cancel_one_handler, NULL);

+	if (ret == 0)
 		(void) printf("removed all registered handlers\n");

 	return (ret);
@ -443,6 +492,15 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
 		if (record->zi_guid) {
 			(void) printf("  vdev: %llx\n",
 			    (u_longlong_t)record->zi_guid);
+		} else if (record->zi_func[0] != '\0') {
+			(void) printf("  panic function: %s\n",
+			    record->zi_func);
+		} else if (record->zi_duration > 0) {
+			(void) printf(" time: %lld seconds\n",
+			    (u_longlong_t)record->zi_duration);
+		} else if (record->zi_duration < 0) {
+			(void) printf(" txgs: %lld \n",
+			    (u_longlong_t)-record->zi_duration);
 		} else {
 			(void) printf("objset: %llu\n",
 			    (u_longlong_t)record->zi_objset);
@ -464,6 +522,22 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
 	return (0);
 }

+int
+perform_action(const char *pool, zinject_record_t *record, int cmd)
+{
+	zfs_cmd_t zc;
+
+	ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
+	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
+	zc.zc_guid = record->zi_guid;
+	zc.zc_cookie = cmd;
+
+	if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+		return (0);
+
+	return (1);
+}
+
 int
 main(int argc, char **argv)
 {
@ -477,12 +551,17 @@ main(int argc, char **argv)
 	int quiet = 0;
 	int error = 0;
 	int domount = 0;
+	int io_type = ZIO_TYPES;
+	int action = VDEV_STATE_UNKNOWN;
 	err_type_t type = TYPE_INVAL;
 	err_type_t label = TYPE_INVAL;
 	zinject_record_t record = { 0 };
 	char pool[MAXNAMELEN];
 	char dataset[MAXNAMELEN];
 	zfs_handle_t *zhp;
+	int nowrites = 0;
+	int dur_txg = 0;
+	int dur_secs = 0;
 	int ret;
 	int flags = 0;

@ -514,11 +593,24 @@ main(int argc, char **argv)
 		return (0);
 	}

-	while ((c = getopt(argc, argv, ":ab:d:f:Fqhc:t:l:mr:e:uL:")) != -1) {
+	while ((c = getopt(argc, argv,
+	    ":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
 		switch (c) {
 		case 'a':
 			flags |= ZINJECT_FLUSH_ARC;
 			break;
+		case 'A':
+			if (strcasecmp(optarg, "degrade") == 0) {
+				action = VDEV_STATE_DEGRADED;
+			} else if (strcasecmp(optarg, "fault") == 0) {
+				action = VDEV_STATE_FAULTED;
+			} else {
+				(void) fprintf(stderr, "invalid action '%s': "
+				    "must be 'degrade' or 'fault'\n", optarg);
+				usage();
+				return (1);
+			}
+			break;
 		case 'b':
 			raw = optarg;
 			break;
@ -554,9 +646,27 @@ main(int argc, char **argv)
 		case 'F':
 			record.zi_failfast = B_TRUE;
 			break;
+		case 'g':
+			dur_txg = 1;
+			record.zi_duration = (int)strtol(optarg, &end, 10);
+			if (record.zi_duration <= 0 || *end != '\0') {
+				(void) fprintf(stderr, "invalid duration '%s': "
+				    "must be a positive integer\n", optarg);
+				usage();
+				return (1);
+			}
+			/* store duration of txgs as its negative */
+			record.zi_duration *= -1;
+			break;
 		case 'h':
 			usage();
 			return (0);
+		case 'I':
+			/* default duration, if one hasn't yet been defined */
+			nowrites = 1;
+			if (dur_secs == 0 && dur_txg == 0)
+				record.zi_duration = 30;
+			break;
 		case 'l':
 			level = (int)strtol(optarg, &end, 10);
 			if (*end != '\0') {
@ -569,12 +679,45 @@ main(int argc, char **argv)
 		case 'm':
 			domount = 1;
 			break;
+		case 'p':
+			(void) strlcpy(record.zi_func, optarg,
+			    sizeof (record.zi_func));
+			break;
 		case 'q':
 			quiet = 1;
 			break;
 		case 'r':
 			range = optarg;
 			break;
+		case 's':
+			dur_secs = 1;
+			record.zi_duration = (int)strtol(optarg, &end, 10);
+			if (record.zi_duration <= 0 || *end != '\0') {
+				(void) fprintf(stderr, "invalid duration '%s': "
+				    "must be a positive integer\n", optarg);
+				usage();
+				return (1);
+			}
+			break;
+		case 'T':
+			if (strcasecmp(optarg, "read") == 0) {
+				io_type = ZIO_TYPE_READ;
+			} else if (strcasecmp(optarg, "write") == 0) {
+				io_type = ZIO_TYPE_WRITE;
+			} else if (strcasecmp(optarg, "free") == 0) {
+				io_type = ZIO_TYPE_FREE;
+			} else if (strcasecmp(optarg, "claim") == 0) {
+				io_type = ZIO_TYPE_CLAIM;
+			} else if (strcasecmp(optarg, "all") == 0) {
+				io_type = ZIO_TYPES;
+			} else {
+				(void) fprintf(stderr, "invalid I/O type "
+				    "'%s': must be 'read', 'write', 'free', "
+				    "'claim' or 'all'\n", optarg);
+				usage();
+				return (1);
+			}
+			break;
 		case 't':
 			if ((type = name_to_type(optarg)) == TYPE_INVAL &&
 			    !MOS_TYPE(type)) {
@ -617,7 +760,8 @@ main(int argc, char **argv)
 		 * '-c' is invalid with any other options.
 		 */
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-		    level != 0) {
+		    level != 0 || record.zi_func[0] != '\0' ||
+		    record.zi_duration != 0) {
 			(void) fprintf(stderr, "cancel (-c) incompatible with "
 			    "any other options\n");
 			usage();
@ -649,7 +793,8 @@ main(int argc, char **argv)
 		 * for doing injection, so handle it separately here.
 		 */
 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
-		    level != 0) {
+		    level != 0 || record.zi_func[0] != '\0' ||
+		    record.zi_duration != 0) {
 			(void) fprintf(stderr, "device (-d) incompatible with "
 			    "data error injection\n");
 			usage();
@ -672,12 +817,18 @@ main(int argc, char **argv)
 			return (1);
 		}

+		record.zi_iotype = io_type;
 		if (translate_device(pool, device, label, &record) != 0)
 			return (1);
 		if (!error)
 			error = ENXIO;
+
+		if (action != VDEV_STATE_UNKNOWN)
+			return (perform_action(pool, &record, action));
+
 	} else if (raw != NULL) {
-		if (range != NULL || type != TYPE_INVAL || level != 0) {
+		if (range != NULL || type != TYPE_INVAL || level != 0 ||
+		    record.zi_func[0] != '\0' || record.zi_duration != 0) {
 			(void) fprintf(stderr, "raw (-b) format with "
 			    "any other options\n");
 			usage();
@ -704,10 +855,52 @@ main(int argc, char **argv)
 			return (1);
 		if (!error)
 			error = EIO;
+	} else if (record.zi_func[0] != '\0') {
+		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+		    level != 0 || device != NULL || record.zi_duration != 0) {
+			(void) fprintf(stderr, "panic (-p) incompatible with "
+			    "other options\n");
+			usage();
+			return (2);
+		}
+
+		if (argc < 1 || argc > 2) {
+			(void) fprintf(stderr, "panic (-p) injection requires "
+			    "a single pool name and an optional id\n");
+			usage();
+			return (2);
+		}
+
+		(void) strcpy(pool, argv[0]);
+		if (argv[1] != NULL)
+			record.zi_type = atoi(argv[1]);
+		dataset[0] = '\0';
+	} else if (record.zi_duration != 0) {
+		if (nowrites == 0) {
+			(void) fprintf(stderr, "-s or -g meaningless "
+			    "without -I (ignore writes)\n");
+			usage();
+			return (2);
+		} else if (dur_secs && dur_txg) {
+			(void) fprintf(stderr, "choose a duration either "
+			    "in seconds (-s) or a number of txgs (-g) "
+			    "but not both\n");
+			usage();
+			return (2);
+		} else if (argc != 1) {
+			(void) fprintf(stderr, "ignore writes (-I) "
+			    "injection requires a single pool name\n");
+			usage();
+			return (2);
+		}
+
+		(void) strcpy(pool, argv[0]);
+		dataset[0] = '\0';
 	} else if (type == TYPE_INVAL) {
 		if (flags == 0) {
 			(void) fprintf(stderr, "at least one of '-b', '-d', "
-			    "'-t', '-a', or '-u' must be specified\n");
+			    "'-t', '-a', '-p', '-I' or '-u' "
+			    "must be specified\n");
 			usage();
 			return (2);
 		}
--- a/cmd/zinject/zinject.h
+++ b/cmd/zinject/zinject.h
@ -19,15 +19,12 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_ZINJECT_H
 #define	_ZINJECT_H

-
-
 #include <sys/zfs_ioctl.h>

 #ifdef	__cplusplus
@ -41,11 +38,13 @@ typedef enum {
 	TYPE_MOSDIR,		/* MOS object directory		*/
 	TYPE_METASLAB,		/* metaslab objects		*/
 	TYPE_CONFIG,		/* MOS config			*/
-	TYPE_BPLIST,		/* block pointer list		*/
+	TYPE_BPOBJ,		/* block pointer list		*/
 	TYPE_SPACEMAP,		/* space map objects		*/
 	TYPE_ERRLOG,		/* persistent error log		*/
 	TYPE_LABEL_UBERBLOCK,	/* label specific uberblock	*/
 	TYPE_LABEL_NVLIST,	/* label specific nvlist	*/
+	TYPE_LABEL_PAD1,	/* label specific 8K pad1 area	*/
+	TYPE_LABEL_PAD2,	/* label specific 8K pad2 area	*/
 	TYPE_INVAL
 } err_type_t;

--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
--- a/cmd/zpool/zpool_util.c
+++ b/cmd/zpool/zpool_util.c
@ -19,12 +19,10 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

-
-
 #include <errno.h>
 #include <libgen.h>
 #include <libintl.h>
@ -50,22 +48,6 @@ safe_malloc(size_t size)
 	return (data);
 }

-/*
- * Same as above, but for strdup()
- */
-char *
-safe_strdup(const char *str)
-{
-	char *ret;
-
-	if ((ret = strdup(str)) == NULL) {
-		(void) fprintf(stderr, "internal error: out of memory\n");
-		exit(1);
-	}
-
-	return (ret);
-}
-
 /*
 * Display an out of memory error message and abort the current program.
 */
--- a/cmd/zpool/zpool_util.h
+++ b/cmd/zpool/zpool_util.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	ZPOOL_UTIL_H
@ -37,7 +36,6 @@ extern "C" {
 * Basic utility functions
 */
 void *safe_malloc(size_t);
-char *safe_strdup(const char *);
 void zpool_no_memory(void);
 uint_t num_logs(nvlist_t *nv);

@ -46,7 +44,9 @@ uint_t num_logs(nvlist_t *nv);
 */

 nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
-    boolean_t isreplace, boolean_t dryrun, int argc, char **argv);
+    boolean_t replacing, boolean_t dryrun, int argc, char **argv);
+nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
+    nvlist_t *props, splitflags_t flags, int argc, char **argv);

 /*
 * Pool list functions
--- a/cmd/zpool/zpool_vdev.c
+++ b/cmd/zpool/zpool_vdev.c
@ -20,8 +20,7 @@
 */

 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 /*
@ -1004,8 +1003,8 @@ is_spare(nvlist_t *config, const char *path)
 		return (B_FALSE);
 	}
 	free(name);
-
 	(void) close(fd);
+
 	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
 	nvlist_free(label);

@ -1029,8 +1028,8 @@ is_spare(nvlist_t *config, const char *path)
 * the majority of this task.
 */
 static int
-check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
-    int isspare)
+check_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
+    boolean_t replacing, boolean_t isspare)
 {
 	nvlist_t **child;
 	uint_t c, children;
@ -1051,13 +1050,14 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
 		 * hot spare within the same pool.  If so, we allow it
 		 * regardless of what libdiskmgt or zpool_in_use() says.
 		 */
-		if (isreplacing) {
+		if (replacing) {
 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 			    &wholedisk) == 0 && wholedisk)
 				(void) snprintf(buf, sizeof (buf), "%ss0",
 				    path);
 			else
 				(void) strlcpy(buf, path, sizeof (buf));
+
 			if (is_spare(config, buf))
 				return (0);
 		}
@ -1073,21 +1073,21 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,

 	for (c = 0; c < children; c++)
 		if ((ret = check_in_use(config, child[c], force,
-		    isreplacing, B_FALSE)) != 0)
+		    replacing, B_FALSE)) != 0)
 			return (ret);

 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
 			if ((ret = check_in_use(config, child[c], force,
-			    isreplacing, B_TRUE)) != 0)
+			    replacing, B_TRUE)) != 0)
 				return (ret);

 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
 			if ((ret = check_in_use(config, child[c], force,
-			    isreplacing, B_FALSE)) != 0)
+			    replacing, B_FALSE)) != 0)
 				return (ret);

 	return (0);
@ -1360,6 +1360,52 @@ construct_spec(int argc, char **argv)
 	return (nvroot);
 }

+nvlist_t *
+split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
+    splitflags_t flags, int argc, char **argv)
+{
+	nvlist_t *newroot = NULL, **child;
+	uint_t c, children;
+
+	if (argc > 0) {
+		if ((newroot = construct_spec(argc, argv)) == NULL) {
+			(void) fprintf(stderr, gettext("Unable to build a "
+			    "pool from the specified devices\n"));
+			return (NULL);
+		}
+
+		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
+			nvlist_free(newroot);
+			return (NULL);
+		}
+
+		/* avoid any tricks in the spec */
+		verify(nvlist_lookup_nvlist_array(newroot,
+		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
+		for (c = 0; c < children; c++) {
+			char *path;
+			const char *type;
+			int min, max;
+
+			verify(nvlist_lookup_string(child[c],
+			    ZPOOL_CONFIG_PATH, &path) == 0);
+			if ((type = is_grouping(path, &min, &max)) != NULL) {
+				(void) fprintf(stderr, gettext("Cannot use "
+				    "'%s' as a device for splitting\n"), type);
+				nvlist_free(newroot);
+				return (NULL);
+			}
+		}
+	}
+
+	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
+		if (newroot != NULL)
+			nvlist_free(newroot);
+		return (NULL);
+	}
+
+	return (newroot);
+}

 /*
 * Get and validate the contents of the given vdev specification.  This ensures
@ -1373,7 +1419,7 @@ construct_spec(int argc, char **argv)
 */
 nvlist_t *
 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
-    boolean_t isreplacing, boolean_t dryrun, int argc, char **argv)
+    boolean_t replacing, boolean_t dryrun, int argc, char **argv)
 {
 	nvlist_t *newroot;
 	nvlist_t *poolconfig = NULL;
@ -1396,8 +1442,7 @@ make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
 	 * uses (such as a dedicated dump device) that even '-f' cannot
 	 * override.
 	 */
-	if (check_in_use(poolconfig, newroot, force, isreplacing,
-	    B_FALSE) != 0) {
+	if (check_in_use(poolconfig, newroot, force, replacing, B_FALSE) != 0) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
--- a/lib/libnvpair/include/libnvpair.h
+++ b/lib/libnvpair/include/libnvpair.h
@ -19,15 +19,13 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

 #ifndef	_LIBNVPAIR_H
 #define	_LIBNVPAIR_H

-
-
 #include <sys/nvpair.h>
 #include <stdlib.h>
 #include <stdio.h>
@ -40,6 +38,7 @@ extern "C" {
 void nvlist_print(FILE *, nvlist_t *);
 int nvpair_value_match(nvpair_t *, int, char *, char **);
 int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, char **);
+void dump_nvlist(nvlist_t *, int);

 #ifdef	__cplusplus
 }
--- a/lib/libnvpair/libnvpair.c
+++ b/lib/libnvpair/libnvpair.c
@ -19,14 +19,13 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

-
-
 #include <unistd.h>
 #include <strings.h>
+#include <libintl.h>
 #include <sys/types.h>
 #include <sys/inttypes.h>
 #include "libnvpair.h"
@ -272,6 +271,156 @@ nvlist_print(FILE *fp, nvlist_t *nvl)
 	nvlist_print_with_indent(fp, nvl, 0);
 }

+
+#define	NVP(elem, type, vtype, ptype, format) { \
+	vtype	value; \
+\
+	(void) nvpair_value_##type(elem, &value); \
+	(void) printf("%*s%s: " format "\n", indent, "", \
+	    nvpair_name(elem), (ptype)value); \
+}
+
+#define	NVPA(elem, type, vtype, ptype, format) { \
+	uint_t	i, count; \
+	vtype	*value;  \
+\
+	(void) nvpair_value_##type(elem, &value, &count); \
+	for (i = 0; i < count; i++) { \
+		(void) printf("%*s%s[%d]: " format "\n", indent, "", \
+		    nvpair_name(elem), i, (ptype)value[i]); \
+	} \
+}
+
+/*
+ * Similar to nvlist_print() but handles arrays slightly differently.
+ */
+void
+dump_nvlist(nvlist_t *list, int indent)
+{
+	nvpair_t	*elem = NULL;
+	boolean_t	bool_value;
+	nvlist_t	*nvlist_value;
+	nvlist_t	**nvlist_array_value;
+	uint_t		i, count;
+
+	if (list == NULL) {
+		return;
+	}
+
+	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
+		switch (nvpair_type(elem)) {
+		case DATA_TYPE_BOOLEAN_VALUE:
+			(void) nvpair_value_boolean_value(elem, &bool_value);
+			(void) printf("%*s%s: %s\n", indent, "",
+			    nvpair_name(elem), bool_value ? "true" : "false");
+			break;
+
+		case DATA_TYPE_BYTE:
+			NVP(elem, byte, uchar_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT8:
+			NVP(elem, int8, int8_t, int, "%d");
+			break;
+
+		case DATA_TYPE_UINT8:
+			NVP(elem, uint8, uint8_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT16:
+			NVP(elem, int16, int16_t, int, "%d");
+			break;
+
+		case DATA_TYPE_UINT16:
+			NVP(elem, uint16, uint16_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT32:
+			NVP(elem, int32, int32_t, long, "%ld");
+			break;
+
+		case DATA_TYPE_UINT32:
+			NVP(elem, uint32, uint32_t, ulong_t, "%lu");
+			break;
+
+		case DATA_TYPE_INT64:
+			NVP(elem, int64, int64_t, longlong_t, "%lld");
+			break;
+
+		case DATA_TYPE_UINT64:
+			NVP(elem, uint64, uint64_t, u_longlong_t, "%llu");
+			break;
+
+		case DATA_TYPE_STRING:
+			NVP(elem, string, char *, char *, "'%s'");
+			break;
+
+		case DATA_TYPE_BYTE_ARRAY:
+			NVPA(elem, byte_array, uchar_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT8_ARRAY:
+			NVPA(elem, int8_array, int8_t, int, "%d");
+			break;
+
+		case DATA_TYPE_UINT8_ARRAY:
+			NVPA(elem, uint8_array, uint8_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT16_ARRAY:
+			NVPA(elem, int16_array, int16_t, int, "%d");
+			break;
+
+		case DATA_TYPE_UINT16_ARRAY:
+			NVPA(elem, uint16_array, uint16_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT32_ARRAY:
+			NVPA(elem, int32_array, int32_t, long, "%ld");
+			break;
+
+		case DATA_TYPE_UINT32_ARRAY:
+			NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu");
+			break;
+
+		case DATA_TYPE_INT64_ARRAY:
+			NVPA(elem, int64_array, int64_t, longlong_t, "%lld");
+			break;
+
+		case DATA_TYPE_UINT64_ARRAY:
+			NVPA(elem, uint64_array, uint64_t, u_longlong_t,
+			    "%llu");
+			break;
+
+		case DATA_TYPE_STRING_ARRAY:
+			NVPA(elem, string_array, char *, char *, "'%s'");
+			break;
+
+		case DATA_TYPE_NVLIST:
+			(void) nvpair_value_nvlist(elem, &nvlist_value);
+			(void) printf("%*s%s:\n", indent, "",
+			    nvpair_name(elem));
+			dump_nvlist(nvlist_value, indent + 4);
+			break;
+
+		case DATA_TYPE_NVLIST_ARRAY:
+			(void) nvpair_value_nvlist_array(elem,
+			    &nvlist_array_value, &count);
+			for (i = 0; i < count; i++) {
+				(void) printf("%*s%s[%u]:\n", indent, "",
+				    nvpair_name(elem), i);
+				dump_nvlist(nvlist_array_value[i], indent + 4);
+			}
+			break;
+
+		default:
+			(void) printf(dgettext(TEXT_DOMAIN, "bad config type "
+			    "%d for %s\n"), nvpair_type(elem),
+			    nvpair_name(elem));
+		}
+	}
+}
+
 /*
 * Determine if string 'value' matches 'nvp' value.  The 'value' string is
 * converted, depending on the type of 'nvp', prior to match.  For numeric
--- a/lib/libzfs/include/libzfs.h
+++ b/lib/libzfs/include/libzfs.h
@ -20,8 +20,7 @@
 */

 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_LIBZFS_H
@ -66,7 +65,6 @@ enum {
 	EZFS_BADSTREAM,		/* bad backup stream */
 	EZFS_DSREADONLY,	/* dataset is readonly */
 	EZFS_VOLTOOBIG,		/* volume is too large for 32-bit system */
-	EZFS_VOLHASDATA,	/* volume already contains data */
 	EZFS_INVALIDNAME,	/* invalid dataset name */
 	EZFS_BADRESTORE,	/* unable to restore to destination */
 	EZFS_BADBACKUP,		/* backup failed */
@ -85,17 +83,15 @@ enum {
 	EZFS_UMOUNTFAILED,	/* failed to unmount dataset */
 	EZFS_UNSHARENFSFAILED,	/* unshare(1M) failed */
 	EZFS_SHARENFSFAILED,	/* share(1M) failed */
-	EZFS_DEVLINKS,		/* failed to create zvol links */
 	EZFS_PERM,		/* permission denied */
 	EZFS_NOSPC,		/* out of space */
+	EZFS_FAULT,		/* bad address */
 	EZFS_IO,		/* I/O error */
 	EZFS_INTR,		/* signal received */
 	EZFS_ISSPARE,		/* device is a hot spare */
 	EZFS_INVALCONFIG,	/* invalid vdev configuration */
 	EZFS_RECURSIVE,		/* recursive dependency */
 	EZFS_NOHISTORY,		/* no history object */
-	EZFS_UNSHAREISCSIFAILED, /* iscsitgtd failed request to unshare */
-	EZFS_SHAREISCSIFAILED,	/* iscsitgtd failed request to share */
 	EZFS_POOLPROPS,		/* couldn't retrieve pool props */
 	EZFS_POOL_NOTSUP,	/* ops not supported for this type of pool */
 	EZFS_POOL_INVALARG,	/* invalid argument for this pool operation */
@ -103,7 +99,6 @@ enum {
 	EZFS_OPENFAILED,	/* open of device failed */
 	EZFS_NOCAP,		/* couldn't get capacity */
 	EZFS_LABELFAILED,	/* write of label failed */
-	EZFS_ISCSISVCUNAVAIL,	/* iscsi service unavailable */
 	EZFS_BADWHO,		/* invalid permission who */
 	EZFS_BADPERM,		/* invalid permission */
 	EZFS_BADPERMSET,	/* invalid permission set name */
@ -119,6 +114,12 @@ enum {
 	EZFS_UNPLAYED_LOGS,	/* log device has unplayed logs */
 	EZFS_REFTAG_RELE,	/* snapshot release: tag not found */
 	EZFS_REFTAG_HOLD,	/* snapshot hold: tag already exists */
+	EZFS_TAGTOOLONG,	/* snapshot hold/rele: tag too long */
+	EZFS_PIPEFAILED,	/* pipe create failed */
+	EZFS_THREADCREATEFAILED, /* thread create failed */
+	EZFS_POSTSPLIT_ONLINE,	/* onlining a disk after splitting it */
+	EZFS_SCRUBBING,		/* currently scrubbing */
+	EZFS_NO_SCRUB,		/* no active scrub */
 	EZFS_UNKNOWN
 };

@ -213,11 +214,19 @@ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
 extern int zpool_destroy(zpool_handle_t *);
 extern int zpool_add(zpool_handle_t *, nvlist_t *);

+typedef struct splitflags {
+	/* do not split, but return the config that would be split off */
+	int dryrun : 1;
+
+	/* after splitting, import the pool */
+	int import : 1;
+} splitflags_t;
+
 /*
 * Functions to manipulate pool and vdev state
 */
-extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t);
-extern int zpool_clear(zpool_handle_t *, const char *);
+extern int zpool_scan(zpool_handle_t *, pool_scan_func_t);
+extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);

 extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
    vdev_state_t *);
@ -226,9 +235,11 @@ extern int zpool_vdev_attach(zpool_handle_t *, const char *,
    const char *, nvlist_t *, int);
 extern int zpool_vdev_detach(zpool_handle_t *, const char *);
 extern int zpool_vdev_remove(zpool_handle_t *, const char *);
+extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *,
+    splitflags_t);

-extern int zpool_vdev_fault(zpool_handle_t *, uint64_t);
-extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t);
+extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t);
+extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t);
 extern int zpool_vdev_clear(zpool_handle_t *, uint64_t);

 extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *,
@ -298,6 +309,7 @@ typedef enum {

 extern zpool_status_t zpool_get_status(zpool_handle_t *, char **);
 extern zpool_status_t zpool_import_status(nvlist_t *, char **);
+extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh);

 /*
 * Statistics and configuration functions.
@ -319,23 +331,38 @@ extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
 /*
 * Search for pools to import
 */
+
+typedef struct importargs {
+	char **path;		/* a list of paths to search		*/
+	int paths;		/* number of paths to search		*/
+	char *poolname;		/* name of a pool to find		*/
+	uint64_t guid;		/* guid of a pool to find		*/
+	char *cachefile;	/* cachefile to use for import		*/
+	int can_be_active : 1;	/* can the pool be active?		*/
+	int unique : 1;		/* does 'poolname' already exist?	*/
+	int exists : 1;		/* set on return if pool already exists	*/
+} importargs_t;
+
+extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *);
+
+/* legacy pool search routines */
 extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **);
 extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *,
    char *, uint64_t);
-extern nvlist_t *zpool_find_import_byname(libzfs_handle_t *, int, char **,
-    char *);
-extern nvlist_t *zpool_find_import_byguid(libzfs_handle_t *, int, char **,
-    uint64_t);
-extern nvlist_t *zpool_find_import_activeok(libzfs_handle_t *, int, char **);

 /*
 * Miscellaneous pool functions
 */
 struct zfs_cmd;

-extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *);
+extern const char *zfs_history_event_names[LOG_END];
+
+extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
+    boolean_t verbose);
 extern int zpool_upgrade(zpool_handle_t *, uint64_t);
 extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
+extern int zpool_history_unpack(char *, uint64_t, uint64_t *,
+    nvlist_t ***, uint_t *);
 extern void zpool_set_history_str(const char *subcommand, int argc,
    char **argv, char *history_str);
 extern int zpool_stage_history(libzfs_handle_t *, const char *);
@ -343,6 +370,8 @@ extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *,
    size_t len);
 extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *);
 extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
+extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
+    nvlist_t *);

 /*
 * Basic handle manipulations.  These functions do not create or destroy the
@ -374,6 +403,8 @@ extern const char *zfs_prop_to_name(zfs_prop_t);
 extern int zfs_prop_set(zfs_handle_t *, const char *, const char *);
 extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t,
    zprop_source_t *, char *, size_t, boolean_t);
+extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t,
+    boolean_t);
 extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *,
    zprop_source_t *, char *, size_t);
 extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
@ -381,10 +412,11 @@ extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
 extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
    char *propbuf, int proplen, boolean_t literal);
 extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
-extern int zfs_prop_inherit(zfs_handle_t *, const char *);
+extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t);
 extern const char *zfs_prop_values(zfs_prop_t);
 extern int zfs_prop_is_string(zfs_prop_t prop);
 extern nvlist_t *zfs_get_user_props(zfs_handle_t *);
+extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *);

 typedef struct zprop_list {
 	int		pl_prop;
@ -392,10 +424,11 @@ typedef struct zprop_list {
 	struct zprop_list *pl_next;
 	boolean_t	pl_all;
 	size_t		pl_width;
+	size_t		pl_recvd_width;
 	boolean_t	pl_fixed;
 } zprop_list_t;

-extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **);
+extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t);
 extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *);

 #define	ZFS_MOUNTPOINT_NONE	"none"
@ -419,13 +452,24 @@ extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **,
    zfs_type_t);
 extern void zprop_free_list(zprop_list_t *);

+#define	ZFS_GET_NCOLS	5
+
+typedef enum {
+	GET_COL_NONE,
+	GET_COL_NAME,
+	GET_COL_PROPERTY,
+	GET_COL_VALUE,
+	GET_COL_RECVD,
+	GET_COL_SOURCE
+} zfs_get_column_t;
+
 /*
 * Functions for printing zfs or zpool properties
 */
 typedef struct zprop_get_cbdata {
 	int cb_sources;
-	int cb_columns[4];
-	int cb_colwidths[5];
+	zfs_get_column_t cb_columns[ZFS_GET_NCOLS];
+	int cb_colwidths[ZFS_GET_NCOLS + 1];
 	boolean_t cb_scripted;
 	boolean_t cb_literal;
 	boolean_t cb_first;
@ -434,12 +478,8 @@ typedef struct zprop_get_cbdata {
 } zprop_get_cbdata_t;

 void zprop_print_one_property(const char *, zprop_get_cbdata_t *,
-    const char *, const char *, zprop_source_t, const char *);
-
-#define	GET_COL_NAME		1
-#define	GET_COL_PROPERTY	2
-#define	GET_COL_VALUE		3
-#define	GET_COL_SOURCE		4
+    const char *, const char *, zprop_source_t, const char *,
+    const char *);

 /*
 * Iterator functions.
@ -450,6 +490,7 @@ extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
 extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *);
+extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *);

 /*
 * Functions to create and destroy datasets.
@ -463,11 +504,42 @@ extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
 extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
 extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t);
 extern int zfs_rename(zfs_handle_t *, const char *, boolean_t);
-extern int zfs_send(zfs_handle_t *, const char *, const char *,
-    boolean_t, boolean_t, boolean_t, boolean_t, int);
+
+typedef struct sendflags {
+	/* print informational messages (ie, -v was specified) */
+	int verbose : 1;
+
+	/* recursive send  (ie, -R) */
+	int replicate : 1;
+
+	/* for incrementals, do all intermediate snapshots */
+	int doall : 1; /* (ie, -I) */
+
+	/* if dataset is a clone, do incremental from its origin */
+	int fromorigin : 1;
+
+	/* do deduplication */
+	int dedup : 1;
+
+	/* send properties (ie, -p) */
+	int props : 1;
+} sendflags_t;
+
+typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
+
+extern int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
+    sendflags_t flags, int outfd, snapfilter_cb_t filter_func,
+    void *cb_arg, nvlist_t **debugnvp);
+
 extern int zfs_promote(zfs_handle_t *);
-extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t);
+extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t,
+    boolean_t, boolean_t);
+extern int zfs_hold_range(zfs_handle_t *, const char *, const char *,
+    const char *, boolean_t, boolean_t, snapfilter_cb_t, void *);
 extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
+extern int zfs_release_range(zfs_handle_t *, const char *, const char *,
+    const char *, boolean_t);
+extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);

 typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
    uid_t rid, uint64_t space);
@ -482,6 +554,12 @@ typedef struct recvflags {
 	/* the destination is a prefix, not the exact fs (ie, -d) */
 	int isprefix : 1;

+	/*
+	 * Only the tail of the sent snapshot path is appended to the
+	 * destination to determine the received snapshot name (ie, -e).
+	 */
+	int istail : 1;
+
 	/* do not actually do the recv, just check if it would work (ie, -n) */
 	int dryrun : 1;

@ -542,10 +620,6 @@ extern int zfs_unshareall_nfs(zfs_handle_t *);
 extern int zfs_unshareall_smb(zfs_handle_t *);
 extern int zfs_unshareall_bypath(zfs_handle_t *, const char *);
 extern int zfs_unshareall(zfs_handle_t *);
-extern boolean_t zfs_is_shared_iscsi(zfs_handle_t *);
-extern int zfs_share_iscsi(zfs_handle_t *);
-extern int zfs_unshare_iscsi(zfs_handle_t *);
-extern int zfs_iscsi_perm_check(libzfs_handle_t *, char *, ucred_t *);
 extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *,
    void *, void *, int, zfs_share_op_t);

@ -571,15 +645,10 @@ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **,
    boolean_t *);

 /*
- * ftyp special.  Read the label from a given device.
+ * Label manipulation.
 */
 extern int zpool_read_label(int, nvlist_t **);
-
-/*
- * Create and remove zvol /dev links.
- */
-extern int zpool_create_zvol_links(zpool_handle_t *);
-extern int zpool_remove_zvol_links(zpool_handle_t *);
+extern int zpool_clear_label(int);

 /* is this zvol valid for use as a dump device? */
 extern int zvol_check_dump_config(char *);
@ -600,6 +669,17 @@ int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *);
 extern int zpool_enable_datasets(zpool_handle_t *, const char *, int);
 extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);

+/*
+ * Mappings between vdev and FRU.
+ */
+extern void libzfs_fru_refresh(libzfs_handle_t *);
+extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *);
+extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *);
+extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *,
+    const char *);
+extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *);
+extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/lib/libzfs/include/libzfs_impl.h
+++ b/lib/libzfs/include/libzfs_impl.h
@ -20,7 +20,7 @@
 */

 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

@ -30,7 +30,6 @@
 #include <sys/dmu.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/zfs_acl.h>
 #include <sys/spa.h>
 #include <sys/nvpair.h>

@ -38,6 +37,8 @@
 #include <libzfs.h>
 #include <libshare.h>

+#include <fm/libtopo.h>
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
@ -47,6 +48,13 @@ extern "C" {
 #endif
 #define	VERIFY	verify

+typedef struct libzfs_fru {
+	char *zf_device;
+	char *zf_fru;
+	struct libzfs_fru *zf_chain;
+	struct libzfs_fru *zf_next;
+} libzfs_fru_t;
+
 struct libzfs_handle {
 	int libzfs_error;
 	int libzfs_fd;
@ -65,7 +73,13 @@ struct libzfs_handle {
 	uint_t libzfs_shareflags;
 	boolean_t libzfs_mnttab_enable;
 	avl_tree_t libzfs_mnttab_cache;
+	int libzfs_pool_iter;
+	topo_hdl_t *libzfs_topo_hdl;
+	libzfs_fru_t **libzfs_fru_hash;
+	libzfs_fru_t *libzfs_fru_list;
+	char libzfs_chassis_id[256];
 };
+
 #define	ZFSSHARE_MISS	0x01	/* Didn't find entry in cache */

 struct zfs_handle {
@ -77,6 +91,7 @@ struct zfs_handle {
 	dmu_objset_stats_t zfs_dmustats;
 	nvlist_t *zfs_props;
 	nvlist_t *zfs_user_props;
+	nvlist_t *zfs_recvd_props;
 	boolean_t zfs_mntcheck;
 	char *zfs_mntopts;
 	uint8_t *zfs_props_table;
@ -112,7 +127,6 @@ typedef  enum {
 */
 typedef enum {
 	SHARED_NOT_SHARED = 0x0,
-	SHARED_ISCSI = 0x1,
 	SHARED_NFS = 0x2,
 	SHARED_SMB = 0x4
 } zfs_share_type_t;
@ -172,9 +186,6 @@ zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *);

 int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **);

-int zvol_create_link(libzfs_handle_t *, const char *);
-int zvol_remove_link(libzfs_handle_t *, const char *);
-int zpool_iter_zvol(zpool_handle_t *, int (*)(const char *, void *), void *);
 boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *);

 void namespace_clear(libzfs_handle_t *);
@ -189,6 +200,9 @@ extern int zfs_parse_options(char *, zfs_share_proto_t);

 extern int zfs_unshare_proto(zfs_handle_t *,
    const char *, zfs_share_proto_t *);
+
+extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/lib/libzfs/libzfs_changelist.c
+++ b/lib/libzfs/libzfs_changelist.c
@ -20,7 +20,7 @@
 */

 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 *
 * Portions Copyright 2007 Ramprakash Jelari
@ -116,34 +116,7 @@ changelist_prefix(prop_changelist_t *clp)
 		if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned)
 			continue;

-		if (ZFS_IS_VOLUME(cn->cn_handle)) {
-			switch (clp->cl_realprop) {
-			case ZFS_PROP_NAME:
-				/*
-				 * If this was a rename, unshare the zvol, and
-				 * remove the /dev/zvol links.
-				 */
-				(void) zfs_unshare_iscsi(cn->cn_handle);
-
-				if (zvol_remove_link(cn->cn_handle->zfs_hdl,
-				    cn->cn_handle->zfs_name) != 0) {
-					ret = -1;
-					cn->cn_needpost = B_FALSE;
-					(void) zfs_share_iscsi(cn->cn_handle);
-				}
-				break;
-
-			case ZFS_PROP_VOLSIZE:
-				/*
-				 * If this was a change to the volume size, we
-				 * need to unshare and reshare the volume.
-				 */
-				(void) zfs_unshare_iscsi(cn->cn_handle);
-				break;
-			default:
-				break;
-			}
-		} else {
+		if (!ZFS_IS_VOLUME(cn->cn_handle)) {
 			/*
 			 * Do the property specific processing.
 			 */
@ -238,32 +211,8 @@ changelist_postfix(prop_changelist_t *clp)

 		zfs_refresh_properties(cn->cn_handle);

-		if (ZFS_IS_VOLUME(cn->cn_handle)) {
-			/*
-			 * If we're doing a rename, recreate the /dev/zvol
-			 * links.
-			 */
-			if (clp->cl_realprop == ZFS_PROP_NAME &&
-			    zvol_create_link(cn->cn_handle->zfs_hdl,
-			    cn->cn_handle->zfs_name) != 0) {
-				errors++;
-			} else if (cn->cn_shared ||
-			    clp->cl_prop == ZFS_PROP_SHAREISCSI) {
-				if (zfs_prop_get(cn->cn_handle,
-				    ZFS_PROP_SHAREISCSI, shareopts,
-				    sizeof (shareopts), NULL, NULL, 0,
-				    B_FALSE) == 0 &&
-				    strcmp(shareopts, "off") == 0) {
-					errors +=
-					    zfs_unshare_iscsi(cn->cn_handle);
-				} else {
-					errors +=
-					    zfs_share_iscsi(cn->cn_handle);
-				}
-			}
-
+		if (ZFS_IS_VOLUME(cn->cn_handle))
 			continue;
-		}

 		/*
 		 * Remount if previously mounted or mountpoint was legacy,
@ -662,8 +611,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,

 	if (clp->cl_prop != ZFS_PROP_MOUNTPOINT &&
 	    clp->cl_prop != ZFS_PROP_SHARENFS &&
-	    clp->cl_prop != ZFS_PROP_SHARESMB &&
-	    clp->cl_prop != ZFS_PROP_SHAREISCSI)
+	    clp->cl_prop != ZFS_PROP_SHARESMB)
 		return (clp);

 	/*
--- a/lib/libzfs/libzfs_config.c
+++ b/lib/libzfs/libzfs_config.c
@ -19,12 +19,10 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

-
-
 /*
 * The pool configuration repository is stored in /etc/zfs/zpool.cache as a
 * single packed nvlist.  While it would be nice to just read in this
@ -313,21 +311,33 @@ zpool_iter(libzfs_handle_t *hdl, zpool_iter_f func, void *data)
 	zpool_handle_t *zhp;
 	int ret;

-	if (namespace_reload(hdl) != 0)
+	/*
+	 * If someone makes a recursive call to zpool_iter(), we want to avoid
+	 * refreshing the namespace because that will invalidate the parent
+	 * context.  We allow recursive calls, but simply re-use the same
+	 * namespace AVL tree.
+	 */
+	if (!hdl->libzfs_pool_iter && namespace_reload(hdl) != 0)
 		return (-1);

+	hdl->libzfs_pool_iter++;
 	for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL;
 	    cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) {

-		if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0)
+		if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) {
+			hdl->libzfs_pool_iter--;
 			return (-1);
+		}

 		if (zhp == NULL)
 			continue;

-		if ((ret = func(zhp, data)) != 0)
+		if ((ret = func(zhp, data)) != 0) {
+			hdl->libzfs_pool_iter--;
 			return (ret);
 		}
+	}
+	hdl->libzfs_pool_iter--;

 	return (0);
 }
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
--- a/lib/libzfs/libzfs_fru.c
+++ b/lib/libzfs/libzfs_fru.c
@ -0,0 +1,452 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <dlfcn.h>
+#include <errno.h>
+#include <libintl.h>
+#include <link.h>
+#include <pthread.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <libzfs.h>
+
+#include <fm/libtopo.h>
+#include <sys/fm/protocol.h>
+#include <sys/systeminfo.h>
+
+#include "libzfs_impl.h"
+
+/*
+ * This file is responsible for determining the relationship between I/O
+ * devices paths and physical locations.  In the world of MPxIO and external
+ * enclosures, the device path is not synonymous with the physical location.
+ * If you remove a drive and insert it into a different slot, it will end up
+ * with the same path under MPxIO.  If you recable storage enclosures, the
+ * device paths may change.  All of this makes it difficult to implement the
+ * 'autoreplace' property, which is supposed to automatically manage disk
+ * replacement based on physical slot.
+ *
+ * In order to work around these limitations, we have a per-vdev FRU property
+ * that is the libtopo path (minus disk-specific authority information) to the
+ * physical location of the device on the system.  This is an optional
+ * property, and is only needed when using the 'autoreplace' property or when
+ * generating FMA faults against vdevs.
+ */
+
+/*
+ * Because the FMA packages depend on ZFS, we have to dlopen() libtopo in case
+ * it is not present.  We only need this once per library instance, so it is
+ * not part of the libzfs handle.
+ */
+static void *_topo_dlhandle;
+static topo_hdl_t *(*_topo_open)(int, const char *, int *);
+static void (*_topo_close)(topo_hdl_t *);
+static char *(*_topo_snap_hold)(topo_hdl_t *, const char *, int *);
+static void (*_topo_snap_release)(topo_hdl_t *);
+static topo_walk_t *(*_topo_walk_init)(topo_hdl_t *, const char *,
+    topo_walk_cb_t, void *, int *);
+static int (*_topo_walk_step)(topo_walk_t *, int);
+static void (*_topo_walk_fini)(topo_walk_t *);
+static void (*_topo_hdl_strfree)(topo_hdl_t *, char *);
+static char *(*_topo_node_name)(tnode_t *);
+static int (*_topo_prop_get_string)(tnode_t *, const char *, const char *,
+    char **, int *);
+static int (*_topo_node_fru)(tnode_t *, nvlist_t **, nvlist_t *, int *);
+static int (*_topo_fmri_nvl2str)(topo_hdl_t *, nvlist_t *, char **, int *);
+static int (*_topo_fmri_strcmp_noauth)(topo_hdl_t *, const char *,
+    const char *);
+
+#define	ZFS_FRU_HASH_SIZE	257
+
+static size_t
+fru_strhash(const char *key)
+{
+	ulong_t g, h = 0;
+	const char *p;
+
+	for (p = key; *p != '\0'; p++) {
+		h = (h << 4) + *p;
+
+		if ((g = (h & 0xf0000000)) != 0) {
+			h ^= (g >> 24);
+			h ^= g;
+		}
+	}
+
+	return (h % ZFS_FRU_HASH_SIZE);
+}
+
+static int
+libzfs_fru_gather(topo_hdl_t *thp, tnode_t *tn, void *arg)
+{
+	libzfs_handle_t *hdl = arg;
+	nvlist_t *fru;
+	char *devpath, *frustr;
+	int err;
+	libzfs_fru_t *frup;
+	size_t idx;
+
+	/*
+	 * If this is the chassis node, and we don't yet have the system
+	 * chassis ID, then fill in this value now.
+	 */
+	if (hdl->libzfs_chassis_id[0] == '\0' &&
+	    strcmp(_topo_node_name(tn), "chassis") == 0) {
+		if (_topo_prop_get_string(tn, FM_FMRI_AUTHORITY,
+		    FM_FMRI_AUTH_CHASSIS, &devpath, &err) == 0)
+			(void) strlcpy(hdl->libzfs_chassis_id, devpath,
+			    sizeof (hdl->libzfs_chassis_id));
+	}
+
+	/*
+	 * Skip non-disk nodes.
+	 */
+	if (strcmp(_topo_node_name(tn), "disk") != 0)
+		return (TOPO_WALK_NEXT);
+
+	/*
+	 * Get the devfs path and FRU.
+	 */
+	if (_topo_prop_get_string(tn, "io", "devfs-path", &devpath, &err) != 0)
+		return (TOPO_WALK_NEXT);
+
+	if (libzfs_fru_lookup(hdl, devpath) != NULL) {
+		_topo_hdl_strfree(thp, devpath);
+		return (TOPO_WALK_NEXT);
+	}
+
+	if (_topo_node_fru(tn, &fru, NULL, &err) != 0) {
+		_topo_hdl_strfree(thp, devpath);
+		return (TOPO_WALK_NEXT);
+	}
+
+	/*
+	 * Convert the FRU into a string.
+	 */
+	if (_topo_fmri_nvl2str(thp, fru, &frustr, &err) != 0) {
+		nvlist_free(fru);
+		_topo_hdl_strfree(thp, devpath);
+		return (TOPO_WALK_NEXT);
+	}
+
+	nvlist_free(fru);
+
+	/*
+	 * Finally, we have a FRU string and device path.  Add it to the hash.
+	 */
+	if ((frup = calloc(sizeof (libzfs_fru_t), 1)) == NULL) {
+		_topo_hdl_strfree(thp, devpath);
+		_topo_hdl_strfree(thp, frustr);
+		return (TOPO_WALK_NEXT);
+	}
+
+	if ((frup->zf_device = strdup(devpath)) == NULL ||
+	    (frup->zf_fru = strdup(frustr)) == NULL) {
+		free(frup->zf_device);
+		free(frup);
+		_topo_hdl_strfree(thp, devpath);
+		_topo_hdl_strfree(thp, frustr);
+		return (TOPO_WALK_NEXT);
+	}
+
+	_topo_hdl_strfree(thp, devpath);
+	_topo_hdl_strfree(thp, frustr);
+
+	idx = fru_strhash(frup->zf_device);
+	frup->zf_chain = hdl->libzfs_fru_hash[idx];
+	hdl->libzfs_fru_hash[idx] = frup;
+	frup->zf_next = hdl->libzfs_fru_list;
+	hdl->libzfs_fru_list = frup;
+
+	return (TOPO_WALK_NEXT);
+}
+
+/*
+ * Called during initialization to setup the dynamic libtopo connection.
+ */
+#pragma init(libzfs_init_fru)
+static void
+libzfs_init_fru(void)
+{
+	char path[MAXPATHLEN];
+	char isa[257];
+
+#if defined(_LP64)
+	if (sysinfo(SI_ARCHITECTURE_64, isa, sizeof (isa)) < 0)
+		isa[0] = '\0';
+#else
+	isa[0] = '\0';
+#endif
+	(void) snprintf(path, sizeof (path),
+	    "/usr/lib/fm/%s/libtopo.so", isa);
+
+	if ((_topo_dlhandle = dlopen(path, RTLD_LAZY)) == NULL)
+		return;
+
+	_topo_open = (topo_hdl_t *(*)())
+	    dlsym(_topo_dlhandle, "topo_open");
+	_topo_close = (void (*)())
+	    dlsym(_topo_dlhandle, "topo_close");
+	_topo_snap_hold = (char *(*)())
+	    dlsym(_topo_dlhandle, "topo_snap_hold");
+	_topo_snap_release = (void (*)())
+	    dlsym(_topo_dlhandle, "topo_snap_release");
+	_topo_walk_init = (topo_walk_t *(*)())
+	    dlsym(_topo_dlhandle, "topo_walk_init");
+	_topo_walk_step = (int (*)())
+	    dlsym(_topo_dlhandle, "topo_walk_step");
+	_topo_walk_fini = (void (*)())
+	    dlsym(_topo_dlhandle, "topo_walk_fini");
+	_topo_hdl_strfree = (void (*)())
+	    dlsym(_topo_dlhandle, "topo_hdl_strfree");
+	_topo_node_name = (char *(*)())
+	    dlsym(_topo_dlhandle, "topo_node_name");
+	_topo_prop_get_string = (int (*)())
+	    dlsym(_topo_dlhandle, "topo_prop_get_string");
+	_topo_node_fru = (int (*)())
+	    dlsym(_topo_dlhandle, "topo_node_fru");
+	_topo_fmri_nvl2str = (int (*)())
+	    dlsym(_topo_dlhandle, "topo_fmri_nvl2str");
+	_topo_fmri_strcmp_noauth = (int (*)())
+	    dlsym(_topo_dlhandle, "topo_fmri_strcmp_noauth");
+
+	if (_topo_open == NULL || _topo_close == NULL ||
+	    _topo_snap_hold == NULL || _topo_snap_release == NULL ||
+	    _topo_walk_init == NULL || _topo_walk_step == NULL ||
+	    _topo_walk_fini == NULL || _topo_hdl_strfree == NULL ||
+	    _topo_node_name == NULL || _topo_prop_get_string == NULL ||
+	    _topo_node_fru == NULL || _topo_fmri_nvl2str == NULL ||
+	    _topo_fmri_strcmp_noauth == NULL) {
+		(void) dlclose(_topo_dlhandle);
+		_topo_dlhandle = NULL;
+	}
+}
+
+/*
+ * Refresh the mappings from device path -> FMRI.  We do this by walking the
+ * hc topology looking for disk nodes, and recording the io/devfs-path and FRU.
+ * Note that we strip out the disk-specific authority information (serial,
+ * part, revision, etc) so that we are left with only the identifying
+ * characteristics of the slot (hc path and chassis-id).
+ */
+void
+libzfs_fru_refresh(libzfs_handle_t *hdl)
+{
+	int err;
+	char *uuid;
+	topo_hdl_t *thp;
+	topo_walk_t *twp;
+
+	if (_topo_dlhandle == NULL)
+		return;
+
+	/*
+	 * Clear the FRU hash and initialize our basic structures.
+	 */
+	libzfs_fru_clear(hdl, B_FALSE);
+
+	if ((hdl->libzfs_topo_hdl = _topo_open(TOPO_VERSION,
+	    NULL, &err)) == NULL)
+		return;
+
+	thp = hdl->libzfs_topo_hdl;
+
+	if ((uuid = _topo_snap_hold(thp, NULL, &err)) == NULL)
+		return;
+
+	_topo_hdl_strfree(thp, uuid);
+
+	if (hdl->libzfs_fru_hash == NULL &&
+	    (hdl->libzfs_fru_hash =
+	    calloc(ZFS_FRU_HASH_SIZE * sizeof (void *), 1)) == NULL)
+		return;
+
+	/*
+	 * We now have a topo snapshot, so iterate over the hc topology looking
+	 * for disks to add to the hash.
+	 */
+	twp = _topo_walk_init(thp, FM_FMRI_SCHEME_HC,
+	    libzfs_fru_gather, hdl, &err);
+	if (twp != NULL) {
+		(void) _topo_walk_step(twp, TOPO_WALK_CHILD);
+		_topo_walk_fini(twp);
+	}
+}
+
+/*
+ * Given a devfs path, return the FRU for the device, if known.  This will
+ * automatically call libzfs_fru_refresh() if it hasn't already been called by
+ * the consumer.  The string returned is valid until the next call to
+ * libzfs_fru_refresh().
+ */
+const char *
+libzfs_fru_lookup(libzfs_handle_t *hdl, const char *devpath)
+{
+	size_t idx = fru_strhash(devpath);
+	libzfs_fru_t *frup;
+
+	if (hdl->libzfs_fru_hash == NULL)
+		libzfs_fru_refresh(hdl);
+
+	if (hdl->libzfs_fru_hash == NULL)
+		return (NULL);
+
+	for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL;
+	    frup = frup->zf_chain) {
+		if (strcmp(devpath, frup->zf_device) == 0)
+			return (frup->zf_fru);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Given a fru path, return the device path.  This will automatically call
+ * libzfs_fru_refresh() if it hasn't already been called by the consumer.  The
+ * string returned is valid until the next call to libzfs_fru_refresh().
+ */
+const char *
+libzfs_fru_devpath(libzfs_handle_t *hdl, const char *fru)
+{
+	libzfs_fru_t *frup;
+	size_t idx;
+
+	if (hdl->libzfs_fru_hash == NULL)
+		libzfs_fru_refresh(hdl);
+
+	if (hdl->libzfs_fru_hash == NULL)
+		return (NULL);
+
+	for (idx = 0; idx < ZFS_FRU_HASH_SIZE; idx++) {
+		for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL;
+		    frup = frup->zf_next) {
+			if (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl,
+			    fru, frup->zf_fru))
+				return (frup->zf_device);
+		}
+	}
+
+	return (NULL);
+}
+
+/*
+ * Change the stored FRU for the given vdev.
+ */
+int
+zpool_fru_set(zpool_handle_t *zhp, uint64_t vdev_guid, const char *fru)
+{
+	zfs_cmd_t zc = { 0 };
+
+	(void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	(void) strncpy(zc.zc_value, fru, sizeof (zc.zc_value));
+	zc.zc_guid = vdev_guid;
+
+	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SETFRU, &zc) != 0)
+		return (zpool_standard_error_fmt(zhp->zpool_hdl, errno,
+		    dgettext(TEXT_DOMAIN, "cannot set FRU")));
+
+	return (0);
+}
+
+/*
+ * Compare to two FRUs, ignoring any authority information.
+ */
+boolean_t
+libzfs_fru_compare(libzfs_handle_t *hdl, const char *a, const char *b)
+{
+	if (hdl->libzfs_fru_hash == NULL)
+		libzfs_fru_refresh(hdl);
+
+	if (hdl->libzfs_fru_hash == NULL)
+		return (strcmp(a, b) == 0);
+
+	return (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl, a, b));
+}
+
+/*
+ * This special function checks to see whether the FRU indicates it's supposed
+ * to be in the system chassis, but the chassis-id doesn't match.  This can
+ * happen in a clustered case, where both head nodes have the same logical
+ * disk, but opening the device on the other head node is meaningless.
+ */
+boolean_t
+libzfs_fru_notself(libzfs_handle_t *hdl, const char *fru)
+{
+	const char *chassisid;
+	size_t len;
+
+	if (hdl->libzfs_fru_hash == NULL)
+		libzfs_fru_refresh(hdl);
+
+	if (hdl->libzfs_chassis_id[0] == '\0')
+		return (B_FALSE);
+
+	if (strstr(fru, "/chassis=0/") == NULL)
+		return (B_FALSE);
+
+	if ((chassisid = strstr(fru, ":chassis-id=")) == NULL)
+		return (B_FALSE);
+
+	chassisid += 12;
+	len = strlen(hdl->libzfs_chassis_id);
+	if (strncmp(chassisid, hdl->libzfs_chassis_id, len) == 0 &&
+	    (chassisid[len] == '/' || chassisid[len] == ':'))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Clear memory associated with the FRU hash.
+ */
+void
+libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final)
+{
+	libzfs_fru_t *frup;
+
+	while ((frup = hdl->libzfs_fru_list) != NULL) {
+		hdl->libzfs_fru_list = frup->zf_next;
+		free(frup->zf_device);
+		free(frup->zf_fru);
+		free(frup);
+	}
+
+	hdl->libzfs_fru_list = NULL;
+
+	if (hdl->libzfs_topo_hdl != NULL) {
+		_topo_snap_release(hdl->libzfs_topo_hdl);
+		_topo_close(hdl->libzfs_topo_hdl);
+		hdl->libzfs_topo_hdl = NULL;
+	}
+
+	if (final) {
+		free(hdl->libzfs_fru_hash);
+	} else if (hdl->libzfs_fru_hash != NULL) {
+		bzero(hdl->libzfs_fru_hash,
+		    ZFS_FRU_HASH_SIZE * sizeof (void *));
+	}
+}
--- a/lib/libzfs/libzfs_import.c
+++ b/lib/libzfs/libzfs_import.c
@ -19,12 +19,10 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

-
-
 /*
 * Pool import support functions.
 *
@ -41,15 +39,21 @@
 * using our derived config, and record the results.
 */

+#include <ctype.h>
 #include <devid.h>
 #include <dirent.h>
 #include <errno.h>
 #include <libintl.h>
+#include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <sys/vtoc.h>
+#include <sys/dktp/fdisk.h>
+#include <sys/efi_partition.h>
+#include <thread_pool.h>

 #include <sys/vdev_impl.h>

@ -388,8 +392,6 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
 	}

 	if (err) {
-		(void) zpool_standard_error(hdl, errno,
-		    dgettext(TEXT_DOMAIN, "cannot discover pools"));
 		zcmd_free_nvlists(&zc);
 		return (NULL);
 	}
@ -403,6 +405,23 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
 	return (nvl);
 }

+/*
+ * Determine if the vdev id is a hole in the namespace.
+ */
+boolean_t
+vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
+{
+	int c;
+
+	for (c = 0; c < holes; c++) {
+
+		/* Top-level is a hole */
+		if (hole_array[c] == id)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
 /*
 * Convert our list of pools into the definitive set of configurations.  We
 * start by picking the best config for each toplevel vdev.  Once that's done,
@ -425,17 +444,20 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 	uint64_t version, guid;
 	uint_t children = 0;
 	nvlist_t **child = NULL;
+	uint_t holes;
+	uint64_t *hole_array, max_id;
 	uint_t c;
 	boolean_t isactive;
 	uint64_t hostid;
 	nvlist_t *nvl;
 	boolean_t found_one = B_FALSE;
+	boolean_t valid_top_config = B_FALSE;

 	if (nvlist_alloc(&ret, 0, 0) != 0)
 		goto nomem;

 	for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
-		uint64_t id;
+		uint64_t id, max_txg = 0;

 		if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
 			goto nomem;
@ -463,6 +485,42 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 				}
 			}

+			/*
+			 * We rely on the fact that the max txg for the
+			 * pool will contain the most up-to-date information
+			 * about the valid top-levels in the vdev namespace.
+			 */
+			if (best_txg > max_txg) {
+				(void) nvlist_remove(config,
+				    ZPOOL_CONFIG_VDEV_CHILDREN,
+				    DATA_TYPE_UINT64);
+				(void) nvlist_remove(config,
+				    ZPOOL_CONFIG_HOLE_ARRAY,
+				    DATA_TYPE_UINT64_ARRAY);
+
+				max_txg = best_txg;
+				hole_array = NULL;
+				holes = 0;
+				max_id = 0;
+				valid_top_config = B_FALSE;
+
+				if (nvlist_lookup_uint64(tmp,
+				    ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
+					verify(nvlist_add_uint64(config,
+					    ZPOOL_CONFIG_VDEV_CHILDREN,
+					    max_id) == 0);
+					valid_top_config = B_TRUE;
+				}
+
+				if (nvlist_lookup_uint64_array(tmp,
+				    ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
+				    &holes) == 0) {
+					verify(nvlist_add_uint64_array(config,
+					    ZPOOL_CONFIG_HOLE_ARRAY,
+					    hole_array, holes) == 0);
+				}
+			}
+
 			if (!config_seen) {
 				/*
 				 * Copy the relevant pieces of data to the pool
@ -522,6 +580,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 			    ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
 			verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
 			    &id) == 0);
+
 			if (id >= children) {
 				nvlist_t **newchild;

@ -542,9 +601,74 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)

 		}

+		/*
+		 * If we have information about all the top-levels then
+		 * clean up the nvlist which we've constructed. This
+		 * means removing any extraneous devices that are
+		 * beyond the valid range or adding devices to the end
+		 * of our array which appear to be missing.
+		 */
+		if (valid_top_config) {
+			if (max_id < children) {
+				for (c = max_id; c < children; c++)
+					nvlist_free(child[c]);
+				children = max_id;
+			} else if (max_id > children) {
+				nvlist_t **newchild;
+
+				newchild = zfs_alloc(hdl, (max_id) *
+				    sizeof (nvlist_t *));
+				if (newchild == NULL)
+					goto nomem;
+
+				for (c = 0; c < children; c++)
+					newchild[c] = child[c];
+
+				free(child);
+				child = newchild;
+				children = max_id;
+			}
+		}
+
 		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 		    &guid) == 0);

+		/*
+		 * The vdev namespace may contain holes as a result of
+		 * device removal. We must add them back into the vdev
+		 * tree before we process any missing devices.
+		 */
+		if (holes > 0) {
+			ASSERT(valid_top_config);
+
+			for (c = 0; c < children; c++) {
+				nvlist_t *holey;
+
+				if (child[c] != NULL ||
+				    !vdev_is_hole(hole_array, holes, c))
+					continue;
+
+				if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
+				    0) != 0)
+					goto nomem;
+
+				/*
+				 * Holes in the namespace are treated as
+				 * "hole" top-level vdevs and have a
+				 * special flag set on them.
+				 */
+				if (nvlist_add_string(holey,
+				    ZPOOL_CONFIG_TYPE,
+				    VDEV_TYPE_HOLE) != 0 ||
+				    nvlist_add_uint64(holey,
+				    ZPOOL_CONFIG_ID, c) != 0 ||
+				    nvlist_add_uint64(holey,
+				    ZPOOL_CONFIG_GUID, 0ULL) != 0)
+					goto nomem;
+				child[c] = holey;
+			}
+		}
+
 		/*
 		 * Look for any missing top-level vdevs.  If this is the case,
 		 * create a faked up 'missing' vdev as a placeholder.  We cannot
@ -552,7 +676,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 		 * certain checks to make sure the vdev IDs match their location
 		 * in the configuration.
 		 */
-		for (c = 0; c < children; c++)
+		for (c = 0; c < children; c++) {
 			if (child[c] == NULL) {
 				nvlist_t *missing;
 				if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
@ -570,6 +694,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 				}
 				child[c] = missing;
 			}
+		}

 		/*
 		 * Put all of this pool's top-level vdevs into a root vdev.
@ -636,8 +761,11 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 			continue;
 		}

-		if ((nvl = refresh_config(hdl, config)) == NULL)
-			goto error;
+		if ((nvl = refresh_config(hdl, config)) == NULL) {
+			nvlist_free(config);
+			config = NULL;
+			continue;
+		}

 		nvlist_free(config);
 		config = nvl;
@ -777,6 +905,212 @@ zpool_read_label(int fd, nvlist_t **config)
 	return (0);
 }

+typedef struct rdsk_node {
+	char *rn_name;
+	int rn_dfd;
+	libzfs_handle_t *rn_hdl;
+	nvlist_t *rn_config;
+	avl_tree_t *rn_avl;
+	avl_node_t rn_node;
+	boolean_t rn_nozpool;
+} rdsk_node_t;
+
+static int
+slice_cache_compare(const void *arg1, const void *arg2)
+{
+	const char  *nm1 = ((rdsk_node_t *)arg1)->rn_name;
+	const char  *nm2 = ((rdsk_node_t *)arg2)->rn_name;
+	char *nm1slice, *nm2slice;
+	int rv;
+
+	/*
+	 * slices zero and two are the most likely to provide results,
+	 * so put those first
+	 */
+	nm1slice = strstr(nm1, "s0");
+	nm2slice = strstr(nm2, "s0");
+	if (nm1slice && !nm2slice) {
+		return (-1);
+	}
+	if (!nm1slice && nm2slice) {
+		return (1);
+	}
+	nm1slice = strstr(nm1, "s2");
+	nm2slice = strstr(nm2, "s2");
+	if (nm1slice && !nm2slice) {
+		return (-1);
+	}
+	if (!nm1slice && nm2slice) {
+		return (1);
+	}
+
+	rv = strcmp(nm1, nm2);
+	if (rv == 0)
+		return (0);
+	return (rv > 0 ? 1 : -1);
+}
+
+static void
+check_one_slice(avl_tree_t *r, char *diskname, uint_t partno,
+    diskaddr_t size, uint_t blksz)
+{
+	rdsk_node_t tmpnode;
+	rdsk_node_t *node;
+	char sname[MAXNAMELEN];
+
+	tmpnode.rn_name = &sname[0];
+	(void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u",
+	    diskname, partno);
+	/*
+	 * protect against division by zero for disk labels that
+	 * contain a bogus sector size
+	 */
+	if (blksz == 0)
+		blksz = DEV_BSIZE;
+	/* too small to contain a zpool? */
+	if ((size < (SPA_MINDEVSIZE / blksz)) &&
+	    (node = avl_find(r, &tmpnode, NULL)))
+		node->rn_nozpool = B_TRUE;
+}
+
+static void
+nozpool_all_slices(avl_tree_t *r, const char *sname)
+{
+	char diskname[MAXNAMELEN];
+	char *ptr;
+	int i;
+
+	(void) strncpy(diskname, sname, MAXNAMELEN);
+	if (((ptr = strrchr(diskname, 's')) == NULL) &&
+	    ((ptr = strrchr(diskname, 'p')) == NULL))
+		return;
+	ptr[0] = 's';
+	ptr[1] = '\0';
+	for (i = 0; i < NDKMAP; i++)
+		check_one_slice(r, diskname, i, 0, 1);
+	ptr[0] = 'p';
+	for (i = 0; i <= FD_NUMPART; i++)
+		check_one_slice(r, diskname, i, 0, 1);
+}
+
+static void
+check_slices(avl_tree_t *r, int fd, const char *sname)
+{
+	struct extvtoc vtoc;
+	struct dk_gpt *gpt;
+	char diskname[MAXNAMELEN];
+	char *ptr;
+	int i;
+
+	(void) strncpy(diskname, sname, MAXNAMELEN);
+	if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1]))
+		return;
+	ptr[1] = '\0';
+
+	if (read_extvtoc(fd, &vtoc) >= 0) {
+		for (i = 0; i < NDKMAP; i++)
+			check_one_slice(r, diskname, i,
+			    vtoc.v_part[i].p_size, vtoc.v_sectorsz);
+	} else if (efi_alloc_and_read(fd, &gpt) >= 0) {
+		/*
+		 * on x86 we'll still have leftover links that point
+		 * to slices s[9-15], so use NDKMAP instead
+		 */
+		for (i = 0; i < NDKMAP; i++)
+			check_one_slice(r, diskname, i,
+			    gpt->efi_parts[i].p_size, gpt->efi_lbasize);
+		/* nodes p[1-4] are never used with EFI labels */
+		ptr[0] = 'p';
+		for (i = 1; i <= FD_NUMPART; i++)
+			check_one_slice(r, diskname, i, 0, 1);
+		efi_free(gpt);
+	}
+}
+
+static void
+zpool_open_func(void *arg)
+{
+	rdsk_node_t *rn = arg;
+	struct stat64 statbuf;
+	nvlist_t *config;
+	int fd;
+
+	if (rn->rn_nozpool)
+		return;
+	if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
+		/* symlink to a device that's no longer there */
+		if (errno == ENOENT)
+			nozpool_all_slices(rn->rn_avl, rn->rn_name);
+		return;
+	}
+	/*
+	 * Ignore failed stats.  We only want regular
+	 * files, character devs and block devs.
+	 */
+	if (fstat64(fd, &statbuf) != 0 ||
+	    (!S_ISREG(statbuf.st_mode) &&
+	    !S_ISCHR(statbuf.st_mode) &&
+	    !S_ISBLK(statbuf.st_mode))) {
+		(void) close(fd);
+		return;
+	}
+	/* this file is too small to hold a zpool */
+	if (S_ISREG(statbuf.st_mode) &&
+	    statbuf.st_size < SPA_MINDEVSIZE) {
+		(void) close(fd);
+		return;
+	} else if (!S_ISREG(statbuf.st_mode)) {
+		/*
+		 * Try to read the disk label first so we don't have to
+		 * open a bunch of minor nodes that can't have a zpool.
+		 */
+		check_slices(rn->rn_avl, fd, rn->rn_name);
+	}
+
+	if ((zpool_read_label(fd, &config)) != 0) {
+		(void) close(fd);
+		(void) no_memory(rn->rn_hdl);
+		return;
+	}
+	(void) close(fd);
+
+
+	rn->rn_config = config;
+	if (config != NULL) {
+		assert(rn->rn_nozpool == B_FALSE);
+	}
+}
+
+/*
+ * Given a file descriptor, clear (zero) the label information.  This function
+ * is currently only used in the appliance stack as part of the ZFS sysevent
+ * module.
+ */
+int
+zpool_clear_label(int fd)
+{
+	struct stat64 statbuf;
+	int l;
+	vdev_label_t *label;
+	uint64_t size;
+
+	if (fstat64(fd, &statbuf) == -1)
+		return (0);
+	size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
+
+	if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL)
+		return (-1);
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+		if (pwrite64(fd, label, sizeof (vdev_label_t),
+		    label_offset(size, l)) != sizeof (vdev_label_t))
+			return (-1);
+	}
+
+	free(label);
+	return (0);
+}
+
 /*
 * Given a list of directories to search, find all pools stored on disk.  This
 * includes partial pools which are not available to import.  If no args are
@ -785,30 +1119,28 @@ zpool_read_label(int fd, nvlist_t **config)
 * to import a specific pool.
 */
 static nvlist_t *
-zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
-    boolean_t active_ok, char *poolname, uint64_t guid)
+zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
 {
-	int i;
+	int i, dirs = iarg->paths;
 	DIR *dirp = NULL;
 	struct dirent64 *dp;
 	char path[MAXPATHLEN];
-	char *end;
+	char *end, **dir = iarg->path;
 	size_t pathleft;
-	struct stat64 statbuf;
-	nvlist_t *ret = NULL, *config;
+	nvlist_t *ret = NULL;
 	static char *default_dir = "/dev/dsk";
-	int fd;
 	pool_list_t pools = { 0 };
 	pool_entry_t *pe, *penext;
 	vdev_entry_t *ve, *venext;
 	config_entry_t *ce, *cenext;
 	name_entry_t *ne, *nenext;
+	avl_tree_t slice_cache;
+	rdsk_node_t *slice;
+	void *cookie;

-	verify(poolname == NULL || guid == 0);
-
-	if (argc == 0) {
-		argc = 1;
-		argv = &default_dir;
+	if (dirs == 0) {
+		dirs = 1;
+		dir = &default_dir;
 	}

 	/*
@ -816,15 +1148,15 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
 	 * possible device, organizing the information according to pool GUID
 	 * and toplevel GUID.
 	 */
-	for (i = 0; i < argc; i++) {
+	for (i = 0; i < dirs; i++) {
+		tpool_t *t;
 		char *rdsk;
 		int dfd;

 		/* use realpath to normalize the path */
-		if (realpath(argv[i], path) == 0) {
+		if (realpath(dir[i], path) == 0) {
 			(void) zfs_error_fmt(hdl, EZFS_BADPATH,
-			    dgettext(TEXT_DOMAIN, "cannot open '%s'"),
-			    argv[i]);
+			    dgettext(TEXT_DOMAIN, "cannot open '%s'"), dir[i]);
 			goto error;
 		}
 		end = &path[strlen(path)];
@ -851,6 +1183,8 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
 			goto error;
 		}

+		avl_create(&slice_cache, slice_cache_compare,
+		    sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node));
 		/*
 		 * This is not MT-safe, but we have no MT consumers of libzfs
 		 */
@ -860,46 +1194,53 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
 			    (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
 				continue;

-			if ((fd = openat64(dfd, name, O_RDONLY)) < 0)
-				continue;
-
+			slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+			slice->rn_name = zfs_strdup(hdl, name);
+			slice->rn_avl = &slice_cache;
+			slice->rn_dfd = dfd;
+			slice->rn_hdl = hdl;
+			slice->rn_nozpool = B_FALSE;
+			avl_add(&slice_cache, slice);
+		}
 		/*
-			 * Ignore failed stats.  We only want regular
-			 * files, character devs and block devs.
+		 * create a thread pool to do all of this in parallel;
+		 * rn_nozpool is not protected, so this is racy in that
+		 * multiple tasks could decide that the same slice can
+		 * not hold a zpool, which is benign.  Also choose
+		 * double the number of processors; we hold a lot of
+		 * locks in the kernel, so going beyond this doesn't
+		 * buy us much.
 		 */
-			if (fstat64(fd, &statbuf) != 0 ||
-			    (!S_ISREG(statbuf.st_mode) &&
-			    !S_ISCHR(statbuf.st_mode) &&
-			    !S_ISBLK(statbuf.st_mode))) {
-				(void) close(fd);
-				continue;
-			}
+		t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN),
+		    0, NULL);
+		for (slice = avl_first(&slice_cache); slice;
+		    (slice = avl_walk(&slice_cache, slice,
+		    AVL_AFTER)))
+			(void) tpool_dispatch(t, zpool_open_func, slice);
+		tpool_wait(t);
+		tpool_destroy(t);

-			if ((zpool_read_label(fd, &config)) != 0) {
-				(void) close(fd);
-				(void) no_memory(hdl);
-				goto error;
-			}
-
-			(void) close(fd);
-
-			if (config != NULL) {
+		cookie = NULL;
+		while ((slice = avl_destroy_nodes(&slice_cache,
+		    &cookie)) != NULL) {
+			if (slice->rn_config != NULL) {
+				nvlist_t *config = slice->rn_config;
 				boolean_t matched = B_TRUE;

-				if (poolname != NULL) {
+				if (iarg->poolname != NULL) {
 					char *pname;

 					matched = nvlist_lookup_string(config,
 					    ZPOOL_CONFIG_POOL_NAME,
 					    &pname) == 0 &&
-					    strcmp(poolname, pname) == 0;
-				} else if (guid != 0) {
+					    strcmp(iarg->poolname, pname) == 0;
+				} else if (iarg->guid != 0) {
 					uint64_t this_guid;

 					matched = nvlist_lookup_uint64(config,
 					    ZPOOL_CONFIG_POOL_GUID,
 					    &this_guid) == 0 &&
-					    guid == this_guid;
+					    iarg->guid == this_guid;
 				}
 				if (!matched) {
 					nvlist_free(config);
@ -907,17 +1248,20 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
 					continue;
 				}
 				/* use the non-raw path for the config */
-				(void) strlcpy(end, name, pathleft);
+				(void) strlcpy(end, slice->rn_name, pathleft);
 				if (add_config(hdl, &pools, path, config) != 0)
 					goto error;
 			}
+			free(slice->rn_name);
+			free(slice);
 		}
+		avl_destroy(&slice_cache);

 		(void) closedir(dirp);
 		dirp = NULL;
 	}

-	ret = get_configs(hdl, &pools, active_ok);
+	ret = get_configs(hdl, &pools, iarg->can_be_active);

 error:
 	for (pe = pools.pools; pe != NULL; pe = penext) {
@ -951,27 +1295,12 @@ error:
 nvlist_t *
 zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv)
 {
-	return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, NULL, 0));
-}
+	importargs_t iarg = { 0 };

-nvlist_t *
-zpool_find_import_byname(libzfs_handle_t *hdl, int argc, char **argv,
-    char *pool)
-{
-	return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, pool, 0));
-}
+	iarg.paths = argc;
+	iarg.path = argv;

-nvlist_t *
-zpool_find_import_byguid(libzfs_handle_t *hdl, int argc, char **argv,
-    uint64_t guid)
-{
-	return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, NULL, guid));
-}
-
-nvlist_t *
-zpool_find_import_activeok(libzfs_handle_t *hdl, int argc, char **argv)
-{
-	return (zpool_find_import_impl(hdl, argc, argv, B_TRUE, NULL, 0));
+	return (zpool_find_import_impl(hdl, &iarg));
 }

 /*
@ -1093,6 +1422,46 @@ zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
 	return (pools);
 }

+static int
+name_or_guid_exists(zpool_handle_t *zhp, void *data)
+{
+	importargs_t *import = data;
+	int found = 0;
+
+	if (import->poolname != NULL) {
+		char *pool_name;
+
+		verify(nvlist_lookup_string(zhp->zpool_config,
+		    ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0);
+		if (strcmp(pool_name, import->poolname) == 0)
+			found = 1;
+	} else {
+		uint64_t pool_guid;
+
+		verify(nvlist_lookup_uint64(zhp->zpool_config,
+		    ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0);
+		if (pool_guid == import->guid)
+			found = 1;
+	}
+
+	zpool_close(zhp);
+	return (found);
+}
+
+nvlist_t *
+zpool_search_import(libzfs_handle_t *hdl, importargs_t *import)
+{
+	verify(import->poolname == NULL || import->guid == 0);
+
+	if (import->unique)
+		import->exists = zpool_iter(hdl, name_or_guid_exists, import);
+
+	if (import->cachefile != NULL)
+		return (zpool_find_import_cached(hdl, import->cachefile,
+		    import->poolname, import->guid));
+
+	return (zpool_find_import_impl(hdl, import));
+}

 boolean_t
 find_guid(nvlist_t *nv, uint64_t guid)
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@ -20,8 +20,7 @@
 */

 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 /*
@ -44,17 +43,14 @@
 *
 * 	zfs_is_shared_nfs()
 * 	zfs_is_shared_smb()
- * 	zfs_is_shared_iscsi()
 * 	zfs_share_proto()
 * 	zfs_shareall();
- * 	zfs_share_iscsi()
 * 	zfs_unshare_nfs()
 * 	zfs_unshare_smb()
 * 	zfs_unshareall_nfs()
 *	zfs_unshareall_smb()
 *	zfs_unshareall()
 *	zfs_unshareall_bypath()
- * 	zfs_unshare_iscsi()
 *
 * The following functions are available for pool consumers, and will
 * mount/unmount and share/unshare all datasets within pool:
@ -89,11 +85,6 @@ static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
 zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
    zfs_share_proto_t);

-static int (*iscsitgt_zfs_share)(const char *);
-static int (*iscsitgt_zfs_unshare)(const char *);
-static int (*iscsitgt_zfs_is_shared)(const char *);
-static int (*iscsitgt_svc_online)(void);
-
 /*
 * The share protocols table must be in the same order as the zfs_share_prot_t
 * enum in libzfs_impl.h
@ -125,35 +116,6 @@ zfs_share_proto_t share_all_proto[] = {
 	PROTO_END
 };

-#ifdef __GNUC__
-static void
-zfs_iscsi_init(void) __attribute__((constructor));
-#else
-#pragma init(zfs_iscsi_init)
-#endif
-
-static void
-zfs_iscsi_init(void)
-{
-	void *libiscsitgt;
-
-	if ((libiscsitgt = dlopen("/lib/libiscsitgt.so.1",
-	    RTLD_LAZY | RTLD_GLOBAL)) == NULL ||
-	    (iscsitgt_zfs_share = (int (*)(const char *))dlsym(libiscsitgt,
-	    "iscsitgt_zfs_share")) == NULL ||
-	    (iscsitgt_zfs_unshare = (int (*)(const char *))dlsym(libiscsitgt,
-	    "iscsitgt_zfs_unshare")) == NULL ||
-	    (iscsitgt_zfs_is_shared = (int (*)(const char *))dlsym(libiscsitgt,
-	    "iscsitgt_zfs_is_shared")) == NULL ||
-	    (iscsitgt_svc_online = (int (*)(void))dlsym(libiscsitgt,
-	    "iscsitgt_svc_online")) == NULL) {
-		iscsitgt_zfs_share = NULL;
-		iscsitgt_zfs_unshare = NULL;
-		iscsitgt_zfs_is_shared = NULL;
-		iscsitgt_svc_online = NULL;
-	}
-}
-
 /*
 * Search the sharetab for the given mountpoint and protocol, returning
 * a zfs_share_type_t value.
@ -351,6 +313,18 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
 		} else if (errno == EPERM) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "Insufficient privileges"));
+		} else if (errno == ENOTSUP) {
+			char buf[256];
+			int spa_version;
+
+			VERIFY(zfs_spa_version(zhp, &spa_version) == 0);
+			(void) snprintf(buf, sizeof (buf),
+			    dgettext(TEXT_DOMAIN, "Can't mount a version %lld "
+			    "file system on a version %d pool. Pool must be"
+			    " upgraded to mount this file system."),
+			    (u_longlong_t)zfs_prop_get_int(zhp,
+			    ZFS_PROP_VERSION), spa_version);
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, buf));
 		} else {
 			zfs_error_aux(hdl, strerror(errno));
 		}
@ -451,7 +425,7 @@ zfs_is_shared(zfs_handle_t *zhp)
 	zfs_share_proto_t *curr_proto;

 	if (ZFS_IS_VOLUME(zhp))
-		return (zfs_is_shared_iscsi(zhp));
+		return (B_FALSE);

 	for (curr_proto = share_all_proto; *curr_proto != PROTO_END;
 	    curr_proto++)
@ -464,7 +438,7 @@ int
 zfs_share(zfs_handle_t *zhp)
 {
 	if (ZFS_IS_VOLUME(zhp))
-		return (zfs_share_iscsi(zhp));
+		return (0);

 	return (zfs_share_proto(zhp, share_all_proto));
 }
@ -473,7 +447,7 @@ int
 zfs_unshare(zfs_handle_t *zhp)
 {
 	if (ZFS_IS_VOLUME(zhp))
-		return (zfs_unshare_iscsi(zhp));
+		return (0);

 	return (zfs_unshareall(zhp));
 }
@ -1009,81 +983,6 @@ remove_mountpoint(zfs_handle_t *zhp)
 	}
 }

-boolean_t
-zfs_is_shared_iscsi(zfs_handle_t *zhp)
-{
-
-	/*
-	 * If iscsi deamon isn't running then we aren't shared
-	 */
-	if (iscsitgt_svc_online && iscsitgt_svc_online() == 1)
-		return (B_FALSE);
-	else
-		return (iscsitgt_zfs_is_shared != NULL &&
-		    iscsitgt_zfs_is_shared(zhp->zfs_name) != 0);
-}
-
-int
-zfs_share_iscsi(zfs_handle_t *zhp)
-{
-	char shareopts[ZFS_MAXPROPLEN];
-	const char *dataset = zhp->zfs_name;
-	libzfs_handle_t *hdl = zhp->zfs_hdl;
-
-	/*
-	 * Return success if there are no share options.
-	 */
-	if (zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts,
-	    sizeof (shareopts), NULL, NULL, 0, B_FALSE) != 0 ||
-	    strcmp(shareopts, "off") == 0)
-		return (0);
-
-	if (iscsitgt_zfs_share == NULL || iscsitgt_zfs_share(dataset) != 0) {
-		int error = EZFS_SHAREISCSIFAILED;
-
-		/*
-		 * If service isn't availabele and EPERM was
-		 * returned then use special error.
-		 */
-		if (iscsitgt_svc_online && errno == EPERM &&
-		    (iscsitgt_svc_online() != 0))
-			error = EZFS_ISCSISVCUNAVAIL;
-
-		return (zfs_error_fmt(hdl, error,
-		    dgettext(TEXT_DOMAIN, "cannot share '%s'"), dataset));
-	}
-
-	return (0);
-}
-
-int
-zfs_unshare_iscsi(zfs_handle_t *zhp)
-{
-	const char *dataset = zfs_get_name(zhp);
-	libzfs_handle_t *hdl = zhp->zfs_hdl;
-
-	/*
-	 * Return if the volume is not shared
-	 */
-	if (zfs_is_shared_iscsi(zhp) != SHARED_ISCSI)
-		return (0);
-
-	/*
-	 * If this fails with ENODEV it indicates that zvol wasn't shared so
-	 * we should return success in that case.
-	 */
-	if (iscsitgt_zfs_unshare == NULL ||
-	    (iscsitgt_zfs_unshare(dataset) != 0 && errno != ENODEV)) {
-		if (errno == EPERM)
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "Insufficient privileges to unshare iscsi"));
-		return (zfs_error_fmt(hdl, EZFS_UNSHAREISCSIFAILED,
-		    dgettext(TEXT_DOMAIN, "cannot unshare '%s'"), dataset));
-	}
-
-	return (0);
-}
-
 typedef struct mount_cbdata {
 	zfs_handle_t	**cb_datasets;
 	int 		cb_used;
@ -1225,28 +1124,6 @@ out:
 	return (ret);
 }

-
-static int
-zvol_cb(const char *dataset, void *data)
-{
-	libzfs_handle_t *hdl = data;
-	zfs_handle_t *zhp;
-
-	/*
-	 * Ignore snapshots and ignore failures from non-existant datasets.
-	 */
-	if (strchr(dataset, '@') != NULL ||
-	    (zhp = zfs_open(hdl, dataset, ZFS_TYPE_VOLUME)) == NULL)
-		return (0);
-
-	if (zfs_unshare_iscsi(zhp) != 0)
-		return (-1);
-
-	zfs_close(zhp);
-
-	return (0);
-}
-
 static int
 mountpoint_compare(const void *a, const void *b)
 {
@ -1256,6 +1133,8 @@ mountpoint_compare(const void *a, const void *b)
 	return (strcmp(mountb, mounta));
 }

+/* alias for 2002/240 */
+#pragma weak zpool_unmount_datasets = zpool_disable_datasets
 /*
 * Unshare and unmount all datasets within the given pool.  We don't want to
 * rely on traversing the DSL to discover the filesystems within the pool,
@ -1263,7 +1142,6 @@ mountpoint_compare(const void *a, const void *b)
 * arbitrarily (on I/O error, for example).  Instead, we walk /etc/mnttab and
 * gather all the filesystems that are currently mounted.
 */
-#pragma weak zpool_unmount_datasets = zpool_disable_datasets
 int
 zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 {
@ -1277,12 +1155,6 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 	int ret = -1;
 	int flags = (force ? MS_FORCE : 0);

-	/*
-	 * First unshare all zvols.
-	 */
-	if (zpool_iter_zvol(zhp, zvol_cb, hdl) != 0)
-		return (-1);
-
 	namelen = strlen(zhp->zpool_name);

 	rewind(hdl->libzfs_mnttab);
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
--- a/lib/libzfs/libzfs_status.c
+++ b/lib/libzfs/libzfs_status.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 /*
@ -138,7 +137,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
 			if (find_vdev_problem(child[c], func))
 				return (B_TRUE);
 	} else {
-		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
+		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
 		    (uint64_t **)&vs, &c) == 0);

 		if (func(vs->vs_state, vs->vs_aux,
@ -173,7 +172,8 @@ check_status(nvlist_t *config, boolean_t isimport)
 {
 	nvlist_t *nvroot;
 	vdev_stat_t *vs;
-	uint_t vsc;
+	pool_scan_stat_t *ps = NULL;
+	uint_t vsc, psc;
 	uint64_t nerr;
 	uint64_t version;
 	uint64_t stateval;
@ -184,15 +184,24 @@ check_status(nvlist_t *config, boolean_t isimport)
 	    &version) == 0);
 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
-	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &vsc) == 0);
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    &stateval) == 0);
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
+
+	/*
+	 * Currently resilvering a vdev
+	 */
+	(void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
+	    (uint64_t **)&ps, &psc);
+	if (ps && ps->pss_func == POOL_SCAN_RESILVER &&
+	    ps->pss_state == DSS_SCANNING)
+		return (ZPOOL_STATUS_RESILVERING);

 	/*
 	 * Pool last accessed by another system.
 	 */
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
 	if (hostid != 0 && (unsigned long)hostid != gethostid() &&
 	    stateval == POOL_STATE_ACTIVE)
 		return (ZPOOL_STATUS_HOSTID_MISMATCH);
@ -288,12 +297,6 @@ check_status(nvlist_t *config, boolean_t isimport)
 	if (find_vdev_problem(nvroot, vdev_removed))
 		return (ZPOOL_STATUS_REMOVED_DEV);

-	/*
-	 * Currently resilvering
-	 */
-	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
-		return (ZPOOL_STATUS_RESILVERING);
-
 	/*
 	 * Outdated, but usable, version
 	 */
@ -328,3 +331,68 @@ zpool_import_status(nvlist_t *config, char **msgid)

 	return (ret);
 }
+
+static void
+dump_ddt_stat(const ddt_stat_t *dds, int h)
+{
+	char refcnt[6];
+	char blocks[6], lsize[6], psize[6], dsize[6];
+	char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6];
+
+	if (dds == NULL || dds->dds_blocks == 0)
+		return;
+
+	if (h == -1)
+		(void) strcpy(refcnt, "Total");
+	else
+		zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt));
+
+	zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks));
+	zfs_nicenum(dds->dds_lsize, lsize, sizeof (lsize));
+	zfs_nicenum(dds->dds_psize, psize, sizeof (psize));
+	zfs_nicenum(dds->dds_dsize, dsize, sizeof (dsize));
+	zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks));
+	zfs_nicenum(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize));
+	zfs_nicenum(dds->dds_ref_psize, ref_psize, sizeof (ref_psize));
+	zfs_nicenum(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize));
+
+	(void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+	    refcnt,
+	    blocks, lsize, psize, dsize,
+	    ref_blocks, ref_lsize, ref_psize, ref_dsize);
+}
+
+/*
+ * Print the DDT histogram and the column totals.
+ */
+void
+zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh)
+{
+	int h;
+
+	(void) printf("\n");
+
+	(void) printf("bucket   "
+	    "           allocated             "
+	    "          referenced          \n");
+	(void) printf("______   "
+	    "______________________________   "
+	    "______________________________\n");
+
+	(void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+	    "refcnt",
+	    "blocks", "LSIZE", "PSIZE", "DSIZE",
+	    "blocks", "LSIZE", "PSIZE", "DSIZE");
+
+	(void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+	    "------",
+	    "------", "-----", "-----", "-----",
+	    "------", "-----", "-----", "-----");
+
+	for (h = 0; h < 64; h++)
+		dump_ddt_stat(&ddh->ddh_stat[h], h);
+
+	dump_ddt_stat(dds_total, -1);
+
+	(void) printf("\n");
+}
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 /*
@ -94,8 +93,6 @@ libzfs_error_description(libzfs_handle_t *hdl)
 	case EZFS_VOLTOOBIG:
 		return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
 		    "this system"));
-	case EZFS_VOLHASDATA:
-		return (dgettext(TEXT_DOMAIN, "volume has data"));
 	case EZFS_INVALIDNAME:
 		return (dgettext(TEXT_DOMAIN, "invalid name"));
 	case EZFS_BADRESTORE:
@ -138,16 +135,12 @@ libzfs_error_description(libzfs_handle_t *hdl)
 		return (dgettext(TEXT_DOMAIN, "smb remove share failed"));
 	case EZFS_SHARESMBFAILED:
 		return (dgettext(TEXT_DOMAIN, "smb add share failed"));
-	case EZFS_ISCSISVCUNAVAIL:
-		return (dgettext(TEXT_DOMAIN,
-		    "iscsitgt service need to be enabled by "
-		    "a privileged user"));
-	case EZFS_DEVLINKS:
-		return (dgettext(TEXT_DOMAIN, "failed to create /dev links"));
 	case EZFS_PERM:
 		return (dgettext(TEXT_DOMAIN, "permission denied"));
 	case EZFS_NOSPC:
 		return (dgettext(TEXT_DOMAIN, "out of space"));
+	case EZFS_FAULT:
+		return (dgettext(TEXT_DOMAIN, "bad address"));
 	case EZFS_IO:
 		return (dgettext(TEXT_DOMAIN, "I/O error"));
 	case EZFS_INTR:
@ -161,12 +154,6 @@ libzfs_error_description(libzfs_handle_t *hdl)
 		return (dgettext(TEXT_DOMAIN, "recursive dataset dependency"));
 	case EZFS_NOHISTORY:
 		return (dgettext(TEXT_DOMAIN, "no history available"));
-	case EZFS_UNSHAREISCSIFAILED:
-		return (dgettext(TEXT_DOMAIN,
-		    "iscsitgtd failed request to unshare"));
-	case EZFS_SHAREISCSIFAILED:
-		return (dgettext(TEXT_DOMAIN,
-		    "iscsitgtd failed request to share"));
 	case EZFS_POOLPROPS:
 		return (dgettext(TEXT_DOMAIN, "failed to retrieve "
 		    "pool properties"));
@ -218,6 +205,20 @@ libzfs_error_description(libzfs_handle_t *hdl)
 	case EZFS_REFTAG_HOLD:
 		return (dgettext(TEXT_DOMAIN, "tag already exists on this "
 		    "dataset"));
+	case EZFS_TAGTOOLONG:
+		return (dgettext(TEXT_DOMAIN, "tag too long"));
+	case EZFS_PIPEFAILED:
+		return (dgettext(TEXT_DOMAIN, "pipe create failed"));
+	case EZFS_THREADCREATEFAILED:
+		return (dgettext(TEXT_DOMAIN, "thread create failed"));
+	case EZFS_POSTSPLIT_ONLINE:
+		return (dgettext(TEXT_DOMAIN, "disk was split from this pool "
+		    "into a new one"));
+	case EZFS_SCRUBBING:
+		return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
+		    "use 'zpool scrub -s' to cancel current scrub"));
+	case EZFS_NO_SCRUB:
+		return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
 	case EZFS_UNKNOWN:
 		return (dgettext(TEXT_DOMAIN, "unknown error"));
 	default:
@ -306,6 +307,10 @@ zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt,
 		zfs_verror(hdl, EZFS_IO, fmt, ap);
 		return (-1);

+	case EFAULT:
+		zfs_verror(hdl, EZFS_FAULT, fmt, ap);
+		return (-1);
+
 	case EINTR:
 		zfs_verror(hdl, EZFS_INTR, fmt, ap);
 		return (-1);
@ -378,7 +383,7 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 		zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
 		break;
 	default:
-		zfs_error_aux(hdl, strerror(errno));
+		zfs_error_aux(hdl, strerror(error));
 		zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
 		break;
 	}
@ -610,6 +615,7 @@ libzfs_fini(libzfs_handle_t *hdl)
 	if (hdl->libzfs_log_str)
 		(void) free(hdl->libzfs_log_str);
 	zpool_free_handles(hdl);
+	libzfs_fru_clear(hdl, B_TRUE);
 	namespace_clear(hdl);
 	libzfs_mnttab_fini(hdl);
 	free(hdl);
@ -686,7 +692,7 @@ int
 zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
 {
 	if (len == 0)
-		len = 2048;
+		len = 4*1024;
 	zc->zc_nvlist_dst_size = len;
 	if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t)
 	    zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == 0)
@ -811,6 +817,8 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 	    "PROPERTY"));
 	cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN,
 	    "VALUE"));
+	cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN,
+	    "RECEIVED"));
 	cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN,
 	    "SOURCE"));

@ -824,7 +832,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 	 * inheriting from the longest name.  This is acceptable because in the
 	 * majority of cases 'SOURCE' is the last column displayed, and we don't
 	 * use the width anyway.  Note that the 'VALUE' column can be oversized,
-	 * if the name of the property is much longer the any values we find.
+	 * if the name of the property is much longer than any values we find.
 	 */
 	for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
 		/*
@ -855,6 +863,11 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 		    pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE])
 			cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width;

+		/* 'RECEIVED' column. */
+		if (pl != cbp->cb_proplist &&
+		    pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD])
+			cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width;
+
 		/*
 		 * 'NAME' and 'SOURCE' columns
 		 */
@ -870,7 +883,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 	/*
 	 * Now go through and print the headers.
 	 */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < ZFS_GET_NCOLS; i++) {
 		switch (cbp->cb_columns[i]) {
 		case GET_COL_NAME:
 			title = dgettext(TEXT_DOMAIN, "NAME");
@ -881,6 +894,9 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 		case GET_COL_VALUE:
 			title = dgettext(TEXT_DOMAIN, "VALUE");
 			break;
+		case GET_COL_RECVD:
+			title = dgettext(TEXT_DOMAIN, "RECEIVED");
+			break;
 		case GET_COL_SOURCE:
 			title = dgettext(TEXT_DOMAIN, "SOURCE");
 			break;
@ -889,7 +905,8 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 		}

 		if (title != NULL) {
-			if (i == 3 || cbp->cb_columns[i + 1] == 0)
+			if (i == (ZFS_GET_NCOLS - 1) ||
+			    cbp->cb_columns[i + 1] == GET_COL_NONE)
 				(void) printf("%s", title);
 			else
 				(void) printf("%-*s  ",
@ -907,7 +924,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 void
 zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
    const char *propname, const char *value, zprop_source_t sourcetype,
-    const char *source)
+    const char *source, const char *recvd_value)
 {
 	int i;
 	const char *str = NULL;
@ -922,7 +939,7 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
 	if (cbp->cb_first)
 		zprop_print_headers(cbp, cbp->cb_type);

-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < ZFS_GET_NCOLS; i++) {
 		switch (cbp->cb_columns[i]) {
 		case GET_COL_NAME:
 			str = name;
@ -959,14 +976,21 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
 				    "inherited from %s", source);
 				str = buf;
 				break;
+			case ZPROP_SRC_RECEIVED:
+				str = "received";
+				break;
 			}
 			break;

+		case GET_COL_RECVD:
+			str = (recvd_value == NULL ? "-" : recvd_value);
+			break;
+
 		default:
 			continue;
 		}

-		if (cbp->cb_columns[i + 1] == 0)
+		if (cbp->cb_columns[i + 1] == GET_COL_NONE)
 			(void) printf("%s", str);
 		else if (cbp->cb_scripted)
 			(void) printf("%s\t", str);
@ -974,7 +998,6 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
 			(void) printf("%-*s  ",
 			    cbp->cb_colwidths[cbp->cb_columns[i]],
 			    str);
-
 	}

 	(void) printf("\n");
@ -1036,7 +1059,7 @@ zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num)
 		return (-1);
 	}

-	/* Rely on stroull() to process the numeric portion.  */
+	/* Rely on strtoull() to process the numeric portion.  */
 	errno = 0;
 	*num = strtoull(value, &end, 10);

--- a/lib/libzpool/include/sys/zfs_context.h
+++ b/lib/libzpool/include/sys/zfs_context.h
@ -19,7 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

@ -75,6 +75,7 @@ extern "C" {
 #include <sys/u8_textprep.h>
 #include <sys/sysevent/eventdefs.h>
 #include <sys/sysevent/dev.h>
+#include <sys/sunddi.h>

 /*
 * Stack
@ -111,21 +112,27 @@ extern void vpanic(const char *, __va_list);

 #define	fm_panic	panic

+extern int aok;
+
 /* This definition is copied from assert.h. */
 #if defined(__STDC__)
 #if __STDC_VERSION__ - 0 >= 199901L
-#define	verify(EX) (void)((EX) || \
+#define	zverify(EX) (void)((EX) || (aok) || \
 	(__assert_c99(#EX, __FILE__, __LINE__, __func__), 0))
 #else
-#define	verify(EX) (void)((EX) || (__assert(#EX, __FILE__, __LINE__), 0))
+#define	zverify(EX) (void)((EX) || (aok) || \
+	(__assert(#EX, __FILE__, __LINE__), 0))
 #endif /* __STDC_VERSION__ - 0 >= 199901L */
 #else
-#define	verify(EX) (void)((EX) || (_assert("EX", __FILE__, __LINE__), 0))
+#define	zverify(EX) (void)((EX) || (aok) || \
+	(_assert("EX", __FILE__, __LINE__), 0))
 #endif	/* __STDC__ */


-#define	VERIFY	verify
-#define	ASSERT	assert
+#define	VERIFY	zverify
+#define	ASSERT	zverify
+#undef	assert
+#define	assert	zverify

 extern void __assert(const char *, const char *, int);

@ -136,7 +143,7 @@ extern void __assert(const char *, const char *, int);
 #define	VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \
 	const TYPE __left = (TYPE)(LEFT); \
 	const TYPE __right = (TYPE)(RIGHT); \
-	if (!(__left OP __right)) { \
+	if (!(__left OP __right) && (!aok)) { \
 		char *__buf = alloca(256); \
 		(void) snprintf(__buf, 256, "%s %s %s (0x%llx %s 0x%llx)", \
 			#LEFT, #OP, #RIGHT, \
@ -202,6 +209,18 @@ typedef struct kthread kthread_t;
 #define	thread_create(stk, stksize, func, arg, len, pp, state, pri)	\
 	zk_thread_create(func, arg)
 #define	thread_exit() thr_exit(NULL)
+#define	thread_join(t)	panic("libzpool cannot join threads")
+
+#define	newproc(f, a, cid, pri, ctp, pid)	(ENOSYS)
+
+/* in libzpool, p0 exists only to have its address taken */
+struct proc {
+	uintptr_t	this_is_never_used_dont_dereference_it;
+};
+
+extern struct proc p0;
+
+#define	PS_NONE		-1

 extern kthread_t *zk_thread_create(void (*func)(), void *arg);

@ -324,20 +343,27 @@ typedef void (task_func_t)(void *);
 #define	TASKQ_PREPOPULATE	0x0001
 #define	TASKQ_CPR_SAFE		0x0002	/* Use CPR safe protocol */
 #define	TASKQ_DYNAMIC		0x0004	/* Use dynamic thread scheduling */
-#define	TASKQ_THREADS_CPU_PCT	0x0008	/* Use dynamic thread scheduling */
+#define	TASKQ_THREADS_CPU_PCT	0x0008	/* Scale # threads by # cpus */
+#define	TASKQ_DC_BATCH		0x0010	/* Mark threads as batch */

 #define	TQ_SLEEP	KM_SLEEP	/* Can block for memory */
 #define	TQ_NOSLEEP	KM_NOSLEEP	/* cannot block for memory; may fail */
 #define	TQ_NOQUEUE	0x02		/* Do not enqueue if can't dispatch */
+#define	TQ_FRONT	0x08		/* Queue in front */

 extern taskq_t *system_taskq;

 extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
+#define	taskq_create_proc(a, b, c, d, e, p, f) \
+	    (taskq_create(a, b, c, d, e, f))
+#define	taskq_create_sysdc(a, b, d, e, p, dc, f) \
+	    (taskq_create(a, b, maxclsyspri, d, e, f))
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern void	taskq_destroy(taskq_t *);
 extern void	taskq_wait(taskq_t *);
 extern int	taskq_member(taskq_t *, void *);
 extern void	system_taskq_init(void);
+extern void	system_taskq_fini(void);

 #define	XVA_MAPSIZE	3
 #define	XVA_MAGIC	0x78766174
@ -351,6 +377,7 @@ typedef struct vnode {
 	char		*v_path;
 } vnode_t;

+#define	AV_SCANSTAMP_SZ	32		/* length of anti-virus scanstamp */

 typedef struct xoptattr {
 	timestruc_t	xoa_createtime;	/* Create time of file */
@ -366,6 +393,8 @@ typedef struct xoptattr {
 	uint8_t		xoa_opaque;
 	uint8_t		xoa_av_quarantined;
 	uint8_t		xoa_av_modified;
+	uint8_t		xoa_av_scanstamp[AV_SCANSTAMP_SZ];
+	uint8_t		xoa_reparse;
 } xoptattr_t;

 typedef struct vattr {
@ -412,9 +441,11 @@ typedef struct vsecattr {

 #define	CRCREAT		0

+extern int fop_getattr(vnode_t *vp, vattr_t *vap);
+
 #define	VOP_CLOSE(vp, f, c, o, cr, ct)	0
 #define	VOP_PUTPAGE(vp, of, sz, fl, cr, ct)	0
-#define	VOP_GETATTR(vp, vap, fl, cr, ct)  ((vap)->va_size = (vp)->v_size, 0)
+#define	VOP_GETATTR(vp, vap, fl, cr, ct)  fop_getattr((vp), (vap));

 #define	VOP_FSYNC(vp, f, cr, ct)	fsync((vp)->v_fd)

@ -439,13 +470,18 @@ extern vnode_t *rootdir;
 /*
 * Random stuff
 */
-#define	lbolt	(gethrtime() >> 23)
-#define	lbolt64	(gethrtime() >> 23)
+#define	ddi_get_lbolt()		(gethrtime() >> 23)
+#define	ddi_get_lbolt64()	(gethrtime() >> 23)
 #define	hz	119	/* frequency when using gethrtime() >> 23 for lbolt */

 extern void delay(clock_t ticks);

 #define	gethrestime_sec() time(NULL)
+#define	gethrestime(t) \
+	do {\
+		(t)->tv_sec = gethrestime_sec();\
+		(t)->tv_nsec = 0;\
+	} while (0);

 #define	max_ncpus	64

@ -496,6 +532,9 @@ typedef struct callb_cpr {
 #define	zone_dataset_visible(x, y)	(1)
 #define	INGLOBALZONE(z)			(1)

+extern char *kmem_asprintf(const char *fmt, ...);
+#define	strfree(str) kmem_free((str), strlen(str)+1)
+
 /*
 * Hostname information
 */
@ -503,6 +542,9 @@ extern char hw_serial[];	/* for userland-emulated hostid access */
 extern int ddi_strtoul(const char *str, char **nptr, int base,
    unsigned long *result);

+extern int ddi_strtoull(const char *str, char **nptr, int base,
+    u_longlong_t *result);
+
 /* ZFS Boot Related stuff. */

 struct _buf {
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@ -19,7 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

@ -41,6 +41,7 @@
 * Emulation of kernel services in userland.
 */

+int aok;
 uint64_t physmem;
 vnode_t *rootdir = (vnode_t *)0xabcd1234;
 char hw_serial[HW_HOSTID_LEN];
@ -49,6 +50,9 @@ struct utsname utsname = {
 	"userland", "libzpool", "1", "1", "na"
 };

+/* this only exists to have its address taken */
+struct proc p0;
+
 /*
 * =========================================================================
 * threads
@ -268,7 +272,7 @@ cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
 	clock_t delta;

 top:
-	delta = abstime - lbolt;
+	delta = abstime - ddi_get_lbolt();
 	if (delta <= 0)
 		return (-1);

@ -451,6 +455,24 @@ vn_close(vnode_t *vp)
 	umem_free(vp, sizeof (vnode_t));
 }

+/*
+ * At a minimum we need to update the size since vdev_reopen()
+ * will no longer call vn_openat().
+ */
+int
+fop_getattr(vnode_t *vp, vattr_t *vap)
+{
+	struct stat64 st;
+
+	if (fstat64(vp->v_fd, &st) == -1) {
+		close(vp->v_fd);
+		return (errno);
+	}
+
+	vap->va_size = st.st_size;
+	return (0);
+}
+
 #ifdef ZFS_DEBUG

 /*
@ -761,6 +783,17 @@ ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
 	return (0);
 }

+int
+ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
+{
+	char *end;
+
+	*result = strtoull(str, &end, base);
+	if (*result == 0)
+		return (errno);
+	return (0);
+}
+
 /*
 * =========================================================================
 * kernel emulation setup & teardown
@ -786,7 +819,8 @@ kernel_init(int mode)
 	dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
 	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));

-	(void) snprintf(hw_serial, sizeof (hw_serial), "%ld", gethostid());
+	(void) snprintf(hw_serial, sizeof (hw_serial), "%ld",
+	    (mode & FWRITE) ? gethostid() : 0);

 	VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1);
 	VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1);
@ -801,6 +835,8 @@ kernel_fini(void)
 {
 	spa_fini();

+	system_taskq_fini();
+
 	close(random_fd);
 	close(urandom_fd);

@ -866,3 +902,27 @@ ksiddomain_rele(ksiddomain_t *ksid)
 	spa_strfree(ksid->kd_name);
 	umem_free(ksid, sizeof (ksiddomain_t));
 }
+
+/*
+ * Do not change the length of the returned string; it must be freed
+ * with strfree().
+ */
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+	int size;
+	va_list adx;
+	char *buf;
+
+	va_start(adx, fmt);
+	size = vsnprintf(NULL, 0, fmt, adx) + 1;
+	va_end(adx);
+
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	va_start(adx, fmt);
+	size = vsnprintf(buf, size, fmt, adx);
+	va_end(adx);
+
+	return (buf);
+}
--- a/lib/libzpool/taskq.c
+++ b/lib/libzpool/taskq.c
@ -19,7 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

@ -49,6 +49,8 @@ struct taskq {
 	int		tq_nalloc;
 	int		tq_minalloc;
 	int		tq_maxalloc;
+	kcondvar_t	tq_maxalloc_cv;
+	int		tq_maxalloc_wait;
 	task_t		*tq_freelist;
 	task_t		tq_task;
 };
@ -57,26 +59,36 @@ static task_t *
 task_alloc(taskq_t *tq, int tqflags)
 {
 	task_t *t;
+	int rv;

-	if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
+again:	if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
 		tq->tq_freelist = t->task_next;
 	} else {
-		mutex_exit(&tq->tq_lock);
 		if (tq->tq_nalloc >= tq->tq_maxalloc) {
-			if (!(tqflags & KM_SLEEP)) {
-				mutex_enter(&tq->tq_lock);
+			if (!(tqflags & KM_SLEEP))
 				return (NULL);
-			}
+
 			/*
 			 * We don't want to exceed tq_maxalloc, but we can't
 			 * wait for other tasks to complete (and thus free up
 			 * task structures) without risking deadlock with
 			 * the caller.  So, we just delay for one second
-			 * to throttle the allocation rate.
+			 * to throttle the allocation rate. If we have tasks
+			 * complete before one second timeout expires then
+			 * taskq_ent_free will signal us and we will
+			 * immediately retry the allocation.
 			 */
-			delay(hz);
+			tq->tq_maxalloc_wait++;
+			rv = cv_timedwait(&tq->tq_maxalloc_cv,
+			    &tq->tq_lock, ddi_get_lbolt() + hz);
+			tq->tq_maxalloc_wait--;
+			if (rv > 0)
+				goto again;		/* signaled */
 		}
+		mutex_exit(&tq->tq_lock);
+
 		t = kmem_alloc(sizeof (task_t), tqflags);
+
 		mutex_enter(&tq->tq_lock);
 		if (t != NULL)
 			tq->tq_nalloc++;
@ -96,6 +108,9 @@ task_free(taskq_t *tq, task_t *t)
 		kmem_free(t, sizeof (task_t));
 		mutex_enter(&tq->tq_lock);
 	}
+
+	if (tq->tq_maxalloc_wait)
+		cv_signal(&tq->tq_maxalloc_cv);
 }

 taskqid_t
@ -114,8 +129,13 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
 		mutex_exit(&tq->tq_lock);
 		return (0);
 	}
+	if (tqflags & TQ_FRONT) {
+		t->task_next = tq->tq_task.task_next;
+		t->task_prev = &tq->tq_task;
+	} else {
 		t->task_next = &tq->tq_task;
 		t->task_prev = tq->tq_task.task_prev;
+	}
 	t->task_next->task_prev = t;
 	t->task_prev->task_next = t;
 	t->task_func = func;
@ -191,6 +211,7 @@ taskq_create(const char *name, int nthreads, pri_t pri,
 	mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
 	tq->tq_flags = flags | TASKQ_ACTIVE;
 	tq->tq_active = nthreads;
 	tq->tq_nthreads = nthreads;
@ -247,6 +268,7 @@ taskq_destroy(taskq_t *tq)
 	mutex_destroy(&tq->tq_lock);
 	cv_destroy(&tq->tq_dispatch_cv);
 	cv_destroy(&tq->tq_wait_cv);
+	cv_destroy(&tq->tq_maxalloc_cv);

 	kmem_free(tq, sizeof (taskq_t));
 }
@ -272,3 +294,10 @@ system_taskq_init(void)
 	system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512,
 	    TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
 }
+
+void
+system_taskq_fini(void)
+{
+	taskq_destroy(system_taskq);
+	system_taskq = NULL; /* defensive */
+}
--- a/lib/libzpool/util.c
+++ b/lib/libzpool/util.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #include <assert.h>
@ -90,7 +89,7 @@ show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent)
 		if (is_log)
 			prefix = "log ";

-		if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+		if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 		    (uint64_t **)&vs, &c) != 0)
 			vs = &v0;

--- a/module/avl/avl.c
+++ b/module/avl/avl.c
@ -19,13 +19,10 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

-
-
-
 /*
 * AVL - generic AVL tree implementation for kernel use
 *
@ -243,7 +240,7 @@ avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
 *	"void *"  of the found tree node
 */
 void *
-avl_find(avl_tree_t *tree, void *value, avl_index_t *where)
+avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
 {
 	avl_node_t *node;
 	avl_node_t *prev = NULL;
--- a/module/avl/include/sys/avl.h
+++ b/module/avl/include/sys/avl.h
@ -19,15 +19,13 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

 #ifndef	_AVL_H
 #define	_AVL_H

-
-
 /*
 * This is a private header file.  Applications should not directly include
 * this file.
@ -163,7 +161,7 @@ extern void avl_create(avl_tree_t *tree,
 * node   - node that has the value being looked for
 * where  - position for use with avl_nearest() or avl_insert(), may be NULL
 */
-extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where);
+extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where);

 /*
 * Insert a node into the tree.
--- a/module/nvpair/include/sys/nvpair.h
+++ b/module/nvpair/include/sys/nvpair.h
@ -19,15 +19,13 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

 #ifndef	_SYS_NVPAIR_H
 #define	_SYS_NVPAIR_H

-
-
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/va_list.h>
@ -199,6 +197,7 @@ int nvlist_add_double(nvlist_t *, const char *, double);

 int nvlist_remove(nvlist_t *, const char *, data_type_t);
 int nvlist_remove_all(nvlist_t *, const char *);
+int nvlist_remove_nvpair(nvlist_t *, nvpair_t *);

 int nvlist_lookup_boolean(nvlist_t *, const char *);
 int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *);
@ -237,9 +236,11 @@ int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **);
 int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **,
    int *, char **);
 boolean_t nvlist_exists(nvlist_t *, const char *);
+boolean_t nvlist_empty(nvlist_t *);

 /* processing nvpair */
 nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *);
+nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *);
 char *nvpair_name(nvpair_t *);
 data_type_t nvpair_type(nvpair_t *);
 int nvpair_type_is_array(nvpair_t *);
--- a/module/nvpair/nvpair.c
+++ b/module/nvpair/nvpair.c
@ -20,12 +20,10 @@
 */

 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

-
-
 #include <sys/stropts.h>
 #include <sys/debug.h>
 #include <sys/isa_defs.h>
@ -692,6 +690,18 @@ nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
 	return (ENOENT);
 }

+int
+nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	if (nvl == NULL || nvp == NULL)
+		return (EINVAL);
+
+	nvp_buf_unlink(nvl, nvp);
+	nvpair_free(nvp);
+	nvp_buf_free(nvl, nvp);
+	return (0);
+}
+
 /*
 * This function calculates the size of an nvpair value.
 *
@ -1162,6 +1172,42 @@ nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 	return (curr != NULL ? &curr->nvi_nvp : NULL);
 }

+nvpair_t *
+nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv;
+	i_nvp_t *curr;
+
+	if (nvl == NULL ||
+	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+		return (NULL);
+
+	curr = NVPAIR2I_NVP(nvp);
+
+	if (nvp == NULL)
+		curr = priv->nvp_last;
+	else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
+		curr = curr->nvi_prev;
+	else
+		curr = NULL;
+
+	priv->nvp_curr = curr;
+
+	return (curr != NULL ? &curr->nvi_nvp : NULL);
+}
+
+boolean_t
+nvlist_empty(nvlist_t *nvl)
+{
+	nvpriv_t *priv;
+
+	if (nvl == NULL ||
+	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+		return (B_TRUE);
+
+	return (priv->nvp_list == NULL);
+}
+
 char *
 nvpair_name(nvpair_t *nvp)
 {
--- a/module/zcommon/include/sys/fs/zfs.h
+++ b/module/zcommon/include/sys/fs/zfs.h
@ -20,10 +20,11 @@
 */

 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef	_SYS_FS_ZFS_H
 #define	_SYS_FS_ZFS_H

@ -86,12 +87,11 @@ typedef enum {
 	ZFS_PROP_READONLY,
 	ZFS_PROP_ZONED,
 	ZFS_PROP_SNAPDIR,
-	ZFS_PROP_ACLMODE,
+	ZFS_PROP_PRIVATE,		/* not exposed to user, temporary */
 	ZFS_PROP_ACLINHERIT,
 	ZFS_PROP_CREATETXG,		/* not exposed to the user */
 	ZFS_PROP_NAME,			/* not exposed to the user */
 	ZFS_PROP_CANMOUNT,
-	ZFS_PROP_SHAREISCSI,
 	ZFS_PROP_ISCSIOPTIONS,		/* not exposed to the user */
 	ZFS_PROP_XATTR,
 	ZFS_PROP_NUMCLONES,		/* not exposed to the user */
@ -116,6 +116,12 @@ typedef enum {
 	ZFS_PROP_STMF_SHAREINFO,	/* not exposed to the user */
 	ZFS_PROP_DEFER_DESTROY,
 	ZFS_PROP_USERREFS,
+	ZFS_PROP_LOGBIAS,
+	ZFS_PROP_UNIQUE,		/* not exposed to the user */
+	ZFS_PROP_OBJSETID,		/* not exposed to the user */
+	ZFS_PROP_DEDUP,
+	ZFS_PROP_MLSLABEL,
+	ZFS_PROP_SYNC,
 	ZFS_NUM_PROPS
 } zfs_prop_t;

@ -138,8 +144,6 @@ extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
 typedef enum {
 	ZPOOL_PROP_NAME,
 	ZPOOL_PROP_SIZE,
-	ZPOOL_PROP_USED,
-	ZPOOL_PROP_AVAILABLE,
 	ZPOOL_PROP_CAPACITY,
 	ZPOOL_PROP_ALTROOT,
 	ZPOOL_PROP_HEALTH,
@ -152,6 +156,10 @@ typedef enum {
 	ZPOOL_PROP_FAILUREMODE,
 	ZPOOL_PROP_LISTSNAPS,
 	ZPOOL_PROP_AUTOEXPAND,
+	ZPOOL_PROP_DEDUPDITTO,
+	ZPOOL_PROP_DEDUPRATIO,
+	ZPOOL_PROP_FREE,
+	ZPOOL_PROP_ALLOCATED,
 	ZPOOL_NUM_PROPS
 } zpool_prop_t;

@ -166,10 +174,27 @@ typedef enum {
 	ZPROP_SRC_DEFAULT = 0x2,
 	ZPROP_SRC_TEMPORARY = 0x4,
 	ZPROP_SRC_LOCAL = 0x8,
-	ZPROP_SRC_INHERITED = 0x10
+	ZPROP_SRC_INHERITED = 0x10,
+	ZPROP_SRC_RECEIVED = 0x20
 } zprop_source_t;

-#define	ZPROP_SRC_ALL	0x1f
+#define	ZPROP_SRC_ALL	0x3f
+
+#define	ZPROP_SOURCE_VAL_RECVD	"$recvd"
+#define	ZPROP_N_MORE_ERRORS	"N_MORE_ERRORS"
+/*
+ * Dataset flag implemented as a special entry in the props zap object
+ * indicating that the dataset has received properties on or after
+ * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
+ * just as it did in earlier versions, and thereafter, local properties are
+ * preserved.
+ */
+#define	ZPROP_HAS_RECVD		"$hasrecvd"
+
+typedef enum {
+	ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
+	ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
+} zprop_errflags_t;

 typedef int (*zprop_func)(int, void *);

@ -191,9 +216,10 @@ boolean_t zfs_prop_setonce(zfs_prop_t);
 const char *zfs_prop_to_name(zfs_prop_t);
 zfs_prop_t zfs_name_to_prop(const char *);
 boolean_t zfs_prop_user(const char *);
-boolean_t zfs_prop_userquota(const char *name);
+boolean_t zfs_prop_userquota(const char *);
 int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
 int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
+uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
 boolean_t zfs_prop_valid_for_type(int, zfs_type_t);

 /*
@ -206,6 +232,7 @@ uint64_t zpool_prop_default_numeric(zpool_prop_t);
 boolean_t zpool_prop_readonly(zpool_prop_t);
 int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
 int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
+uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);

 /*
 * Definitions for the Delegation.
@ -236,6 +263,8 @@ typedef enum {
 #define	ZFS_DELEG_PERM_GID	"gid"
 #define	ZFS_DELEG_PERM_GROUPS	"groups"

+#define	ZFS_MLSLABEL_DEFAULT	"none"
+
 #define	ZFS_SMB_ACL_SRC		"src"
 #define	ZFS_SMB_ACL_TARGET	"target"

@ -245,6 +274,11 @@ typedef enum {
 	ZFS_CANMOUNT_NOAUTO = 2
 } zfs_canmount_type_t;

+typedef enum {
+	ZFS_LOGBIAS_LATENCY = 0,
+	ZFS_LOGBIAS_THROUGHPUT = 1
+} zfs_logbias_op_t;
+
 typedef enum zfs_share_op {
 	ZFS_SHARE_NFS = 0,
 	ZFS_UNSHARE_NFS = 1,
@ -265,6 +299,12 @@ typedef enum zfs_cache_type {
 	ZFS_CACHE_ALL = 2
 } zfs_cache_type_t;

+typedef enum {
+	ZFS_SYNC_STANDARD = 0,
+	ZFS_SYNC_ALWAYS = 1,
+	ZFS_SYNC_DISABLED = 2
+} zfs_sync_type_t;
+

 /*
 * On-disk version number.
@ -287,14 +327,22 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_16			16ULL
 #define	SPA_VERSION_17			17ULL
 #define	SPA_VERSION_18			18ULL
+#define	SPA_VERSION_19			19ULL
+#define	SPA_VERSION_20			20ULL
+#define	SPA_VERSION_21			21ULL
+#define	SPA_VERSION_22			22ULL
+#define	SPA_VERSION_23			23ULL
+#define	SPA_VERSION_24			24ULL
+#define	SPA_VERSION_25			25ULL
+#define	SPA_VERSION_26			26ULL
 /*
 * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
 * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
 * and do the appropriate changes.  Also bump the version number in
 * usr/src/grub/capability.
 */
-#define	SPA_VERSION			SPA_VERSION_18
-#define	SPA_VERSION_STRING		"18"
+#define	SPA_VERSION			SPA_VERSION_26
+#define	SPA_VERSION_STRING		"26"

 /*
 * Symbolic names for the changes that caused a SPA_VERSION switch.
@ -311,7 +359,7 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_DITTO_BLOCKS	SPA_VERSION_2
 #define	SPA_VERSION_SPARES		SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ2		SPA_VERSION_3
-#define	SPA_VERSION_BPLIST_ACCOUNT	SPA_VERSION_3
+#define	SPA_VERSION_BPOBJ_ACCOUNT	SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ_DEFLATE	SPA_VERSION_3
 #define	SPA_VERSION_DNODE_BYTES		SPA_VERSION_3
 #define	SPA_VERSION_ZPOOL_HISTORY	SPA_VERSION_4
@ -334,6 +382,15 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_STMF_PROP		SPA_VERSION_16
 #define	SPA_VERSION_RAIDZ3		SPA_VERSION_17
 #define	SPA_VERSION_USERREFS		SPA_VERSION_18
+#define	SPA_VERSION_HOLES		SPA_VERSION_19
+#define	SPA_VERSION_ZLE_COMPRESSION	SPA_VERSION_20
+#define	SPA_VERSION_DEDUP		SPA_VERSION_21
+#define	SPA_VERSION_RECVD_PROPS		SPA_VERSION_22
+#define	SPA_VERSION_SLIM_ZIL		SPA_VERSION_23
+#define	SPA_VERSION_SA			SPA_VERSION_24
+#define	SPA_VERSION_SCAN		SPA_VERSION_25
+#define	SPA_VERSION_DIR_CLONES		SPA_VERSION_26
+#define	SPA_VERSION_DEADLISTS		SPA_VERSION_26

 /*
 * ZPL version - rev'd whenever an incompatible on-disk format change
@ -347,8 +404,9 @@ typedef enum zfs_cache_type {
 #define	ZPL_VERSION_2			2ULL
 #define	ZPL_VERSION_3			3ULL
 #define	ZPL_VERSION_4			4ULL
-#define	ZPL_VERSION			ZPL_VERSION_4
-#define	ZPL_VERSION_STRING		"4"
+#define	ZPL_VERSION_5			5ULL
+#define	ZPL_VERSION			ZPL_VERSION_5
+#define	ZPL_VERSION_STRING		"5"

 #define	ZPL_VERSION_INITIAL		ZPL_VERSION_1
 #define	ZPL_VERSION_DIRENT_TYPE		ZPL_VERSION_2
@ -356,6 +414,23 @@ typedef enum zfs_cache_type {
 #define	ZPL_VERSION_NORMALIZATION	ZPL_VERSION_3
 #define	ZPL_VERSION_SYSATTR		ZPL_VERSION_3
 #define	ZPL_VERSION_USERSPACE		ZPL_VERSION_4
+#define	ZPL_VERSION_SA			ZPL_VERSION_5
+
+/* Rewind request information */
+#define	ZPOOL_NO_REWIND		1  /* No policy - default behavior */
+#define	ZPOOL_NEVER_REWIND	2  /* Do not search for best txg or rewind */
+#define	ZPOOL_TRY_REWIND	4  /* Search for best txg, but do not rewind */
+#define	ZPOOL_DO_REWIND		8  /* Rewind to best txg w/in deferred frees */
+#define	ZPOOL_EXTREME_REWIND	16 /* Allow extreme measures to find best txg */
+#define	ZPOOL_REWIND_MASK	28 /* All the possible rewind bits */
+#define	ZPOOL_REWIND_POLICIES	31 /* All the possible policy bits */
+
+typedef struct zpool_rewind_policy {
+	uint32_t	zrp_request;	/* rewind behavior requested */
+	uint64_t	zrp_maxmeta;	/* max acceptable meta-data errors */
+	uint64_t	zrp_maxdata;	/* max acceptable data errors */
+	uint64_t	zrp_txg;	/* specific txg to load */
+} zpool_rewind_policy_t;

 /*
 * The following are configuration names used in the nvlist describing a pool's
@ -380,7 +455,8 @@ typedef enum zfs_cache_type {
 #define	ZPOOL_CONFIG_ASHIFT		"ashift"
 #define	ZPOOL_CONFIG_ASIZE		"asize"
 #define	ZPOOL_CONFIG_DTL		"DTL"
-#define	ZPOOL_CONFIG_STATS		"stats"
+#define	ZPOOL_CONFIG_SCAN_STATS		"scan_stats"	/* not stored on disk */
+#define	ZPOOL_CONFIG_VDEV_STATS		"vdev_stats"	/* not stored on disk */
 #define	ZPOOL_CONFIG_WHOLE_DISK		"whole_disk"
 #define	ZPOOL_CONFIG_ERRCOUNT		"error_count"
 #define	ZPOOL_CONFIG_NOT_PRESENT	"not_present"
@ -393,6 +469,17 @@ typedef enum zfs_cache_type {
 #define	ZPOOL_CONFIG_PHYS_PATH		"phys_path"
 #define	ZPOOL_CONFIG_IS_LOG		"is_log"
 #define	ZPOOL_CONFIG_L2CACHE		"l2cache"
+#define	ZPOOL_CONFIG_HOLE_ARRAY		"hole_array"
+#define	ZPOOL_CONFIG_VDEV_CHILDREN	"vdev_children"
+#define	ZPOOL_CONFIG_IS_HOLE		"is_hole"
+#define	ZPOOL_CONFIG_DDT_HISTOGRAM	"ddt_histogram"
+#define	ZPOOL_CONFIG_DDT_OBJ_STATS	"ddt_object_stats"
+#define	ZPOOL_CONFIG_DDT_STATS		"ddt_stats"
+#define	ZPOOL_CONFIG_SPLIT		"splitcfg"
+#define	ZPOOL_CONFIG_ORIG_GUID		"orig_guid"
+#define	ZPOOL_CONFIG_SPLIT_GUID		"split_guid"
+#define	ZPOOL_CONFIG_SPLIT_LIST		"guid_list"
+#define	ZPOOL_CONFIG_REMOVING		"removing"
 #define	ZPOOL_CONFIG_SUSPENDED		"suspended"	/* not stored on disk */
 #define	ZPOOL_CONFIG_TIMESTAMP		"timestamp"	/* not stored on disk */
 #define	ZPOOL_CONFIG_BOOTFS		"bootfs"	/* not stored on disk */
@ -406,6 +493,19 @@ typedef enum zfs_cache_type {
 #define	ZPOOL_CONFIG_DEGRADED		"degraded"
 #define	ZPOOL_CONFIG_REMOVED		"removed"
 #define	ZPOOL_CONFIG_FRU		"fru"
+#define	ZPOOL_CONFIG_AUX_STATE		"aux_state"
+
+/* Rewind policy parameters */
+#define	ZPOOL_REWIND_POLICY		"rewind-policy"
+#define	ZPOOL_REWIND_REQUEST		"rewind-request"
+#define	ZPOOL_REWIND_REQUEST_TXG	"rewind-request-txg"
+#define	ZPOOL_REWIND_META_THRESH	"rewind-meta-thresh"
+#define	ZPOOL_REWIND_DATA_THRESH	"rewind-data-thresh"
+
+/* Rewind data discovered */
+#define	ZPOOL_CONFIG_LOAD_TIME		"rewind_txg_ts"
+#define	ZPOOL_CONFIG_LOAD_DATA_ERRORS	"verify_data_errors"
+#define	ZPOOL_CONFIG_REWIND_TIME	"seconds_of_rewind"

 #define	VDEV_TYPE_ROOT			"root"
 #define	VDEV_TYPE_MIRROR		"mirror"
@ -414,6 +514,7 @@ typedef enum zfs_cache_type {
 #define	VDEV_TYPE_DISK			"disk"
 #define	VDEV_TYPE_FILE			"file"
 #define	VDEV_TYPE_MISSING		"missing"
+#define	VDEV_TYPE_HOLE			"hole"
 #define	VDEV_TYPE_SPARE			"spare"
 #define	VDEV_TYPE_LOG			"log"
 #define	VDEV_TYPE_L2CACHE		"l2cache"
@ -463,7 +564,9 @@ typedef enum vdev_aux {
 	VDEV_AUX_SPARED,	/* hot spare used in another pool	*/
 	VDEV_AUX_ERR_EXCEEDED,	/* too many errors			*/
 	VDEV_AUX_IO_FAILURE,	/* experienced I/O failure		*/
-	VDEV_AUX_BAD_LOG	/* cannot read log chain(s)		*/
+	VDEV_AUX_BAD_LOG,	/* cannot read log chain(s)		*/
+	VDEV_AUX_EXTERNAL,	/* external diagnosis			*/
+	VDEV_AUX_SPLIT_POOL	/* vdev was split off into another pool	*/
 } vdev_aux_t;

 /*
@ -484,14 +587,14 @@ typedef enum pool_state {
 } pool_state_t;

 /*
- * Scrub types.
+ * Scan Functions.
 */
-typedef enum pool_scrub_type {
-	POOL_SCRUB_NONE,
-	POOL_SCRUB_RESILVER,
-	POOL_SCRUB_EVERYTHING,
-	POOL_SCRUB_TYPES
-} pool_scrub_type_t;
+typedef enum pool_scan_func {
+	POOL_SCAN_NONE,
+	POOL_SCAN_SCRUB,
+	POOL_SCAN_RESILVER,
+	POOL_SCAN_FUNCS
+} pool_scan_func_t;

 /*
 * ZIO types.  Needed to interpret vdev statistics below.
@ -506,6 +609,36 @@ typedef enum zio_type {
 	ZIO_TYPES
 } zio_type_t;

+/*
+ * Pool statistics.  Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct pool_scan_stat {
+	/* values stored on disk */
+	uint64_t	pss_func;	/* pool_scan_func_t */
+	uint64_t	pss_state;	/* dsl_scan_state_t */
+	uint64_t	pss_start_time;	/* scan start time */
+	uint64_t	pss_end_time;	/* scan end time */
+	uint64_t	pss_to_examine;	/* total bytes to scan */
+	uint64_t	pss_examined;	/* total examined bytes	*/
+	uint64_t	pss_to_process; /* total bytes to process */
+	uint64_t	pss_processed;	/* total processed bytes */
+	uint64_t	pss_errors;	/* scan errors	*/
+
+	/* values not stored on disk */
+	uint64_t	pss_pass_exam;	/* examined bytes per scan pass */
+	uint64_t	pss_pass_start;	/* start time of a scan pass */
+} pool_scan_stat_t;
+
+typedef enum dsl_scan_state {
+	DSS_NONE,
+	DSS_SCANNING,
+	DSS_FINISHED,
+	DSS_CANCELED,
+	DSS_NUM_STATES
+} dsl_scan_state_t;
+
+
 /*
 * Vdev statistics.  Note: all fields should be 64-bit because this
 * is passed between kernel and userland as an nvlist uint64 array.
@ -524,34 +657,49 @@ typedef struct vdev_stat {
 	uint64_t	vs_write_errors;	/* write errors		*/
 	uint64_t	vs_checksum_errors;	/* checksum errors	*/
 	uint64_t	vs_self_healed;		/* self-healed bytes	*/
-	uint64_t	vs_scrub_type;		/* pool_scrub_type_t	*/
-	uint64_t	vs_scrub_complete;	/* completed?		*/
-	uint64_t	vs_scrub_examined;	/* bytes examined; top	*/
-	uint64_t	vs_scrub_repaired;	/* bytes repaired; leaf	*/
-	uint64_t	vs_scrub_errors;	/* errors during scrub	*/
-	uint64_t	vs_scrub_start;		/* UTC scrub start time	*/
-	uint64_t	vs_scrub_end;		/* UTC scrub end time	*/
+	uint64_t	vs_scan_removing;	/* removing?	*/
+	uint64_t	vs_scan_processed;	/* scan processed bytes	*/
 } vdev_stat_t;

+/*
+ * DDT statistics.  Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct ddt_object {
+	uint64_t	ddo_count;	/* number of elments in ddt 	*/
+	uint64_t	ddo_dspace;	/* size of ddt on disk		*/
+	uint64_t	ddo_mspace;	/* size of ddt in-core		*/
+} ddt_object_t;
+
+typedef struct ddt_stat {
+	uint64_t	dds_blocks;	/* blocks			*/
+	uint64_t	dds_lsize;	/* logical size			*/
+	uint64_t	dds_psize;	/* physical size		*/
+	uint64_t	dds_dsize;	/* deflated allocated size	*/
+	uint64_t	dds_ref_blocks;	/* referenced blocks		*/
+	uint64_t	dds_ref_lsize;	/* referenced lsize * refcnt	*/
+	uint64_t	dds_ref_psize;	/* referenced psize * refcnt	*/
+	uint64_t	dds_ref_dsize;	/* referenced dsize * refcnt	*/
+} ddt_stat_t;
+
+typedef struct ddt_histogram {
+	ddt_stat_t	ddh_stat[64];	/* power-of-two histogram buckets */
+} ddt_histogram_t;
+
 #define	ZVOL_DRIVER	"zvol"
 #define	ZFS_DRIVER	"zfs"
 #define	ZFS_DEV		"/dev/zfs"

-/*
- * zvol paths.  Irritatingly, the devfsadm interfaces want all these
- * paths without the /dev prefix, but for some things, we want the
- * /dev prefix.  Below are the names without /dev.
- */
-#define	ZVOL_DEV_DIR	"zvol/dsk"
-#define	ZVOL_RDEV_DIR	"zvol/rdsk"
-
-/*
- * And here are the things we need with /dev, etc. in front of them.
- */
+/* general zvol path */
+#define	ZVOL_DIR		"/dev/zvol"
+/* expansion */
 #define	ZVOL_PSEUDO_DEV		"/devices/pseudo/zfs@0:"
-#define	ZVOL_FULL_DEV_DIR	"/dev/" ZVOL_DEV_DIR "/"
+/* for dump and swap */
+#define	ZVOL_FULL_DEV_DIR	ZVOL_DIR "/dsk/"
+#define	ZVOL_FULL_RDEV_DIR	ZVOL_DIR "/rdsk/"

 #define	ZVOL_PROP_NAME		"name"
+#define	ZVOL_DEFAULT_BLOCKSIZE	8192

 /*
 * /dev/zfs ioctl numbers.
@ -566,7 +714,7 @@ typedef enum zfs_ioc {
 	ZFS_IOC_POOL_CONFIGS,
 	ZFS_IOC_POOL_STATS,
 	ZFS_IOC_POOL_TRYIMPORT,
-	ZFS_IOC_POOL_SCRUB,
+	ZFS_IOC_POOL_SCAN,
 	ZFS_IOC_POOL_FREEZE,
 	ZFS_IOC_POOL_UPGRADE,
 	ZFS_IOC_POOL_GET_HISTORY,
@ -582,8 +730,6 @@ typedef enum zfs_ioc {
 	ZFS_IOC_DATASET_LIST_NEXT,
 	ZFS_IOC_SNAPSHOT_LIST_NEXT,
 	ZFS_IOC_SET_PROP,
-	ZFS_IOC_CREATE_MINOR,
-	ZFS_IOC_REMOVE_MINOR,
 	ZFS_IOC_CREATE,
 	ZFS_IOC_DESTROY,
 	ZFS_IOC_ROLLBACK,
@ -604,7 +750,6 @@ typedef enum zfs_ioc {
 	ZFS_IOC_POOL_GET_PROPS,
 	ZFS_IOC_SET_FSACL,
 	ZFS_IOC_GET_FSACL,
-	ZFS_IOC_ISCSI_PERM_CHECK,
 	ZFS_IOC_SHARE,
 	ZFS_IOC_INHERIT_PROP,
 	ZFS_IOC_SMB_ACL,
@ -613,7 +758,9 @@ typedef enum zfs_ioc {
 	ZFS_IOC_USERSPACE_UPGRADE,
 	ZFS_IOC_HOLD,
 	ZFS_IOC_RELEASE,
-	ZFS_IOC_GET_HOLDS
+	ZFS_IOC_GET_HOLDS,
+	ZFS_IOC_OBJSET_RECVD_PROPS,
+	ZFS_IOC_VDEV_SPLIT
 } zfs_ioc_t;

 /*
@ -623,7 +770,9 @@ typedef enum {
 	SPA_LOAD_NONE,		/* no load in progress	*/
 	SPA_LOAD_OPEN,		/* normal open		*/
 	SPA_LOAD_IMPORT,	/* import in progress	*/
-	SPA_LOAD_TRYIMPORT	/* tryimport in progress */
+	SPA_LOAD_TRYIMPORT,	/* tryimport in progress */
+	SPA_LOAD_RECOVER,	/* recovery requested	*/
+	SPA_LOAD_ERROR		/* load failed		*/
 } spa_load_state_t;

 /*
@ -686,7 +835,7 @@ typedef enum {
 /*
 * Note: This is encoded on-disk, so new events must be added to the
 * end, and unused events can not be removed.  Be sure to edit
- * zpool_main.c: hist_event_table[].
+ * libzfs_pool.c: hist_event_table[].
 */
 typedef enum history_internal_events {
 	LOG_NO_EVENT = 0,
@ -703,7 +852,7 @@ typedef enum history_internal_events {
 	LOG_POOL_VDEV_OFFLINE,
 	LOG_POOL_UPGRADE,
 	LOG_POOL_CLEAR,
-	LOG_POOL_SCRUB,
+	LOG_POOL_SCAN,
 	LOG_POOL_PROPSET,
 	LOG_DS_CREATE,
 	LOG_DS_CLONE,
@ -726,9 +875,10 @@ typedef enum history_internal_events {
 	LOG_DS_UPGRADE,
 	LOG_DS_REFQUOTA,
 	LOG_DS_REFRESERV,
-	LOG_POOL_SCRUB_DONE,
+	LOG_POOL_SCAN_DONE,
 	LOG_DS_USER_HOLD,
 	LOG_DS_USER_RELEASE,
+	LOG_POOL_SPLIT,
 	LOG_END
 } history_internal_events_t;

--- a/module/zcommon/include/zfs_comutil.h
+++ b/module/zcommon/include/zfs_comutil.h
@ -19,15 +19,12 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_ZFS_COMUTIL_H
 #define	_ZFS_COMUTIL_H

-
-
 #include <sys/fs/zfs.h>
 #include <sys/types.h>

@ -35,7 +32,12 @@
 extern "C" {
 #endif

-extern boolean_t zfs_allocatable_devs(nvlist_t *nv);
+extern boolean_t zfs_allocatable_devs(nvlist_t *);
+extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *);
+
+extern int zfs_zpl_version_map(int spa_version);
+extern int zfs_spa_version_map(int zpl_version);
+extern const char *zfs_history_event_names[LOG_END];

 #ifdef	__cplusplus
 }
--- a/module/zcommon/include/zfs_fletcher.h
+++ b/module/zcommon/include/zfs_fletcher.h
@ -0,0 +1,53 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_ZFS_FLETCHER_H
+#define	_ZFS_FLETCHER_H
+
+#include <sys/types.h>
+#include <sys/spa.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * fletcher checksum functions
+ */
+
+void fletcher_2_native(const void *, uint64_t, zio_cksum_t *);
+void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_native(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_incremental_native(const void *, uint64_t,
+    zio_cksum_t *);
+void fletcher_4_incremental_byteswap(const void *, uint64_t,
+    zio_cksum_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZFS_FLETCHER_H */
--- a/module/zcommon/include/zfs_prop.h
+++ b/module/zcommon/include/zfs_prop.h
@ -19,15 +19,13 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

 #ifndef	_ZFS_PROP_H
 #define	_ZFS_PROP_H

-
-
 #include <sys/fs/zfs.h>
 #include <sys/types.h>

@ -79,6 +77,7 @@ typedef struct {
 					/* "zfs get" help message */
 	const zprop_index_t *pd_table;	/* for index properties, a table */
 					/* defining the possible values */
+	size_t pd_table_size;		/* number of entries in pd_table[] */
 } zprop_desc_t;

 /*
@ -99,16 +98,16 @@ zprop_desc_t *zpool_prop_get_table(void);
 /*
 * Common routines to initialize property tables
 */
-void register_impl(int, const char *, zprop_type_t, uint64_t,
+void zprop_register_impl(int, const char *, zprop_type_t, uint64_t,
    const char *, zprop_attr_t, int, const char *, const char *,
    boolean_t, boolean_t, const zprop_index_t *);
-void register_string(int, const char *, const char *, zprop_attr_t attr,
-    int, const char *, const char *);
-void register_number(int, const char *, uint64_t, zprop_attr_t, int,
+void zprop_register_string(int, const char *, const char *,
+    zprop_attr_t attr, int, const char *, const char *);
+void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int,
    const char *, const char *);
-void register_index(int, const char *, uint64_t, zprop_attr_t, int,
+void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int,
    const char *, const char *, const zprop_index_t *);
-void register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
+void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
    int, const char *);

 /*
@ -118,6 +117,7 @@ int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t);
 int zprop_name_to_prop(const char *, zfs_type_t);
 int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t);
 int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t);
+uint64_t zprop_random_value(int, uint64_t, zfs_type_t);
 const char *zprop_values(int, zfs_type_t);
 size_t zprop_width(int, boolean_t *, zfs_type_t);
 boolean_t zprop_valid_for_type(int, zfs_type_t);
--- a/module/zcommon/zfs_comutil.c
+++ b/module/zcommon/zfs_comutil.c
@ -19,12 +19,9 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 */

-
-
 /*
 * This file is intended for functions that ought to be common between user
 * land (libzfs) and the kernel. When many common routines need to be shared
@ -33,11 +30,15 @@

 #if defined(_KERNEL)
 #include <sys/systm.h>
+#else
+#include <string.h>
 #endif

 #include <sys/types.h>
 #include <sys/fs/zfs.h>
+#include <sys/int_limits.h>
 #include <sys/nvpair.h>
+#include "zfs_comutil.h"

 /*
 * Are there allocatable vdevs?
@ -64,6 +65,142 @@ zfs_allocatable_devs(nvlist_t *nv)
 	return (B_FALSE);
 }

+void
+zpool_get_rewind_policy(nvlist_t *nvl, zpool_rewind_policy_t *zrpp)
+{
+	nvlist_t *policy;
+	nvpair_t *elem;
+	char *nm;
+
+	/* Defaults */
+	zrpp->zrp_request = ZPOOL_NO_REWIND;
+	zrpp->zrp_maxmeta = 0;
+	zrpp->zrp_maxdata = UINT64_MAX;
+	zrpp->zrp_txg = UINT64_MAX;
+
+	if (nvl == NULL)
+		return;
+
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+		nm = nvpair_name(elem);
+		if (strcmp(nm, ZPOOL_REWIND_POLICY) == 0) {
+			if (nvpair_value_nvlist(elem, &policy) == 0)
+				zpool_get_rewind_policy(policy, zrpp);
+			return;
+		} else if (strcmp(nm, ZPOOL_REWIND_REQUEST) == 0) {
+			if (nvpair_value_uint32(elem, &zrpp->zrp_request) == 0)
+				if (zrpp->zrp_request & ~ZPOOL_REWIND_POLICIES)
+					zrpp->zrp_request = ZPOOL_NO_REWIND;
+		} else if (strcmp(nm, ZPOOL_REWIND_REQUEST_TXG) == 0) {
+			(void) nvpair_value_uint64(elem, &zrpp->zrp_txg);
+		} else if (strcmp(nm, ZPOOL_REWIND_META_THRESH) == 0) {
+			(void) nvpair_value_uint64(elem, &zrpp->zrp_maxmeta);
+		} else if (strcmp(nm, ZPOOL_REWIND_DATA_THRESH) == 0) {
+			(void) nvpair_value_uint64(elem, &zrpp->zrp_maxdata);
+		}
+	}
+	if (zrpp->zrp_request == 0)
+		zrpp->zrp_request = ZPOOL_NO_REWIND;
+}
+
+typedef struct zfs_version_spa_map {
+	int	version_zpl;
+	int	version_spa;
+} zfs_version_spa_map_t;
+
+/*
+ * Keep this table in monotonically increasing version number order.
+ */
+static zfs_version_spa_map_t zfs_version_table[] = {
+	{ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL},
+	{ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL},
+	{ZPL_VERSION_FUID, SPA_VERSION_FUID},
+	{ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
+	{ZPL_VERSION_SA, SPA_VERSION_SA},
+	{0, 0}
+};
+
+/*
+ * Return the max zpl version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_zpl_version_map(int spa_version)
+{
+	int i;
+	int version = -1;
+
+	for (i = 0; zfs_version_table[i].version_spa; i++) {
+		if (spa_version >= zfs_version_table[i].version_spa)
+			version = zfs_version_table[i].version_zpl;
+	}
+
+	return (version);
+}
+
+/*
+ * Return the min spa version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_spa_version_map(int zpl_version)
+{
+	int i;
+	int version = -1;
+
+	for (i = 0; zfs_version_table[i].version_zpl; i++) {
+		if (zfs_version_table[i].version_zpl >= zpl_version)
+			return (zfs_version_table[i].version_spa);
+	}
+
+	return (version);
+}
+
+const char *zfs_history_event_names[LOG_END] = {
+	"invalid event",
+	"pool create",
+	"vdev add",
+	"pool remove",
+	"pool destroy",
+	"pool export",
+	"pool import",
+	"vdev attach",
+	"vdev replace",
+	"vdev detach",
+	"vdev online",
+	"vdev offline",
+	"vdev upgrade",
+	"pool clear",
+	"pool scrub",
+	"pool property set",
+	"create",
+	"clone",
+	"destroy",
+	"destroy_begin_sync",
+	"inherit",
+	"property set",
+	"quota set",
+	"permission update",
+	"permission remove",
+	"permission who remove",
+	"promote",
+	"receive",
+	"rename",
+	"reservation set",
+	"replay_inc_sync",
+	"replay_full_sync",
+	"rollback",
+	"snapshot",
+	"filesystem version upgrade",
+	"refquota set",
+	"refreservation set",
+	"pool scrub done",
+	"user hold",
+	"user release",
+	"pool split",
+};
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(zfs_allocatable_devs);
 #endif
--- a/module/zcommon/zfs_fletcher.c
+++ b/module/zcommon/zfs_fletcher.c
@ -128,6 +128,7 @@
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/byteorder.h>
+#include <sys/zio.h>
 #include <sys/spa.h>

 void
--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@ -19,10 +19,11 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

+/* Portions Copyright 2010 Robert Milkowski */
+
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/u8_textprep.h>
@ -69,6 +70,16 @@ zfs_prop_init(void)
 		{ NULL }
 	};

+	static zprop_index_t dedup_table[] = {
+		{ "on",		ZIO_CHECKSUM_ON },
+		{ "off",	ZIO_CHECKSUM_OFF },
+		{ "verify",	ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
+		{ "sha256",	ZIO_CHECKSUM_SHA256 },
+		{ "sha256,verify",
+				ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
+		{ NULL }
+	};
+
 	static zprop_index_t compress_table[] = {
 		{ "on",		ZIO_COMPRESS_ON },
 		{ "off",	ZIO_COMPRESS_OFF },
@ -83,6 +94,7 @@ zfs_prop_init(void)
 		{ "gzip-7",	ZIO_COMPRESS_GZIP_7 },
 		{ "gzip-8",	ZIO_COMPRESS_GZIP_8 },
 		{ "gzip-9",	ZIO_COMPRESS_GZIP_9 },
+		{ "zle",	ZIO_COMPRESS_ZLE },
 		{ NULL }
 	};

@ -92,13 +104,6 @@ zfs_prop_init(void)
 		{ NULL }
 	};

-	static zprop_index_t acl_mode_table[] = {
-		{ "discard",	ZFS_ACL_DISCARD },
-		{ "groupmask",	ZFS_ACL_GROUPMASK },
-		{ "passthrough", ZFS_ACL_PASSTHROUGH },
-		{ NULL }
-	};
-
 	static zprop_index_t acl_inherit_table[] = {
 		{ "discard",	ZFS_ACL_DISCARD },
 		{ "noallow",	ZFS_ACL_NOALLOW },
@ -142,6 +147,7 @@ zfs_prop_init(void)
 		{ "2",		2 },
 		{ "3",		3 },
 		{ "4",		4 },
+		{ "5",		5 },
 		{ "current",	ZPL_VERSION },
 		{ NULL }
 	};
@ -152,6 +158,12 @@ zfs_prop_init(void)
 		{ NULL }
 	};

+	static zprop_index_t logbias_table[] = {
+		{ "latency",	ZFS_LOGBIAS_LATENCY },
+		{ "throughput",	ZFS_LOGBIAS_THROUGHPUT },
+		{ NULL }
+	};
+
 	static zprop_index_t canmount_table[] = {
 		{ "off",	ZFS_CANMOUNT_OFF },
 		{ "on",		ZFS_CANMOUNT_ON },
@ -166,170 +178,208 @@ zfs_prop_init(void)
 		{ NULL }
 	};

+	static zprop_index_t sync_table[] = {
+		{ "standard",	ZFS_SYNC_STANDARD },
+		{ "always",	ZFS_SYNC_ALWAYS },
+		{ "disabled",	ZFS_SYNC_DISABLED },
+		{ NULL }
+	};
+
 	/* inherit index properties */
-	register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT,
+	zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "standard | always | disabled", "SYNC",
+	    sync_table);
+	zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
+	    ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME,
 	    "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM",
 	    checksum_table);
-	register_index(ZFS_PROP_COMPRESSION, "compression",
+	zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "on | off | verify | sha256[,verify]", "DEDUP",
+	    dedup_table);
+	zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
 	    ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", compress_table);
-	register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
+	    "on | off | lzjb | gzip | gzip-[1-9] | zle", "COMPRESS",
+	    compress_table);
+	zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
 	    "hidden | visible", "SNAPDIR", snapdir_table);
-	register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_GROUPMASK,
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
-	    "discard | groupmask | passthrough", "ACLMODE", acl_mode_table);
-	register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED,
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+	zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
+	    ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
 	    "discard | noallow | restricted | passthrough | passthrough-x",
 	    "ACLINHERIT", acl_inherit_table);
-	register_index(ZFS_PROP_COPIES, "copies", 1,
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "1 | 2 | 3", "COPIES", copies_table);
-	register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
+	zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
 	    ZFS_CACHE_ALL, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
 	    "all | none | metadata", "PRIMARYCACHE", cache_table);
-	register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
+	zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
 	    ZFS_CACHE_ALL, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
 	    "all | none | metadata", "SECONDARYCACHE", cache_table);
+	zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "latency | throughput", "LOGBIAS", logbias_table);

 	/* inherit index (boolean) properties */
-	register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
-	register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
 	    boolean_table);
-	register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
 	    boolean_table);
-	register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
 	    boolean_table);
-	register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
 	    boolean_table);
-	register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table);
-	register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
 	    boolean_table);
-	register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
 	    boolean_table);
-	register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
 	    boolean_table);

 	/* default index properties */
-	register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
+	zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
 	    "1 | 2 | 3 | 4 | current", "VERSION", version_table);
-	register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
+	zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
 	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
 	    "CANMOUNT", canmount_table);

 	/* readonly index (boolean) properties */
-	register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
+	zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
-	register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
+	zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
 	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
 	    boolean_table);

 	/* set once index properties */
-	register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
+	zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
 	    PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
 	    "none | formC | formD | formKC | formKD", "NORMALIZATION",
 	    normalize_table);
-	register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE,
-	    PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+	zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
+	    ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_SNAPSHOT,
 	    "sensitive | insensitive | mixed", "CASE", case_table);

 	/* set once index (boolean) properties */
-	register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
+	zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
 	    "on | off", "UTF8ONLY", boolean_table);

 	/* string properties */
-	register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
+	zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
-	register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "<path> | legacy | none", "MOUNTPOINT");
-	register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", "SHARENFS");
-	register_string(ZFS_PROP_SHAREISCSI, "shareiscsi", "off", PROP_INHERIT,
-	    ZFS_TYPE_DATASET, "on | off | type=<type>", "SHAREISCSI");
-	register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
+	zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none",
+	    "MOUNTPOINT");
+	zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options",
+	    "SHARENFS");
+	zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
 	    ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE");
-	register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB");
+	zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+	    "on | off | sharemgr(1M) options", "SHARESMB");
+	zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
+	    ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
+	    "<sensitivity label>", "MLSLABEL");

 	/* readonly number properties */
-	register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
+	zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
 	    ZFS_TYPE_DATASET, "<size>", "USED");
-	register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
+	zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
-	register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY,
-	    ZFS_TYPE_DATASET, "<size>", "REFER");
-	register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
+	zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "REFER");
+	zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
 	    PROP_READONLY, ZFS_TYPE_DATASET,
 	    "<1.00x or higher if compressed>", "RATIO");
-	register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", 8192,
-	    PROP_ONETIME,
+	zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
+	    ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
 	    ZFS_TYPE_VOLUME, "512 to 128k, power of 2",	"VOLBLOCK");
-	register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDSNAP");
-	register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDDS");
-	register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDCHILD");
-	register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
+	zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+	    "USEDSNAP");
+	zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+	    "USEDDS");
+	zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+	    "USEDCHILD");
+	zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
 	    PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
-	register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
+	zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
 	    ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");

 	/* default number properties */
-	register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
+	zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
 	    ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
-	register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size> | none", "RESERV");
-	register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
+	zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
+	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<size> | none", "RESERV");
+	zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
 	    ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
-	register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
+	zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
 	    ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
-	register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
+	zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
 	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "<size> | none", "REFRESERV");

 	/* inherit number properties */
-	register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_MAXBLOCKSIZE,
-	    PROP_INHERIT,
+	zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
+	    SPA_MAXBLOCKSIZE, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");

 	/* hidden properties */
-	register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_DATASET, NULL);
-	register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, NULL);
-	register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
+	zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "CREATETXG");
+	zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
+	zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_DATASET, "NAME");
-	register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING,
-	    PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
-	register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
+	zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
+	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
+	zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
 	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
 	    "STMF_SBD_LU");
-	register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY,
-	    ZFS_TYPE_DATASET, "GUID");
-	register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
-	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+	zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "GUID");
+	zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
+	    "USERACCOUNTING");
+	zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
+	zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
+
+	/*
+	 * Property to be removed once libbe is integrated
+	 */
+	zprop_register_hidden(ZFS_PROP_PRIVATE, "priv_prop",
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_FILESYSTEM,
+	    "PRIV_PROP");

 	/* oddball properties */
-	register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL,
-	    PROP_READONLY, ZFS_TYPE_DATASET,
+	zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
+	    NULL, PROP_READONLY, ZFS_TYPE_DATASET,
 	    "<date>", "CREATION", B_FALSE, B_TRUE, NULL);
 }

@ -337,6 +387,11 @@ boolean_t
 zfs_prop_delegatable(zfs_prop_t prop)
 {
 	zprop_desc_t *pd = &zfs_prop_table[prop];
+
+	/* The mlslabel property is never delegatable. */
+	if (prop == ZFS_PROP_MLSLABEL)
+		return (B_FALSE);
+
 	return (pd->pd_attr != PROP_READONLY);
 }

@ -421,6 +476,12 @@ zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
 	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
 }

+uint64_t
+zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
+{
+	return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
+}
+
 /*
 * Returns TRUE if the property applies to any of the given dataset types.
 */
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@ -19,7 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

@ -64,48 +64,55 @@ zpool_prop_init(void)
 	};

 	/* string properties */
-	register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
+	zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
 	    ZFS_TYPE_POOL, "<path>", "ALTROOT");
-	register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
+	zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
 	    ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
-	register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
+	zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");

 	/* readonly number properties */
-	register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
+	zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "SIZE");
-	register_number(ZPOOL_PROP_USED, "used", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<size>", "USED");
-	register_number(ZPOOL_PROP_AVAILABLE, "available", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<size>", "AVAIL");
-	register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
+	zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<size>", "FREE");
+	zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
+	zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "CAP");
-	register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
+	zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<guid>", "GUID");
-	register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
+	zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<state>", "HEALTH");
+	zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
+	    "DEDUP");

 	/* default number properties */
-	register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
+	zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
+	zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO");

 	/* default index (boolean) properties */
-	register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "on | off", "DELEGATION", boolean_table);
-	register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
-	register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table);
-	register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
+	zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION",
+	    boolean_table);
+	zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
+	zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS",
+	    boolean_table);
+	zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);

 	/* default index properties */
-	register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
+	zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
 	    ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
 	    "wait | continue | panic", "FAILMODE", failuremode_table);

 	/* hidden properties */
-	register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
+	zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_POOL, "NAME");
 }

@ -166,6 +173,12 @@ zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
 	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
 }

+uint64_t
+zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
+{
+	return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
+}
+
 #ifndef _KERNEL

 const char *
--- a/module/zcommon/zprop_common.c
+++ b/module/zcommon/zprop_common.c
@ -19,7 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

@ -65,7 +65,7 @@ zprop_get_numprops(zfs_type_t type)
 }

 void
-register_impl(int prop, const char *name, zprop_type_t type,
+zprop_register_impl(int prop, const char *name, zprop_type_t type,
    uint64_t numdefault, const char *strdefault, zprop_attr_t attr,
    int objset_types, const char *values, const char *colname,
    boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl)
@ -76,6 +76,8 @@ register_impl(int prop, const char *name, zprop_type_t type,
 	pd = &prop_tbl[prop];

 	ASSERT(pd->pd_name == NULL || pd->pd_name == name);
+	ASSERT(name != NULL);
+	ASSERT(colname != NULL);

 	pd->pd_name = name;
 	pd->pd_propnum = prop;
@ -89,40 +91,44 @@ register_impl(int prop, const char *name, zprop_type_t type,
 	pd->pd_rightalign = rightalign;
 	pd->pd_visible = visible;
 	pd->pd_table = idx_tbl;
+	pd->pd_table_size = 0;
+	while (idx_tbl && (idx_tbl++)->pi_name != NULL)
+		pd->pd_table_size++;
 }

 void
-register_string(int prop, const char *name, const char *def,
+zprop_register_string(int prop, const char *name, const char *def,
    zprop_attr_t attr, int objset_types, const char *values,
    const char *colname)
 {
-	register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
+	zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
 	    objset_types, values, colname, B_FALSE, B_TRUE, NULL);

 }

 void
-register_number(int prop, const char *name, uint64_t def, zprop_attr_t attr,
-    int objset_types, const char *values, const char *colname)
+zprop_register_number(int prop, const char *name, uint64_t def,
+    zprop_attr_t attr, int objset_types, const char *values,
+    const char *colname)
 {
-	register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
+	zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
 	    objset_types, values, colname, B_TRUE, B_TRUE, NULL);
 }

 void
-register_index(int prop, const char *name, uint64_t def, zprop_attr_t attr,
-    int objset_types, const char *values, const char *colname,
-    const zprop_index_t *idx_tbl)
+zprop_register_index(int prop, const char *name, uint64_t def,
+    zprop_attr_t attr, int objset_types, const char *values,
+    const char *colname, const zprop_index_t *idx_tbl)
 {
-	register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
+	zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
 	    objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl);
 }

 void
-register_hidden(int prop, const char *name, zprop_type_t type,
+zprop_register_hidden(int prop, const char *name, zprop_type_t type,
    zprop_attr_t attr, int objset_types, const char *colname)
 {
-	register_impl(prop, name, type, 0, NULL, attr,
+	zprop_register_impl(prop, name, type, 0, NULL, attr,
 	    objset_types, NULL, colname, B_FALSE, B_FALSE, NULL);
 }

@ -307,6 +313,25 @@ zprop_index_to_string(int prop, uint64_t index, const char **string,
 	return (-1);
 }

+/*
+ * Return a random valid property value.  Used by ztest.
+ */
+uint64_t
+zprop_random_value(int prop, uint64_t seed, zfs_type_t type)
+{
+	zprop_desc_t *prop_tbl;
+	const zprop_index_t *idx_tbl;
+
+	ASSERT((uint_t)prop < zprop_get_numprops(type));
+	prop_tbl = zprop_get_proptable(type);
+	idx_tbl = prop_tbl[prop].pd_table;
+
+	if (idx_tbl == NULL)
+		return (seed);
+
+	return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value);
+}
+
 const char *
 zprop_values(int prop, zfs_type_t type)
 {
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
--- a/module/zfs/bplist.c
+++ b/module/zfs/bplist.c
@ -19,331 +19,51 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #include <sys/bplist.h>
 #include <sys/zfs_context.h>

-static int
-bplist_hold(bplist_t *bpl)
+
+void
+bplist_create(bplist_t *bpl)
 {
-	ASSERT(MUTEX_HELD(&bpl->bpl_lock));
-	if (bpl->bpl_dbuf == NULL) {
-		int err = dmu_bonus_hold(bpl->bpl_mos,
-		    bpl->bpl_object, bpl, &bpl->bpl_dbuf);
-		if (err)
-			return (err);
-		bpl->bpl_phys = bpl->bpl_dbuf->db_data;
-	}
-	return (0);
-}
-
-uint64_t
-bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
-{
-	int size;
-
-	size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
-	    BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
-
-	return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
-	    DMU_OT_BPLIST_HDR, size, tx));
+	mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
+	    offsetof(bplist_entry_t, bpe_node));
 }

 void
-bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+bplist_destroy(bplist_t *bpl)
 {
-	VERIFY(dmu_object_free(mos, object, tx) == 0);
-}
-
-int
-bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
-{
-	dmu_object_info_t doi;
-	int err;
-
-	err = dmu_object_info(mos, object, &doi);
-	if (err)
-		return (err);
-
-	mutex_enter(&bpl->bpl_lock);
-
-	ASSERT(bpl->bpl_dbuf == NULL);
-	ASSERT(bpl->bpl_phys == NULL);
-	ASSERT(bpl->bpl_cached_dbuf == NULL);
-	ASSERT(bpl->bpl_queue == NULL);
-	ASSERT(object != 0);
-	ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
-	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
-
-	bpl->bpl_mos = mos;
-	bpl->bpl_object = object;
-	bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
-	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
-	bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
-
-	mutex_exit(&bpl->bpl_lock);
-	return (0);
+	list_destroy(&bpl->bpl_list);
+	mutex_destroy(&bpl->bpl_lock);
 }

 void
-bplist_close(bplist_t *bpl)
+bplist_append(bplist_t *bpl, const blkptr_t *bp)
 {
-	mutex_enter(&bpl->bpl_lock);
-
-	ASSERT(bpl->bpl_queue == NULL);
-
-	if (bpl->bpl_cached_dbuf) {
-		dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
-		bpl->bpl_cached_dbuf = NULL;
-	}
-	if (bpl->bpl_dbuf) {
-		dmu_buf_rele(bpl->bpl_dbuf, bpl);
-		bpl->bpl_dbuf = NULL;
-		bpl->bpl_phys = NULL;
-	}
-
-	mutex_exit(&bpl->bpl_lock);
-}
-
-boolean_t
-bplist_empty(bplist_t *bpl)
-{
-	boolean_t rv;
-
-	if (bpl->bpl_object == 0)
-		return (B_TRUE);
+	bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);

 	mutex_enter(&bpl->bpl_lock);
-	VERIFY(0 == bplist_hold(bpl)); /* XXX */
-	rv = (bpl->bpl_phys->bpl_entries == 0);
-	mutex_exit(&bpl->bpl_lock);
-
-	return (rv);
-}
-
-static int
-bplist_cache(bplist_t *bpl, uint64_t blkid)
-{
-	int err = 0;
-
-	if (bpl->bpl_cached_dbuf == NULL ||
-	    bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
-		if (bpl->bpl_cached_dbuf != NULL)
-			dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
-		err = dmu_buf_hold(bpl->bpl_mos,
-		    bpl->bpl_object, blkid << bpl->bpl_blockshift,
-		    bpl, &bpl->bpl_cached_dbuf);
-		ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
-		    1ULL << bpl->bpl_blockshift);
-	}
-	return (err);
-}
-
-int
-bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
-{
-	uint64_t blk, off;
-	blkptr_t *bparray;
-	int err;
-
-	mutex_enter(&bpl->bpl_lock);
-
-	err = bplist_hold(bpl);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	if (*itorp >= bpl->bpl_phys->bpl_entries) {
-		mutex_exit(&bpl->bpl_lock);
-		return (ENOENT);
-	}
-
-	blk = *itorp >> bpl->bpl_bpshift;
-	off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
-
-	err = bplist_cache(bpl, blk);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	bparray = bpl->bpl_cached_dbuf->db_data;
-	*bp = bparray[off];
-	(*itorp)++;
-	mutex_exit(&bpl->bpl_lock);
-	return (0);
-}
-
-int
-bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	uint64_t blk, off;
-	blkptr_t *bparray;
-	int err;
-
-	ASSERT(!BP_IS_HOLE(bp));
-	mutex_enter(&bpl->bpl_lock);
-	err = bplist_hold(bpl);
-	if (err)
-		return (err);
-
-	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
-	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
-
-	err = bplist_cache(bpl, blk);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
-	bparray = bpl->bpl_cached_dbuf->db_data;
-	bparray[off] = *bp;
-
-	/* We never need the fill count. */
-	bparray[off].blk_fill = 0;
-
-	/* The bplist will compress better if we can leave off the checksum */
-	bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
-
-	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-	bpl->bpl_phys->bpl_entries++;
-	bpl->bpl_phys->bpl_bytes +=
-	    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
-	if (bpl->bpl_havecomp) {
-		bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
-		bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
-	}
-	mutex_exit(&bpl->bpl_lock);
-
-	return (0);
-}
-
-/*
- * Deferred entry; will be written later by bplist_sync().
- */
-void
-bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
-{
-	bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
-
-	ASSERT(!BP_IS_HOLE(bp));
-	mutex_enter(&bpl->bpl_lock);
-	bpq->bpq_blk = *bp;
-	bpq->bpq_next = bpl->bpl_queue;
-	bpl->bpl_queue = bpq;
+	bpe->bpe_blk = *bp;
+	list_insert_tail(&bpl->bpl_list, bpe);
 	mutex_exit(&bpl->bpl_lock);
 }

 void
-bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
 {
-	bplist_q_t *bpq;
+	bplist_entry_t *bpe;

 	mutex_enter(&bpl->bpl_lock);
-	while ((bpq = bpl->bpl_queue) != NULL) {
-		bpl->bpl_queue = bpq->bpq_next;
+	while (bpe = list_head(&bpl->bpl_list)) {
+		list_remove(&bpl->bpl_list, bpe);
 		mutex_exit(&bpl->bpl_lock);
-		VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
-		kmem_free(bpq, sizeof (*bpq));
+		func(arg, &bpe->bpe_blk, tx);
+		kmem_free(bpe, sizeof (*bpe));
 		mutex_enter(&bpl->bpl_lock);
 	}
 	mutex_exit(&bpl->bpl_lock);
 }
-
-void
-bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
-{
-	mutex_enter(&bpl->bpl_lock);
-	ASSERT3P(bpl->bpl_queue, ==, NULL);
-	VERIFY(0 == bplist_hold(bpl));
-	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-	VERIFY(0 == dmu_free_range(bpl->bpl_mos,
-	    bpl->bpl_object, 0, -1ULL, tx));
-	bpl->bpl_phys->bpl_entries = 0;
-	bpl->bpl_phys->bpl_bytes = 0;
-	if (bpl->bpl_havecomp) {
-		bpl->bpl_phys->bpl_comp = 0;
-		bpl->bpl_phys->bpl_uncomp = 0;
-	}
-	mutex_exit(&bpl->bpl_lock);
-}
-
-int
-bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
-	int err;
-
-	mutex_enter(&bpl->bpl_lock);
-
-	err = bplist_hold(bpl);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	*usedp = bpl->bpl_phys->bpl_bytes;
-	if (bpl->bpl_havecomp) {
-		*compp = bpl->bpl_phys->bpl_comp;
-		*uncompp = bpl->bpl_phys->bpl_uncomp;
-	}
-	mutex_exit(&bpl->bpl_lock);
-
-	if (!bpl->bpl_havecomp) {
-		uint64_t itor = 0, comp = 0, uncomp = 0;
-		blkptr_t bp;
-
-		while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
-			comp += BP_GET_PSIZE(&bp);
-			uncomp += BP_GET_UCSIZE(&bp);
-		}
-		if (err == ENOENT)
-			err = 0;
-		*compp = comp;
-		*uncompp = uncomp;
-	}
-
-	return (err);
-}
-
-/*
- * Return (in *dasizep) the amount of space on the deadlist which is:
- * mintxg < blk_birth <= maxtxg
- */
-int
-bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t *dasizep)
-{
-	uint64_t size = 0;
-	uint64_t itor = 0;
-	blkptr_t bp;
-	int err;
-
-	/*
-	 * As an optimization, if they want the whole txg range, just
-	 * get bpl_bytes rather than iterating over the bps.
-	 */
-	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
-		mutex_enter(&bpl->bpl_lock);
-		err = bplist_hold(bpl);
-		if (err == 0)
-			*dasizep = bpl->bpl_phys->bpl_bytes;
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
-		if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
-			size +=
-			    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
-		}
-	}
-	if (err == ENOENT)
-		err = 0;
-	*dasizep = size;
-	return (err);
-}
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@ -0,0 +1,462 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+uint64_t
+bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+	int size;
+
+	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
+		size = BPOBJ_SIZE_V0;
+	else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+		size = BPOBJ_SIZE_V1;
+	else
+		size = sizeof (bpobj_phys_t);
+
+	return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
+	    DMU_OT_BPOBJ_HDR, size, tx));
+}
+
+void
+bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+	int64_t i;
+	bpobj_t bpo;
+	dmu_object_info_t doi;
+	int epb;
+	dmu_buf_t *dbuf = NULL;
+
+	VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
+
+	mutex_enter(&bpo.bpo_lock);
+
+	if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
+		goto out;
+
+	VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
+	epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+	for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+		uint64_t *objarray;
+		uint64_t offset, blkoff;
+
+		offset = i * sizeof (uint64_t);
+		blkoff = P2PHASE(i, epb);
+
+		if (dbuf == NULL || dbuf->db_offset > offset) {
+			if (dbuf)
+				dmu_buf_rele(dbuf, FTAG);
+			VERIFY3U(0, ==, dmu_buf_hold(os,
+			    bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
+		}
+
+		ASSERT3U(offset, >=, dbuf->db_offset);
+		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+		objarray = dbuf->db_data;
+		bpobj_free(os, objarray[blkoff], tx);
+	}
+	if (dbuf) {
+		dmu_buf_rele(dbuf, FTAG);
+		dbuf = NULL;
+	}
+	VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
+
+out:
+	mutex_exit(&bpo.bpo_lock);
+	bpobj_close(&bpo);
+
+	VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
+}
+
+int
+bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
+{
+	dmu_object_info_t doi;
+	int err;
+
+	err = dmu_object_info(os, object, &doi);
+	if (err)
+		return (err);
+
+	bzero(bpo, sizeof (*bpo));
+	mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	ASSERT(bpo->bpo_dbuf == NULL);
+	ASSERT(bpo->bpo_phys == NULL);
+	ASSERT(object != 0);
+	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
+	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+
+	bpo->bpo_os = os;
+	bpo->bpo_object = object;
+	bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
+	bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
+	bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+
+	err = dmu_bonus_hold(bpo->bpo_os,
+	    bpo->bpo_object, bpo, &bpo->bpo_dbuf);
+	if (err)
+		return (err);
+	bpo->bpo_phys = bpo->bpo_dbuf->db_data;
+	return (0);
+}
+
+void
+bpobj_close(bpobj_t *bpo)
+{
+	/* Lame workaround for closing a bpobj that was never opened. */
+	if (bpo->bpo_object == 0)
+		return;
+
+	dmu_buf_rele(bpo->bpo_dbuf, bpo);
+	if (bpo->bpo_cached_dbuf != NULL)
+		dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+	bpo->bpo_dbuf = NULL;
+	bpo->bpo_phys = NULL;
+	bpo->bpo_cached_dbuf = NULL;
+
+	mutex_destroy(&bpo->bpo_lock);
+}
+
+static int
+bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
+    boolean_t free)
+{
+	dmu_object_info_t doi;
+	int epb;
+	int64_t i;
+	int err = 0;
+	dmu_buf_t *dbuf = NULL;
+
+	mutex_enter(&bpo->bpo_lock);
+
+	if (free)
+		dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+	for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+		blkptr_t *bparray;
+		blkptr_t *bp;
+		uint64_t offset, blkoff;
+
+		offset = i * sizeof (blkptr_t);
+		blkoff = P2PHASE(i, bpo->bpo_epb);
+
+		if (dbuf == NULL || dbuf->db_offset > offset) {
+			if (dbuf)
+				dmu_buf_rele(dbuf, FTAG);
+			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
+			    FTAG, &dbuf, 0);
+			if (err)
+				break;
+		}
+
+		ASSERT3U(offset, >=, dbuf->db_offset);
+		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+		bparray = dbuf->db_data;
+		bp = &bparray[blkoff];
+		err = func(arg, bp, tx);
+		if (err)
+			break;
+		if (free) {
+			bpo->bpo_phys->bpo_bytes -=
+			    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+			if (bpo->bpo_havecomp) {
+				bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
+				bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
+			}
+			bpo->bpo_phys->bpo_num_blkptrs--;
+			ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+		}
+	}
+	if (dbuf) {
+		dmu_buf_rele(dbuf, FTAG);
+		dbuf = NULL;
+	}
+	if (free) {
+		i++;
+		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
+		    i * sizeof (blkptr_t), -1ULL, tx));
+	}
+	if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
+		goto out;
+
+	ASSERT(bpo->bpo_havecomp);
+	err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
+	if (err)
+		return (err);
+	epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+	for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+		uint64_t *objarray;
+		uint64_t offset, blkoff;
+		bpobj_t sublist;
+		uint64_t used_before, comp_before, uncomp_before;
+		uint64_t used_after, comp_after, uncomp_after;
+
+		offset = i * sizeof (uint64_t);
+		blkoff = P2PHASE(i, epb);
+
+		if (dbuf == NULL || dbuf->db_offset > offset) {
+			if (dbuf)
+				dmu_buf_rele(dbuf, FTAG);
+			err = dmu_buf_hold(bpo->bpo_os,
+			    bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
+			if (err)
+				break;
+		}
+
+		ASSERT3U(offset, >=, dbuf->db_offset);
+		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+		objarray = dbuf->db_data;
+		err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
+		if (err)
+			break;
+		if (free) {
+			err = bpobj_space(&sublist,
+			    &used_before, &comp_before, &uncomp_before);
+			if (err)
+				break;
+		}
+		err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
+		if (free) {
+			VERIFY3U(0, ==, bpobj_space(&sublist,
+			    &used_after, &comp_after, &uncomp_after));
+			bpo->bpo_phys->bpo_bytes -= used_before - used_after;
+			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+			bpo->bpo_phys->bpo_comp -= comp_before - used_after;
+			bpo->bpo_phys->bpo_uncomp -=
+			    uncomp_before - uncomp_after;
+		}
+
+		bpobj_close(&sublist);
+		if (err)
+			break;
+		if (free) {
+			err = dmu_object_free(bpo->bpo_os,
+			    objarray[blkoff], tx);
+			if (err)
+				break;
+			bpo->bpo_phys->bpo_num_subobjs--;
+			ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
+		}
+	}
+	if (dbuf) {
+		dmu_buf_rele(dbuf, FTAG);
+		dbuf = NULL;
+	}
+	if (free) {
+		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
+		    bpo->bpo_phys->bpo_subobjs,
+		    (i + 1) * sizeof (uint64_t), -1ULL, tx));
+	}
+
+out:
+	/* If there are no entries, there should be no bytes. */
+	ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
+	    (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
+	    bpo->bpo_phys->bpo_bytes == 0);
+
+	mutex_exit(&bpo->bpo_lock);
+	return (err);
+}
+
+/*
+ * Iterate and remove the entries.  If func returns nonzero, iteration
+ * will stop and that entry will not be removed.
+ */
+int
+bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
+}
+
+/*
+ * Iterate the entries.  If func returns nonzero, iteration will stop.
+ */
+int
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+	return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
+}
+
+void
+bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
+{
+	bpobj_t subbpo;
+	uint64_t used, comp, uncomp;
+
+	ASSERT(bpo->bpo_havesubobj);
+	ASSERT(bpo->bpo_havecomp);
+
+	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
+	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
+	bpobj_close(&subbpo);
+
+	if (used == 0) {
+		/* No point in having an empty subobj. */
+		bpobj_free(bpo->bpo_os, subobj, tx);
+		return;
+	}
+
+	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+	if (bpo->bpo_phys->bpo_subobjs == 0) {
+		bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
+		    DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+	}
+
+	mutex_enter(&bpo->bpo_lock);
+	dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+	    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+	    sizeof (subobj), &subobj, tx);
+	bpo->bpo_phys->bpo_num_subobjs++;
+	bpo->bpo_phys->bpo_bytes += used;
+	bpo->bpo_phys->bpo_comp += comp;
+	bpo->bpo_phys->bpo_uncomp += uncomp;
+	mutex_exit(&bpo->bpo_lock);
+}
+
+void
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	blkptr_t stored_bp = *bp;
+	uint64_t offset;
+	int blkoff;
+	blkptr_t *bparray;
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	/* We never need the fill count. */
+	stored_bp.blk_fill = 0;
+
+	/* The bpobj will compress better if we can leave off the checksum */
+	if (!BP_GET_DEDUP(bp))
+		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+
+	mutex_enter(&bpo->bpo_lock);
+
+	offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
+	blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
+
+	if (bpo->bpo_cached_dbuf == NULL ||
+	    offset < bpo->bpo_cached_dbuf->db_offset ||
+	    offset >= bpo->bpo_cached_dbuf->db_offset +
+	    bpo->bpo_cached_dbuf->db_size) {
+		if (bpo->bpo_cached_dbuf)
+			dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+		VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+		    offset, bpo, &bpo->bpo_cached_dbuf, 0));
+	}
+
+	dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
+	bparray = bpo->bpo_cached_dbuf->db_data;
+	bparray[blkoff] = stored_bp;
+
+	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+	bpo->bpo_phys->bpo_num_blkptrs++;
+	bpo->bpo_phys->bpo_bytes +=
+	    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+	if (bpo->bpo_havecomp) {
+		bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
+		bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
+	}
+	mutex_exit(&bpo->bpo_lock);
+}
+
+struct space_range_arg {
+	spa_t *spa;
+	uint64_t mintxg;
+	uint64_t maxtxg;
+	uint64_t used;
+	uint64_t comp;
+	uint64_t uncomp;
+};
+
+/* ARGSUSED */
+static int
+space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	struct space_range_arg *sra = arg;
+
+	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+		sra->used += bp_get_dsize_sync(sra->spa, bp);
+		sra->comp += BP_GET_PSIZE(bp);
+		sra->uncomp += BP_GET_UCSIZE(bp);
+	}
+	return (0);
+}
+
+int
+bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	mutex_enter(&bpo->bpo_lock);
+
+	*usedp = bpo->bpo_phys->bpo_bytes;
+	if (bpo->bpo_havecomp) {
+		*compp = bpo->bpo_phys->bpo_comp;
+		*uncompp = bpo->bpo_phys->bpo_uncomp;
+		mutex_exit(&bpo->bpo_lock);
+		return (0);
+	} else {
+		mutex_exit(&bpo->bpo_lock);
+		return (bpobj_space_range(bpo, 0, UINT64_MAX,
+		    usedp, compp, uncompp));
+	}
+}
+
+/*
+ * Return the amount of space in the bpobj which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	struct space_range_arg sra = { 0 };
+	int err;
+
+	/*
+	 * As an optimization, if they want the whole txg range, just
+	 * get bpo_bytes rather than iterating over the bps.
+	 */
+	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
+		return (bpobj_space(bpo, usedp, compp, uncompp));
+
+	sra.spa = dmu_objset_spa(bpo->bpo_os);
+	sra.mintxg = mintxg;
+	sra.maxtxg = maxtxg;
+
+	err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
+	*usedp = sra.used;
+	*compp = sra.comp;
+	*uncompp = sra.uncomp;
+	return (err);
+}
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@ -0,0 +1,157 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <util/sscanf.h>
+
+int ddt_zap_leaf_blockshift = 12;
+int ddt_zap_indirect_blockshift = 12;
+
+static int
+ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
+{
+	zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
+
+	if (prehash)
+		flags |= ZAP_FLAG_PRE_HASHED_KEY;
+
+	*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
+	    ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+	    DMU_OT_NONE, 0, tx);
+
+	return (*objectp == 0 ? ENOTSUP : 0);
+}
+
+static int
+ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	return (zap_destroy(os, object, tx));
+}
+
+static int
+ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+	uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+	uint64_t one, csize;
+	int error;
+
+	error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, &one, &csize);
+	if (error)
+		return (error);
+
+	ASSERT(one == 1);
+	ASSERT(csize <= sizeof (cbuf));
+
+	error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, 1, csize, cbuf);
+	if (error)
+		return (error);
+
+	ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
+
+	return (0);
+}
+
+static void
+ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+	(void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS);
+}
+
+static int
+ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+	uint64_t csize;
+
+	csize = ddt_compress(dde->dde_phys, cbuf,
+	    sizeof (dde->dde_phys), sizeof (cbuf));
+
+	return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, 1, csize, cbuf, tx));
+}
+
+static int
+ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, tx));
+}
+
+static int
+ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int error;
+
+	zap_cursor_init_serialized(&zc, os, object, *walk);
+	if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
+		uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+		uint64_t csize = za.za_num_integers;
+		ASSERT(za.za_integer_length == 1);
+		error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
+		    DDT_KEY_WORDS, 1, csize, cbuf);
+		ASSERT(error == 0);
+		if (error == 0) {
+			ddt_decompress(cbuf, dde->dde_phys, csize,
+			    sizeof (dde->dde_phys));
+			dde->dde_key = *(ddt_key_t *)za.za_name;
+		}
+		zap_cursor_advance(&zc);
+		*walk = zap_cursor_serialize(&zc);
+	}
+	zap_cursor_fini(&zc);
+	return (error);
+}
+
+static uint64_t
+ddt_zap_count(objset_t *os, uint64_t object)
+{
+	uint64_t count = 0;
+
+	VERIFY(zap_count(os, object, &count) == 0);
+
+	return (count);
+}
+
+const ddt_ops_t ddt_zap_ops = {
+	"zap",
+	ddt_zap_create,
+	ddt_zap_destroy,
+	ddt_zap_lookup,
+	ddt_zap_prefetch,
+	ddt_zap_update,
+	ddt_zap_remove,
+	ddt_zap_walk,
+	ddt_zap_count,
+};
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #include <sys/dmu.h>
@ -32,16 +31,15 @@ uint64_t
 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
-	objset_impl_t *osi = os->os;
 	uint64_t object;
 	uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
-	    (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+	    (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
 	dnode_t *dn = NULL;
 	int restarted = B_FALSE;

-	mutex_enter(&osi->os_obj_lock);
+	mutex_enter(&os->os_obj_lock);
 	for (;;) {
-		object = osi->os_obj_next;
+		object = os->os_obj_next;
 		/*
 		 * Each time we polish off an L2 bp worth of dnodes
 		 * (2^13 objects), move to another L2 bp that's still
@ -51,14 +49,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 		 */
 		if (P2PHASE(object, L2_dnode_count) == 0) {
 			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
-			int error = dnode_next_offset(osi->os_meta_dnode,
+			int error = dnode_next_offset(os->os_meta_dnode,
 			    DNODE_FIND_HOLE,
 			    &offset, 2, DNODES_PER_BLOCK >> 2, 0);
 			restarted = B_TRUE;
 			if (error == 0)
 				object = offset >> DNODE_SHIFT;
 		}
-		osi->os_obj_next = ++object;
+		os->os_obj_next = ++object;

 		/*
 		 * XXX We should check for an i/o error here and return
@ -66,19 +64,19 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 		 * dmu_tx_assign(), but there is currently no mechanism
 		 * to do so.
 		 */
-		(void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
+		(void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
 		    FTAG, &dn);
 		if (dn)
 			break;

 		if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
-			osi->os_obj_next = object - 1;
+			os->os_obj_next = object - 1;
 	}

 	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
 	dnode_rele(dn, FTAG);

-	mutex_exit(&osi->os_obj_lock);
+	mutex_exit(&os->os_obj_lock);

 	dmu_tx_add_new_object(tx, os, object);
 	return (object);
@ -94,7 +92,7 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
 	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
 		return (EBADF);

-	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
 	if (err)
 		return (err);
 	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
@ -116,7 +114,7 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
 	if (object == DMU_META_DNODE_OBJECT)
 		return (EBADF);

-	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
 	    FTAG, &dn);
 	if (err)
 		return (err);
@ -128,7 +126,11 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
 		return (0);
 	}

+	if (bonustype == DMU_OT_SA) {
+		nblkptr = 1;
+	} else {
 		nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	}

 	/*
 	 * If we are losing blkptrs or changing the block size this must
@ -166,7 +168,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)

 	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));

-	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
 	    FTAG, &dn);
 	if (err)
 		return (err);
@ -185,7 +187,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
 	uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
 	int error;

-	error = dnode_next_offset(os->os->os_meta_dnode,
+	error = dnode_next_offset(os->os_meta_dnode,
 	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);

 	*objectp = offset >> DNODE_SHIFT;
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -33,16 +32,10 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_impl.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
 #include <sys/callb.h>

-#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                       \
-	(zb)->zb_objset = objset;                       \
-	(zb)->zb_object = object;                       \
-	(zb)->zb_level = level;                         \
-	(zb)->zb_blkid = blkid;                         \
-}
-
 struct prefetch_data {
 	kmutex_t pd_mtx;
 	kcondvar_t pd_cv;
@ -68,27 +61,28 @@ static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
    arc_buf_t *buf, uint64_t objset, uint64_t object);

 /* ARGSUSED */
-static void
+static int
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	struct traverse_data *td = arg;
 	zbookmark_t zb;

 	if (bp->blk_birth == 0)
-		return;
+		return (0);

 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
-		return;
+		return (0);

-	zb.zb_objset = td->td_objset;
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-	VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
+
+	return (0);
 }

 /* ARGSUSED */
-static void
+static int
 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
 	struct traverse_data *td = arg;
@ -99,17 +93,18 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 		zbookmark_t zb;

 		if (bp->blk_birth == 0)
-			return;
+			return (0);

 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
-			return;
+			return (0);

-		zb.zb_objset = td->td_objset;
-		zb.zb_object = lr->lr_foid;
-		zb.zb_level = BP_GET_LEVEL(bp);
-		zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
-		VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
+
+		(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
+		    td->td_arg);
 	}
+	return (0);
 }

 static void
@ -120,7 +115,7 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh)

 	/*
 	 * We only want to visit blocks that have been claimed but not yet
-	 * replayed (or, in read-only mode, blocks that *would* be claimed).
+	 * replayed; plus, in read-only mode, blocks that are already stable.
 	 */
 	if (claim_txg == 0 && spa_writeable(td->td_spa))
 		return;
@ -138,12 +133,14 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
    arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
 {
 	zbookmark_t czb;
-	int err = 0;
+	int err = 0, lasterr = 0;
 	arc_buf_t *buf = NULL;
 	struct prefetch_data *pd = td->td_pfd;
+	boolean_t hard = td->td_flags & TRAVERSE_HARD;

 	if (bp->blk_birth == 0) {
-		err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
+		err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
+		    td->td_arg);
 		return (err);
 	}

@ -163,7 +160,8 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 	}

 	if (td->td_flags & TRAVERSE_PRE) {
-		err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+		err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+		    td->td_arg);
 		if (err)
 			return (err);
 	}
@ -174,7 +172,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;

-		err = arc_read(NULL, td->td_spa, bp, pbuf,
+		err = dsl_read(NULL, td->td_spa, bp, pbuf,
 		    arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
@ -187,15 +185,18 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = traverse_visitbp(td, dnp, buf, cbp, &czb);
-			if (err)
+			if (err) {
+				if (!hard)
 					break;
+				lasterr = err;
+			}
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_WAIT;
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;

-		err = arc_read(NULL, td->td_spa, bp, pbuf,
+		err = dsl_read(NULL, td->td_spa, bp, pbuf,
 		    arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
@ -203,18 +204,21 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,

 		/* recursively visitbp() blocks below this */
 		dnp = buf->b_data;
-		for (i = 0; i < epb && err == 0; i++, dnp++) {
+		for (i = 0; i < epb; i++, dnp++) {
 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
 			    zb->zb_blkid * epb + i);
-			if (err)
+			if (err) {
+				if (!hard)
 					break;
+				lasterr = err;
+			}
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t flags = ARC_WAIT;
 		objset_phys_t *osp;
 		dnode_phys_t *dnp;

-		err = arc_read_nolock(NULL, td->td_spa, bp,
+		err = dsl_read_nolock(NULL, td->td_spa, bp,
 		    arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
@ -224,12 +228,21 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 		traverse_zil(td, &osp->os_zil_header);

 		dnp = &osp->os_meta_dnode;
-		err = traverse_dnode(td, dnp, buf, zb->zb_objset, 0);
+		err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+		    DMU_META_DNODE_OBJECT);
+		if (err && hard) {
+			lasterr = err;
+			err = 0;
+		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			dnp = &osp->os_userused_dnode;
 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
 			    DMU_USERUSED_OBJECT);
 		}
+		if (err && hard) {
+			lasterr = err;
+			err = 0;
+		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			dnp = &osp->os_groupused_dnode;
 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
@ -240,33 +253,52 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 	if (buf)
 		(void) arc_buf_remove_ref(buf, &buf);

-	if (err == 0 && (td->td_flags & TRAVERSE_POST))
-		err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+	if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
+		err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
+		    td->td_arg);
+	}

-	return (err);
+	return (err != 0 ? err : lasterr);
 }

 static int
 traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
    arc_buf_t *buf, uint64_t objset, uint64_t object)
 {
-	int j, err = 0;
+	int j, err = 0, lasterr = 0;
 	zbookmark_t czb;
+	boolean_t hard = (td->td_flags & TRAVERSE_HARD);

 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		err = traverse_visitbp(td, dnp, buf,
 		    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
-		if (err)
+		if (err) {
+			if (!hard)
 				break;
+			lasterr = err;
 		}
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		SET_BOOKMARK(&czb, objset,
+		    object, 0, DMU_SPILL_BLKID);
+		err = traverse_visitbp(td, dnp, buf,
+		    (blkptr_t *)&dnp->dn_spill, &czb);
+		if (err) {
+			if (!hard)
 				return (err);
+			lasterr = err;
+		}
+	}
+	return (err != 0 ? err : lasterr);
 }

 /* ARGSUSED */
 static int
-traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
+    void *arg)
 {
 	struct prefetch_data *pfd = arg;
 	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
@ -276,7 +308,8 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
 		return (EINTR);

 	if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
-	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
+	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
+	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
 		return (0);

 	mutex_enter(&pfd->pd_mtx);
@ -286,7 +319,7 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
 	cv_broadcast(&pfd->pd_cv);
 	mutex_exit(&pfd->pd_mtx);

-	(void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
+	(void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 	    &aflags, zb);
@ -305,7 +338,8 @@ traverse_prefetch_thread(void *arg)
 	td.td_arg = td_main->td_pfd;
 	td.td_pfd = NULL;

-	SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
+	SET_BOOKMARK(&czb, td.td_objset,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);

 	mutex_enter(&td_main->td_pfd->pd_mtx);
@ -349,7 +383,8 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
 	    &td, TQ_NOQUEUE))
 		pd.pd_exited = B_TRUE;

-	SET_BOOKMARK(&czb, objset, 0, -1, 0);
+	SET_BOOKMARK(&czb, objset,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);

 	mutex_enter(&pd.pd_mtx);
@ -381,45 +416,61 @@ traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
 */
 int
-traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
+traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
+    blkptr_cb_t func, void *arg)
 {
-	int err;
+	int err, lasterr = 0;
 	uint64_t obj;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	objset_t *mos = dp->dp_meta_objset;
+	boolean_t hard = (flags & TRAVERSE_HARD);

 	/* visit the MOS */
 	err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
-	    0, TRAVERSE_PRE, func, arg);
+	    txg_start, flags, func, arg);
 	if (err)
 		return (err);

 	/* visit each dataset */
-	for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
+	for (obj = 1; err == 0 || (err != ESRCH && hard);
+	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
 		dmu_object_info_t doi;

 		err = dmu_object_info(mos, obj, &doi);
-		if (err)
+		if (err) {
+			if (!hard)
 				return (err);
+			lasterr = err;
+			continue;
+		}

 		if (doi.doi_type == DMU_OT_DSL_DATASET) {
 			dsl_dataset_t *ds;
+			uint64_t txg = txg_start;
+
 			rw_enter(&dp->dp_config_rwlock, RW_READER);
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 			rw_exit(&dp->dp_config_rwlock);
-			if (err)
+			if (err) {
+				if (!hard)
 					return (err);
-			err = traverse_dataset(ds,
-			    ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
-			    func, arg);
+				lasterr = err;
+				continue;
+			}
+			if (ds->ds_phys->ds_prev_snap_txg > txg)
+				txg = ds->ds_phys->ds_prev_snap_txg;
+			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
-			if (err)
+			if (err) {
+				if (!hard)
 					return (err);
+				lasterr = err;
+			}
 		}
 	}
 	if (err == ESRCH)
 		err = 0;
-	return (err);
+	return (err != 0 ? err : lasterr);
 }

 #if defined(_KERNEL) && defined(HAVE_SPL)
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #include <sys/dmu.h>
@ -33,7 +32,10 @@
 #include <sys/dsl_pool.h>
 #include <sys/zap_impl.h> /* for fzap_default_block_shift */
 #include <sys/spa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
+#include <sys/varargs.h>

 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
    uint64_t arg1, uint64_t arg2);
@ -48,6 +50,8 @@ dmu_tx_create_dd(dsl_dir_t *dd)
 		tx->tx_pool = dd->dd_pool;
 	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
 	    offsetof(dmu_tx_hold_t, txh_node));
+	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
+	    offsetof(dmu_tx_callback_t, dcb_node));
 #ifdef ZFS_DEBUG
 	refcount_create(&tx->tx_space_written);
 	refcount_create(&tx->tx_space_freed);
@ -58,9 +62,9 @@ dmu_tx_create_dd(dsl_dir_t *dd)
 dmu_tx_t *
 dmu_tx_create(objset_t *os)
 {
-	dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
+	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
 	tx->tx_objset = os;
-	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
+	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
 	return (tx);
 }

@ -98,7 +102,7 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 	int err;

 	if (object != DMU_NEW_OBJECT) {
-		err = dnode_hold(os->os, object, tx, &dn);
+		err = dnode_hold(os, object, tx, &dn);
 		if (err) {
 			tx->tx_err = err;
 			return (NULL);
@ -161,38 +165,47 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 }

 static void
-dmu_tx_count_indirects(dmu_tx_hold_t *txh, dmu_buf_impl_t *db,
-    boolean_t freeable, dmu_buf_impl_t **history)
+dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
+    int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 {
-	int i = db->db_level + 1;
-	dnode_t *dn = db->db_dnode;
+	objset_t *os = dn->dn_objset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	dmu_buf_impl_t *parent = NULL;
+	blkptr_t *bp = NULL;
+	uint64_t space;

-	if (i >= dn->dn_nlevels)
+	if (level >= dn->dn_nlevels || history[level] == blkid)
 		return;

-	db = db->db_parent;
-	if (db == NULL) {
-		uint64_t lvls = dn->dn_nlevels - i;
+	history[level] = blkid;

-		txh->txh_space_towrite += lvls << dn->dn_indblkshift;
-		return;
+	space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
+
+	if (db == NULL || db == dn->dn_dbuf) {
+		ASSERT(level != 0);
+		db = NULL;
+	} else {
+		ASSERT(db->db_dnode == dn);
+		ASSERT(db->db_level == level);
+		ASSERT(db->db.db_size == space);
+		ASSERT(db->db_blkid == blkid);
+		bp = db->db_blkptr;
+		parent = db->db_parent;
 	}

-	if (db != history[i]) {
-		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-		uint64_t space = 1ULL << dn->dn_indblkshift;
+	freeable = (bp && (freeable ||
+	    dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));

-		freeable = (db->db_blkptr && (freeable ||
-		    dsl_dataset_block_freeable(ds, db->db_blkptr->blk_birth)));
 	if (freeable)
 		txh->txh_space_tooverwrite += space;
 	else
 		txh->txh_space_towrite += space;
-		if (db->db_blkptr)
-			txh->txh_space_tounref += space;
-		history[i] = db;
-		dmu_tx_count_indirects(txh, db, freeable, history);
-	}
+	if (bp)
+		txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
+
+	dmu_tx_count_twig(txh, dn, parent, level + 1,
+	    blkid >> epbs, freeable, history);
 }

 /* ARGSUSED */
@ -203,6 +216,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	uint64_t start, end, i;
 	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 	int err = 0;
+	int l;

 	if (len == 0)
 		return;
@ -213,7 +227,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	max_ibs = DN_MAX_INDBLKSHIFT;

 	if (dn) {
-		dmu_buf_impl_t *last[DN_MAX_LEVELS];
+		uint64_t history[DN_MAX_LEVELS];
 		int nlvls = dn->dn_nlevels;
 		int delta;

@ -289,29 +303,24 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 		 * If this write is not off the end of the file
 		 * we need to account for overwrites/unref.
 		 */
-		if (start <= dn->dn_maxblkid)
-			bzero(last, sizeof (dmu_buf_impl_t *) * DN_MAX_LEVELS);
+		if (start <= dn->dn_maxblkid) {
+			for (l = 0; l < DN_MAX_LEVELS; l++)
+				history[l] = -1ULL;
+		}
 		while (start <= dn->dn_maxblkid) {
-			spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
-			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 			dmu_buf_impl_t *db;

 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			db = dbuf_hold_level(dn, 0, start, FTAG);
+			err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 			rw_exit(&dn->dn_struct_rwlock);
-			if (db->db_blkptr && dsl_dataset_block_freeable(ds,
-			    db->db_blkptr->blk_birth)) {
-				dprintf_bp(db->db_blkptr, "can free old%s", "");
-				txh->txh_space_tooverwrite += dn->dn_datablksz;
-				txh->txh_space_tounref += dn->dn_datablksz;
-				dmu_tx_count_indirects(txh, db, TRUE, last);
-			} else {
-				txh->txh_space_towrite += dn->dn_datablksz;
-				if (db->db_blkptr)
-					txh->txh_space_tounref +=
-					    bp_get_dasize(spa, db->db_blkptr);
-				dmu_tx_count_indirects(txh, db, FALSE, last);
+
+			if (err) {
+				txh->txh_tx->tx_err = err;
+				return;
 			}
+
+			dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
+			    history);
 			dbuf_rele(db, FTAG);
 			if (++start > end) {
 				/*
@ -376,13 +385,13 @@ static void
 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 {
 	dnode_t *dn = txh->txh_dnode;
-	dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
+	dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode;
 	uint64_t space = mdn->dn_datablksz +
 	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);

 	if (dn && dn->dn_dbuf->db_blkptr &&
 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-	    dn->dn_dbuf->db_blkptr->blk_birth)) {
+	    dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 		txh->txh_space_tooverwrite += space;
 		txh->txh_space_tounref += space;
 	} else {
@ -427,7 +436,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	 * The struct_rwlock protects us against dn_nlevels
 	 * changing, in case (against all odds) we manage to dirty &
 	 * sync out the changes after we check for being dirty.
-	 * Also, dbuf_hold_level() wants us to have the struct_rwlock.
+	 * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@ -457,9 +466,9 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 			blkptr_t *bp = dn->dn_phys->dn_blkptr;
 			ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 			bp += blkid + i;
-			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
+			if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 				dprintf_bp(bp, "can free old%s", "");
-				space += bp_get_dasize(spa, bp);
+				space += bp_get_dsize(spa, bp);
 			}
 			unref += BP_GET_ASIZE(bp);
 		}
@ -515,14 +524,22 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 		blkoff = P2PHASE(blkid, epb);
 		tochk = MIN(epb - blkoff, nblks);

-		dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
-
-		txh->txh_memory_tohold += dbuf->db.db_size;
-		if (txh->txh_memory_tohold > DMU_MAX_ACCESS) {
-			txh->txh_tx->tx_err = E2BIG;
-			dbuf_rele(dbuf, FTAG);
+		err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
+		if (err) {
+			txh->txh_tx->tx_err = err;
 			break;
 		}
+
+		txh->txh_memory_tohold += dbuf->db.db_size;
+
+		/*
+		 * We don't check memory_tohold against DMU_MAX_ACCESS because
+		 * memory_tohold is an over-estimation (especially the >L1
+		 * indirect blocks), so it could fail.  Callers should have
+		 * already verified that they will not be holding too much
+		 * memory.
+		 */
+
 		err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 		if (err != 0) {
 			txh->txh_tx->tx_err = err;
@ -534,9 +551,10 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 		bp += blkoff;

 		for (i = 0; i < tochk; i++) {
-			if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
+			if (dsl_dataset_block_freeable(ds, &bp[i],
+			    bp[i].blk_birth)) {
 				dprintf_bp(&bp[i], "can free old%s", "");
-				space += bp_get_dasize(spa, &bp[i]);
+				space += bp_get_dsize(spa, &bp[i]);
 			}
 			unref += BP_GET_ASIZE(bp);
 		}
@ -581,6 +599,8 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 	if (len != DMU_OBJECT_END)
 		dmu_tx_count_write(txh, off+len, 1);

+	dmu_tx_count_dnode(txh);
+
 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
@ -623,7 +643,6 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 		}
 	}

-	dmu_tx_count_dnode(txh);
 	dmu_tx_count_free(txh, off, len);
 }

@ -673,6 +692,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 		 * the size will change between now and the dbuf dirty call.
 		 */
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+		    &dn->dn_phys->dn_blkptr[0],
 		    dn->dn_phys->dn_blkptr[0].blk_birth)) {
 			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
 		} else {
@ -688,7 +708,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 		 * access the name in this fat-zap so that we'll check
 		 * for i/o errors to the leaf blocks, etc.
 		 */
-		err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
+		err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 		    8, 0, NULL);
 		if (err == EIO) {
 			tx->tx_err = err;
@ -696,7 +716,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 		}
 	}

-	err = zap_count_write(&dn->dn_objset->os, dn->dn_object, name, add,
+	err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);

 	/*
@ -771,7 +791,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 	dnode_t *dn = db->db_dnode;

 	ASSERT(tx->tx_txg != 0);
-	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
+	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 	ASSERT3U(dn->dn_object, ==, db->db.db_object);

 	if (tx->tx_anyobj)
@ -808,10 +828,11 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 					match_offset = TRUE;
 				/*
 				 * We will let this hold work for the bonus
-				 * buffer so that we don't need to hold it
-				 * when creating a new object.
+				 * or spill buffer so that we don't need to
+				 * hold it when creating a new object.
 				 */
-				if (blkid == DB_BONUS_BLKID)
+				if (blkid == DMU_BONUS_BLKID ||
+				    blkid == DMU_SPILL_BLKID)
 					match_offset = TRUE;
 				/*
 				 * They might have to increase nlevels,
@ -832,8 +853,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 				    txh->txh_arg2 == DMU_OBJECT_END))
 					match_offset = TRUE;
 				break;
+			case THT_SPILL:
+				if (blkid == DMU_SPILL_BLKID)
+					match_offset = TRUE;
+				break;
 			case THT_BONUS:
-				if (blkid == DB_BONUS_BLKID)
+				if (blkid == DMU_BONUS_BLKID)
 					match_offset = TRUE;
 				break;
 			case THT_ZAP:
@ -931,7 +956,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 	 * assume that we won't be able to free or overwrite anything.
 	 */
 	if (tx->tx_objset &&
-	    dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
+	    dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
 	    tx->tx_lastsnap_txg) {
 		towrite += tooverwrite;
 		tooverwrite = tofree = 0;
@ -1112,8 +1137,13 @@ dmu_tx_commit(dmu_tx_t *tx)
 	if (tx->tx_tempreserve_cookie)
 		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);

+	if (!list_is_empty(&tx->tx_callbacks))
+		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
+
 	if (tx->tx_anyobj == FALSE)
 		txg_rele_to_sync(&tx->tx_txgh);
+
+	list_destroy(&tx->tx_callbacks);
 	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
@ -1142,6 +1172,14 @@ dmu_tx_abort(dmu_tx_t *tx)
 		if (dn != NULL)
 			dnode_rele(dn, tx);
 	}
+
+	/*
+	 * Call any registered callbacks with an error code.
+	 */
+	if (!list_is_empty(&tx->tx_callbacks))
+		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
+
+	list_destroy(&tx->tx_callbacks);
 	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	refcount_destroy_many(&tx->tx_space_written,
@ -1159,6 +1197,172 @@ dmu_tx_get_txg(dmu_tx_t *tx)
 	return (tx->tx_txg);
 }

+void
+dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
+{
+	dmu_tx_callback_t *dcb;
+
+	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
+
+	dcb->dcb_func = func;
+	dcb->dcb_data = data;
+
+	list_insert_tail(&tx->tx_callbacks, dcb);
+}
+
+/*
+ * Call all the commit callbacks on a list, with a given error code.
+ */
+void
+dmu_tx_do_callbacks(list_t *cb_list, int error)
+{
+	dmu_tx_callback_t *dcb;
+
+	while (dcb = list_head(cb_list)) {
+		list_remove(cb_list, dcb);
+		dcb->dcb_func(dcb->dcb_data, error);
+		kmem_free(dcb, sizeof (dmu_tx_callback_t));
+	}
+}
+
+/*
+ * Interface to hold a bunch of attributes.
+ * used for creating new files.
+ * attrsize is the total size of all attributes
+ * to be added during object creation
+ *
+ * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
+ */
+
+/*
+ * hold necessary attribute name for attribute registration.
+ * should be a very rare case where this is needed.  If it does
+ * happen it would only happen on the first write to the file system.
+ */
+static void
+dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
+{
+	int i;
+
+	if (!sa->sa_need_attr_registration)
+		return;
+
+	for (i = 0; i != sa->sa_num_attrs; i++) {
+		if (!sa->sa_attr_table[i].sa_registered) {
+			if (sa->sa_reg_attr_obj)
+				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
+				    B_TRUE, sa->sa_attr_table[i].sa_name);
+			else
+				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
+				    B_TRUE, sa->sa_attr_table[i].sa_name);
+		}
+	}
+}
+
+
+void
+dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
+{
+	dnode_t *dn;
+	dmu_tx_hold_t *txh;
+	blkptr_t *bp;
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+	    THT_SPILL, 0, 0);
+
+	dn = txh->txh_dnode;
+
+	if (dn == NULL)
+		return;
+
+	/* If blkptr doesn't exist then add space to towrite */
+	bp = &dn->dn_phys->dn_spill;
+	if (BP_IS_HOLE(bp)) {
+		txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+		txh->txh_space_tounref = 0;
+	} else {
+		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+		    bp, bp->blk_birth))
+			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+		else
+			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+		if (bp->blk_birth)
+			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+	}
+}
+
+void
+dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
+{
+	sa_os_t *sa = tx->tx_objset->os_sa;
+
+	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+	if (tx->tx_objset->os_sa->sa_master_obj == 0)
+		return;
+
+	if (tx->tx_objset->os_sa->sa_layout_attr_obj)
+		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+	else {
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	}
+
+	dmu_tx_sa_registration_hold(sa, tx);
+
+	if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
+		return;
+
+	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
+	    THT_SPILL, 0, 0);
+}
+
+/*
+ * Hold SA attribute
+ *
+ * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
+ *
+ * variable_size is the total size of all variable sized attributes
+ * passed to this function.  It is not the total size of all
+ * variable size attributes that *may* exist on this object.
+ */
+void
+dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
+{
+	uint64_t object;
+	sa_os_t *sa = tx->tx_objset->os_sa;
+
+	ASSERT(hdl != NULL);
+
+	object = sa_handle_object(hdl);
+
+	dmu_tx_hold_bonus(tx, object);
+
+	if (tx->tx_objset->os_sa->sa_master_obj == 0)
+		return;
+
+	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
+	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	}
+
+	dmu_tx_sa_registration_hold(sa, tx);
+
+	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
+		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+
+	if (sa->sa_force_spill || may_grow || hdl->sa_spill ||
+	    ((dmu_buf_impl_t *)hdl->sa_bonus)->db_dnode->dn_have_spill) {
+		ASSERT(tx->tx_txg == 0);
+		dmu_tx_hold_spill(tx, object);
+	}
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(dmu_tx_create);
 EXPORT_SYMBOL(dmu_tx_hold_write);
@ -1170,4 +1374,9 @@ EXPORT_SYMBOL(dmu_tx_assign);
 EXPORT_SYMBOL(dmu_tx_wait);
 EXPORT_SYMBOL(dmu_tx_commit);
 EXPORT_SYMBOL(dmu_tx_get_txg);
+EXPORT_SYMBOL(dmu_tx_callback_register);
+EXPORT_SYMBOL(dmu_tx_do_callbacks);
+EXPORT_SYMBOL(dmu_tx_hold_spill);
+EXPORT_SYMBOL(dmu_tx_hold_sa_create);
+EXPORT_SYMBOL(dmu_tx_hold_sa);
 #endif
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@ -19,18 +19,17 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

-
-
 #include <sys/zfs_context.h>
 #include <sys/dnode.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/dmu.h>
 #include <sys/dbuf.h>
+#include <sys/kstat.h>

 /*
 * I'm against tune-ables, but these should probably exist as tweakable globals
@ -59,6 +58,41 @@ static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
 static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
 static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);

+typedef struct zfetch_stats {
+	kstat_named_t zfetchstat_hits;
+	kstat_named_t zfetchstat_misses;
+	kstat_named_t zfetchstat_colinear_hits;
+	kstat_named_t zfetchstat_colinear_misses;
+	kstat_named_t zfetchstat_stride_hits;
+	kstat_named_t zfetchstat_stride_misses;
+	kstat_named_t zfetchstat_reclaim_successes;
+	kstat_named_t zfetchstat_reclaim_failures;
+	kstat_named_t zfetchstat_stream_resets;
+	kstat_named_t zfetchstat_stream_noresets;
+	kstat_named_t zfetchstat_bogus_streams;
+} zfetch_stats_t;
+
+static zfetch_stats_t zfetch_stats = {
+	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "misses",			KSTAT_DATA_UINT64 },
+	{ "colinear_hits",		KSTAT_DATA_UINT64 },
+	{ "colinear_misses",		KSTAT_DATA_UINT64 },
+	{ "stride_hits",		KSTAT_DATA_UINT64 },
+	{ "stride_misses",		KSTAT_DATA_UINT64 },
+	{ "reclaim_successes",		KSTAT_DATA_UINT64 },
+	{ "reclaim_failures",		KSTAT_DATA_UINT64 },
+	{ "streams_resets",		KSTAT_DATA_UINT64 },
+	{ "streams_noresets",		KSTAT_DATA_UINT64 },
+	{ "bogus_streams",		KSTAT_DATA_UINT64 },
+};
+
+#define	ZFETCHSTAT_INCR(stat, val) \
+	atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
+
+#define	ZFETCHSTAT_BUMP(stat)		ZFETCHSTAT_INCR(stat, 1);
+
+kstat_t		*zfetch_ksp;
+
 /*
 * Given a zfetch structure and a zstream structure, determine whether the
 * blocks to be read are part of a co-linear pair of existing prefetch
@ -192,7 +226,30 @@ dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
 			break;
 	}
 	zs->zst_ph_offset = prefetch_tail;
-	zs->zst_last = lbolt;
+	zs->zst_last = ddi_get_lbolt();
+}
+
+void
+zfetch_init(void)
+{
+
+	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (zfetch_ksp != NULL) {
+		zfetch_ksp->ks_data = &zfetch_stats;
+		kstat_install(zfetch_ksp);
+	}
+}
+
+void
+zfetch_fini(void)
+{
+	if (zfetch_ksp != NULL) {
+		kstat_delete(zfetch_ksp);
+		zfetch_ksp = NULL;
+	}
 }

 /*
@ -298,7 +355,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
 }

 /*
- * given a zfetch and a zsearch structure, see if there is an associated zstream
+ * given a zfetch and a zstream structure, see if there is an associated zstream
 * for this block read.  If so, it starts a prefetch for the stream it
 * located and returns true, otherwise it returns false
 */
@ -330,6 +387,7 @@ top:
 		 */
 		if (zs->zst_len == 0) {
 			/* bogus stream */
+			ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
 			continue;
 		}

@ -339,9 +397,14 @@ top:
 		 */
 		if (zh->zst_offset >= zs->zst_offset &&
 		    zh->zst_offset < zs->zst_offset + zs->zst_len) {
+			if (prefetched) {
 				/* already fetched */
+				ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
 				rc = 1;
 				goto out;
+			} else {
+				ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
+			}
 		}

 		/*
@ -446,6 +509,7 @@ top:
 		if (reset) {
 			zstream_t *remove = zs;

+			ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
 			rc = 0;
 			mutex_exit(&zs->zst_lock);
 			rw_exit(&zf->zf_rwlock);
@ -464,6 +528,7 @@ top:
 				}
 			}
 		} else {
+			ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
 			rc = 1;
 			dmu_zfetch_dofetch(zf, zs);
 			mutex_exit(&zs->zst_lock);
@ -500,7 +565,6 @@ dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)

 	list_insert_head(&zf->zf_stream, zs);
 	zf->zf_stream_cnt++;
-
 	return (1);
 }

@ -520,7 +584,7 @@ dmu_zfetch_stream_reclaim(zfetch_t *zf)
 	for (zs = list_head(&zf->zf_stream); zs;
 	    zs = list_next(&zf->zf_stream, zs)) {

-		if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
+		if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
 			break;
 	}

@ -604,8 +668,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
 	    P2ALIGN(offset, blksz)) >> blkshft;

 	fetched = dmu_zfetch_find(zf, &zst, prefetched);
-	if (!fetched) {
-		fetched = dmu_zfetch_colinear(zf, &zst);
+	if (fetched) {
+		ZFETCHSTAT_BUMP(zfetchstat_hits);
+	} else {
+		ZFETCHSTAT_BUMP(zfetchstat_misses);
+		if (fetched = dmu_zfetch_colinear(zf, &zst)) {
+			ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
+		} else {
+			ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
+		}
 	}

 	if (!fetched) {
@ -615,11 +686,14 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
 		 * we still couldn't find a stream, drop the lock, and allocate
 		 * one if possible.  Otherwise, give up and go home.
 		 */
-		if (newstream == NULL) {
+		if (newstream) {
+			ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
+		} else {
 			uint64_t	maxblocks;
 			uint32_t	max_streams;
 			uint32_t	cur_streams;

+			ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
 			cur_streams = zf->zf_stream_cnt;
 			maxblocks = zf->zf_dnode->dn_maxblkid;

@ -632,7 +706,6 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
 			if (cur_streams >= max_streams) {
 				return;
 			}
-
 			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
 		}

@ -642,7 +715,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
 		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
 		newstream->zst_cap = zst.zst_len;
 		newstream->zst_direction = ZFETCH_FORWARD;
-		newstream->zst_last = lbolt;
+		newstream->zst_last = ddi_get_lbolt();

 		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);

--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -232,6 +231,11 @@ dnode_byteswap(dnode_phys_t *dnp)
 		ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
 		dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
 	}
+
+	/* Swap SPILL block if we have one */
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+		byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
+
 }

 void
@ -280,6 +284,27 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 	rw_exit(&dn->dn_struct_rwlock);
 }

+void
+dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
+{
+	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+	dnode_setdirty(dn, tx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dn->dn_bonustype = newtype;
+	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+	dnode_setdirty(dn, tx);
+	dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
+	dn->dn_have_spill = B_FALSE;
+}
+
 static void
 dnode_setdblksz(dnode_t *dn, int size)
 {
@ -294,7 +319,7 @@ dnode_setdblksz(dnode_t *dn, int size)
 }

 static dnode_t *
-dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
    uint64_t object)
 {
 	dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
@ -340,6 +365,9 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 	dn->dn_dirtyctx_firstset = NULL;
 	dn->dn_bonus = NULL;
 	dn->dn_zio = NULL;
+	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
+	dn->dn_id_flags = 0;
+
 	dmu_zfetch_init(&dn->dn_zfetch, dn);

 	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
@ -354,7 +382,7 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 static void
 dnode_destroy(dnode_t *dn)
 {
-	objset_impl_t *os = dn->dn_objset;
+	objset_t *os = dn->dn_objset;

 #ifdef ZFS_DEBUG
 	int i;
@ -366,7 +394,7 @@ dnode_destroy(dnode_t *dn)
 	}
 	ASSERT(NULL == list_head(&dn->dn_dbufs));
 #endif
-	ASSERT(dn->dn_oldphys == NULL);
+	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);

 	mutex_enter(&os->os_lock);
 	list_remove(&os->os_dnodes, dn);
@ -414,6 +442,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 	ASSERT(ot != DMU_OT_NONE);
 	ASSERT3U(ot, <, DMU_OT_NUMTYPES);
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
 	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
@ -429,6 +458,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 		ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
 		ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
 		ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
+		ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
+		ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
 		ASSERT3U(dn->dn_next_blksz[i], ==, 0);
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
@ -439,7 +470,11 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 	dnode_setdblksz(dn, blocksize);
 	dn->dn_indblkshift = ibs;
 	dn->dn_nlevels = 1;
-	dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+		dn->dn_nblkptr = 1;
+	else
+		dn->dn_nblkptr = 1 +
+		    ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@ -453,10 +488,12 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 	}

 	dn->dn_allocated_txg = tx->tx_txg;
+	dn->dn_id_flags = 0;

 	dnode_setdirty(dn, tx);
 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
 }

@ -472,13 +509,16 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 	ASSERT(tx->tx_txg != 0);
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
-	    (bonustype != DMU_OT_NONE && bonuslen != 0));
+	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
+	    (bonustype == DMU_OT_SA && bonuslen == 0));
 	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);

 	/* clean up any unreferenced dbufs */
 	dnode_evict_dbufs(dn);

+	dn->dn_id_flags = 0;
+
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_setdirty(dn, tx);
 	if (dn->dn_datablksz != blocksize) {
@ -491,9 +531,19 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 	}
 	if (dn->dn_bonuslen != bonuslen)
 		dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
+
+	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+		nblkptr = 1;
+	else
 		nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	if (dn->dn_bonustype != bonustype)
+		dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
 	if (dn->dn_nblkptr != nblkptr)
 		dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
+	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		dbuf_rm_spill(dn, tx);
+		dnode_rm_spill(dn, tx);
+	}
 	rw_exit(&dn->dn_struct_rwlock);

 	/* change type */
@ -534,7 +584,7 @@ dnode_special_close(dnode_t *dn)
 }

 dnode_t *
-dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object)
 {
 	dnode_t *dn = dnode_create(os, dnp, NULL, object);
 	DNODE_VERIFY(dn);
@ -583,7 +633,7 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
 * succeeds even for free dnodes.
 */
 int
-dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag,
    void *tag, dnode_t **dnp)
 {
 	int epb, idx, err;
@ -596,9 +646,14 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,

 	/*
 	 * If you are holding the spa config lock as writer, you shouldn't
-	 * be asking the DMU to do *anything*.
+	 * be asking the DMU to do *anything* unless it's the root pool
+	 * which may require us to read from the root filesystem while
+	 * holding some (not all) of the locks as writer.
 	 */
-	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
+	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
+	    (spa_is_root(os->os_spa) &&
+	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER) &&
+	    !spa_config_held(os->os_spa, SCL_ZIO, RW_WRITER)));

 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
 		dn = (object == DMU_USERUSED_OBJECT) ?
@ -675,7 +730,7 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
 	if (dn->dn_free_txg ||
 	    ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
 	    ((flag & DNODE_MUST_BE_FREE) &&
-	    (type != DMU_OT_NONE || dn->dn_oldphys))) {
+	    (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
 		mutex_exit(&dn->dn_mtx);
 		dbuf_rele(db, FTAG);
 		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
@ -698,7 +753,7 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
 * Return held dnode if the object is allocated, NULL if not.
 */
 int
-dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
+dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
 {
 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
 }
@ -737,7 +792,7 @@ dnode_rele(dnode_t *dn, void *tag)
 void
 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 {
-	objset_impl_t *os = dn->dn_objset;
+	objset_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;

 	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
@ -754,6 +809,11 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 	mutex_exit(&dn->dn_mtx);
 #endif

+	/*
+	 * Determine old uid/gid when necessary
+	 */
+	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
+
 	mutex_enter(&os->os_lock);

 	/*
@ -768,6 +828,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 	ASSERT(dn->dn_datablksz != 0);
 	ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
 	ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
+	ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0);

 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
 	    dn->dn_object, txg);
@ -862,7 +923,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
 		db_next = list_next(&dn->dn_dbufs, db);

-		if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) {
+		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
+		    db->db_blkid != DMU_SPILL_BLKID) {
 			mutex_exit(&dn->dn_dbufs_mtx);
 			goto fail;
 		}
@ -906,7 +968,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
 	int epbs, new_nlevels;
 	uint64_t sz;

-	ASSERT(blkid != DB_BONUS_BLKID);
+	ASSERT(blkid != DMU_BONUS_BLKID);

 	ASSERT(have_read ?
 	    RW_READ_HELD(&dn->dn_struct_rwlock) :
@ -953,6 +1015,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)

 		/* dirty the left indirects */
 		db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+		ASSERT(db != NULL);
 		new = dbuf_dirty(db, tx);
 		dbuf_rele(db, FTAG);

@ -963,7 +1026,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
 		for (dr = list_head(list); dr; dr = dr_next) {
 			dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
 			if (dr->dr_dbuf->db_level != new_nlevels-1 &&
-			    dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
+			    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
 				ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
 				list_remove(&dn->dn_dirty_records[txgoff], dr);
 				list_insert_tail(&new->dt.di.dr_children, dr);
@ -1218,6 +1282,20 @@ out:
 	rw_exit(&dn->dn_struct_rwlock);
 }

+static boolean_t
+dnode_spill_freed(dnode_t *dn)
+{
+	int i;
+
+	mutex_enter(&dn->dn_mtx);
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
+			break;
+	}
+	mutex_exit(&dn->dn_mtx);
+	return (i < TXG_SIZE);
+}
+
 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
@ -1226,7 +1304,7 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
 	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
 	int i;

-	if (blkid == DB_BONUS_BLKID)
+	if (blkid == DMU_BONUS_BLKID)
 		return (FALSE);

 	/*
@ -1239,6 +1317,9 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
 	if (dn->dn_free_txg)
 		return (TRUE);

+	if (blkid == DMU_SPILL_BLKID)
+		return (dnode_spill_freed(dn));
+
 	range_tofind.fr_blkid = blkid;
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
@ -1296,7 +1377,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
 void
 dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
 {
-	objset_impl_t *os = dn->dn_objset;
+	objset_t *os = dn->dn_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;

 	if (space > 0)
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #include <sys/zfs_context.h>
@ -120,7 +119,7 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
 		if (BP_IS_HOLE(bp))
 			continue;

-		bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
+		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
 		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
 		bzero(bp, sizeof (blkptr_t));
 		blocks_freed += 1;
@ -228,7 +227,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
 	if (db->db_state != DB_CACHED)
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);

-	arc_release(db->db_buf, db);
+	dbuf_release_bp(db);
 	bp = (blkptr_t *)db->db.db_data;

 	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
@ -428,6 +427,9 @@ dnode_undirty_dbufs(list_t *list)
 		dmu_buf_impl_t *db = dr->dr_dbuf;
 		uint64_t txg = dr->dr_txg;

+		if (db->db_level != 0)
+			dnode_undirty_dbufs(&dr->dt.di.dr_children);
+
 		mutex_enter(&db->db_mtx);
 		/* XXX - use dbuf_undirty()? */
 		list_remove(list, dr);
@ -435,18 +437,12 @@ dnode_undirty_dbufs(list_t *list)
 		db->db_last_dirty = NULL;
 		db->db_dirtycnt -= 1;
 		if (db->db_level == 0) {
-			ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+			ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 			    dr->dt.dl.dr_data == db->db_buf);
 			dbuf_unoverride(dr);
-			mutex_exit(&db->db_mtx);
-		} else {
-			mutex_exit(&db->db_mtx);
-			dnode_undirty_dbufs(&dr->dt.di.dr_children);
-			mutex_destroy(&dr->dt.di.dr_mtx);
-			list_destroy(&dr->dt.di.dr_children);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
-		dbuf_rele(db, (void *)(uintptr_t)txg);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 	}
 }

@ -497,6 +493,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 	dn->dn_maxblkid = 0;
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
+	dn->dn_have_spill = B_FALSE;
 	mutex_exit(&dn->dn_mtx);

 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@ -518,6 +515,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 	dnode_phys_t *dnp = dn->dn_phys;
 	int txgoff = tx->tx_txg & TXG_MASK;
 	list_t *list = &dn->dn_dirty_records[txgoff];
+	boolean_t kill_spill = B_FALSE;
 	ASSERTV(static const dnode_phys_t zerodn = { 0 });

 	ASSERT(dmu_tx_is_syncing(tx));
@ -530,10 +528,12 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)

 	if (dmu_objset_userused_enabled(dn->dn_objset) &&
 	    !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
-		ASSERT(dn->dn_oldphys == NULL);
-		dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t));
-		*dn->dn_oldphys = *dn->dn_phys; /* struct assignment */
+		mutex_enter(&dn->dn_mtx);
+		dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
+		dn->dn_oldflags = dn->dn_phys->dn_flags;
 		dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+		mutex_exit(&dn->dn_mtx);
+		dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
 	} else {
 		/* Once we account for it, we should always account for it. */
 		ASSERT(!(dn->dn_phys->dn_flags &
@ -564,6 +564,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		    SPA_MINBLOCKSIZE) == 0);
 		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
+		    avl_last(&dn->dn_ranges[txgoff]) ||
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
 		    dnp->dn_datablkszsec);
 		dnp->dn_datablkszsec =
@ -580,6 +581,24 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		dn->dn_next_bonuslen[txgoff] = 0;
 	}

+	if (dn->dn_next_bonustype[txgoff]) {
+		ASSERT(dn->dn_next_bonustype[txgoff] < DMU_OT_NUMTYPES);
+		dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
+		dn->dn_next_bonustype[txgoff] = 0;
+	}
+
+	/*
+	 * We will either remove a spill block when a file is being removed
+	 * or we have been asked to remove it.
+	 */
+	if (dn->dn_rm_spillblk[txgoff] ||
+	    ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
+	    dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) {
+		if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+			kill_spill = B_TRUE;
+		dn->dn_rm_spillblk[txgoff] = 0;
+	}
+
 	if (dn->dn_next_indblkshift[txgoff]) {
 		ASSERT(dnp->dn_nlevels == 1);
 		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
@ -596,6 +615,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)

 	mutex_exit(&dn->dn_mtx);

+	if (kill_spill) {
+		(void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+		mutex_enter(&dn->dn_mtx);
+		dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
+		mutex_exit(&dn->dn_mtx);
+	}
+
 	/* process all the "freed" ranges in the file */
 	while ((rp = avl_last(&dn->dn_ranges[txgoff]))) {
 		dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@ -0,0 +1,474 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dsl_dataset.h>
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_pool.h>
+
+static int
+dsl_deadlist_compare(const void *arg1, const void *arg2)
+{
+	const dsl_deadlist_entry_t *dle1 = arg1;
+	const dsl_deadlist_entry_t *dle2 = arg2;
+
+	if (dle1->dle_mintxg < dle2->dle_mintxg)
+		return (-1);
+	else if (dle1->dle_mintxg > dle2->dle_mintxg)
+		return (+1);
+	else
+		return (0);
+}
+
+static void
+dsl_deadlist_load_tree(dsl_deadlist_t *dl)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	ASSERT(!dl->dl_oldfmt);
+	if (dl->dl_havetree)
+		return;
+
+	avl_create(&dl->dl_tree, dsl_deadlist_compare,
+	    sizeof (dsl_deadlist_entry_t),
+	    offsetof(dsl_deadlist_entry_t, dle_node));
+	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+		dle->dle_mintxg = strtonum(za.za_name, NULL);
+		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
+		    za.za_first_integer));
+		avl_add(&dl->dl_tree, dle);
+	}
+	zap_cursor_fini(&zc);
+	dl->dl_havetree = B_TRUE;
+}
+
+void
+dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
+{
+	dmu_object_info_t doi;
+
+	mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
+	dl->dl_os = os;
+	dl->dl_object = object;
+	VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
+	dmu_object_info_from_db(dl->dl_dbuf, &doi);
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		dmu_buf_rele(dl->dl_dbuf, dl);
+		dl->dl_dbuf = NULL;
+		dl->dl_oldfmt = B_TRUE;
+		VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
+		return;
+	}
+
+	dl->dl_oldfmt = B_FALSE;
+	dl->dl_phys = dl->dl_dbuf->db_data;
+	dl->dl_havetree = B_FALSE;
+}
+
+void
+dsl_deadlist_close(dsl_deadlist_t *dl)
+{
+	void *cookie = NULL;
+	dsl_deadlist_entry_t *dle;
+
+	if (dl->dl_oldfmt) {
+		dl->dl_oldfmt = B_FALSE;
+		bpobj_close(&dl->dl_bpobj);
+		return;
+	}
+
+	if (dl->dl_havetree) {
+		while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
+		    != NULL) {
+			bpobj_close(&dle->dle_bpobj);
+			kmem_free(dle, sizeof (*dle));
+		}
+		avl_destroy(&dl->dl_tree);
+	}
+	dmu_buf_rele(dl->dl_dbuf, dl);
+	mutex_destroy(&dl->dl_lock);
+	dl->dl_dbuf = NULL;
+	dl->dl_phys = NULL;
+}
+
+uint64_t
+dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
+{
+	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+		return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
+	    sizeof (dsl_deadlist_phys_t), tx));
+}
+
+void
+dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
+{
+	dmu_object_info_t doi;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		bpobj_free(os, dlobj, tx);
+		return;
+	}
+
+	for (zap_cursor_init(&zc, os, dlobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc))
+		bpobj_free(os, za.za_first_integer, tx);
+	zap_cursor_fini(&zc);
+	VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
+}
+
+void
+dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	if (dl->dl_oldfmt) {
+		bpobj_enqueue(&dl->dl_bpobj, bp, tx);
+		return;
+	}
+
+	dsl_deadlist_load_tree(dl);
+
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	mutex_enter(&dl->dl_lock);
+	dl->dl_phys->dl_used +=
+	    bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
+	dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
+	dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
+	mutex_exit(&dl->dl_lock);
+
+	dle_tofind.dle_mintxg = bp->blk_birth;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+	else
+		dle = AVL_PREV(&dl->dl_tree, dle);
+	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+/*
+ * Insert new key in deadlist, which must be > all current entries.
+ * mintxg is not inclusive.
+ */
+void
+dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+	uint64_t obj;
+	dsl_deadlist_entry_t *dle;
+
+	if (dl->dl_oldfmt)
+		return;
+
+	dsl_deadlist_load_tree(dl);
+
+	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+	dle->dle_mintxg = mintxg;
+	obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+	avl_add(&dl->dl_tree, dle);
+
+	VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
+	    mintxg, obj, tx));
+}
+
+/*
+ * Remove this key, merging its entries into the previous key.
+ */
+void
+dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle, *dle_prev;
+
+	if (dl->dl_oldfmt)
+		return;
+
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+	dle_prev = AVL_PREV(&dl->dl_tree, dle);
+
+	bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
+	    dle->dle_bpobj.bpo_object, tx);
+
+	avl_remove(&dl->dl_tree, dle);
+	bpobj_close(&dle->dle_bpobj);
+	kmem_free(dle, sizeof (*dle));
+
+	VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
+}
+
+/*
+ * Walk ds's snapshots to regenerate generate ZAP & AVL.
+ */
+static void
+dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
+    uint64_t mrs_obj, dmu_tx_t *tx)
+{
+	dsl_deadlist_t dl;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	dsl_deadlist_open(&dl, os, dlobj);
+	if (dl.dl_oldfmt) {
+		dsl_deadlist_close(&dl);
+		return;
+	}
+
+	while (mrs_obj != 0) {
+		dsl_dataset_t *ds;
+		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
+		dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx);
+		mrs_obj = ds->ds_phys->ds_prev_snap_obj;
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dsl_deadlist_close(&dl);
+}
+
+uint64_t
+dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+    uint64_t mrs_obj, dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t *dle;
+	uint64_t newobj;
+
+	newobj = dsl_deadlist_alloc(dl->dl_os, tx);
+
+	if (dl->dl_oldfmt) {
+		dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
+		return (newobj);
+	}
+
+	dsl_deadlist_load_tree(dl);
+
+	for (dle = avl_first(&dl->dl_tree); dle;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		uint64_t obj;
+
+		if (dle->dle_mintxg >= maxtxg)
+			break;
+
+		obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+		VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
+		    dle->dle_mintxg, obj, tx));
+	}
+	return (newobj);
+}
+
+void
+dsl_deadlist_space(dsl_deadlist_t *dl,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	if (dl->dl_oldfmt) {
+		VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
+		    usedp, compp, uncompp));
+		return;
+	}
+
+	mutex_enter(&dl->dl_lock);
+	*usedp = dl->dl_phys->dl_used;
+	*compp = dl->dl_phys->dl_comp;
+	*uncompp = dl->dl_phys->dl_uncomp;
+	mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * return space used in the range (mintxg, maxtxg].
+ * Includes maxtxg, does not include mintxg.
+ * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
+ * UINT64_MAX).
+ */
+void
+dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	if (dl->dl_oldfmt) {
+		VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
+		    mintxg, maxtxg, usedp, compp, uncompp));
+		return;
+	}
+
+	dsl_deadlist_load_tree(dl);
+	*usedp = *compp = *uncompp = 0;
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	/*
+	 * If we don't find this mintxg, there shouldn't be anything
+	 * after it either.
+	 */
+	ASSERT(dle != NULL ||
+	    avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
+	for (; dle && dle->dle_mintxg < maxtxg;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		uint64_t used, comp, uncomp;
+
+		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+		    &used, &comp, &uncomp));
+
+		*usedp += used;
+		*compp += comp;
+		*uncompp += uncomp;
+	}
+}
+
+static void
+dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
+    dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+	uint64_t used, comp, uncomp;
+	bpobj_t bpo;
+
+	VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+	VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
+	bpobj_close(&bpo);
+
+	dsl_deadlist_load_tree(dl);
+
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	mutex_enter(&dl->dl_lock);
+	dl->dl_phys->dl_used += used;
+	dl->dl_phys->dl_comp += comp;
+	dl->dl_phys->dl_uncomp += uncomp;
+	mutex_exit(&dl->dl_lock);
+
+	dle_tofind.dle_mintxg = birth;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+	bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+}
+
+static int
+dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, tx);
+	return (0);
+}
+
+/*
+ * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
+ * an empty deadlist.
+ */
+void
+dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	dmu_buf_t *bonus;
+	dsl_deadlist_phys_t *dlp;
+	dmu_object_info_t doi;
+
+	VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		bpobj_t bpo;
+		VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+		VERIFY3U(0, ==, bpobj_iterate(&bpo,
+		    dsl_deadlist_insert_cb, dl, tx));
+		bpobj_close(&bpo);
+		return;
+	}
+
+	for (zap_cursor_init(&zc, dl->dl_os, obj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t mintxg = strtonum(za.za_name, NULL);
+		dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
+		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
+	}
+	zap_cursor_fini(&zc);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
+	dlp = bonus->db_data;
+	dmu_buf_will_dirty(bonus, tx);
+	bzero(dlp, sizeof (*dlp));
+	dmu_buf_rele(bonus, FTAG);
+}
+
+/*
+ * Remove entries on dl that are >= mintxg, and put them on the bpobj.
+ */
+void
+dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+    dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	ASSERT(!dl->dl_oldfmt);
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+	while (dle) {
+		uint64_t used, comp, uncomp;
+		dsl_deadlist_entry_t *dle_next;
+
+		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+
+		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+		    &used, &comp, &uncomp));
+		mutex_enter(&dl->dl_lock);
+		ASSERT3U(dl->dl_phys->dl_used, >=, used);
+		ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
+		ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
+		dl->dl_phys->dl_used -= used;
+		dl->dl_phys->dl_comp -= comp;
+		dl->dl_phys->dl_uncomp -= uncomp;
+		mutex_exit(&dl->dl_lock);
+
+		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, tx));
+
+		dle_next = AVL_NEXT(&dl->dl_tree, dle);
+		avl_remove(&dl->dl_tree, dle);
+		bpobj_close(&dle->dle_bpobj);
+		kmem_free(dle, sizeof (*dle));
+		dle = dle_next;
+	}
+}
--- a/module/zfs/dsl_deleg.c
+++ b/module/zfs/dsl_deleg.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 /*
@ -75,8 +74,6 @@
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/cred.h>
@ -150,7 +147,7 @@ dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
 }

 static void
-dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_set_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	nvlist_t *nvp = arg2;
@ -185,8 +182,8 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)

 			VERIFY(zap_update(mos, jumpobj,
 			    perm, 8, 1, &n, tx) == 0);
-			spa_history_internal_log(LOG_DS_PERM_UPDATE,
-			    dd->dd_pool->dp_spa, tx, cr,
+			spa_history_log_internal(LOG_DS_PERM_UPDATE,
+			    dd->dd_pool->dp_spa, tx,
 			    "%s %s dataset = %llu", whokey, perm,
 			    dd->dd_phys->dd_head_dataset_obj);
 		}
@ -194,7 +191,7 @@ dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 }

 static void
-dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_unset_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	nvlist_t *nvp = arg2;
@ -217,8 +214,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 				(void) zap_remove(mos, zapobj, whokey, tx);
 				VERIFY(0 == zap_destroy(mos, jumpobj, tx));
 			}
-			spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE,
-			    dd->dd_pool->dp_spa, tx, cr,
+			spa_history_log_internal(LOG_DS_PERM_WHO_REMOVE,
+			    dd->dd_pool->dp_spa, tx,
 			    "%s dataset = %llu", whokey,
 			    dd->dd_phys->dd_head_dataset_obj);
 			continue;
@ -238,8 +235,8 @@ dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 				VERIFY(0 == zap_destroy(mos,
 				    jumpobj, tx));
 			}
-			spa_history_internal_log(LOG_DS_PERM_REMOVE,
-			    dd->dd_pool->dp_spa, tx, cr,
+			spa_history_log_internal(LOG_DS_PERM_REMOVE,
+			    dd->dd_pool->dp_spa, tx,
 			    "%s %s dataset = %llu", whokey, perm,
 			    dd->dd_phys->dd_head_dataset_obj);
 		}
@ -589,7 +586,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)

 			if (dsl_prop_get_dd(dd,
 			    zfs_prop_to_name(ZFS_PROP_ZONED),
-			    8, 1, &zoned, NULL) != 0)
+			    8, 1, &zoned, NULL, B_FALSE) != 0)
 				break;
 			if (!zoned)
 				break;
@ -739,7 +736,7 @@ dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
 boolean_t
 dsl_delegation_on(objset_t *os)
 {
-	return (os->os->os_spa->spa_delegation);
+	return (!!spa_delegation(os->os_spa));
 }

 #if defined(_KERNEL) && defined(HAVE_SPL)
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #include <sys/dmu.h>
@ -32,6 +31,7 @@
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
+#include <sys/metaslab.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
@ -39,8 +39,7 @@
 #include "zfs_namecheck.h"

 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync(void *arg1, void *arg2,
-    cred_t *cr, dmu_tx_t *tx);
+static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx);


 /* ARGSUSED */
@ -63,8 +62,8 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
 	spa_close(dd->dd_pool->dp_spa, dd);

 	/*
-	 * The props callback list should be empty since they hold the
-	 * dir open.
+	 * The props callback list should have been cleaned up by
+	 * objset_evict().
 	 */
 	list_destroy(&dd->dd_prop_cbs);
 	mutex_destroy(&dd->dd_lock);
@ -107,6 +106,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
 		    offsetof(dsl_prop_cb_record_t, cbr_node));

+		dsl_dir_snap_cmtime_update(dd);
+
 		if (dd->dd_phys->dd_parent_obj) {
 			err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
 			    NULL, dd, &dd->dd_parent);
@ -133,6 +134,25 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
 		}

+		if (dsl_dir_is_clone(dd)) {
+			dmu_buf_t *origin_bonus;
+			dsl_dataset_phys_t *origin_phys;
+
+			/*
+			 * We can't open the origin dataset, because
+			 * that would require opening this dsl_dir.
+			 * Just look at its phys directly instead.
+			 */
+			err = dmu_bonus_hold(dp->dp_meta_objset,
+			    dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
+			if (err)
+				goto errout;
+			origin_phys = origin_bonus->db_data;
+			dd->dd_origin_txg =
+			    origin_phys->ds_creation_txg;
+			dmu_buf_rele(origin_bonus, FTAG);
+		}
+
 		winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
 		    dsl_dir_evict);
 		if (winner) {
@ -392,7 +412,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 {
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t ddobj;
-	dsl_dir_phys_t *dsphys;
+	dsl_dir_phys_t *ddphys;
 	dmu_buf_t *dbuf;

 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
@ -407,17 +427,17 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 	}
 	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
-	dsphys = dbuf->db_data;
+	ddphys = dbuf->db_data;

-	dsphys->dd_creation_time = gethrestime_sec();
+	ddphys->dd_creation_time = gethrestime_sec();
 	if (pds)
-		dsphys->dd_parent_obj = pds->dd_object;
-	dsphys->dd_props_zapobj = zap_create(mos,
+		ddphys->dd_parent_obj = pds->dd_object;
+	ddphys->dd_props_zapobj = zap_create(mos,
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
-	dsphys->dd_child_dir_zapobj = zap_create(mos,
+	ddphys->dd_child_dir_zapobj = zap_create(mos,
 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
-		dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
+		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 	dmu_buf_rele(dbuf, FTAG);

 	return (ddobj);
@ -427,7 +447,8 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 int
 dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
 	dsl_pool_t *dp = dd->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	int err;
@ -454,19 +475,27 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }

 void
-dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	uint64_t val, obj;
+	dsl_prop_setarg_t psa;
+	uint64_t value = 0;
+	uint64_t obj;
 	dd_used_t t;

 	ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
 	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);

 	/* Remove our reservation. */
-	val = 0;
-	dsl_dir_set_reservation_sync(dd, &val, cr, tx);
+	dsl_prop_setarg_init_uint64(&psa, "reservation",
+	    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+	    &value);
+	psa.psa_effective_value = 0;	/* predict default value */
+
+	dsl_dir_set_reservation_sync(ds, &psa, tx);
+
 	ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0);
 	ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
 	for (t = 0; t < DD_USED_NUM; t++)
@ -640,15 +669,6 @@ dsl_dir_space_available(dsl_dir_t *dd,
 	if (used > quota) {
 		/* over quota */
 		myspace = 0;
-
-		/*
-		 * While it's OK to be a little over quota, if
-		 * we think we are using more space than there
-		 * is in the pool (which is already 1.6% more than
-		 * dsl_pool_adjustedsize()), something is very
-		 * wrong.
-		 */
-		ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
 	} else {
 		/*
 		 * the lesser of the space provided by our parent and
@ -676,8 +696,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 {
 	uint64_t txg = tx->tx_txg;
 	uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
+	uint64_t deferred = 0;
 	struct tempreserve *tr;
-	int enospc = EDQUOT;
+	int retval = EDQUOT;
 	int txgidx = txg & TXG_MASK;
 	int i;
 	uint64_t ref_rsrv = 0;
@ -703,7 +724,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 	 */
 	if (first && tx->tx_objset) {
 		int error;
-		dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset;
+		dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;

 		error = dsl_dataset_check_quota(ds, checkrefquota,
 		    asize, est_inflight, &used_on_disk, &ref_rsrv);
@ -723,7 +744,8 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 		quota = dd->dd_phys->dd_quota;

 	/*
-	 * Adjust the quota against the actual pool size at the root.
+	 * Adjust the quota against the actual pool size at the root
+	 * minus any outstanding deferred frees.
 	 * To ensure that it's possible to remove files from a full
 	 * pool without inducing transient overcommits, we throttle
 	 * netfree transactions against a quota that is slightly larger,
@ -732,10 +754,12 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 	 * removes to get through.
 	 */
 	if (dd->dd_parent == NULL) {
+		spa_t *spa = dd->dd_pool->dp_spa;
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
-		if (poolsize < quota) {
-			quota = poolsize;
-			enospc = ENOSPC;
+		deferred = metaslab_class_get_deferred(spa_normal_class(spa));
+		if (poolsize - deferred < quota) {
+			quota = poolsize - deferred;
+			retval = ENOSPC;
 		}
 	}

@ -745,15 +769,16 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
-	if (used_on_disk + est_inflight > quota) {
-		if (est_inflight > 0 || used_on_disk < quota)
-			enospc = ERESTART;
+	if (used_on_disk + est_inflight >= quota) {
+		if (est_inflight > 0 || used_on_disk < quota ||
+		    (retval == ENOSPC && used_on_disk < quota + deferred))
+			retval = ERESTART;
 		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
 		    "quota=%lluK tr=%lluK err=%d\n",
 		    used_on_disk>>10, est_inflight>>10,
-		    quota>>10, asize>>10, enospc);
+		    quota>>10, asize>>10, retval);
 		mutex_exit(&dd->dd_lock);
-		return (enospc);
+		return (retval);
 	}

 	/* We need to up our estimated delta before dropping dd_lock */
@ -989,13 +1014,16 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
 static int
 dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	uint64_t *quotap = arg2;
-	uint64_t new_quota = *quotap;
-	int err = 0;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_setarg_t *psa = arg2;
+	int err;
 	uint64_t towrite;

-	if (new_quota == 0)
+	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+		return (err);
+
+	if (psa->psa_effective_value == 0)
 		return (0);

 	mutex_enter(&dd->dd_lock);
@ -1007,64 +1035,88 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	 */
 	towrite = dsl_dir_space_towrite(dd);
 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
-	    (new_quota < dd->dd_phys->dd_reserved ||
-	    new_quota < dd->dd_phys->dd_used_bytes + towrite)) {
+	    (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
+	    psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
 		err = ENOSPC;
 	}
 	mutex_exit(&dd->dd_lock);
 	return (err);
 }

-/* ARGSUSED */
+extern dsl_syncfunc_t dsl_prop_set_sync;
+
 static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	uint64_t *quotap = arg2;
-	uint64_t new_quota = *quotap;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_setarg_t *psa = arg2;
+	uint64_t effective_value = psa->psa_effective_value;
+
+	dsl_prop_set_sync(ds, psa, tx);
+	DSL_PROP_CHECK_PREDICTION(dd, psa);

 	dmu_buf_will_dirty(dd->dd_dbuf, tx);

 	mutex_enter(&dd->dd_lock);
-	dd->dd_phys->dd_quota = new_quota;
+	dd->dd_phys->dd_quota = effective_value;
 	mutex_exit(&dd->dd_lock);

-	spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
-	    tx, cr, "%lld dataset = %llu ",
-	    (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj);
+	spa_history_log_internal(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
+	    tx, "%lld dataset = %llu ",
+	    (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
 }

 int
-dsl_dir_set_quota(const char *ddname, uint64_t quota)
+dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
 {
 	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	dsl_prop_setarg_t psa;
 	int err;

-	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	dsl_prop_setarg_init_uint64(&psa, "quota", source, &quota);
+
+	err = dsl_dataset_hold(ddname, FTAG, &ds);
 	if (err)
 		return (err);

-	if (quota != dd->dd_phys->dd_quota) {
+	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	if (err) {
+		dsl_dataset_rele(ds, FTAG);
+		return (err);
+	}
+
+	ASSERT(ds->ds_dir == dd);
+
 	/*
-		 * If someone removes a file, then tries to set the quota, we
-		 * want to make sure the file freeing takes effect.
+	 * If someone removes a file, then tries to set the quota, we want to
+	 * make sure the file freeing takes effect.
 	 */
 	txg_wait_open(dd->dd_pool, 0);

 	err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
-		    dsl_dir_set_quota_sync, dd, &quota, 0);
-	}
+	    dsl_dir_set_quota_sync, ds, &psa, 0);
+
 	dsl_dir_close(dd, FTAG);
+	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }

 int
 dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	uint64_t *reservationp = arg2;
-	uint64_t new_reservation = *reservationp;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_setarg_t *psa = arg2;
+	uint64_t effective_value;
 	uint64_t used, avail;
+	int err;
+
+	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+		return (err);
+
+	effective_value = psa->psa_effective_value;

 	/*
 	 * If we are doing the preliminary check in open context, the
@ -1084,37 +1136,40 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
 	}

-	if (MAX(used, new_reservation) > MAX(used, dd->dd_phys->dd_reserved)) {
-		uint64_t delta = MAX(used, new_reservation) -
+	if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
+		uint64_t delta = MAX(used, effective_value) -
 		    MAX(used, dd->dd_phys->dd_reserved);

 		if (delta > avail)
 			return (ENOSPC);
 		if (dd->dd_phys->dd_quota > 0 &&
-		    new_reservation > dd->dd_phys->dd_quota)
+		    effective_value > dd->dd_phys->dd_quota)
 			return (ENOSPC);
 	}

 	return (0);
 }

-/* ARGSUSED */
 static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	uint64_t *reservationp = arg2;
-	uint64_t new_reservation = *reservationp;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_setarg_t *psa = arg2;
+	uint64_t effective_value = psa->psa_effective_value;
 	uint64_t used;
 	int64_t delta;

+	dsl_prop_set_sync(ds, psa, tx);
+	DSL_PROP_CHECK_PREDICTION(dd, psa);
+
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);

 	mutex_enter(&dd->dd_lock);
 	used = dd->dd_phys->dd_used_bytes;
-	delta = MAX(used, new_reservation) -
+	delta = MAX(used, effective_value) -
 	    MAX(used, dd->dd_phys->dd_reserved);
-	dd->dd_phys->dd_reserved = new_reservation;
+	dd->dd_phys->dd_reserved = effective_value;

 	if (dd->dd_parent != NULL) {
 		/* Roll up this additional usage into our ancestors */
@ -1123,23 +1178,39 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	}
 	mutex_exit(&dd->dd_lock);

-	spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
-	    tx, cr, "%lld dataset = %llu",
-	    (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj);
+	spa_history_log_internal(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
+	    tx, "%lld dataset = %llu",
+	    (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
 }

 int
-dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
+dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+    uint64_t reservation)
 {
 	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	dsl_prop_setarg_t psa;
 	int err;

-	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
+
+	err = dsl_dataset_hold(ddname, FTAG, &ds);
 	if (err)
 		return (err);
+
+	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	if (err) {
+		dsl_dataset_rele(ds, FTAG);
+		return (err);
+	}
+
+	ASSERT(ds->ds_dir == dd);
+
 	err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
-	    dsl_dir_set_reservation_sync, dd, &reservation, 0);
+	    dsl_dir_set_reservation_sync, ds, &psa, 0);
+
 	dsl_dir_close(dd, FTAG);
+	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }

@ -1177,7 +1248,6 @@ struct renamearg {
 	const char *mynewname;
 };

-/*ARGSUSED*/
 static int
 dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
@ -1188,8 +1258,14 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	int err;
 	uint64_t val;

-	/* There should be 2 references: the open and the dirty */
-	if (dmu_buf_refcount(dd->dd_dbuf) > 2)
+	/*
+	 * There should only be one reference, from dmu_objset_rename().
+	 * Fleeting holds are also possible (eg, from "zfs list" getting
+	 * stats), but any that are present in open context will likely
+	 * be gone by syncing context, so only fail from syncing
+	 * context.
+	 */
+	if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1)
 		return (EBUSY);

 	/* check for existing name */
@ -1218,7 +1294,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }

 static void
-dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct renamearg *ra = arg2;
@ -1267,8 +1343,8 @@ dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	    dd->dd_myname, 8, 1, &dd->dd_object, tx);
 	ASSERT3U(err, ==, 0);

-	spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa,
-	    tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
+	spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa,
+	    tx, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
 }

 int
@ -1318,6 +1394,29 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
 	return (0);
 }

+timestruc_t
+dsl_dir_snap_cmtime(dsl_dir_t *dd)
+{
+	timestruc_t t;
+
+	mutex_enter(&dd->dd_lock);
+	t = dd->dd_snap_cmtime;
+	mutex_exit(&dd->dd_lock);
+
+	return (t);
+}
+
+void
+dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
+{
+	timestruc_t t;
+
+	gethrestime(&t);
+	mutex_enter(&dd->dd_lock);
+	dd->dd_snap_cmtime = t;
+	mutex_exit(&dd->dd_lock);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 EXPORT_SYMBOL(dsl_dir_set_quota);
 EXPORT_SYMBOL(dsl_dir_set_reservation);
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@ -19,14 +19,16 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
+#include <sys/dsl_scan.h>
+#include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
@ -36,10 +38,11 @@
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
+#include <sys/dsl_deadlist.h>

 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
-int zfs_txg_synctime = 5;			/* target secs to sync a txg */
+int zfs_txg_synctime_ms = 5000;		/* target millisecs to sync a txg */

 uint64_t zfs_write_limit_min = 32 << 20;	/* min write limit is 32MB */
 uint64_t zfs_write_limit_max = 0;		/* max data payload per txg */
@ -50,7 +53,7 @@ kmutex_t zfs_write_limit_lock;

 static pgcnt_t old_physmem = 0;

-static int
+int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
 	uint64_t obj;
@ -88,7 +91,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	    offsetof(dsl_dataset_t, ds_synced_link));

 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);

 	dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 	    1, 4, 0);
@ -103,13 +105,13 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
-	objset_impl_t *osi;
+	uint64_t obj;

 	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
-	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
+	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
+	    &dp->dp_meta_objset);
 	if (err)
 		goto out;
-	dp->dp_meta_objset = &osi->os;

 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
@ -143,52 +145,29 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 			goto out;
 	}

-	/* get scrub status */
-	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
-	    &dp->dp_scrub_func);
-	if (err == 0) {
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
-		    &dp->dp_scrub_queue_obj);
+	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+		err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
+		    &dp->dp_free_dir);
 		if (err)
 			goto out;
+
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
-		    &dp->dp_scrub_min_txg);
+		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 		if (err)
 			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
-		    &dp->dp_scrub_max_txg);
-		if (err)
-			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
-		    &dp->dp_scrub_bookmark);
-		if (err)
-			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-		    &spa->spa_scrub_errors);
-		if (err)
-			goto out;
-		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
-			/*
-			 * A new-type scrub was in progress on an old
-			 * pool.  Restart from the beginning, since the
-			 * old software may have changed the pool in the
-			 * meantime.
-			 */
-			dsl_pool_scrub_restart(dp);
+		VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+		    dp->dp_meta_objset, obj));
 	}
-	} else {
-		/*
-		 * It's OK if there is no scrub in progress (and if
-		 * there was an I/O error, ignore it).
-		 */
+
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
+	    &dp->dp_tmp_userrefs_obj);
+	if (err == ENOENT)
 		err = 0;
-	}
+	if (err)
+		goto out;
+
+	err = dsl_scan_init(dp, txg);

 out:
 	rw_exit(&dp->dp_config_rwlock);
@ -214,22 +193,27 @@ dsl_pool_close(dsl_pool_t *dp)
 		dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
 	if (dp->dp_mos_dir)
 		dsl_dir_close(dp->dp_mos_dir, dp);
+	if (dp->dp_free_dir)
+		dsl_dir_close(dp->dp_free_dir, dp);
 	if (dp->dp_root_dir)
 		dsl_dir_close(dp->dp_root_dir, dp);

+	bpobj_close(&dp->dp_free_bpobj);
+
 	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 	if (dp->dp_meta_objset)
-		dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+		dmu_objset_evict(dp->dp_meta_objset);

 	txg_list_destroy(&dp->dp_dirty_datasets);
+	txg_list_destroy(&dp->dp_sync_tasks);
 	txg_list_destroy(&dp->dp_dirty_dirs);
 	list_destroy(&dp->dp_synced_datasets);

 	arc_flush(dp->dp_spa);
 	txg_fini(dp);
+	dsl_scan_fini(dp);
 	rw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
-	mutex_destroy(&dp->dp_scrub_cancel_lock);
 	taskq_destroy(dp->dp_vnrele_taskq);
 	if (dp->dp_blkstats)
 		kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
@ -242,19 +226,22 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
-	objset_impl_t *osip;
+	objset_t *os;
 	dsl_dataset_t *ds;
-	uint64_t dsobj;
+	uint64_t obj;

 	/* create and open the MOS (meta-objset) */
-	dp->dp_meta_objset = &dmu_objset_create_impl(spa,
-	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
+	dp->dp_meta_objset = dmu_objset_create_impl(spa,
+	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);

 	/* create the pool directory */
 	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 	ASSERT3U(err, ==, 0);

+	/* Initialize scan structures */
+	VERIFY3U(0, ==, dsl_scan_init(dp, txg));
+
 	/* create and open the root dir */
 	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 	VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
@ -265,18 +252,33 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 	VERIFY(0 == dsl_pool_open_special_dir(dp,
 	    MOS_DIR_NAME, &dp->dp_mos_dir));

+	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+		/* create and open the free dir */
+		(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+		    FREE_DIR_NAME, tx);
+		VERIFY(0 == dsl_pool_open_special_dir(dp,
+		    FREE_DIR_NAME, &dp->dp_free_dir));
+
+		/* create and open the free_bplist */
+		obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+		VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
+		VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+		    dp->dp_meta_objset, obj));
+	}
+
 	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 		dsl_pool_create_origin(dp, tx);

 	/* create the root dataset */
-	dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);

 	/* create the root objset */
-	VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-	osip = dmu_objset_create_impl(dp->dp_spa, ds,
+	VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+	os = dmu_objset_create_impl(dp->dp_spa, ds,
 	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 #ifdef _KERNEL
-	zfs_create_fs(&osip->os, kcred, zplprops, tx);
+	zfs_create_fs(os, kcred, zplprops, tx);
 #endif
 	dsl_dataset_rele(ds, FTAG);

@ -285,6 +287,14 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 	return (dp);
 }

+static int
+deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, tx);
+	return (0);
+}
+
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
@ -293,11 +303,19 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	dsl_sync_task_group_t *dstg;
-	objset_impl_t *mosi = dp->dp_meta_objset->os;
+	objset_t *mos = dp->dp_meta_objset;
 	hrtime_t start, write_time;
 	uint64_t data_written;
 	int err;

+	/*
+	 * We need to copy dp_space_towrite() before doing
+	 * dsl_sync_task_group_sync(), because
+	 * dsl_dataset_snapshot_reserve_space() will increase
+	 * dp_space_towrite but not actually write anything.
+	 */
+	data_written = dp->dp_space_towrite[txg & TXG_MASK];
+
 	tx = dmu_tx_create_assigned(dp, txg);

 	dp->dp_read_overhead = 0;
@ -323,11 +341,11 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)

 	for (ds = list_head(&dp->dp_synced_datasets); ds;
 	    ds = list_next(&dp->dp_synced_datasets, ds))
-		dmu_objset_do_userquota_callbacks(ds->ds_user_ptr, tx);
+		dmu_objset_do_userquota_updates(ds->ds_objset, tx);

 	/*
 	 * Sync the datasets again to push out the changes due to
-	 * userquota updates.  This must be done before we process the
+	 * userspace updates.  This must be done before we process the
 	 * sync tasks, because that could cause a snapshot of a dataset
 	 * whose ds_bp will be rewritten when we do this 2nd sync.
 	 */
@ -339,6 +357,16 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	}
 	err = zio_wait(zio);

+	/*
+	 * Move dead blocks from the pending deadlist to the on-disk
+	 * deadlist.
+	 */
+	for (ds = list_head(&dp->dp_synced_datasets); ds;
+	    ds = list_next(&dp->dp_synced_datasets, ds)) {
+		bplist_iterate(&ds->ds_pending_deadlist,
+		    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+	}
+
 	while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) {
 		/*
 		 * No more sync tasks should have been added while we
@ -354,14 +382,11 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 		dsl_dir_sync(dd, tx);
 	write_time += gethrtime() - start;

-	if (spa_sync_pass(dp->dp_spa) == 1)
-		dsl_pool_scrub_sync(dp, tx);
-
 	start = gethrtime();
-	if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
-	    list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
+	    list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-		dmu_objset_sync(mosi, zio, tx);
+		dmu_objset_sync(mos, zio, tx);
 		err = zio_wait(zio);
 		ASSERT(err == 0);
 		dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
@ -374,7 +399,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)

 	dmu_tx_commit(tx);

-	data_written = dp->dp_space_towrite[txg & TXG_MASK];
 	dp->dp_space_towrite[txg & TXG_MASK] = 0;
 	ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);

@ -399,10 +423,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	 * amount of write traffic allowed into each transaction group.
 	 * Weight the throughput calculation towards the current value:
 	 * 	thru = 3/4 old_thru + 1/4 new_thru
+	 *
+	 * Note: write_time is in nanosecs, so write_time/MICROSEC
+	 * yields millisecs
 	 */
 	ASSERT(zfs_write_limit_min > 0);
-	if (data_written > zfs_write_limit_min / 8 && write_time > 0) {
-		uint64_t throughput = (data_written * NANOSEC) / write_time;
+	if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
+		uint64_t throughput = data_written / (write_time / MICROSEC);
+
 		if (dp->dp_throughput)
 			dp->dp_throughput = throughput / 4 +
 			    3 * dp->dp_throughput / 4;
@ -410,21 +438,24 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 			dp->dp_throughput = throughput;
 		dp->dp_write_limit = MIN(zfs_write_limit_inflated,
 		    MAX(zfs_write_limit_min,
-		    dp->dp_throughput * zfs_txg_synctime));
+		    dp->dp_throughput * zfs_txg_synctime_ms));
 	}
 }

 void
-dsl_pool_zil_clean(dsl_pool_t *dp)
+dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
 	dsl_dataset_t *ds;
+	objset_t *os;

 	while ((ds = list_head(&dp->dp_synced_datasets))) {
 		list_remove(&dp->dp_synced_datasets, ds);
-		ASSERT(ds->ds_user_ptr != NULL);
-		zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
+		os = ds->ds_objset;
+		zil_clean(os->os_zil);
+		ASSERT(!dmu_objset_is_dirty(os, txg));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 	}
+	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }

 /*
@ -601,6 +632,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 	ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);

 	if (prev->ds_phys->ds_next_clones_obj == 0) {
+		dmu_buf_will_dirty(prev->ds_dbuf, tx);
 		prev->ds_phys->ds_next_clones_obj =
 		    zap_create(dp->dp_meta_objset,
 		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
@ -620,8 +652,67 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap != NULL);

-	(void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
-	    tx, DS_FIND_CHILDREN);
+	VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
+	    tx, DS_FIND_CHILDREN));
+}
+
+/* ARGSUSED */
+static int
+upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	dsl_dataset_t *ds;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	objset_t *mos = dp->dp_meta_objset;
+
+	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+	if (ds->ds_dir->dd_phys->dd_origin_obj) {
+		dsl_dataset_t *origin;
+
+		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+		    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
+
+		if (origin->ds_dir->dd_phys->dd_clones == 0) {
+			dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+			origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
+			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+		}
+
+		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+		    origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+
+		dsl_dataset_rele(origin, FTAG);
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+void
+dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	uint64_t obj;
+
+	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
+	VERIFY(0 == dsl_pool_open_special_dir(dp,
+	    FREE_DIR_NAME, &dp->dp_free_dir));
+
+	/*
+	 * We can't use bpobj_alloc(), because spa_version() still
+	 * returns the old version, and we need a new-version bpobj with
+	 * subobj support.  So call dmu_object_alloc() directly.
+	 */
+	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
+	    SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+	VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+	VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+	    dp->dp_meta_objset, obj));
+
+	VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL,
+	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
 }

 void
@ -638,7 +729,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 	    NULL, 0, kcred, tx);
 	VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-	dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx);
+	dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx);
 	VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
 	    dp, &dp->dp_origin_snap));
 	dsl_dataset_rele(ds, FTAG);
@ -650,3 +741,108 @@ dsl_pool_vnrele_taskq(dsl_pool_t *dp)
 {
 	return (dp->dp_vnrele_taskq);
 }
+
+/*
+ * Walk through the pool-wide zap object of temporary snapshot user holds
+ * and release them.
+ */
+void
+dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
+{
+	zap_attribute_t za;
+	zap_cursor_t zc;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+
+	if (zapobj == 0)
+		return;
+	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+
+	for (zap_cursor_init(&zc, mos, zapobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		char *htag;
+		uint64_t dsobj;
+
+		htag = strchr(za.za_name, '-');
+		*htag = '\0';
+		++htag;
+		dsobj = strtonum(za.za_name, NULL);
+		(void) dsl_dataset_user_release_tmp(dp, dsobj, htag);
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*
+ * Create the pool-wide zap object for storing temporary snapshot holds.
+ */
+void
+dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	objset_t *mos = dp->dp_meta_objset;
+
+	ASSERT(dp->dp_tmp_userrefs_obj == 0);
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS,
+	    DMU_OT_NONE, 0, tx);
+
+	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS,
+	    sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0);
+}
+
+static int
+dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
+{
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+	char *name;
+	int error;
+
+	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	/*
+	 * If the pool was created prior to SPA_VERSION_USERREFS, the
+	 * zap object for temporary holds might not exist yet.
+	 */
+	if (zapobj == 0) {
+		if (holding) {
+			dsl_pool_user_hold_create_obj(dp, tx);
+			zapobj = dp->dp_tmp_userrefs_obj;
+		} else {
+			return (ENOENT);
+		}
+	}
+
+	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
+	if (holding)
+		error = zap_add(mos, zapobj, name, 8, 1, now, tx);
+	else
+		error = zap_remove(mos, zapobj, name, tx);
+	strfree(name);
+
+	return (error);
+}
+
+/*
+ * Add a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+    uint64_t *now, dmu_tx_t *tx)
+{
+	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
+}
+
+/*
+ * Release a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+    dmu_tx_t *tx)
+{
+	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
+	    tx, B_FALSE));
+}
--- a/module/zfs/dsl_prop.c
+++ b/module/zfs/dsl_prop.c
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
--- a/module/zfs/dsl_scrub.c
+++ b/module/zfs/dsl_scrub.c
--- a/module/zfs/dsl_synctask.c
+++ b/module/zfs/dsl_synctask.c
@ -19,18 +19,15 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

-
-
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
-#include <sys/cred.h>
+#include <sys/metaslab.h>

 #define	DST_AVG_BLKSHIFT 14

@ -50,7 +47,6 @@ dsl_sync_task_group_create(dsl_pool_t *dp)
 	list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
 	    offsetof(dsl_sync_task_t, dst_node));
 	dstg->dstg_pool = dp;
-	dstg->dstg_cr = CRED();

 	return (dstg);
 }
@ -112,14 +108,21 @@ top:
 		return (dstg->dstg_err);
 	}

-	VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+	/*
+	 * We don't generally have many sync tasks, so pay the price of
+	 * add_tail to get the tasks executed in the right order.
+	 */
+	VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks,
+	    dstg, txg));

 	dmu_tx_commit(tx);

 	txg_wait_synced(dstg->dstg_pool, txg);

-	if (dstg->dstg_err == EAGAIN)
+	if (dstg->dstg_err == EAGAIN) {
+		txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE);
 		goto top;
+	}

 	return (dstg->dstg_err);
 }
@ -131,7 +134,12 @@ dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)

 	dstg->dstg_nowaiter = B_TRUE;
 	txg = dmu_tx_get_txg(tx);
-	VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+	/*
+	 * We don't generally have many sync tasks, so pay the price of
+	 * add_tail to get the tasks executed in the right order.
+	 */
+	VERIFY(0 == txg_list_add_tail(&dstg->dstg_pool->dp_sync_tasks,
+	    dstg, txg));
 }

 void
@ -150,25 +158,30 @@ void
 dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
 {
 	dsl_sync_task_t *dst;
-	void *tr_cookie;
+	dsl_pool_t *dp = dstg->dstg_pool;
+	uint64_t quota, used;

 	ASSERT3U(dstg->dstg_err, ==, 0);

 	/*
-	 * Check for sufficient space.
+	 * Check for sufficient space.  We just check against what's
+	 * on-disk; we don't want any in-flight accounting to get in our
+	 * way, because open context may have already used up various
+	 * in-core limits (arc_tempreserve, dsl_pool_tempreserve).
 	 */
-	dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
-	    dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx);
-	/* don't bother trying again */
-	if (dstg->dstg_err == ERESTART)
-		dstg->dstg_err = EAGAIN;
-	if (dstg->dstg_err)
+	quota = dsl_pool_adjustedsize(dp, B_FALSE) -
+	    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+	used = dp->dp_root_dir->dd_phys->dd_used_bytes;
+	/* MOS space is triple-dittoed, so we multiply by 3. */
+	if (dstg->dstg_space > 0 && used + dstg->dstg_space * 3 > quota) {
+		dstg->dstg_err = ENOSPC;
 		return;
+	}

 	/*
 	 * Check for errors by calling checkfuncs.
 	 */
-	rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER);
+	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 	for (dst = list_head(&dstg->dstg_tasks); dst;
 	    dst = list_next(&dstg->dstg_tasks, dst)) {
 		dst->dst_err =
@ -183,13 +196,10 @@ dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
 		 */
 		for (dst = list_head(&dstg->dstg_tasks); dst;
 		    dst = list_next(&dstg->dstg_tasks, dst)) {
-			dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2,
-			    dstg->dstg_cr, tx);
+			dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, tx);
 		}
 	}
-	rw_exit(&dstg->dstg_pool->dp_config_rwlock);
-
-	dsl_dir_tempreserve_clear(tr_cookie, tx);
+	rw_exit(&dp->dp_config_rwlock);

 	if (dstg->dstg_nowaiter)
 		dsl_sync_task_group_destroy(dstg);
--- a/module/zfs/fm.c
+++ b/module/zfs/fm.c
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 /*
@ -94,6 +93,8 @@ static ulong_t ereport_qlen = 0;
 static size_t ereport_size = 0;
 static int ereport_cols = 80;

+extern void fastreboot_disable_highpil(void);
+
 /*
 * Common fault management kstats to record ereport generation
 * failures
@ -374,6 +375,9 @@ fm_panic(const char *format, ...)
 	va_list ap;

 	(void) casptr((void *)&fm_panicstr, NULL, (void *)format);
+#if defined(__i386) || defined(__amd64)
+	fastreboot_disable_highpil();
+#endif /* __i386 || __amd64 */
 	va_start(ap, format);
 	vpanic(format, ap);
 	va_end(ap);
@ -512,10 +516,10 @@ fm_ereport_post(nvlist_t *ereport, int evc_flag)
 	if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR,
 	    SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) {
 		atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
-		sysevent_evc_unbind(error_chan);
+		(void) sysevent_evc_unbind(error_chan);
 		return;
 	}
-	sysevent_evc_unbind(error_chan);
+	(void) sysevent_evc_unbind(error_chan);
 }

 /*
@ -788,6 +792,14 @@ fm_payload_set(nvlist_t *payload, ...)
 *	detector		nvlist_t	<detector>
 *	ereport-payload		nvlist_t	<var args>
 *
+ * We don't actually add a 'version' member to the payload.  Really,
+ * the version quoted to us by our caller is that of the category 1
+ * "ereport" event class (and we require FM_EREPORT_VERS0) but
+ * the payload version of the actual leaf class event under construction
+ * may be something else.  Callers should supply a version in the varargs,
+ * or (better) we could take two version arguments - one for the
+ * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
+ * for the leaf class.
 */
 void
 fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
@ -920,46 +932,41 @@ fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
 *	version			uint8_t		0
 *	auth			nvlist_t	<auth>
 *	devpath			string		<devpath>
- *	devid			string		<devid>
+ *	[devid]			string		<devid>
+ *	[target-port-l0id]	string		<target-port-lun0-id>
 *
 * Note that auth and devid are optional members.
 */
 void
 fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
-    const char *devpath, const char *devid)
+    const char *devpath, const char *devid, const char *tpl0)
 {
+	int err = 0;
+
 	if (version != DEV_SCHEME_VERSION0) {
 		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
 		return;
 	}

-	if (nvlist_add_uint8(fmri_dev, FM_VERSION, version) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
-		return;
-	}
-
-	if (nvlist_add_string(fmri_dev, FM_FMRI_SCHEME,
-	    FM_FMRI_SCHEME_DEV) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
-		return;
-	}
+	err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
+	err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);

 	if (auth != NULL) {
-		if (nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
-		    (nvlist_t *)auth) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
-		}
+		err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
+		    (nvlist_t *)auth);
 	}

-	if (nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
-	}
+	err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);

 	if (devid != NULL)
-		if (nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid) != 0)
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
+
+	if (tpl0 != NULL)
+		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
+
+	if (err)
+		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+
 }

 /*
@ -1264,3 +1271,102 @@ print_msg_hwerr(ctid_t ct_id, proc_t *p)
 	uprintf("Killed process %d (%s) in contract id %d "
 	    "due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id);
 }
+
+void
+fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
+    nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
+{
+	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+	nvlist_t *pairs[HC_MAXPAIRS];
+	nvlist_t **hcl;
+	uint_t n;
+	int i, j;
+	va_list ap;
+	char *hcname, *hcid;
+
+	if (!fm_fmri_hc_set_common(fmri, version, auth))
+		return;
+
+	/*
+	 * copy the bboard nvpairs to the pairs array
+	 */
+	if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
+	    != 0) {
+		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		return;
+	}
+
+	for (i = 0; i < n; i++) {
+		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
+		    &hcname) != 0) {
+			atomic_add_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			return;
+		}
+		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
+			atomic_add_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			return;
+		}
+
+		pairs[i] = fm_nvlist_create(nva);
+		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
+		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
+			for (j = 0; j <= i; j++) {
+				if (pairs[j] != NULL)
+					fm_nvlist_destroy(pairs[j],
+					    FM_NVA_RETAIN);
+			}
+			atomic_add_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			return;
+		}
+	}
+
+	/*
+	 * create the pairs from passed in pairs
+	 */
+	npairs = MIN(npairs, HC_MAXPAIRS);
+
+	va_start(ap, npairs);
+	for (i = n; i < npairs + n; i++) {
+		const char *name = va_arg(ap, const char *);
+		uint32_t id = va_arg(ap, uint32_t);
+		char idstr[11];
+		(void) snprintf(idstr, sizeof (idstr), "%u", id);
+		pairs[i] = fm_nvlist_create(nva);
+		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+			for (j = 0; j <= i; j++) {
+				if (pairs[j] != NULL)
+					fm_nvlist_destroy(pairs[j],
+					    FM_NVA_RETAIN);
+			}
+			atomic_add_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			return;
+		}
+	}
+	va_end(ap);
+
+	/*
+	 * Create the fmri hc list
+	 */
+	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
+	    npairs + n) != 0) {
+		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		return;
+	}
+
+	for (i = 0; i < npairs + n; i++) {
+			fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+	}
+
+	if (snvl != NULL) {
+		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+			atomic_add_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			return;
+		}
+	}
+}
--- a/module/zfs/include/sys/arc.h
+++ b/module/zfs/include/sys/arc.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_ARC_H
@ -48,7 +47,8 @@ arc_done_func_t arc_getbuf_func;
 struct arc_buf {
 	arc_buf_hdr_t		*b_hdr;
 	arc_buf_t		*b_next;
-	krwlock_t		b_lock;
+	kmutex_t		b_evict_lock;
+	krwlock_t		b_data_lock;
 	void			*b_data;
 	arc_evict_func_t	*b_efunc;
 	void			*b_private;
@ -87,10 +87,13 @@ arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
    arc_buf_contents_t type);
 arc_buf_t *arc_loan_buf(spa_t *spa, int size);
 void arc_return_buf(arc_buf_t *buf, void *tag);
+void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
 void arc_buf_add_ref(arc_buf_t *buf, void *tag);
 int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
 void arc_release(arc_buf_t *buf, void *tag);
+int arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
+    zbookmark_t *zb);
 int arc_released(arc_buf_t *buf);
 int arc_has_callback(arc_buf_t *buf);
 void arc_buf_freeze(arc_buf_t *buf);
@ -99,28 +102,16 @@ void arc_buf_thaw(arc_buf_t *buf);
 int arc_referenced(arc_buf_t *buf);
 #endif

-typedef struct writeprops {
-	dmu_object_type_t wp_type;
-	uint8_t wp_level;
-	uint8_t wp_copies;
-	uint8_t wp_dncompress, wp_oscompress;
-	uint8_t wp_dnchecksum, wp_oschecksum;
-} writeprops_t;
-
-void write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp);
-int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
    arc_done_func_t *done, void *private, int priority, int zio_flags,
    uint32_t *arc_flags, const zbookmark_t *zb);
-int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
    arc_done_func_t *done, void *private, int priority, int flags,
    uint32_t *arc_flags, const zbookmark_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
-    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int zio_flags, const zbookmark_t *zb);
-int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags);
-int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
+zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+    arc_done_func_t *ready, arc_done_func_t *done, void *private,
+    int priority, int zio_flags, const zbookmark_t *zb);

 void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
 int arc_buf_evict(arc_buf_t *buf);
--- a/module/zfs/include/sys/bplist.h
+++ b/module/zfs/include/sys/bplist.h
@ -19,68 +19,36 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_BPLIST_H
 #define	_SYS_BPLIST_H

-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
 #include <sys/zfs_context.h>
+#include <sys/spa.h>

 #ifdef	__cplusplus
 extern "C" {
 #endif

-typedef struct bplist_phys {
-	/*
-	 * This is the bonus buffer for the dead lists.  The object's
-	 * contents is an array of bpl_entries blkptr_t's, representing
-	 * a total of bpl_bytes physical space.
-	 */
-	uint64_t	bpl_entries;
-	uint64_t	bpl_bytes;
-	uint64_t	bpl_comp;
-	uint64_t	bpl_uncomp;
-} bplist_phys_t;
-
-#define	BPLIST_SIZE_V0	(2 * sizeof (uint64_t))
-
-typedef struct bplist_q {
-	blkptr_t	bpq_blk;
-	void		*bpq_next;
-} bplist_q_t;
+typedef struct bplist_entry {
+	blkptr_t	bpe_blk;
+	list_node_t	bpe_node;
+} bplist_entry_t;

 typedef struct bplist {
 	kmutex_t	bpl_lock;
-	objset_t	*bpl_mos;
-	uint64_t	bpl_object;
-	uint8_t		bpl_blockshift;
-	uint8_t		bpl_bpshift;
-	uint8_t		bpl_havecomp;
-	bplist_q_t	*bpl_queue;
-	bplist_phys_t	*bpl_phys;
-	dmu_buf_t	*bpl_dbuf;
-	dmu_buf_t	*bpl_cached_dbuf;
+	list_t		bpl_list;
 } bplist_t;

-extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
-extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
-extern void bplist_close(bplist_t *bpl);
-extern boolean_t bplist_empty(bplist_t *bpl);
-extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
-extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
-extern int bplist_space(bplist_t *bpl,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-extern int bplist_space_birthrange(bplist_t *bpl,
-    uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep);
+typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+void bplist_create(bplist_t *bpl);
+void bplist_destroy(bplist_t *bpl);
+void bplist_append(bplist_t *bpl, const blkptr_t *bp);
+void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
+    void *arg, dmu_tx_t *tx);

 #ifdef	__cplusplus
 }
--- a/module/zfs/include/sys/bpobj.h
+++ b/module/zfs/include/sys/bpobj.h
@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_BPOBJ_H
+#define	_SYS_BPOBJ_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct bpobj_phys {
+	/*
+	 * This is the bonus buffer for the dead lists.  The object's
+	 * contents is an array of bpo_entries blkptr_t's, representing
+	 * a total of bpo_bytes physical space.
+	 */
+	uint64_t	bpo_num_blkptrs;
+	uint64_t	bpo_bytes;
+	uint64_t	bpo_comp;
+	uint64_t	bpo_uncomp;
+	uint64_t	bpo_subobjs;
+	uint64_t	bpo_num_subobjs;
+} bpobj_phys_t;
+
+#define	BPOBJ_SIZE_V0	(2 * sizeof (uint64_t))
+#define	BPOBJ_SIZE_V1	(4 * sizeof (uint64_t))
+
+typedef struct bpobj {
+	kmutex_t	bpo_lock;
+	objset_t	*bpo_os;
+	uint64_t	bpo_object;
+	int		bpo_epb;
+	uint8_t		bpo_havecomp;
+	uint8_t		bpo_havesubobj;
+	bpobj_phys_t	*bpo_phys;
+	dmu_buf_t	*bpo_dbuf;
+	dmu_buf_t	*bpo_cached_dbuf;
+} bpobj_t;
+
+typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+
+int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
+void bpobj_close(bpobj_t *bpo);
+
+int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
+int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
+int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp);
+
+void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
+void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
+
+int bpobj_space(bpobj_t *bpo,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_BPOBJ_H */
--- a/module/zfs/include/sys/dbuf.h
+++ b/module/zfs/include/sys/dbuf.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_DBUF_H
@ -38,7 +37,6 @@
 extern "C" {
 #endif

-#define	DB_BONUS_BLKID (-1ULL)
 #define	IN_DMU_SYNC 2

 /*
@ -75,7 +73,6 @@ typedef enum dbuf_states {
 	DB_EVICTING
 } dbuf_states_t;

-struct objset_impl;
 struct dnode;
 struct dmu_tx;

@ -131,6 +128,7 @@ typedef struct dbuf_dirty_record {
 			arc_buf_t *dr_data;
 			blkptr_t dr_overridden_by;
 			override_states_t dr_override_state;
+			uint8_t dr_copies;
 		} dl;
 	} dt;
 } dbuf_dirty_record_t;
@ -145,7 +143,7 @@ typedef struct dmu_buf_impl {
 	dmu_buf_t db;

 	/* the objset we belong to */
-	struct objset_impl *db_objset;
+	struct objset *db_objset;

 	/*
 	 * the dnode we belong to (NULL when evicted)
@ -239,6 +237,10 @@ uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);

 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
 void dbuf_create_bonus(struct dnode *dn);
+int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
+void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
+
+void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);

 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
@ -252,6 +254,7 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);

 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);

 dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);

@ -263,6 +266,7 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);

 void dbuf_clear(dmu_buf_impl_t *db);
 void dbuf_evict(dmu_buf_impl_t *db);
@ -270,6 +274,7 @@ void dbuf_evict(dmu_buf_impl_t *db);
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
 void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_release_bp(dmu_buf_impl_t *db);

 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
    struct dmu_tx *);
@ -321,7 +326,7 @@ _NOTE(CONSTCOND) } while (0)
 #define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);		\
+	sprintf_blkptr(__blkbuf, bp);				\
 	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} 							\
--- a/module/zfs/include/sys/ddt.h
+++ b/module/zfs/include/sys/ddt.h
@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _SYS_DDT_H
+#define	_SYS_DDT_H
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * On-disk DDT formats, in the desired search order (newest version first).
+ */
+enum ddt_type {
+	DDT_TYPE_ZAP = 0,
+	DDT_TYPES
+};
+
+/*
+ * DDT classes, in the desired search order (highest replication level first).
+ */
+enum ddt_class {
+	DDT_CLASS_DITTO = 0,
+	DDT_CLASS_DUPLICATE,
+	DDT_CLASS_UNIQUE,
+	DDT_CLASSES
+};
+
+#define	DDT_TYPE_CURRENT		0
+
+#define	DDT_COMPRESS_BYTEORDER_MASK	0x80
+#define	DDT_COMPRESS_FUNCTION_MASK	0x7f
+
+/*
+ * On-disk ddt entry:  key (name) and physical storage (value).
+ */
+typedef struct ddt_key {
+	zio_cksum_t	ddk_cksum;	/* 256-bit block checksum */
+	uint64_t	ddk_prop;	/* LSIZE, PSIZE, compression */
+} ddt_key_t;
+
+/*
+ * ddk_prop layout:
+ *
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ *	|   0	|   0	|   0	| comp	|     PSIZE	|     LSIZE	|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ */
+#define	DDK_GET_LSIZE(ddk)	\
+	BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define	DDK_SET_LSIZE(ddk, x)	\
+	BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define	DDK_GET_PSIZE(ddk)	\
+	BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define	DDK_SET_PSIZE(ddk, x)	\
+	BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define	DDK_GET_COMPRESS(ddk)		BF64_GET((ddk)->ddk_prop, 32, 8)
+#define	DDK_SET_COMPRESS(ddk, x)	BF64_SET((ddk)->ddk_prop, 32, 8, x)
+
+#define	DDT_KEY_WORDS	(sizeof (ddt_key_t) / sizeof (uint64_t))
+
+typedef struct ddt_phys {
+	dva_t		ddp_dva[SPA_DVAS_PER_BP];
+	uint64_t	ddp_refcnt;
+	uint64_t	ddp_phys_birth;
+} ddt_phys_t;
+
+enum ddt_phys_type {
+	DDT_PHYS_DITTO = 0,
+	DDT_PHYS_SINGLE = 1,
+	DDT_PHYS_DOUBLE = 2,
+	DDT_PHYS_TRIPLE = 3,
+	DDT_PHYS_TYPES
+};
+
+/*
+ * In-core ddt entry
+ */
+struct ddt_entry {
+	ddt_key_t	dde_key;
+	ddt_phys_t	dde_phys[DDT_PHYS_TYPES];
+	zio_t		*dde_lead_zio[DDT_PHYS_TYPES];
+	void		*dde_repair_data;
+	enum ddt_type	dde_type;
+	enum ddt_class	dde_class;
+	uint8_t		dde_loading;
+	uint8_t		dde_loaded;
+	kcondvar_t	dde_cv;
+	avl_node_t	dde_node;
+};
+
+/*
+ * In-core ddt
+ */
+struct ddt {
+	kmutex_t	ddt_lock;
+	avl_tree_t	ddt_tree;
+	avl_tree_t	ddt_repair_tree;
+	enum zio_checksum ddt_checksum;
+	spa_t		*ddt_spa;
+	objset_t	*ddt_os;
+	uint64_t	ddt_stat_object;
+	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];
+	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
+	ddt_histogram_t	ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+	ddt_object_t	ddt_object_stats[DDT_TYPES][DDT_CLASSES];
+	avl_node_t	ddt_node;
+};
+
+/*
+ * In-core and on-disk bookmark for DDT walks
+ */
+typedef struct ddt_bookmark {
+	uint64_t	ddb_class;
+	uint64_t	ddb_type;
+	uint64_t	ddb_checksum;
+	uint64_t	ddb_cursor;
+} ddt_bookmark_t;
+
+/*
+ * Ops vector to access a specific DDT object type.
+ */
+typedef struct ddt_ops {
+	char ddt_op_name[32];
+	int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
+	    boolean_t prehash);
+	int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
+	int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
+	void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
+	    ddt_entry_t *dde);
+	int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+	    dmu_tx_t *tx);
+	int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+	    dmu_tx_t *tx);
+	int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+	    uint64_t *walk);
+	uint64_t (*ddt_op_count)(objset_t *os, uint64_t object);
+} ddt_ops_t;
+
+#define	DDT_NAMELEN	80
+
+extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, char *name);
+extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, uint64_t *walk, ddt_entry_t *dde);
+extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class);
+extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, dmu_object_info_t *);
+extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class);
+
+extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
+    uint64_t txg);
+extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
+    const ddt_phys_t *ddp, blkptr_t *bp);
+
+extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
+
+extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
+extern void ddt_phys_clear(ddt_phys_t *ddp);
+extern void ddt_phys_addref(ddt_phys_t *ddp);
+extern void ddt_phys_decref(ddt_phys_t *ddp);
+extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp,
+    uint64_t txg);
+extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+
+extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
+
+extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
+extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
+extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
+extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
+
+extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
+extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
+
+extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde,
+    ddt_phys_t *ddp_willref);
+extern int ddt_ditto_copies_present(ddt_entry_t *dde);
+
+extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len);
+extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
+
+extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
+extern void ddt_enter(ddt_t *ddt);
+extern void ddt_exit(ddt_t *ddt);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
+extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
+
+extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
+    const blkptr_t *bp);
+
+extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
+extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
+
+extern int ddt_entry_compare(const void *x1, const void *x2);
+
+extern void ddt_create(spa_t *spa);
+extern int ddt_load(spa_t *spa);
+extern void ddt_unload(spa_t *spa);
+extern void ddt_sync(spa_t *spa, uint64_t txg);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx);
+
+extern const ddt_ops_t ddt_zap_ops;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DDT_H */
--- a/module/zfs/include/sys/dmu.h
+++ b/module/zfs/include/sys/dmu.h
@ -19,10 +19,11 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H

@ -38,12 +39,14 @@
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/cred.h>
+#include <sys/time.h>

 #ifdef	__cplusplus
 extern "C" {
 #endif

 struct uio;
+struct xuio;
 struct page;
 struct vnode;
 struct spa;
@ -59,8 +62,9 @@ struct drr_end;
 struct zbookmark;
 struct spa;
 struct nvlist;
-struct objset_impl;
 struct arc_buf;
+struct zio_prop;
+struct sa_handle;

 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
@ -73,8 +77,8 @@ typedef enum dmu_object_type {
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
-	DMU_OT_BPLIST,			/* UINT64 */
-	DMU_OT_BPLIST_HDR,		/* UINT64 */
+	DMU_OT_BPOBJ,			/* UINT64 */
+	DMU_OT_BPOBJ_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
@ -114,10 +118,22 @@ typedef enum dmu_object_type {
 	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
-	DMU_OT_SCRUB_QUEUE,		/* ZAP */
+	DMU_OT_SCAN_QUEUE,		/* ZAP */
 	DMU_OT_USERGROUP_USED,		/* ZAP */
 	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_USERREFS,		/* ZAP */
+	DMU_OT_DDT_ZAP,			/* ZAP */
+	DMU_OT_DDT_STATS,		/* ZAP */
+	DMU_OT_SA,			/* System attr */
+	DMU_OT_SA_MASTER_NODE,		/* ZAP */
+	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
+	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
+	DMU_OT_SCAN_XLATE,		/* ZAP */
+	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
+	DMU_OT_DEADLIST,		/* ZAP */
+	DMU_OT_DEADLIST_HDR,		/* UINT64 */
+	DMU_OT_DSL_CLONES,		/* ZAP */
+	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;

@ -140,16 +156,6 @@ void zfs_oldacl_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);

-#define	DS_MODE_NOHOLD		0	/* internal use only */
-#define	DS_MODE_USER		1	/* simple access, no special needs */
-#define	DS_MODE_OWNER		2	/* the "main" access, e.g. a mount */
-#define	DS_MODE_TYPE_MASK	0x3
-#define	DS_MODE_TYPE(x)		((x) & DS_MODE_TYPE_MASK)
-#define	DS_MODE_READONLY	0x8
-#define	DS_MODE_IS_READONLY(x)	((x) & DS_MODE_READONLY)
-#define	DS_MODE_INCONSISTENT	0x10
-#define	DS_MODE_IS_INCONSISTENT(x)	((x) & DS_MODE_INCONSISTENT)
-
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)

@ -162,27 +168,35 @@ void zfs_znode_byteswap(void *buf, size_t size);

 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
+#define	DMU_DEADLIST_OBJECT	(-3ULL)

+/*
+ * artificial blkids for bonus buffer and spill blocks
+ */
+#define	DMU_BONUS_BLKID		(-1ULL)
+#define	DMU_SPILL_BLKID		(-2ULL)
 /*
 * Public routines to create, destroy, open, and close objsets.
 */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp);
-int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type,
-    objset_t **osp);
-void dmu_objset_close(objset_t *os);
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
+
 int dmu_objset_evict_dbufs(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent, uint64_t flags,
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
    void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+    uint64_t flags);
 int dmu_objset_destroy(const char *name, boolean_t defer);
 int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
-int dmu_objset_rollback(objset_t *os);
 int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
    boolean_t recursive);
 int dmu_objset_rename(const char *name, const char *newname,
    boolean_t recursive);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
    int flags);
 void dmu_objset_byteswap(void *buf, size_t size);

@ -201,7 +215,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
-#define	DMU_POOL_SYNC_BPLIST		"sync_bplist"
+#define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
@ -209,19 +223,12 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
 #define	DMU_POOL_L2CACHE		"l2cache"
-
-/* 4x8 zbookmark_t */
-#define	DMU_POOL_SCRUB_BOOKMARK		"scrub_bookmark"
-/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
-#define	DMU_POOL_SCRUB_QUEUE		"scrub_queue"
-/* 1x8 txg */
-#define	DMU_POOL_SCRUB_MIN_TXG		"scrub_min_txg"
-/* 1x8 txg */
-#define	DMU_POOL_SCRUB_MAX_TXG		"scrub_max_txg"
-/* 1x4 enum scrub_func */
-#define	DMU_POOL_SCRUB_FUNC		"scrub_func"
-/* 1x8 count */
-#define	DMU_POOL_SCRUB_ERRORS		"scrub_errors"
+#define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
+#define	DMU_POOL_DDT			"DDT-%s-%s-%s"
+#define	DMU_POOL_DDT_STATS		"DDT-statistics"
+#define	DMU_POOL_CREATION_VERSION	"creation_version"
+#define	DMU_POOL_SCAN			"scan"
+#define	DMU_POOL_FREE_BPOBJ		"free_bpobj"

 /*
 * Allocate an object from this objset.  The range of object numbers
@ -306,11 +313,14 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
    dmu_tx_t *tx);

 /*
- * Decide how many copies of a given block we should make.  Can be from
- * 1 to SPA_DVAS_PER_BP.
+ * Decide how to write a block: checksum, compression, number of copies, etc.
 */
-int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
-    dmu_object_type_t ot);
+#define	WP_NOFILL	0x1
+#define	WP_DMU_SYNC	0x2
+#define	WP_SPILL	0x4
+
+void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+    struct zio_prop *zp);
 /*
 * The bonus data is accessed more or less like a regular buffer.
 * You must dmu_bonus_hold() to get the buffer, which will give you a
@ -324,6 +334,17 @@ int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
 int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
+int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
+
+/*
+ * Special spill buffer support used by "SA" framework
+ */
+
+int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
+    void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);

 /*
 * Obtain the DMU buffer from the specified object which contains the
@ -340,7 +361,7 @@ int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 * The object number must be a valid, allocated object number.
 */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **);
+    void *tag, dmu_buf_t **, int flags);
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
 void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
@ -437,11 +458,34 @@ void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
    uint64_t len);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
+void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);

+/*
+ * To register a commit callback, dmu_tx_callback_register() must be called.
+ *
+ * dcb_data is a pointer to caller private data that is passed on as a
+ * callback parameter. The caller is responsible for properly allocating and
+ * freeing it.
+ *
+ * When registering a callback, the transaction must be already created, but
+ * it cannot be committed or aborted. It can be assigned to a txg or not.
+ *
+ * The callback will be called after the transaction has been safely written
+ * to stable storage and will also be called if the dmu_tx is aborted.
+ * If there is any error which prevents the transaction from being committed to
+ * disk, the callback will be called with a value of error != 0.
+ */
+typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
+
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+    void *dcb_data);
+
 /*
 * Free up the data blocks for a defined range of a file.  If size is
 * zero, the range from offset to end-of-file is freed.
@ -469,12 +513,23 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
    dmu_tx_t *tx);
+int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
+    dmu_tx_t *tx);
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
    uint64_t size, struct page *pp, dmu_tx_t *tx);
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
    dmu_tx_t *tx);
+int dmu_xuio_init(struct xuio *uio, int niov);
+void dmu_xuio_fini(struct xuio *uio);
+int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
+    size_t n);
+int dmu_xuio_cnt(struct xuio *uio);
+struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
+void dmu_xuio_clear(struct xuio *uio, int i);
+void xuio_stat_wbuf_copied();
+void xuio_stat_wbuf_nocopy();

 extern int zfs_prefetch_disable;

@ -485,19 +540,19 @@ void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
    uint64_t len);

 typedef struct dmu_object_info {
-	/* All sizes are in bytes. */
+	/* All sizes are in bytes unless otherwise indicated. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
-	uint64_t doi_bonus_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
+	uint64_t doi_bonus_size;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_pad[5];
-	/* Values below are number of 512-byte blocks. */
-	uint64_t doi_physical_blks;		/* data + metadata */
-	uint64_t doi_max_block_offset;
+	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
+	uint64_t doi_max_offset;
+	uint64_t doi_fill_count;		/* number of non-empty blocks */
 } dmu_object_info_t;

 typedef void arc_byteswap_func_t(void *buf, size_t size);
@ -566,6 +621,11 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
 */
 uint64_t dmu_objset_fsid_guid(objset_t *os);

+/*
+ * Get the [cm]time for an objset's snapshot dir
+ */
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+
 int dmu_objset_is_snapshot(objset_t *os);

 extern struct spa *dmu_objset_spa(objset_t *os);
@ -575,6 +635,8 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
+extern uint64_t dmu_objset_syncprop(objset_t *os);
+extern uint64_t dmu_objset_logbias(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
    uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
@ -582,9 +644,8 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
    uint64_t *idp, uint64_t *offp);

-typedef void objset_used_cb_t(objset_t *os, dmu_object_type_t bonustype,
-    void *oldbonus, void *newbonus, uint64_t oldused, uint64_t newused,
-    dmu_tx_t *tx);
+typedef int objset_used_cb_t(dmu_object_type_t bonustype,
+    void *bonus, uint64_t *userp, uint64_t *groupp);
 extern void dmu_objset_register_type(dmu_objset_type_t ost,
    objset_used_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
@ -605,9 +666,20 @@ uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 * storage when the write completes this new data does not become a
 * permanent part of the file until the associated transaction commits.
 */
-typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
-int dmu_sync(struct zio *zio, dmu_buf_t *db,
-    struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
+
+/*
+ * {zfs,zvol,ztest}_get_done() args
+ */
+typedef struct zgd {
+	struct zilog	*zgd_zilog;
+	struct blkptr	*zgd_bp;
+	dmu_buf_t	*zgd_db;
+	struct rl	*zgd_rl;
+	void		*zgd_private;
+} zgd_t;
+
+typedef void dmu_sync_cb_t(zgd_t *arg, int error);
+int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);

 /*
 * Find the next hole or data block in file starting at *off
@ -642,11 +714,12 @@ typedef struct dmu_recv_cookie {
 	struct dsl_dataset *drc_real_ds;
 	struct drr_begin *drc_drrb;
 	char *drc_tosnap;
+	char *drc_top_ds;
 	boolean_t drc_newfs;
 	boolean_t drc_force;
 } dmu_recv_cookie_t;

-int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *,
+int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *,
    boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
 int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp);
 int dmu_recv_end(dmu_recv_cookie_t *drc);
--- a/module/zfs/include/sys/dmu_impl.h
+++ b/module/zfs/include/sys/dmu_impl.h
@ -19,7 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

@ -210,8 +210,7 @@ extern "C" {
 *
 * ds_lock
 *    protects:
- *    	ds_user_ptr
- *    	ds_user_evict_func
+ *    	ds_objset
 *    	ds_open_refcount
 *    	ds_snapname
 *    	ds_phys accounting
@ -233,6 +232,39 @@ extern "C" {
 struct objset;
 struct dmu_pool;

+typedef struct dmu_xuio {
+	int next;
+	int cnt;
+	struct arc_buf **bufs;
+	iovec_t *iovp;
+} dmu_xuio_t;
+
+typedef struct xuio_stats {
+	/* loaned yet not returned arc_buf */
+	kstat_named_t xuiostat_onloan_rbuf;
+	kstat_named_t xuiostat_onloan_wbuf;
+	/* whether a copy is made when loaning out a read buffer */
+	kstat_named_t xuiostat_rbuf_copied;
+	kstat_named_t xuiostat_rbuf_nocopy;
+	/* whether a copy is made when assigning a write buffer */
+	kstat_named_t xuiostat_wbuf_copied;
+	kstat_named_t xuiostat_wbuf_nocopy;
+} xuio_stats_t;
+
+static xuio_stats_t xuio_stats = {
+	{ "onloan_read_buf",	KSTAT_DATA_UINT64 },
+	{ "onloan_write_buf",	KSTAT_DATA_UINT64 },
+	{ "read_buf_copied",	KSTAT_DATA_UINT64 },
+	{ "read_buf_nocopy",	KSTAT_DATA_UINT64 },
+	{ "write_buf_copied",	KSTAT_DATA_UINT64 },
+	{ "write_buf_nocopy",	KSTAT_DATA_UINT64 }
+};
+
+#define	XUIOSTAT_INCR(stat, val)	\
+	atomic_add_64(&xuio_stats.stat.value.ui64, (val))
+#define	XUIOSTAT_BUMP(stat)	XUIOSTAT_INCR(stat, 1)
+
+
 #ifdef	__cplusplus
 }
 #endif
--- a/module/zfs/include/sys/dmu_objset.h
+++ b/module/zfs/include/sys/dmu_objset.h
@ -19,10 +19,11 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef	_SYS_DMU_OBJSET_H
 #define	_SYS_DMU_OBJSET_H

@ -33,6 +34,7 @@
 #include <sys/dnode.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
+#include <sys/sa.h>

 #ifdef	__cplusplus
 extern "C" {
@ -40,11 +42,13 @@ extern "C" {

 struct dsl_dataset;
 struct dmu_tx;
-struct objset_impl;

 #define	OBJSET_PHYS_SIZE 2048
 #define	OBJSET_OLD_PHYS_SIZE 1024

+#define	OBJSET_BUF_HAS_USERUSED(buf) \
+	(arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
+
 #define	OBJSET_FLAG_USERACCOUNTING_COMPLETE	(1ULL<<0)

 typedef struct objset_phys {
@ -59,11 +63,6 @@ typedef struct objset_phys {
 } objset_phys_t;

 struct objset {
-	struct objset_impl *os;
-	int os_mode;
-};
-
-typedef struct objset_impl {
 	/* Immutable: */
 	struct dsl_dataset *os_dsl_dataset;
 	spa_t *os_spa;
@ -73,12 +72,17 @@ typedef struct objset_impl {
 	dnode_t *os_userused_dnode;
 	dnode_t *os_groupused_dnode;
 	zilog_t *os_zil;
-	objset_t os;
-	uint8_t os_checksum;	/* can change, under dsl_dir's locks */
-	uint8_t os_compress;	/* can change, under dsl_dir's locks */
-	uint8_t os_copies;	/* can change, under dsl_dir's locks */
-	uint8_t os_primary_cache;	/* can change, under dsl_dir's locks */
-	uint8_t os_secondary_cache;	/* can change, under dsl_dir's locks */
+
+	/* can change, under dsl_dir's locks: */
+	uint8_t os_checksum;
+	uint8_t os_compress;
+	uint8_t os_copies;
+	uint8_t os_dedup_checksum;
+	uint8_t os_dedup_verify;
+	uint8_t os_logbias;
+	uint8_t os_primary_cache;
+	uint8_t os_secondary_cache;
+	uint8_t os_sync;

 	/* no lock needed: */
 	struct dmu_tx *os_synctx; /* XXX sketchy */
@ -101,8 +105,12 @@ typedef struct objset_impl {
 	/* stuff we store for the user */
 	kmutex_t os_user_ptr_lock;
 	void *os_user_ptr;
-} objset_impl_t;

+	/* SA layout/attribute registration */
+	sa_os_t *os_sa;
+};
+
+#define	DMU_META_OBJSET		0
 #define	DMU_META_DNODE_OBJECT	0
 #define	DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)

@ -111,14 +119,18 @@ typedef struct objset_impl {
 	(os)->os_secondary_cache == ZFS_CACHE_METADATA)

 /* called from zpl */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp);
-void dmu_objset_close(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent, uint64_t flags,
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
+
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
    void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+    uint64_t flags);
 int dmu_objset_destroy(const char *name, boolean_t defer);
-int dmu_objset_rollback(objset_t *os);
 int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
    boolean_t recursive);
 void dmu_objset_stats(objset_t *os, nvlist_t *nv);
@ -126,23 +138,26 @@ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
    uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dmu_objset_fsid_guid(objset_t *os);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
    int flags);
 int dmu_objset_find_spa(spa_t *spa, const char *name,
    int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags);
-int dmu_objset_prefetch(char *name, void *arg);
+int dmu_objset_prefetch(const char *name, void *arg);
 void dmu_objset_byteswap(void *buf, size_t size);
 int dmu_objset_evict_dbufs(objset_t *os);
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);

 /* called from dsl */
-void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
-objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
+boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
+objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
    blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
 int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
-    objset_impl_t **osip);
-void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
-void dmu_objset_do_userquota_callbacks(objset_impl_t *os, dmu_tx_t *tx);
-boolean_t dmu_objset_userused_enabled(objset_impl_t *os);
+    objset_t **osp);
+void dmu_objset_evict(objset_t *os);
+void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
+void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
+boolean_t dmu_objset_userused_enabled(objset_t *os);
 int dmu_objset_userspace_upgrade(objset_t *os);
 boolean_t dmu_objset_userspace_present(objset_t *os);

--- a/module/zfs/include/sys/dmu_traverse.h
+++ b/module/zfs/include/sys/dmu_traverse.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_DMU_TRAVERSE_H
@ -36,19 +35,24 @@ extern "C" {

 struct dnode_phys;
 struct dsl_dataset;
+struct zilog;
+struct arc_buf;

-typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp,
-    const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
+typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    struct arc_buf *pbuf, const zbookmark_t *zb, const struct dnode_phys *dnp,
+    void *arg);

 #define	TRAVERSE_PRE			(1<<0)
 #define	TRAVERSE_POST			(1<<1)
 #define	TRAVERSE_PREFETCH_METADATA	(1<<2)
 #define	TRAVERSE_PREFETCH_DATA		(1<<3)
 #define	TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
+#define	TRAVERSE_HARD			(1<<4)

-int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start,
-    int flags, blkptr_cb_t func, void *arg);
-int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg);
+int traverse_dataset(struct dsl_dataset *ds,
+    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_pool(spa_t *spa,
+    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);

 #ifdef	__cplusplus
 }
--- a/module/zfs/include/sys/dmu_tx.h
+++ b/module/zfs/include/sys/dmu_tx.h
@ -19,15 +19,13 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

 #ifndef	_SYS_DMU_TX_H
 #define	_SYS_DMU_TX_H

-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/inttypes.h>
 #include <sys/dmu.h>
 #include <sys/txg.h>
@ -59,6 +57,7 @@ struct dmu_tx {
 	txg_handle_t tx_txgh;
 	void *tx_tempreserve_cookie;
 	struct dmu_tx_hold *tx_needassign_txh;
+	list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */
 	uint8_t tx_anyobj;
 	int tx_err;
 #ifdef ZFS_DEBUG
@ -78,6 +77,7 @@ enum dmu_tx_hold_type {
 	THT_FREE,
 	THT_ZAP,
 	THT_SPACE,
+	THT_SPILL,
 	THT_NUMTYPES
 };

@ -98,6 +98,11 @@ typedef struct dmu_tx_hold {
 #endif
 } dmu_tx_hold_t;

+typedef struct dmu_tx_callback {
+	list_node_t		dcb_node;    /* linked to tx_callbacks list */
+	dmu_tx_callback_func_t	*dcb_func;   /* caller function pointer */
+	void			*dcb_data;   /* caller private data */
+} dmu_tx_callback_t;

 /*
 * These routines are defined in dmu.h, and are called by the user.
@ -109,6 +114,10 @@ void dmu_tx_abort(dmu_tx_t *tx);
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 void dmu_tx_wait(dmu_tx_t *tx);

+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+    void *dcb_data);
+void dmu_tx_do_callbacks(list_t *cb_list, int error);
+
 /*
 * These routines are defined in dmu_spa.h, and are called by the SPA.
 */
--- a/module/zfs/include/sys/dmu_zfetch.h
+++ b/module/zfs/include/sys/dmu_zfetch.h
@ -19,15 +19,13 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

 #ifndef	_DFETCH_H
 #define	_DFETCH_H

-
-
 #include <sys/zfs_context.h>

 #ifdef	__cplusplus
@ -63,6 +61,9 @@ typedef struct zfetch {
 	uint64_t	zf_alloc_fail;	/* # of failed attempts to alloc strm */
 } zfetch_t;

+void		zfetch_init(void);
+void		zfetch_fini(void);
+
 void		dmu_zfetch_init(zfetch_t *, struct dnode *);
 void		dmu_zfetch_rele(zfetch_t *);
 void		dmu_zfetch_cons(zfetch_t *);
--- a/module/zfs/include/sys/dnode.h
+++ b/module/zfs/include/sys/dnode.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_DNODE_H
@ -62,6 +61,18 @@ extern "C" {
 #define	DN_MAX_OBJECT_SHIFT	48	/* 256 trillion (zfs_fid_t limit) */
 #define	DN_MAX_OFFSET_SHIFT	64	/* 2^64 bytes in a dnode */

+/*
+ * dnode id flags
+ *
+ * Note: a file will never ever have its
+ * ids moved from bonus->spill
+ * and only in a crypto environment would it be on spill
+ */
+#define	DN_ID_CHKED_BONUS	0x1
+#define	DN_ID_CHKED_SPILL	0x2
+#define	DN_ID_OLD_EXIST		0x4
+#define	DN_ID_NEW_EXIST		0x8
+
 /*
 * Derived constants.
 */
@ -70,10 +81,12 @@ extern "C" {
 #define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
 #define	DN_MAX_OBJECT	(1ULL << DN_MAX_OBJECT_SHIFT)
 #define	DN_ZERO_BONUSLEN	(DN_MAX_BONUSLEN + 1)
+#define	DN_KILL_SPILLBLK (1)

 #define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
 #define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
 #define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+#define	DNODES_PER_LEVEL	(1ULL << DNODES_PER_LEVEL_SHIFT)

 /* The +2 here is a cheesy way to round up */
 #define	DN_MAX_LEVELS	(2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
@ -88,7 +101,7 @@ extern "C" {
 #define	EPB(blkshift, typeshift)	(1 << (blkshift - typeshift))

 struct dmu_buf_impl;
-struct objset_impl;
+struct objset;
 struct zio;

 enum dnode_dirtycontext {
@ -101,6 +114,9 @@ enum dnode_dirtycontext {
 #define	DNODE_FLAG_USED_BYTES		(1<<0)
 #define	DNODE_FLAG_USERUSED_ACCOUNTED	(1<<1)

+/* Does dnode have a SA spill blkptr in bonus? */
+#define	DNODE_FLAG_SPILL_BLKPTR	(1<<2)
+
 typedef struct dnode_phys {
 	uint8_t dn_type;		/* dmu_object_type_t */
 	uint8_t dn_indblkshift;		/* ln2(indirect block size) */
@ -121,7 +137,8 @@ typedef struct dnode_phys {
 	uint64_t dn_pad3[4];

 	blkptr_t dn_blkptr[1];
-	uint8_t dn_bonus[DN_MAX_BONUSLEN];
+	uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
+	blkptr_t dn_spill;
 } dnode_phys_t;

 typedef struct dnode {
@ -136,7 +153,7 @@ typedef struct dnode {
 	list_node_t dn_link;

 	/* immutable: */
-	struct objset_impl *dn_objset;
+	struct objset *dn_objset;
 	uint64_t dn_object;
 	struct dmu_buf_impl *dn_dbuf;
 	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
@ -161,6 +178,8 @@ typedef struct dnode {
 	uint8_t dn_next_nblkptr[TXG_SIZE];
 	uint8_t dn_next_nlevels[TXG_SIZE];
 	uint8_t dn_next_indblkshift[TXG_SIZE];
+	uint8_t dn_next_bonustype[TXG_SIZE];
+	uint8_t dn_rm_spillblk[TXG_SIZE];	/* for removing spill blk */
 	uint16_t dn_next_bonuslen[TXG_SIZE];
 	uint32_t dn_next_blksz[TXG_SIZE];	/* next block size in bytes */

@ -185,12 +204,17 @@ typedef struct dnode {
 	kmutex_t dn_dbufs_mtx;
 	list_t dn_dbufs;		/* linked list of descendent dbuf_t's */
 	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */
+	boolean_t dn_have_spill;	/* have spill or are spilling */

 	/* parent IO for current sync write */
 	zio_t *dn_zio;

 	/* used in syncing context */
-	dnode_phys_t *dn_oldphys;
+	uint64_t dn_oldused;	/* old phys used bytes */
+	uint64_t dn_oldflags;	/* old phys dn_flags */
+	uint64_t dn_olduid, dn_oldgid;
+	uint64_t dn_newuid, dn_newgid;
+	int dn_id_flags;

 	/* holds prefetch structure */
 	struct zfetch	dn_zfetch;
@ -202,14 +226,17 @@ typedef struct free_range {
 	uint64_t fr_nblks;
 } free_range_t;

-dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
+dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
    uint64_t object);
 void dnode_special_close(dnode_t *dn);

 void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
-int dnode_hold(struct objset_impl *dd, uint64_t object,
+void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
+void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
+
+int dnode_hold(struct objset *dd, uint64_t object,
    void *ref, dnode_t **dnp);
-int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
    void *ref, dnode_t **dnp);
 boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
--- a/module/zfs/include/sys/dsl_dataset.h
+++ b/module/zfs/include/sys/dsl_dataset.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_DSL_DATASET_H
@ -33,6 +32,7 @@
 #include <sys/bplist.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_context.h>
+#include <sys/dsl_deadlist.h>

 #ifdef	__cplusplus
 extern "C" {
@ -42,8 +42,6 @@ struct dsl_dataset;
 struct dsl_dir;
 struct dsl_pool;

-typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
-
 #define	DS_FLAG_INCONSISTENT	(1ULL<<0)
 #define	DS_IS_INCONSISTENT(ds)	\
 	((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
@ -85,7 +83,7 @@ typedef struct dsl_dataset_phys {
 	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
 	uint64_t ds_creation_time;	/* seconds since 1970 */
 	uint64_t ds_creation_txg;
-	uint64_t ds_deadlist_obj;	/* DMU_OT_BPLIST */
+	uint64_t ds_deadlist_obj;	/* DMU_OT_DEADLIST */
 	uint64_t ds_used_bytes;
 	uint64_t ds_compressed_bytes;
 	uint64_t ds_uncompressed_bytes;
@ -115,10 +113,10 @@ typedef struct dsl_dataset {

 	/* only used in syncing context, only valid for non-snapshots: */
 	struct dsl_dataset *ds_prev;
-	uint64_t ds_origin_txg;

 	/* has internal locking: */
-	bplist_t ds_deadlist;
+	dsl_deadlist_t ds_deadlist;
+	bplist_t ds_pending_deadlist;

 	/* to protect against multiple concurrent incremental recv */
 	kmutex_t ds_recvlock;
@ -132,8 +130,7 @@ typedef struct dsl_dataset {
 	 * Protected by ds_lock:
 	 */
 	kmutex_t ds_lock;
-	void *ds_user_ptr;
-	dsl_dataset_evict_func_t *ds_user_evict_func;
+	objset_t *ds_objset;
 	uint64_t ds_userrefs;

 	/*
@ -174,17 +171,17 @@ struct dsl_ds_destroyarg {
 int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj,
    void *tag, dsl_dataset_t **);
-int dsl_dataset_own(const char *name, int flags, void *owner,
-    dsl_dataset_t **dsp);
+int dsl_dataset_own(const char *name, boolean_t inconsistentok,
+    void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
-    int flags, void *owner, dsl_dataset_t **);
+    boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
-void dsl_dataset_disown(dsl_dataset_t *ds, void *owner);
+void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
 void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
 boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
-    void *owner);
-void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner);
+    void *tag);
+void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
    dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
 uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
@ -195,21 +192,18 @@ dsl_checkfunc_t dsl_dataset_destroy_check;
 dsl_syncfunc_t dsl_dataset_destroy_sync;
 dsl_checkfunc_t dsl_dataset_snapshot_check;
 dsl_syncfunc_t dsl_dataset_snapshot_sync;
-int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost);
 int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
-int dsl_dataset_promote(const char *name);
+int dsl_dataset_promote(const char *name, char *conflsnap);
 int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
    boolean_t force);
 int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
-    boolean_t recursive);
+    boolean_t recursive, boolean_t temphold);
 int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
    boolean_t recursive);
+int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj,
+    char *htag);
 int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);

-void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
-    void *p, dsl_dataset_evict_func_t func);
-void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
-
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
 void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);

@ -219,10 +213,12 @@ boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);

 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);

-void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
    dmu_tx_t *tx);
-boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
+    dmu_tx_t *tx, boolean_t async);
+boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
+    uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);

 void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
@ -238,13 +234,13 @@ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
    uint64_t asize, uint64_t inflight, uint64_t *used,
    uint64_t *ref_rsrv);
-int dsl_dataset_set_quota(const char *dsname, uint64_t quota);
-void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr,
-    dmu_tx_t *tx);
-int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation);
-void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags);
-int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation,
-    dmu_tx_t *tx);
+int dsl_dataset_set_quota(const char *dsname, zprop_source_t source,
+    uint64_t quota);
+dsl_syncfunc_t dsl_dataset_set_quota_sync;
+int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+    uint64_t reservation);
+
+int dsl_destroy_inconsistent(const char *dsname, void *arg);

 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
--- a/module/zfs/include/sys/dsl_deadlist.h
+++ b/module/zfs/include/sys/dsl_deadlist.h
@ -0,0 +1,87 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_DSL_DEADLIST_H
+#define	_SYS_DSL_DEADLIST_H
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf;
+struct dsl_dataset;
+
+typedef struct dsl_deadlist_phys {
+	uint64_t dl_used;
+	uint64_t dl_comp;
+	uint64_t dl_uncomp;
+	uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
+} dsl_deadlist_phys_t;
+
+typedef struct dsl_deadlist {
+	objset_t *dl_os;
+	uint64_t dl_object;
+	avl_tree_t dl_tree;
+	boolean_t dl_havetree;
+	struct dmu_buf *dl_dbuf;
+	dsl_deadlist_phys_t *dl_phys;
+	kmutex_t dl_lock;
+
+	/* if it's the old on-disk format: */
+	bpobj_t dl_bpobj;
+	boolean_t dl_oldfmt;
+} dsl_deadlist_t;
+
+typedef struct dsl_deadlist_entry {
+	avl_node_t dle_node;
+	uint64_t dle_mintxg;
+	bpobj_t dle_bpobj;
+} dsl_deadlist_entry_t;
+
+void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
+void dsl_deadlist_close(dsl_deadlist_t *dl);
+uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
+void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
+void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
+void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+    uint64_t mrs_obj, dmu_tx_t *tx);
+void dsl_deadlist_space(dsl_deadlist_t *dl,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_space_range(dsl_deadlist_t *dl,
+    uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
+void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+    dmu_tx_t *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DEADLIST_H */
--- a/module/zfs/include/sys/dsl_dir.h
+++ b/module/zfs/include/sys/dsl_dir.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_DSL_DIR_H
@ -70,7 +69,8 @@ typedef struct dsl_dir_phys {
 	uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
 	uint64_t dd_flags;
 	uint64_t dd_used_breakdown[DD_USED_NUM];
-	uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */
+	uint64_t dd_clones; /* dsl_dir objects */
+	uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
 } dsl_dir_phys_t;

 struct dsl_dir {
@ -89,6 +89,8 @@ struct dsl_dir {
 	/* Protected by dd_lock */
 	kmutex_t dd_lock;
 	list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+	timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
+	uint64_t dd_origin_txg;

 	/* gross estimate of space used by in-flight tx's */
 	uint64_t dd_tempreserved[TXG_SIZE];
@ -125,18 +127,24 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
 void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
    dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
-int dsl_dir_set_quota(const char *ddname, uint64_t quota);
-int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
+int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
+    uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+    uint64_t reservation);
 int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
 int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
 int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
 boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
 void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
    uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
+void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
+timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);

 /* internal reserved dir name */
 #define	MOS_DIR_NAME "$MOS"
 #define	ORIGIN_DIR_NAME "$ORIGIN"
+#define	XLATION_DIR_NAME "$XLATION"
+#define	FREE_DIR_NAME "$FREE"

 #ifdef ZFS_DEBUG
 #define	dprintf_dd(dd, fmt, ...) do { \
--- a/module/zfs/include/sys/dsl_pool.h
+++ b/module/zfs/include/sys/dsl_pool.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_DSL_POOL_H
@ -32,6 +31,9 @@
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
 #include <sys/dnode.h>
+#include <sys/ddt.h>
+#include <sys/arc.h>
+#include <sys/bpobj.h>

 #ifdef	__cplusplus
 extern "C" {
@ -42,12 +44,7 @@ struct dsl_dir;
 struct dsl_dataset;
 struct dsl_pool;
 struct dmu_tx;
-
-enum scrub_func {
-	SCRUB_FUNC_NONE,
-	SCRUB_FUNC_CLEAN,
-	SCRUB_FUNC_NUMFUNCS
-};
+struct dsl_scan;

 /* These macros are for indexing into the zfs_all_blkstats_t. */
 #define	DMU_OT_DEFERRED	DMU_OT_NONE
@ -75,6 +72,7 @@ typedef struct dsl_pool {
 	struct objset *dp_meta_objset;
 	struct dsl_dir *dp_root_dir;
 	struct dsl_dir *dp_mos_dir;
+	struct dsl_dir *dp_free_dir;
 	struct dsl_dataset *dp_origin_snap;
 	uint64_t dp_root_dir_obj;
 	struct taskq *dp_vnrele_taskq;
@ -83,25 +81,18 @@ typedef struct dsl_pool {
 	blkptr_t dp_meta_rootbp;
 	list_t dp_synced_datasets;
 	hrtime_t dp_read_overhead;
-	uint64_t dp_throughput;
+	uint64_t dp_throughput; /* bytes per millisec */
 	uint64_t dp_write_limit;
+	uint64_t dp_tmp_userrefs_obj;
+	bpobj_t dp_free_bpobj;
+
+	struct dsl_scan *dp_scan;

 	/* Uses dp_lock */
 	kmutex_t dp_lock;
 	uint64_t dp_space_towrite[TXG_SIZE];
 	uint64_t dp_tempreserved[TXG_SIZE];

-	enum scrub_func dp_scrub_func;
-	uint64_t dp_scrub_queue_obj;
-	uint64_t dp_scrub_min_txg;
-	uint64_t dp_scrub_max_txg;
-	zbookmark_t dp_scrub_bookmark;
-	boolean_t dp_scrub_pausing;
-	boolean_t dp_scrub_isresilver;
-	uint64_t dp_scrub_start_time;
-	kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
-	boolean_t dp_scrub_restart;
-
 	/* Has its own locking */
 	tx_state_t dp_tx;
 	txg_list_t dp_dirty_datasets;
@ -123,29 +114,36 @@ int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
 void dsl_pool_close(dsl_pool_t *dp);
 dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
 void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
-void dsl_pool_zil_clean(dsl_pool_t *dp);
+void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
 int dsl_pool_sync_context(dsl_pool_t *dp);
 uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
 int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
 void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_pool_memory_pressure(dsl_pool_t *dp);
 void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags);
-void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
-    struct dmu_tx *tx);
+void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
+void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
+    const blkptr_t *bpp);
+int dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb);
+int dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
+    arc_done_func_t *done, void *private, int priority, int zio_flags,
+    uint32_t *arc_flags, const zbookmark_t *zb);
 void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
-
-int dsl_pool_scrub_cancel(dsl_pool_t *dp);
-int dsl_pool_scrub_clean(dsl_pool_t *dp);
-void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
-void dsl_pool_scrub_restart(dsl_pool_t *dp);
+void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);

 taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);

+extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, uint64_t *now, dmu_tx_t *tx);
+extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, dmu_tx_t *tx);
+extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
+int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
+
 #ifdef	__cplusplus
 }
 #endif
--- a/module/zfs/include/sys/dsl_prop.h
+++ b/module/zfs/include/sys/dsl_prop.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_DSL_PROP_H
@ -49,6 +48,25 @@ typedef struct dsl_prop_cb_record {
 	void *cbr_arg;
 } dsl_prop_cb_record_t;

+typedef struct dsl_props_arg {
+	nvlist_t *pa_props;
+	zprop_source_t pa_source;
+} dsl_props_arg_t;
+
+typedef struct dsl_prop_set_arg {
+	const char *psa_name;
+	zprop_source_t psa_source;
+	int psa_intsz;
+	int psa_numints;
+	const void *psa_value;
+
+	/*
+	 * Used to handle the special requirements of the quota and reservation
+	 * properties.
+	 */
+	uint64_t psa_effective_value;
+} dsl_prop_setarg_t;
+
 int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
    dsl_prop_changed_cb_t *callback, void *cbarg);
 int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
@ -59,18 +77,36 @@ int dsl_prop_get(const char *ddname, const char *propname,
    int intsz, int numints, void *buf, char *setpoint);
 int dsl_prop_get_integer(const char *ddname, const char *propname,
    uint64_t *valuep, char *setpoint);
-int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local);
+int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
+int dsl_prop_get_received(objset_t *os, nvlist_t **nvp);
 int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
    int intsz, int numints, void *buf, char *setpoint);
 int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint);
+    int intsz, int numints, void *buf, char *setpoint,
+    boolean_t snapshot);

 dsl_syncfunc_t dsl_props_set_sync;
 int dsl_prop_set(const char *ddname, const char *propname,
-    int intsz, int numints, const void *buf);
-int dsl_props_set(const char *dsname, nvlist_t *nvl);
+    zprop_source_t source, int intsz, int numints, const void *buf);
+int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
 void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
-    cred_t *cr, dmu_tx_t *tx);
+    dmu_tx_t *tx);
+
+void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
+    zprop_source_t source, uint64_t *value);
+int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#ifdef	ZFS_DEBUG
+void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#define	DSL_PROP_CHECK_PREDICTION(dd, psa)	\
+	dsl_prop_check_prediction((dd), (psa))
+#else
+#define	DSL_PROP_CHECK_PREDICTION(dd, psa)	/* nothing */
+#endif
+
+/* flag first receive on or after SPA_VERSION_RECVD_PROPS */
+boolean_t dsl_prop_get_hasrecvd(objset_t *os);
+void dsl_prop_set_hasrecvd(objset_t *os);
+void dsl_prop_unset_hasrecvd(objset_t *os);

 void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
 void dsl_prop_nvlist_add_string(nvlist_t *nv,
--- a/module/zfs/include/sys/dsl_scan.h
+++ b/module/zfs/include/sys/dsl_scan.h
@ -0,0 +1,108 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_DSL_SCAN_H
+#define	_SYS_DSL_SCAN_H
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/bplist.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+
+/*
+ * All members of this structure must be uint64_t, for byteswap
+ * purposes.
+ */
+typedef struct dsl_scan_phys {
+	uint64_t scn_func; /* pool_scan_func_t */
+	uint64_t scn_state; /* dsl_scan_state_t */
+	uint64_t scn_queue_obj;
+	uint64_t scn_min_txg;
+	uint64_t scn_max_txg;
+	uint64_t scn_cur_min_txg;
+	uint64_t scn_cur_max_txg;
+	uint64_t scn_start_time;
+	uint64_t scn_end_time;
+	uint64_t scn_to_examine; /* total bytes to be scanned */
+	uint64_t scn_examined; /* bytes scanned so far */
+	uint64_t scn_to_process;
+	uint64_t scn_processed;
+	uint64_t scn_errors;	/* scan I/O error count */
+	uint64_t scn_ddt_class_max;
+	ddt_bookmark_t scn_ddt_bookmark;
+	zbookmark_t scn_bookmark;
+	uint64_t scn_flags; /* dsl_scan_flags_t */
+} dsl_scan_phys_t;
+
+#define	SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
+
+typedef enum dsl_scan_flags {
+	DSF_VISIT_DS_AGAIN = 1<<0,
+} dsl_scan_flags_t;
+
+typedef struct dsl_scan {
+	struct dsl_pool *scn_dp;
+
+	boolean_t scn_pausing;
+	uint64_t scn_restart_txg;
+	uint64_t scn_sync_start_time;
+	zio_t *scn_zio_root;
+
+	/* for debugging / information */
+	uint64_t scn_visited_this_txg;
+
+	dsl_scan_phys_t scn_phys;
+} dsl_scan_t;
+
+int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
+void dsl_scan_fini(struct dsl_pool *dp);
+void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
+int dsl_scan_cancel(struct dsl_pool *);
+int dsl_scan(struct dsl_pool *, pool_scan_func_t);
+void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
+boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
+boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
+void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+    ddt_entry_t *dde, dmu_tx_t *tx);
+void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
+    struct dmu_tx *tx);
+boolean_t dsl_scan_active(dsl_scan_t *scn);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SCAN_H */
--- a/module/zfs/include/sys/dsl_synctask.h
+++ b/module/zfs/include/sys/dsl_synctask.h
@ -19,15 +19,12 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_DSL_SYNCTASK_H
 #define	_SYS_DSL_SYNCTASK_H

-
-
 #include <sys/txg.h>
 #include <sys/zfs_context.h>

@ -38,7 +35,7 @@ extern "C" {
 struct dsl_pool;

 typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, void *, dmu_tx_t *);

 typedef struct dsl_sync_task {
 	list_node_t dst_node;
@ -53,7 +50,6 @@ typedef struct dsl_sync_task_group {
 	txg_node_t dstg_node;
 	list_t dstg_tasks;
 	struct dsl_pool *dstg_pool;
-	cred_t *dstg_cr;
 	uint64_t dstg_txg;
 	int dstg_err;
 	int dstg_space;
--- a/module/zfs/include/sys/fm/fs/zfs.h
+++ b/module/zfs/include/sys/fm/fs/zfs.h
@ -68,6 +68,18 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET	"zio_offset"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE		"zio_size"
 #define	FM_EREPORT_PAYLOAD_ZFS_PREV_STATE	"prev_state"
+#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED	"cksum_expected"
+#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL	"cksum_actual"
+#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO	"cksum_algorithm"
+#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP	"cksum_byteswap"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS	"bad_range_sets"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS	"bad_range_clears"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS	"bad_set_bits"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS	"bad_cleared_bits"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"

 #define	FM_EREPORT_FAILMODE_WAIT		"wait"
 #define	FM_EREPORT_FAILMODE_CONTINUE		"continue"
@ -75,6 +87,7 @@ extern "C" {

 #define	FM_RESOURCE_REMOVED			"removed"
 #define	FM_RESOURCE_AUTOREPLACE			"autoreplace"
+#define	FM_RESOURCE_STATECHANGE			"statechange"

 #ifdef	__cplusplus
 }
--- a/module/zfs/include/sys/fm/protocol.h
+++ b/module/zfs/include/sys/fm/protocol.h
@ -20,8 +20,7 @@
 */

 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_FM_PROTOCOL_H
@ -47,6 +46,7 @@ extern "C" {
 /* FM event class values */
 #define	FM_EREPORT_CLASS		"ereport"
 #define	FM_FAULT_CLASS			"fault"
+#define	FM_DEFECT_CLASS			"defect"
 #define	FM_RSRC_CLASS			"resource"
 #define	FM_LIST_EVENT			"list"

@ -83,6 +83,7 @@ extern "C" {
 #define	FM_SUSPECT_FAULT_LIST		"fault-list"
 #define	FM_SUSPECT_FAULT_SZ		"fault-list-sz"
 #define	FM_SUSPECT_FAULT_STATUS		"fault-status"
+#define	FM_SUSPECT_INJECTED		"__injected"
 #define	FM_SUSPECT_MESSAGE		"message"
 #define	FM_SUSPECT_RETIRE		"retire"
 #define	FM_SUSPECT_RESPONSE		"response"
@ -122,6 +123,7 @@ extern "C" {
 #define	FM_RSRC_ASRU_REPAIRED		"repaired"
 #define	FM_RSRC_ASRU_REPLACED		"replaced"
 #define	FM_RSRC_ASRU_ACQUITTED		"acquitted"
+#define	FM_RSRC_ASRU_RESOLVED		"resolved"
 #define	FM_RSRC_ASRU_UNUSABLE		"unusable"
 #define	FM_RSRC_ASRU_EVENT		"event"

@ -170,6 +172,7 @@ extern "C" {

 /* FMRI authority-type member names */
 #define	FM_FMRI_AUTH_CHASSIS		"chassis-id"
+#define	FM_FMRI_AUTH_PRODUCT_SN		"product-sn"
 #define	FM_FMRI_AUTH_PRODUCT		"product-id"
 #define	FM_FMRI_AUTH_DOMAIN		"domain-id"
 #define	FM_FMRI_AUTH_SERVER		"server-id"
@ -243,6 +246,7 @@ extern "C" {

 /* dev scheme member names */
 #define	FM_FMRI_DEV_ID			"devid"
+#define	FM_FMRI_DEV_TGTPTLUN0		"target-port-l0id"
 #define	FM_FMRI_DEV_PATH		"device-path"

 /* pkg scheme member names */
@ -311,7 +315,7 @@ extern int i_fm_payload_set(nvlist_t *, const char *, va_list);
 extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *,
    int, ...);
 extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *,
-    const char *);
+    const char *, const char *);
 extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *);
 extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t,
    uint8_t *, const char *);
@ -320,6 +324,8 @@ extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *,
 extern void fm_authority_set(nvlist_t *, int, const char *, const char *,
    const char *, const char *);
 extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
+extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *,
+    nvlist_t *, int, ...);

 extern uint64_t fm_ena_increment(uint64_t);
 extern uint64_t fm_ena_generate(uint64_t, uchar_t);
--- a/module/zfs/include/sys/metaslab.h
+++ b/module/zfs/include/sys/metaslab.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef _SYS_METASLAB_H
@ -36,9 +35,6 @@
 extern "C" {
 #endif

-typedef struct metaslab_class metaslab_class_t;
-typedef struct metaslab_group metaslab_group_t;
-
 extern space_map_ops_t *zfs_metaslab_ops;

 extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
@ -46,6 +42,7 @@ extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 extern void metaslab_fini(metaslab_t *msp);
 extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
 extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_reassess(metaslab_group_t *mg);

 #define	METASLAB_HINTBP_FAVOR	0x0
 #define	METASLAB_HINTBP_AVOID	0x1
@ -57,14 +54,24 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
    boolean_t now);
 extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);

-extern metaslab_class_t *metaslab_class_create(space_map_ops_t *ops);
+extern metaslab_class_t *metaslab_class_create(spa_t *spa,
+    space_map_ops_t *ops);
 extern void metaslab_class_destroy(metaslab_class_t *mc);
-extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
-extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+extern int metaslab_class_validate(metaslab_class_t *mc);
+
+extern void metaslab_class_space_update(metaslab_class_t *mc,
+    int64_t alloc_delta, int64_t defer_delta,
+    int64_t space_delta, int64_t dspace_delta);
+extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);

 extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
    vdev_t *vd);
 extern void metaslab_group_destroy(metaslab_group_t *mg);
+extern void metaslab_group_activate(metaslab_group_t *mg);
+extern void metaslab_group_passivate(metaslab_group_t *mg);

 #ifdef	__cplusplus
 }
--- a/module/zfs/include/sys/metaslab_impl.h
+++ b/module/zfs/include/sys/metaslab_impl.h
@ -37,16 +37,23 @@ extern "C" {
 #endif

 struct metaslab_class {
+	spa_t			*mc_spa;
 	metaslab_group_t	*mc_rotor;
-	uint64_t		mc_allocated;
 	space_map_ops_t		*mc_ops;
+	uint64_t		mc_aliquot;
+	uint64_t		mc_alloc;	/* total allocated space */
+	uint64_t		mc_deferred;	/* total deferred frees */
+	uint64_t		mc_space;	/* total space (alloc + free) */
+	uint64_t		mc_dspace;	/* total deflated space */
 };

 struct metaslab_group {
 	kmutex_t		mg_lock;
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
+	uint64_t		mg_bonus_area;
 	int64_t			mg_bias;
+	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;
 	vdev_t			*mg_vd;
 	metaslab_group_t	*mg_prev;
@ -66,7 +73,9 @@ struct metaslab {
 	space_map_obj_t	ms_smo_syncing;	/* syncing space map object	*/
 	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
 	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
+	space_map_t	ms_defermap[TXG_DEFER_SIZE]; /* deferred frees	*/
 	space_map_t	ms_map;		/* in-core free space map	*/
+	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
 	uint64_t	ms_weight;	/* weight vs. others in group	*/
 	metaslab_group_t *ms_group;	/* metaslab group		*/
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
--- a/module/zfs/include/sys/refcount.h
+++ b/module/zfs/include/sys/refcount.h
@ -19,15 +19,12 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef	_SYS_REFCOUNT_H
 #define	_SYS_REFCOUNT_H

-
-
 #include <sys/inttypes.h>
 #include <sys/list.h>
 #include <sys/zfs_context.h>
@ -91,6 +88,11 @@ typedef struct refcount {
 	atomic_add_64_nv(&(rc)->rc_count, number)
 #define	refcount_remove_many(rc, number, holder) \
 	atomic_add_64_nv(&(rc)->rc_count, -number)
+#define	refcount_transfer(dst, src) { \
+	uint64_t __tmp = (src)->rc_count; \
+	atomic_add_64(&(src)->rc_count, -__tmp); \
+	atomic_add_64(&(dst)->rc_count, __tmp); \
+}

 #define	refcount_init()
 #define	refcount_fini()
--- a/module/zfs/include/sys/sa.h
+++ b/module/zfs/include/sys/sa.h
@ -0,0 +1,171 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_SA_H
+#define	_SYS_SA_H
+
+#include <sys/dmu.h>
+
+/*
+ * Currently available byteswap functions.
+ * If it all possible new attributes should used
+ * one of the already defined byteswap functions.
+ * If a new byteswap function is added then the
+ * ZPL/Pool version will need to be bumped.
+ */
+
+typedef enum sa_bswap_type {
+	SA_UINT64_ARRAY,
+	SA_UINT32_ARRAY,
+	SA_UINT16_ARRAY,
+	SA_UINT8_ARRAY,
+	SA_ACL,
+} sa_bswap_type_t;
+
+typedef uint16_t	sa_attr_type_t;
+
+/*
+ * Attribute to register support for.
+ */
+typedef struct sa_attr_reg {
+	char 			*sa_name;	/* attribute name */
+	uint16_t 		sa_length;
+	sa_bswap_type_t		sa_byteswap;	/* bswap functon enum */
+	sa_attr_type_t 		sa_attr; /* filled in during registration */
+} sa_attr_reg_t;
+
+
+typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t,
+    boolean_t, void *userptr);
+
+/*
+ * array of attributes to store.
+ *
+ * This array should be treated as opaque/private data.
+ * The SA_BULK_ADD_ATTR() macro should be used for manipulating
+ * the array.
+ *
+ * When sa_replace_all_by_template() is used the attributes
+ * will be stored in the order defined in the array, except that
+ * the attributes may be split between the bonus and the spill buffer
+ *
+ */
+typedef struct sa_bulk_attr {
+	void			*sa_data;
+	sa_data_locator_t	*sa_data_func;
+	uint16_t		sa_length;
+	sa_attr_type_t		sa_attr;
+	/* the following are private to the sa framework */
+	void 			*sa_addr;
+	uint16_t		sa_buftype;
+	uint16_t		sa_size;
+} sa_bulk_attr_t;
+
+
+/*
+ * special macro for adding entries for bulk attr support
+ * bulk - sa_bulk_attr_t
+ * count - integer that will be incremented during each add
+ * attr - attribute to manipulate
+ * func - function for accessing data.
+ * data - pointer to data.
+ * len - length of data
+ */
+
+#define	SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \
+{ \
+	b[idx].sa_attr = attr;\
+	b[idx].sa_data_func = func; \
+	b[idx].sa_data = data; \
+	b[idx++].sa_length = len; \
+}
+
+typedef struct sa_os sa_os_t;
+
+typedef enum sa_handle_type {
+	SA_HDL_SHARED,
+	SA_HDL_PRIVATE
+} sa_handle_type_t;
+
+struct sa_handle;
+typedef void *sa_lookup_tab_t;
+typedef struct sa_handle sa_handle_t;
+
+typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx);
+
+int sa_handle_get(objset_t *, uint64_t, void *userp,
+    sa_handle_type_t, sa_handle_t **);
+int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp,
+    sa_handle_type_t, sa_handle_t **);
+void sa_handle_destroy(sa_handle_t *);
+int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **);
+void sa_buf_rele(dmu_buf_t *, void *);
+int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen);
+int sa_update(sa_handle_t *, sa_attr_type_t, void *buf,
+    uint32_t buflen, dmu_tx_t *);
+int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *);
+int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *);
+int sa_size(sa_handle_t *, sa_attr_type_t, int *);
+int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
+    uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
+void sa_object_info(sa_handle_t *, dmu_object_info_t *);
+void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
+void sa_update_user(sa_handle_t *, sa_handle_t *);
+void *sa_get_userdata(sa_handle_t *);
+void sa_set_userp(sa_handle_t *, void *);
+dmu_buf_t *sa_get_db(sa_handle_t *);
+uint64_t sa_handle_object(sa_handle_t *);
+boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
+void sa_register_update_callback(objset_t *, sa_update_cb_t *);
+sa_attr_type_t *sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int);
+void sa_tear_down(objset_t *);
+int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
+    int, dmu_tx_t *);
+int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
+    int, dmu_tx_t *);
+boolean_t sa_enabled(objset_t *);
+void sa_cache_init();
+void sa_cache_fini();
+int sa_set_sa_object(objset_t *, uint64_t);
+int sa_hdrsize(void *);
+void sa_handle_lock(sa_handle_t *);
+void sa_handle_unlock(sa_handle_t *);
+
+#ifdef _KERNEL
+int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *);
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SA_H */
--- a/module/zfs/include/sys/sa_impl.h
+++ b/module/zfs/include/sys/sa_impl.h
@ -0,0 +1,288 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_SA_IMPL_H
+#define	_SYS_SA_IMPL_H
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/list.h>
+
+/*
+ * Array of known attributes and their
+ * various characteristics.
+ */
+typedef struct sa_attr_table {
+	sa_attr_type_t	sa_attr;
+	uint8_t sa_registered;
+	uint16_t sa_length;
+	sa_bswap_type_t sa_byteswap;
+	char *sa_name;
+} sa_attr_table_t;
+
+/*
+ * Zap attribute format for attribute registration
+ *
+ * 64      56      48      40      32      24      16      8       0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * |        unused         |      len      | bswap |   attr num    |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Zap attribute format for layout information.
+ *
+ * layout information is stored as an array of attribute numbers
+ * The name of the attribute is the layout number (0, 1, 2, ...)
+ *
+ * 16       0
+ * +---- ---+
+ * | attr # |
+ * +--------+
+ * | attr # |
+ * +--- ----+
+ *  ......
+ *
+ */
+
+#define	ATTR_BSWAP(x)	BF32_GET(x, 16, 8)
+#define	ATTR_LENGTH(x)	BF32_GET(x, 24, 16)
+#define	ATTR_NUM(x)	BF32_GET(x, 0, 16)
+#define	ATTR_ENCODE(x, attr, length, bswap) \
+{ \
+	BF64_SET(x, 24, 16, length); \
+	BF64_SET(x, 16, 8, bswap); \
+	BF64_SET(x, 0, 16, attr); \
+}
+
+#define	TOC_OFF(x)		BF32_GET(x, 0, 23)
+#define	TOC_ATTR_PRESENT(x)	BF32_GET(x, 31, 1)
+#define	TOC_LEN_IDX(x)		BF32_GET(x, 24, 4)
+#define	TOC_ATTR_ENCODE(x, len_idx, offset) \
+{ \
+	BF32_SET(x, 31, 1, 1); \
+	BF32_SET(x, 24, 7, len_idx); \
+	BF32_SET(x, 0, 24, offset); \
+}
+
+#define	SA_LAYOUTS	"LAYOUTS"
+#define	SA_REGISTRY	"REGISTRY"
+
+/*
+ * Each unique layout will have their own table
+ * sa_lot (layout_table)
+ */
+typedef struct sa_lot {
+	avl_node_t lot_num_node;
+	avl_node_t lot_hash_node;
+	uint64_t lot_num;
+	uint64_t lot_hash;
+	sa_attr_type_t *lot_attrs;	/* array of attr #'s */
+	uint32_t lot_var_sizes;	/* how many aren't fixed size */
+	uint32_t lot_attr_count;	/* total attr count */
+	list_t 	lot_idx_tab;	/* should be only a couple of entries */
+	int	lot_instance;	/* used with lot_hash to identify entry */
+} sa_lot_t;
+
+/* index table of offsets */
+typedef struct sa_idx_tab {
+	list_node_t	sa_next;
+	sa_lot_t	*sa_layout;
+	uint16_t	*sa_variable_lengths;
+	refcount_t	sa_refcount;
+	uint32_t	*sa_idx_tab;	/* array of offsets */
+} sa_idx_tab_t;
+
+/*
+ * Since the offset/index information into the actual data
+ * will usually be identical we can share that information with
+ * all handles that have the exact same offsets.
+ *
+ * You would typically only have a large number of different table of
+ * contents if you had a several variable sized attributes.
+ *
+ * Two AVL trees are used to track the attribute layout numbers.
+ * one is keyed by number and will be consulted when a DMU_OT_SA
+ * object is first read.  The second tree is keyed by the hash signature
+ * of the attributes and will be consulted when an attribute is added
+ * to determine if we already have an instance of that layout.  Both
+ * of these tree's are interconnected.  The only difference is that
+ * when an entry is found in the "hash" tree the list of attributes will
+ * need to be compared against the list of attributes you have in hand.
+ * The assumption is that typically attributes will just be updated and
+ * adding a completely new attribute is a very rare operation.
+ */
+struct sa_os {
+	kmutex_t 	sa_lock;
+	boolean_t	sa_need_attr_registration;
+	boolean_t	sa_force_spill;
+	uint64_t	sa_master_obj;
+	uint64_t	sa_reg_attr_obj;
+	uint64_t	sa_layout_attr_obj;
+	int		sa_num_attrs;
+	sa_attr_table_t *sa_attr_table;	 /* private attr table */
+	sa_update_cb_t	*sa_update_cb;
+	avl_tree_t	sa_layout_num_tree;  /* keyed by layout number */
+	avl_tree_t	sa_layout_hash_tree; /* keyed by layout hash value */
+	int		sa_user_table_sz;
+	sa_attr_type_t	*sa_user_table; /* user name->attr mapping table */
+};
+
+/*
+ * header for all bonus and spill buffers.
+ * The header has a fixed portion with a variable number
+ * of "lengths" depending on the number of variable sized
+ * attribues which are determined by the "layout number"
+ */
+
+#define	SA_MAGIC	0x2F505A  /* ZFS SA */
+typedef struct sa_hdr_phys {
+	uint32_t sa_magic;
+	uint16_t sa_layout_info;  /* Encoded with hdrsize and layout number */
+	uint16_t sa_lengths[1];	/* optional sizes for variable length attrs */
+	/* ... Data follows the lengths.  */
+} sa_hdr_phys_t;
+
+/*
+ * sa_hdr_phys -> sa_layout_info
+ *
+ * 16      10       0
+ * +--------+-------+
+ * | hdrsz  |layout |
+ * +--------+-------+
+ *
+ * Bits 0-10 are the layout number
+ * Bits 11-16 are the size of the header.
+ * The hdrsize is the number * 8
+ *
+ * For example.
+ * hdrsz of 1 ==> 8 byte header
+ *          2 ==> 16 byte header
+ *
+ */
+
+#define	SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
+#define	SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0)
+#define	SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
+{ \
+	BF32_SET_SB(x, 10, 6, 3, 0, size); \
+	BF32_SET(x, 0, 10, num); \
+}
+
+typedef enum sa_buf_type {
+	SA_BONUS = 1,
+	SA_SPILL = 2
+} sa_buf_type_t;
+
+typedef enum sa_data_op {
+	SA_LOOKUP,
+	SA_UPDATE,
+	SA_ADD,
+	SA_REPLACE,
+	SA_REMOVE
+} sa_data_op_t;
+
+/*
+ * Opaque handle used for most sa functions
+ *
+ * This needs to be kept as small as possible.
+ */
+
+struct sa_handle {
+	kmutex_t	sa_lock;
+	dmu_buf_t	*sa_bonus;
+	dmu_buf_t	*sa_spill;
+	objset_t	*sa_os;
+	void 		*sa_userp;
+	sa_idx_tab_t	*sa_bonus_tab;	 /* idx of bonus */
+	sa_idx_tab_t	*sa_spill_tab; /* only present if spill activated */
+};
+
+#define	SA_GET_DB(hdl, type)	\
+	(dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill)
+
+#define	SA_GET_HDR(hdl, type) \
+	((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \
+	type))->db.db_data))
+
+#define	SA_IDX_TAB_GET(hdl, type) \
+	(type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab)
+
+#define	IS_SA_BONUSTYPE(a)	\
+	((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
+
+#define	SA_BONUSTYPE_FROM_DB(db) \
+	(((dmu_buf_impl_t *)db)->db_dnode->dn_bonustype)
+
+#define	SA_BLKPTR_SPACE	(DN_MAX_BONUSLEN - sizeof (blkptr_t))
+
+#define	SA_LAYOUT_NUM(x, type) \
+	((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
+	((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x))))
+
+
+#define	SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length
+
+#define	SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\
+	hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \
+	SA_REGISTERED_LEN(sa, attr))
+
+#define	SA_SET_HDR(hdr, num, size) \
+	{ \
+		hdr->sa_magic = SA_MAGIC; \
+		SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \
+	}
+
+#define	SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \
+	{ \
+		bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \
+		bulk.sa_buftype = type; \
+		bulk.sa_addr = \
+		    (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \
+		    (uintptr_t)hdr); \
+}
+
+#define	SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \
+	(SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \
+	(tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \
+	sizeof (uint16_t), 8) : 0)))
+
+int sa_add_impl(sa_handle_t *, sa_attr_type_t,
+    uint32_t, sa_data_locator_t, void *, dmu_tx_t *);
+
+void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *);
+int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *);
+
+void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t,
+    uint16_t *, sa_hdr_phys_t *);
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SA_IMPL_H */
--- a/module/zfs/include/sys/spa.h
+++ b/module/zfs/include/sys/spa.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef _SYS_SPA_H
@ -43,8 +42,13 @@ extern "C" {
 typedef struct spa spa_t;
 typedef struct vdev vdev_t;
 typedef struct metaslab metaslab_t;
+typedef struct metaslab_group metaslab_group_t;
+typedef struct metaslab_class metaslab_class_t;
+typedef struct zio zio_t;
 typedef struct zilog zilog_t;
 typedef struct spa_aux_vdev spa_aux_vdev_t;
+typedef struct ddt ddt_t;
+typedef struct ddt_entry ddt_entry_t;
 struct dsl_pool;

 /*
@ -134,15 +138,15 @@ typedef struct zio_cksum {
 *	+-------+-------+-------+-------+-------+-------+-------+-------+
 * 5	|G|			 offset3				|
 *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 6	|E| lvl | type	| cksum | comp	|     PSIZE	|     LSIZE	|
+ * 6	|BDX|lvl| type	| cksum | comp	|     PSIZE	|     LSIZE	|
 *	+-------+-------+-------+-------+-------+-------+-------+-------+
 * 7	|			padding					|
 *	+-------+-------+-------+-------+-------+-------+-------+-------+
 * 8	|			padding					|
 *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 9	|			padding					|
+ * 9	|			physical birth txg			|
 *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * a	|			birth txg				|
+ * a	|			logical birth txg			|
 *	+-------+-------+-------+-------+-------+-------+-------+-------+
 * b	|			fill count				|
 *	+-------+-------+-------+-------+-------+-------+-------+-------+
@ -166,25 +170,29 @@ typedef struct zio_cksum {
 * cksum	checksum function
 * comp		compression function
 * G		gang block indicator
- * E		endianness
- * type		DMU object type
+ * B		byteorder (endianness)
+ * D		dedup
+ * X		unused
 * lvl		level of indirection
- * birth txg	transaction group in which the block was born
+ * type		DMU object type
+ * phys birth	txg of block allocation; zero if same as logical birth txg
+ * log. birth	transaction group in which the block was logically born
 * fill count	number of non-zero blocks under this bp
 * checksum[4]	256-bit checksum of the data this bp describes
 */
+#define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
+#define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
+
 typedef struct blkptr {
-	dva_t		blk_dva[3];	/* 128-bit Data Virtual Address	*/
+	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
 	uint64_t	blk_prop;	/* size, compression, type, etc	    */
-	uint64_t	blk_pad[3];	/* Extra space for the future	*/
+	uint64_t	blk_pad[2];	/* Extra space for the future	    */
+	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
 	uint64_t	blk_birth;	/* transaction group at birth	    */
 	uint64_t	blk_fill;	/* fill count			    */
 	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
 } blkptr_t;

-#define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
-#define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
-
 /*
 * Macros to get and set fields in a bp or DVA.
 */
@ -208,8 +216,7 @@ typedef struct blkptr {
 #define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)

 #define	BP_GET_LSIZE(bp)	\
-	(BP_IS_HOLE(bp) ? 0 : \
-	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
 #define	BP_SET_LSIZE(bp, x)	\
 	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)

@ -230,16 +237,31 @@ typedef struct blkptr {
 #define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
 #define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)

+#define	BP_GET_PROP_BIT_61(bp)		BF64_GET((bp)->blk_prop, 61, 1)
+#define	BP_SET_PROP_BIT_61(bp, x)	BF64_SET((bp)->blk_prop, 61, 1, x)
+
+#define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
+#define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
+
 #define	BP_GET_BYTEORDER(bp)		(0 - BF64_GET((bp)->blk_prop, 63, 1))
 #define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)

+#define	BP_PHYSICAL_BIRTH(bp)		\
+	((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+
+#define	BP_SET_BIRTH(bp, logical, physical)	\
+{						\
+	(bp)->blk_birth = (logical);		\
+	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
+
 #define	BP_GET_ASIZE(bp)	\
 	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 		DVA_GET_ASIZE(&(bp)->blk_dva[2]))

 #define	BP_GET_UCSIZE(bp) \
 	((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
-	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))

 #define	BP_GET_NDVAS(bp)	\
 	(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
@ -255,6 +277,12 @@ typedef struct blkptr {
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])

+#define	BP_EQUAL(bp1, bp2)	\
+	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
+
 #define	ZIO_CHECKSUM_EQUAL(zc1, zc2) \
 	(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
 	((zc1).zc_word[1] - (zc2).zc_word[1]) | \
@ -274,7 +302,10 @@ typedef struct blkptr {
 #define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
 #define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
 #define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
-#define	BP_IS_OLDER(bp, txg)	(!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
+
+/* BP_IS_RAIDZ(bp) assumes no block compression */
+#define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
+				BP_GET_PSIZE(bp))

 #define	BP_ZERO(bp)				\
 {						\
@ -287,14 +318,12 @@ typedef struct blkptr {
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
-	(bp)->blk_pad[2] = 0;			\
+	(bp)->blk_phys_birth = 0;		\
 	(bp)->blk_birth = 0;			\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }

-#define	BLK_FILL_ALREADY_FREED	(-1ULL)
-
 /*
 * Note: the byteorder is either 0 or -1, both of which are palindromes.
 * This simplifies the endianness handling a bit.
@ -309,17 +338,82 @@ typedef struct blkptr {

 #define	BP_SPRINTF_LEN	320

+/*
+ * This macro allows code sharing between zfs, libzpool, and mdb.
+ * 'func' is either snprintf() or mdb_snprintf().
+ * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+ */
+#define	SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress)	\
+{									\
+	static const char *copyname[] =					\
+	    { "zero", "single", "double", "triple" };			\
+	int size = BP_SPRINTF_LEN;					\
+	int len = 0;							\
+	int copies = 0;							\
+	int d;								\
+									\
+	if (bp == NULL) {						\
+		len = func(buf + len, size - len, "<NULL>");		\
+	} else if (BP_IS_HOLE(bp)) {					\
+		len = func(buf + len, size - len, "<hole>");		\
+	} else {							\
+		for (d = 0; d < BP_GET_NDVAS(bp); d++) {		\
+			const dva_t *dva = &bp->blk_dva[d];		\
+			if (DVA_IS_VALID(dva))				\
+				copies++;				\
+			len += func(buf + len, size - len,		\
+			    "DVA[%d]=<%llu:%llx:%llx>%c", d,		\
+			    (u_longlong_t)DVA_GET_VDEV(dva),		\
+			    (u_longlong_t)DVA_GET_OFFSET(dva),		\
+			    (u_longlong_t)DVA_GET_ASIZE(dva),		\
+			    ws);					\
+		}							\
+		if (BP_IS_GANG(bp) &&					\
+		    DVA_GET_ASIZE(&bp->blk_dva[2]) <=			\
+		    DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)			\
+			copies--;					\
+		len += func(buf + len, size - len,			\
+		    "[L%llu %s] %s %s %s %s %s %s%c"			\
+		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
+		    "cksum=%llx:%llx:%llx:%llx",			\
+		    (u_longlong_t)BP_GET_LEVEL(bp),			\
+		    type,						\
+		    checksum,						\
+		    compress,						\
+		    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",		\
+		    BP_IS_GANG(bp) ? "gang" : "contiguous",		\
+		    BP_GET_DEDUP(bp) ? "dedup" : "unique",		\
+		    copyname[copies],					\
+		    ws,							\
+		    (u_longlong_t)BP_GET_LSIZE(bp),			\
+		    (u_longlong_t)BP_GET_PSIZE(bp),			\
+		    (u_longlong_t)bp->blk_birth,			\
+		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
+		    (u_longlong_t)bp->blk_fill,				\
+		    ws,							\
+		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
+		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
+		    (u_longlong_t)bp->blk_cksum.zc_word[2],		\
+		    (u_longlong_t)bp->blk_cksum.zc_word[3]);		\
+	}								\
+	ASSERT(len < size);						\
+}
+
 #include <sys/dmu.h>

 #define	BP_GET_BUFC_TYPE(bp)						\
 	(((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
 	ARC_BUFC_METADATA : ARC_BUFC_DATA);
-/*
- * Routines found in spa.c
- */
+
+typedef enum spa_import_type {
+	SPA_IMPORT_EXISTING,
+	SPA_IMPORT_ASSEMBLE
+} spa_import_type_t;

 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
+    nvlist_t *policy, nvlist_t **config);
 extern int spa_get_stats(const char *pool, nvlist_t **config,
    char *altroot, size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
@ -338,6 +432,8 @@ extern void spa_async_suspend(spa_t *spa);
 extern void spa_async_resume(spa_t *spa);
 extern spa_t *spa_inject_addref(char *pool);
 extern void spa_inject_delref(spa_t *spa);
+extern void spa_scan_stat_init(spa_t *spa);
+extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);

 #define	SPA_ASYNC_CONFIG_UPDATE	0x01
 #define	SPA_ASYNC_REMOVE	0x02
@ -345,6 +441,14 @@ extern void spa_inject_delref(spa_t *spa);
 #define	SPA_ASYNC_RESILVER_DONE	0x08
 #define	SPA_ASYNC_RESILVER	0x10
 #define	SPA_ASYNC_AUTOEXPAND	0x20
+#define	SPA_ASYNC_REMOVE_DONE	0x40
+#define	SPA_ASYNC_REMOVE_STOP	0x80
+
+/*
+ * Controls the behavior of spa_vdev_remove().
+ */
+#define	SPA_REMOVE_UNSPARE	0x01
+#define	SPA_REMOVE_DONE		0x02

 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
@ -353,8 +457,11 @@ extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
    int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
+extern boolean_t spa_vdev_remove_active(spa_t *spa);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
+extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+    nvlist_t *props, boolean_t exp);

 /* spare state (which is global across all pools) */
 extern void spa_spare_add(vdev_t *vd);
@ -368,15 +475,23 @@ extern void spa_l2cache_remove(vdev_t *vd);
 extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
 extern void spa_l2cache_activate(vdev_t *vd);
 extern void spa_l2cache_drop(spa_t *spa);
-extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc);

-/* scrubbing */
-extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
+/* scanning */
+extern int spa_scan(spa_t *spa, pool_scan_func_t func);
+extern int spa_scan_stop(spa_t *spa);

 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);

+/*
+ * DEFERRED_FREE must be large enough that regular blocks are not
+ * deferred.  XXX so can't we change it back to 1?
+ */
+#define	SYNC_PASS_DEFERRED_FREE	2	/* defer frees after this pass */
+#define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
+#define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
+
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;

@ -394,7 +509,6 @@ extern void spa_config_set(spa_t *spa, nvlist_t *config);
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
    int getstats);
 extern void spa_config_update(spa_t *spa, int what);
-extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);

 /*
 * Miscellaneous SPA routines in spa_misc.c
@ -402,7 +516,7 @@ extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);

 /* Namespace manipulation */
 extern spa_t *spa_lookup(const char *name);
-extern spa_t *spa_add(const char *name, const char *altroot);
+extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
 extern void spa_remove(spa_t *spa);
 extern spa_t *spa_next(spa_t *prev);

@ -411,6 +525,7 @@ extern void spa_open_ref(spa_t *spa, void *tag);
 extern void spa_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);

+#define	SCL_NONE	0x00
 #define	SCL_CONFIG	0x01
 #define	SCL_STATE	0x02
 #define	SCL_L2ARC	0x04		/* hack until L2ARC 2.0 */
@ -430,12 +545,30 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw);

 /* Pool vdev add/remove lock */
 extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_config_enter(spa_t *spa);
+extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
+    int error, char *tag);
 extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);

 /* Pool vdev state change lock */
-extern void spa_vdev_state_enter(spa_t *spa);
+extern void spa_vdev_state_enter(spa_t *spa, int oplock);
 extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);

+/* Log state */
+typedef enum spa_log_state {
+	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
+	SPA_LOG_MISSING,	/* missing log(s) */
+	SPA_LOG_CLEAR,		/* clear the log(s) */
+	SPA_LOG_GOOD,		/* log(s) are good */
+} spa_log_state_t;
+
+extern spa_log_state_t spa_get_log_state(spa_t *spa);
+extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
+extern int spa_offline_log(spa_t *spa);
+
+/* Log claim callback */
+extern void spa_claim_notify(zio_t *zio);
+
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
@ -447,18 +580,26 @@ extern char *spa_name(spa_t *spa);
 extern uint64_t spa_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
+extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
-extern uint64_t spa_get_alloc(spa_t *spa);
-extern uint64_t spa_get_space(spa_t *spa);
-extern uint64_t spa_get_dspace(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_get_dspace(spa_t *spa);
+extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
+extern boolean_t spa_deflate(spa_t *spa);
+extern metaslab_class_t *spa_normal_class(spa_t *spa);
+extern metaslab_class_t *spa_log_class(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
+extern int spa_prev_software_version(spa_t *spa);
 extern int spa_busy(void);
 extern uint8_t spa_get_failmode(spa_t *spa);
 extern boolean_t spa_suspended(spa_t *spa);
+extern uint64_t spa_bootfs(spa_t *spa);
+extern uint64_t spa_delegation(spa_t *spa);
+extern objset_t *spa_meta_objset(spa_t *spa);

 /* Miscellaneous support routines */
 extern int spa_rename(const char *oldname, const char *newname);
@ -466,18 +607,24 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_get_random(uint64_t range);
-extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
+extern uint64_t spa_generate_guid(spa_t *spa);
+extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
    boolean_t l2cache);
 extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
-extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
+extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
+extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
+extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
+extern void spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *to);
+
 extern int spa_mode(spa_t *spa);
+extern uint64_t strtonum(const char *str, char **nptr);

 /* history logging */
 typedef enum history_log_type {
@ -487,10 +634,11 @@ typedef enum history_log_type {
 } history_log_type_t;

 typedef struct history_arg {
-	const char *ha_history_str;
+	char *ha_history_str;
 	history_log_type_t ha_log_type;
 	history_internal_events_t ha_event;
-	char ha_zone[MAXPATHLEN];
+	char *ha_zone;
+	uid_t ha_uid;
 } history_arg_t;

 extern char *spa_his_ievent_table[];
@ -500,17 +648,17 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
    char *his_buf);
 extern int spa_history_log(spa_t *spa, const char *his_buf,
    history_log_type_t what);
-extern void spa_history_internal_log(history_internal_events_t event,
-    spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_log_internal(history_internal_events_t event,
+    spa_t *spa, dmu_tx_t *tx, const char *fmt, ...);
 extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);

 /* error handling */
 struct zbookmark;
-struct zio;
-extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void spa_log_error(spa_t *spa, zio_t *zio);
 extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
-    struct zio *zio, uint64_t stateoroffset, uint64_t length);
+    zio_t *zio, uint64_t stateoroffset, uint64_t length);
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
+extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
 extern uint64_t spa_get_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
@ -541,7 +689,7 @@ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
 #define	dprintf_bp(bp, fmt, ...) do {				\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { 			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));		\
+	sprintf_blkptr(__blkbuf, (bp));				\
 	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} \
--- a/module/zfs/include/sys/spa_impl.h
+++ b/module/zfs/include/sys/spa_impl.h
@ -19,8 +19,7 @@
 * CDDL HEADER END
 */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

 #ifndef _SYS_SPA_IMPL_H
@ -36,6 +35,7 @@
 #include <sys/avl.h>
 #include <sys/refcount.h>
 #include <sys/bplist.h>
+#include <sys/bpobj.h>

 #ifdef	__cplusplus
 extern "C" {
@ -78,19 +78,33 @@ typedef struct spa_config_dirent {
 	char		*scd_path;
 } spa_config_dirent_t;

-typedef enum spa_log_state {
-	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
-	SPA_LOG_MISSING,	/* missing log(s) */
-	SPA_LOG_CLEAR,		/* clear the log(s) */
-	SPA_LOG_GOOD,		/* log(s) are good */
-} spa_log_state_t;
-
 enum zio_taskq_type {
 	ZIO_TASKQ_ISSUE = 0,
+	ZIO_TASKQ_ISSUE_HIGH,
 	ZIO_TASKQ_INTERRUPT,
+	ZIO_TASKQ_INTERRUPT_HIGH,
 	ZIO_TASKQ_TYPES
 };

+/*
+ * State machine for the zpool-pooname process.  The states transitions
+ * are done as follows:
+ *
+ *	From		   To			Routine
+ *	PROC_NONE	-> PROC_CREATED		spa_activate()
+ *	PROC_CREATED	-> PROC_ACTIVE		spa_thread()
+ *	PROC_ACTIVE	-> PROC_DEACTIVATE	spa_deactivate()
+ *	PROC_DEACTIVATE	-> PROC_GONE		spa_thread()
+ *	PROC_GONE	-> PROC_NONE		spa_deactivate()
+ */
+typedef enum spa_proc_state {
+	SPA_PROC_NONE,		/* spa_proc = &p0, no process created */
+	SPA_PROC_CREATED,	/* spa_activate() has proc, is waiting */
+	SPA_PROC_ACTIVE,	/* taskqs created, spa_proc set */
+	SPA_PROC_DEACTIVATE,	/* spa_deactivate() requests process exit */
+	SPA_PROC_GONE		/* spa_thread() is exiting, spa_proc = &p0 */
+} spa_proc_state_t;
+
 struct spa {
 	/*
 	 * Fields protected by spa_namespace_lock.
@ -99,6 +113,7 @@ struct spa {
 	avl_node_t	spa_avl;		/* node in spa_namespace_avl */
 	nvlist_t	*spa_config;		/* last synced config */
 	nvlist_t	*spa_config_syncing;	/* currently syncing config */
+	nvlist_t	*spa_config_splitting;	/* config for splitting */
 	uint64_t	spa_config_txg;		/* txg of last config change */
 	int		spa_sync_pass;		/* iterate-to-convergence */
 	pool_state_t	spa_state;		/* pool state */
@ -113,6 +128,8 @@ struct spa {
 	uint64_t	spa_first_txg;		/* first txg after spa_open() */
 	uint64_t	spa_final_txg;		/* txg of export/destroy */
 	uint64_t	spa_freeze_txg;		/* freeze pool at this txg */
+	uint64_t	spa_load_max_txg;	/* best initial ub_txg */
+	uint64_t	spa_claim_max_txg;	/* highest claimed birth txg */
 	objset_t	*spa_meta_objset;	/* copy of dp->dp_meta_objset */
 	txg_list_t	spa_vdev_txg_list;	/* per-txg dirty vdev list */
 	vdev_t		*spa_root_vdev;		/* top-level vdev container */
@ -122,21 +139,24 @@ struct spa {
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	uint64_t	spa_config_object;	/* MOS object for pool config */
+	uint64_t	spa_config_generation;	/* config generation number */
 	uint64_t	spa_syncing_txg;	/* txg currently syncing */
-	uint64_t	spa_sync_bplist_obj;	/* object for deferred frees */
-	bplist_t	spa_sync_bplist;	/* deferred-free bplist */
+	bpobj_t		spa_deferred_bpobj;	/* deferred-free bplist */
+	bplist_t	spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
+	boolean_t	spa_extreme_rewind;	/* rewind past deferred frees */
 	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
 	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
 	uint64_t	spa_scrub_maxinflight;	/* max in-flight scrub I/Os */
-	uint64_t	spa_scrub_errors;	/* scrub I/O error count */
 	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
 	uint8_t		spa_scrub_active;	/* active or suspended? */
 	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
 	uint8_t		spa_scrub_finished;	/* indicator to rotate logs */
 	uint8_t		spa_scrub_started;	/* started since last boot */
 	uint8_t		spa_scrub_reopen;	/* scrub doing vdev_reopen */
+	uint64_t	spa_scan_pass_start;	/* start time per pass/reboot */
+	uint64_t	spa_scan_pass_exam;	/* examined bytes per pass */
 	kmutex_t	spa_async_lock;		/* protect async state */
 	kthread_t	*spa_async_thread;	/* thread doing async task */
 	int		spa_async_suspended;	/* async tasks suspended */
@ -144,7 +164,14 @@ struct spa {
 	uint16_t	spa_async_tasks;	/* async task mask */
 	char		*spa_root;		/* alternate root directory */
 	uint64_t	spa_ena;		/* spa-wide ereport ENA */
-	boolean_t	spa_last_open_failed;	/* true if last open faled */
+	int		spa_last_open_failed;	/* error if last open failed */
+	uint64_t	spa_last_ubsync_txg;	/* "best" uberblock txg */
+	uint64_t	spa_last_ubsync_txg_ts;	/* timestamp from that ub */
+	uint64_t	spa_load_txg;		/* ub txg that loaded */
+	uint64_t	spa_load_txg_ts;	/* timestamp from that ub */
+	uint64_t	spa_load_meta_errors;	/* verify metadata err count */
+	uint64_t	spa_load_data_errors;	/* verify data err count */
+	uint64_t	spa_verify_min_txg;	/* start txg of verify scrub */
 	kmutex_t	spa_errlog_lock;	/* error log lock */
 	uint64_t	spa_errlog_last;	/* last error log object */
 	uint64_t	spa_errlog_scrub;	/* scrub error log object */
@ -166,11 +193,27 @@ struct spa {
 	kmutex_t	spa_suspend_lock;	/* protects suspend_zio_root */
 	kcondvar_t	spa_suspend_cv;		/* notification of resume */
 	uint8_t		spa_suspended;		/* pool is suspended */
+	uint8_t		spa_claiming;		/* pool is doing zil_claim() */
 	boolean_t	spa_is_root;		/* pool is root */
 	int		spa_minref;		/* num refs when first opened */
 	int		spa_mode;		/* FREAD | FWRITE */
 	spa_log_state_t spa_log_state;		/* log state */
 	uint64_t	spa_autoexpand;		/* lun expansion on/off */
+	ddt_t		*spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
+	uint64_t	spa_ddt_stat_object;	/* DDT statistics */
+	uint64_t	spa_dedup_ditto;	/* dedup ditto threshold */
+	uint64_t	spa_dedup_checksum;	/* default dedup checksum */
+	uint64_t	spa_dspace;		/* dspace in normal class */
+	kmutex_t	spa_vdev_top_lock;	/* dueling offline/remove */
+	kmutex_t	spa_proc_lock;		/* protects spa_proc* */
+	kcondvar_t	spa_proc_cv;		/* spa_proc_state transitions */
+	spa_proc_state_t spa_proc_state;	/* see definition */
+	struct proc	*spa_proc;		/* "zpool-poolname" process */
+	uint64_t	spa_did;		/* if procp != p0, did of t1 */
+	boolean_t	spa_autoreplace;	/* autoreplace set in open */
+	int		spa_vdev_locks;		/* locks grabbed */
+	uint64_t	spa_creation_version;	/* version at pool creation */
+	uint64_t	spa_prev_software_version;
 	/*
 	 * spa_refcnt & spa_config_lock must be the last elements
 	 * because refcount_t changes size based on compilation options.
@ -183,12 +226,6 @@ struct spa {

 extern char *spa_config_path;

-#define	BOOTFS_COMPRESS_VALID(compress) \
-	((compress) == ZIO_COMPRESS_LZJB || \
-	((compress) == ZIO_COMPRESS_ON && \
-	ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
-	(compress) == ZIO_COMPRESS_OFF)
-
 #ifdef	__cplusplus
 }
 #endif
--- a/module/zfs/include/sys/space_map.h
+++ b/module/zfs/include/sys/space_map.h
@ -77,6 +77,7 @@ struct space_map_ops {
 	void	(*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
 	void	(*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
 	uint64_t (*smop_max)(space_map_t *sm);
+	boolean_t (*smop_fragmented)(space_map_t *sm);
 };

 /*
--- a/Show More
+++ b/Show More