OpenZFS 2605, 6980, 6902

2605 want to resume interrupted zfs send Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Paul Dagnelie <pcd@delphix.com> Reviewed by: Richard Elling <Richard.Elling@RichardElling.com> Reviewed by: Xin Li <delphij@freebsd.org> Reviewed by: Arne Jansen <sensille@gmx.net> Approved by: Dan McDonald <danmcd@omniti.com> Ported-by: kernelOfTruth <kerneloftruth@gmail.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/2605 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/9c3fd12 6980 6902 causes zfs send to break due to 32-bit/64-bit struct mismatch Reviewed by: Paul Dagnelie <pcd@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Approved by: Robert Mustacchi <rm@joyent.com> Ported by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/6980 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/ea4a67f Porting notes: - All rsend and snapshop tests enabled and updated for Linux. - Fix misuse of input argument in traverse_visitbp(). - Fix ISO C90 warnings and errors. - Fix gcc 'missing braces around initializer' in 'struct send_thread_arg to_arg =' warning. - Replace 4 argument fletcher_4_native() with 3 argument version, this change was made in OpenZFS 4185 which has not been ported. - Part of the sections for 'zfs receive' and 'zfs send' was rewritten and reordered to approximate upstream. - Fix mktree xattr creation, 'user.' prefix required. - Minor fixes to newly enabled test cases - Long holds for volumes allowed during receive for minor registration.
2016-01-06 22:22:48 +01:00 · 2016-01-06 22:22:48 +01:00 · 47dfff3b86
parent 669cf0ab29
commit 47dfff3b86
40 changed files with 1483 additions and 337 deletions
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@ -248,10 +248,11 @@ get_usage(zfs_help_t idx)
 	case HELP_PROMOTE:
 		return (gettext("\tpromote <clone-filesystem>\n"));
 	case HELP_RECEIVE:
-		return (gettext("\treceive [-vnFu] <filesystem|volume|"
+		return (gettext("\treceive [-vnsFu] <filesystem|volume|"
 		    "snapshot>\n"
-		    "\treceive [-vnFu] [-o origin=<snapshot>] [-d | -e] "
-		    "<filesystem>\n"));
+		    "\treceive [-vnsFu] [-o origin=<snapshot>] [-d | -e] "
+		    "<filesystem>\n"
+		    "\treceive -A <filesystem|volume>\n"));
 	case HELP_RENAME:
 		return (gettext("\trename [-f] <filesystem|volume|snapshot> "
 		    "<filesystem|volume|snapshot>\n"
@ -263,7 +264,8 @@ get_usage(zfs_help_t idx)
 		return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
 		    "<snapshot>\n"
 		    "\tsend [-Le] [-i snapshot|bookmark] "
-		    "<filesystem|volume|snapshot>\n"));
+		    "<filesystem|volume|snapshot>\n"
+		    "\tsend [-nvPe] -t <receive_resume_token>\n"));
 	case HELP_SET:
 		return (gettext("\tset <property=value> ... "
 		    "<filesystem|volume|snapshot> ...\n"));
@ -3707,6 +3709,7 @@ zfs_do_send(int argc, char **argv)
 {
 	char *fromname = NULL;
 	char *toname = NULL;
+	char *resume_token = NULL;
 	char *cp;
 	zfs_handle_t *zhp;
 	sendflags_t flags = { 0 };
@ -3715,7 +3718,7 @@ zfs_do_send(int argc, char **argv)
 	boolean_t extraverbose = B_FALSE;

 	/* check options */
-	while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) {
+	while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) {
 		switch (c) {
 		case 'i':
 			if (fromname)
@ -3756,6 +3759,9 @@ zfs_do_send(int argc, char **argv)
 		case 'e':
 			flags.embed_data = B_TRUE;
 			break;
+		case 't':
+			resume_token = optarg;
+			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
@ -3771,14 +3777,28 @@ zfs_do_send(int argc, char **argv)
 	argc -= optind;
 	argv += optind;

-	/* check number of arguments */
-	if (argc < 1) {
-		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
-		usage(B_FALSE);
-	}
-	if (argc > 1) {
-		(void) fprintf(stderr, gettext("too many arguments\n"));
-		usage(B_FALSE);
+	if (resume_token != NULL) {
+		if (fromname != NULL || flags.replicate || flags.props ||
+		    flags.dedup) {
+			(void) fprintf(stderr,
+			    gettext("invalid flags combined with -t\n"));
+			usage(B_FALSE);
+		}
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("no additional "
+			    "arguments are permitted with -t\n"));
+			usage(B_FALSE);
+		}
+	} else {
+		if (argc < 1) {
+			(void) fprintf(stderr,
+			    gettext("missing snapshot argument\n"));
+			usage(B_FALSE);
+		}
+		if (argc > 1) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
 	}

 	if (!flags.dryrun && isatty(STDOUT_FILENO)) {
@ -3788,6 +3808,11 @@ zfs_do_send(int argc, char **argv)
 		return (1);
 	}

+	if (resume_token != NULL) {
+		return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO,
+		    resume_token));
+	}
+
 	/*
 	 * Special case sending a filesystem, or from a bookmark.
 	 */
@ -3893,8 +3918,6 @@ zfs_do_send(int argc, char **argv)
 }

 /*
- * zfs receive [-vnFu] [-d | -e] <fs@snap>
- *
 * Restore a backup stream from stdin.
 */
 static int
@ -3902,6 +3925,8 @@ zfs_do_receive(int argc, char **argv)
 {
 	int c, err;
 	recvflags_t flags = { 0 };
+	boolean_t abort_resumable = B_FALSE;
+
 	nvlist_t *props;
 	nvpair_t *nvp = NULL;

@ -3909,7 +3934,7 @@ zfs_do_receive(int argc, char **argv)
 		nomem();

 	/* check options */
-	while ((c = getopt(argc, argv, ":o:denuvF")) != -1) {
+	while ((c = getopt(argc, argv, ":o:denuvFsA")) != -1) {
 		switch (c) {
 		case 'o':
 			if (parseprop(props, optarg) != 0)
@ -3931,9 +3956,15 @@ zfs_do_receive(int argc, char **argv)
 		case 'v':
 			flags.verbose = B_TRUE;
 			break;
+		case 's':
+			flags.resumable = B_TRUE;
+			break;
 		case 'F':
 			flags.force = B_TRUE;
 			break;
+		case 'A':
+			abort_resumable = B_TRUE;
+			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
@ -3966,6 +3997,44 @@ zfs_do_receive(int argc, char **argv)
 		}
 	}

+	if (abort_resumable) {
+		if (flags.isprefix || flags.istail || flags.dryrun ||
+		    flags.resumable || flags.nomount) {
+			(void) fprintf(stderr, gettext("invalid option"));
+			usage(B_FALSE);
+		}
+
+		char namebuf[ZFS_MAXNAMELEN];
+		(void) snprintf(namebuf, sizeof (namebuf),
+		    "%s/%%recv", argv[0]);
+
+		if (zfs_dataset_exists(g_zfs, namebuf,
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) {
+			zfs_handle_t *zhp = zfs_open(g_zfs,
+			    namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+			if (zhp == NULL)
+				return (1);
+			err = zfs_destroy(zhp, B_FALSE);
+		} else {
+			zfs_handle_t *zhp = zfs_open(g_zfs,
+			    argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+			if (zhp == NULL)
+				usage(B_FALSE);
+			if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) ||
+			    zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+			    NULL, 0, NULL, NULL, 0, B_TRUE) == -1) {
+				(void) fprintf(stderr,
+				    gettext("'%s' does not have any "
+				    "resumable receive state to abort\n"),
+				    argv[0]);
+				return (1);
+			}
+			err = zfs_destroy(zhp, B_FALSE);
+		}
+
+		return (err != 0);
+	}
+
 	if (isatty(STDIN_FILENO)) {
 		(void) fprintf(stderr,
 		    gettext("Error: Backup stream can not be read "
@ -3973,7 +4042,6 @@ zfs_do_receive(int argc, char **argv)
 		    "You must redirect standard input.\n"));
 		return (1);
 	}
-
 	err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);

 	return (err != 0);
@ -5803,6 +5871,24 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
 		return (0);
 	}

+	/*
+	 * If this filesystem is inconsistent and has a receive resume
+	 * token, we can not mount it.
+	 */
+	if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
+	    zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+	    NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "Contains partially-completed state from "
+		    "\"zfs receive -r\", which can be resumed with "
+		    "\"zfs send -t\"\n"),
+		    cmdname, zfs_get_name(zhp));
+		return (1);
+	}
+
 	/*
 	 * At this point, we have verified that the mountpoint and/or
 	 * shareopts are appropriate for auto management. If the
--- a/cmd/zstreamdump/zstreamdump.c
+++ b/cmd/zstreamdump/zstreamdump.c
@ -127,7 +127,7 @@ read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum)
 		    (longlong_t)saved_cksum.zc_word[1],
 		    (longlong_t)saved_cksum.zc_word[2],
 		    (longlong_t)saved_cksum.zc_word[3]);
-		exit(1);
+		return (0);
 	}
 	return (sizeof (*drr));
 }
@ -347,8 +347,7 @@ main(int argc, char *argv[])
 			if (verbose)
 				(void) printf("\n");

-			if ((DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
-			    DMU_COMPOUNDSTREAM) && drr->drr_payloadlen != 0) {
+			if (drr->drr_payloadlen != 0) {
 				nvlist_t *nv;
 				int sz = drr->drr_payloadlen;

--- a/config/user-commands.m4
+++ b/config/user-commands.m4
@ -67,6 +67,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_COMMANDS_COMMON], [
 	AC_PATH_TOOL(SHUF, shuf, "")
 	AC_PATH_TOOL(SLEEP, sleep, "")
 	AC_PATH_TOOL(SORT, sort, "")
+	AC_PATH_TOOL(STAT, stat, "")
 	AC_PATH_TOOL(STRINGS, strings, "")
 	AC_PATH_TOOL(SU, su, "")
 	AC_PATH_TOOL(SUM, sum, "")
@ -75,6 +76,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_COMMANDS_COMMON], [
 	AC_PATH_TOOL(TAR, tar, "")
 	AC_PATH_TOOL(TOUCH, touch, "")
 	AC_PATH_TOOL(TR, tr, "")
+	AC_PATH_TOOL(TRUNCATE, truncate, "")
 	AC_PATH_TOOL(TRUE, true, "")
 	AC_PATH_TOOL(UMASK, umask, "")
 	AC_PATH_TOOL(UMOUNT, umount, "")
@ -103,7 +105,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_COMMANDS_LINUX], [
 	AC_PATH_TOOL(SHARE, exportfs, "")
 	AC_PATH_TOOL(SWAP, swapon, "")
 	AC_PATH_TOOL(SWAPADD, swapon, "")
-	AC_PATH_TOOL(TRUNCATE, truncate, "")
 	AC_PATH_TOOL(UDEVADM, udevadm, "")
 	AC_PATH_TOOL(UFSDUMP, dump, "")
 	AC_PATH_TOOL(UFSRESTORE, restore, "")
--- a/include/libzfs.h
+++ b/include/libzfs.h
@ -640,6 +640,10 @@ typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
 extern int zfs_send(zfs_handle_t *, const char *, const char *,
    sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
 extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags);
+extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd,
+    const char *);
+extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl,
+    const char *token);

 extern int zfs_promote(zfs_handle_t *);
 extern int zfs_hold(zfs_handle_t *, const char *, const char *,
@ -680,6 +684,12 @@ typedef struct recvflags {
 	/* set "canmount=off" on all modified filesystems */
 	boolean_t canmountoff;

+	/*
+	 * Mark the file systems as "resumable" and do not destroy them if the
+	 * receive is interrupted
+	 */
+	boolean_t resumable;
+
 	/* byteswap flag is used internally; callers need not specify */
 	boolean_t byteswap;

--- a/include/libzfs_core.h
+++ b/include/libzfs_core.h
@ -20,7 +20,7 @@
 */

 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
 */

 #ifndef	_LIBZFS_CORE_H
@ -58,7 +58,11 @@ enum lzc_send_flags {
 };

 int lzc_send(const char *, const char *, int, enum lzc_send_flags);
+int lzc_send_resume(const char *, const char *, int,
+    enum lzc_send_flags, uint64_t, uint64_t);
 int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int);
+int lzc_receive_resumable(const char *, nvlist_t *, const char *,
+    boolean_t, int);
 int lzc_send_space(const char *, const char *, uint64_t *);

 boolean_t lzc_exists(const char *);
--- a/include/sys/dmu_impl.h
+++ b/include/sys/dmu_impl.h
@ -24,7 +24,7 @@
 */
 /*
 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
 */

 #ifndef _SYS_DMU_IMPL_H
@ -272,6 +272,8 @@ typedef struct dmu_sendarg {
 	uint64_t dsa_featureflags;
 	uint64_t dsa_last_data_object;
 	uint64_t dsa_last_data_offset;
+	uint64_t dsa_resume_object;
+	uint64_t dsa_resume_offset;
 } dmu_sendarg_t;

 void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
--- a/include/sys/dmu_send.h
+++ b/include/sys/dmu_send.h
@ -36,10 +36,13 @@ struct vnode;
 struct dsl_dataset;
 struct drr_begin;
 struct avl_tree;
+struct dmu_replay_record;

-int dmu_send(const char *tosnap, const char *fromsnap,
-    boolean_t embedok, boolean_t large_block_ok,
-    int outfd, struct vnode *vp, offset_t *off);
+extern const char *recv_clone_name;
+
+int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+    boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
+    struct vnode *vp, offset_t *off);
 int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
    uint64_t *sizep);
 int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
@ -50,12 +53,14 @@ int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,

 typedef struct dmu_recv_cookie {
 	struct dsl_dataset *drc_ds;
+	struct dmu_replay_record *drc_drr_begin;
 	struct drr_begin *drc_drrb;
 	const char *drc_tofs;
 	const char *drc_tosnap;
 	boolean_t drc_newfs;
 	boolean_t drc_byteswap;
 	boolean_t drc_force;
+	boolean_t drc_resumable;
 	struct avl_tree *drc_guid_to_ds_map;
 	zio_cksum_t drc_cksum;
 	uint64_t drc_newsnapobj;
@ -63,8 +68,9 @@ typedef struct dmu_recv_cookie {
 	cred_t *drc_cred;
 } dmu_recv_cookie_t;

-int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
-    boolean_t force, char *origin, dmu_recv_cookie_t *drc);
+int dmu_recv_begin(char *tofs, char *tosnap,
+    struct dmu_replay_record *drr_begin,
+    boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc);
 int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
    int cleanup_fd, uint64_t *action_handlep);
 int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
--- a/include/sys/dmu_traverse.h
+++ b/include/sys/dmu_traverse.h
@ -54,6 +54,8 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,

 int traverse_dataset(struct dsl_dataset *ds,
    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start,
+    zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg);
 int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
    blkptr_cb_t func, void *arg);
--- a/include/sys/dsl_dataset.h
+++ b/include/sys/dsl_dataset.h
@ -98,6 +98,18 @@ struct dsl_pool;
 */
 #define	DS_FIELD_LARGE_DNODE "org.zfsonlinux:large_dnode"

+/*
+ * These fields are set on datasets that are in the middle of a resumable
+ * receive, and allow the sender to resume the send if it is interrupted.
+ */
+#define	DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
+#define	DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
+#define	DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
+#define	DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
+#define	DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
+#define	DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
+#define	DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
+
 /*
 * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
 * name lookups should be performed case-insensitively.
@ -191,6 +203,14 @@ typedef struct dsl_dataset {
 	kmutex_t ds_sendstream_lock;
 	list_t ds_sendstreams;

+	/*
+	 * When in the middle of a resumable receive, tracks how much
+	 * progress we have made.
+	 */
+	uint64_t ds_resume_object[TXG_SIZE];
+	uint64_t ds_resume_offset[TXG_SIZE];
+	uint64_t ds_resume_bytes[TXG_SIZE];
+
 	/* Protected by our dsl_dir's dd_lock */
 	list_t ds_prop_cbs;

@ -242,6 +262,7 @@ int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
 void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
+boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
    dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
 uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
@ -322,6 +343,8 @@ int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
 void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
    zprop_source_t source, uint64_t value, dmu_tx_t *tx);
 void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
+boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
+boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);
 int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result);

 void dsl_dataset_deactivate_feature(uint64_t dsobj,
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@ -158,6 +158,7 @@ typedef enum {
 	ZFS_PROP_REDUNDANT_METADATA,
 	ZFS_PROP_OVERLAY,
 	ZFS_PROP_PREV_SNAP,
+	ZFS_PROP_RECEIVE_RESUME_TOKEN,
 	ZFS_NUM_PROPS
 } zfs_prop_t;

--- a/include/sys/zfs_ioctl.h
+++ b/include/sys/zfs_ioctl.h
@ -90,16 +90,16 @@ typedef enum drr_headertype {
 * Feature flags for zfs send streams (flags in drr_versioninfo)
 */

-#define	DMU_BACKUP_FEATURE_DEDUP		(1<<0)
-#define	DMU_BACKUP_FEATURE_DEDUPPROPS		(1<<1)
-#define	DMU_BACKUP_FEATURE_SA_SPILL		(1<<2)
+#define	DMU_BACKUP_FEATURE_DEDUP		(1 << 0)
+#define	DMU_BACKUP_FEATURE_DEDUPPROPS		(1 << 1)
+#define	DMU_BACKUP_FEATURE_SA_SPILL		(1 << 2)
 /* flags #3 - #15 are reserved for incompatible closed-source implementations */
-#define	DMU_BACKUP_FEATURE_EMBED_DATA		(1<<16)
-#define	DMU_BACKUP_FEATURE_EMBED_DATA_LZ4	(1<<17)
+#define	DMU_BACKUP_FEATURE_EMBED_DATA		(1 << 16)
+#define	DMU_BACKUP_FEATURE_EMBED_DATA_LZ4	(1 << 17)
 /* flag #18 is reserved for a Delphix feature */
-#define	DMU_BACKUP_FEATURE_LARGE_BLOCKS		(1<<19)
-/* flag #20 is reserved for resumable streams */
-#define	DMU_BACKUP_FEATURE_LARGE_DNODE		(1<<21)
+#define	DMU_BACKUP_FEATURE_LARGE_BLOCKS		(1 << 19)
+#define	DMU_BACKUP_FEATURE_RESUMING		(1 << 20)
+#define	DMU_BACKUP_FEATURE_LARGE_DNODE		(1 << 21)

 /*
 * Mask of all supported backup features
@ -107,11 +107,16 @@ typedef enum drr_headertype {
 #define	DMU_BACKUP_FEATURE_MASK	(DMU_BACKUP_FEATURE_DEDUP | \
    DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
    DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
-    DMU_BACKUP_FEATURE_LARGE_BLOCKS | DMU_BACKUP_FEATURE_LARGE_DNODE)
+    DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_LARGE_BLOCKS | \
+    DMU_BACKUP_FEATURE_LARGE_DNODE)

 /* Are all features in the given flag word currently supported? */
 #define	DMU_STREAM_SUPPORTED(x)	(!((x) & ~DMU_BACKUP_FEATURE_MASK))

+typedef enum dmu_send_resume_token_version {
+	ZFS_SEND_RESUME_TOKEN_VERSION = 1
+} dmu_send_resume_token_version_t;
+
 /*
 * The drr_versioninfo field of the dmu_replay_record has the
 * following layout:
@ -335,6 +340,12 @@ typedef enum zfs_case {
 	ZFS_CASE_MIXED
 } zfs_case_t;

+/*
+ * Note: this struct must have the same layout in 32-bit and 64-bit, so
+ * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit
+ * kernel.  Therefore, we add padding to it so that no "hidden" padding
+ * is automatically added on 64-bit (but not on 32-bit).
+ */
 typedef struct zfs_cmd {
 	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
 	uint64_t	zc_nvlist_src;		/* really (char *) */
@ -363,14 +374,15 @@ typedef struct zfs_cmd {
 	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
 	zfs_share_t	zc_share;
 	dmu_objset_stats_t zc_objset_stats;
-	struct drr_begin zc_begin_record;
+	dmu_replay_record_t zc_begin_record;
 	zinject_record_t zc_inject_record;
 	uint32_t	zc_defer_destroy;
 	uint32_t	zc_flags;
 	uint64_t	zc_action_handle;
 	int		zc_cleanup_fd;
 	uint8_t		zc_simple;
-	uint8_t		zc_pad[3];		/* alignment */
+	boolean_t	zc_resumable;
+	uint8_t		zc_pad[2];		/* alignment */
 	uint64_t	zc_sendobj;
 	uint64_t	zc_fromobj;
 	uint64_t	zc_createtxg;
--- a/include/sys/zvol.h
+++ b/include/sys/zvol.h
@ -32,6 +32,8 @@
 #define	ZVOL_OBJ		1ULL
 #define	ZVOL_ZAP_OBJ		2ULL

+extern void *zvol_tag;
+
 extern void zvol_create_minors(spa_t *spa, const char *name, boolean_t async);
 extern void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async);
 extern void zvol_rename_minors(spa_t *spa, const char *oldname,
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@ -1865,22 +1865,21 @@ getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 	return (value);
 }

-static char *
+static const char *
 getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 {
 	nvlist_t *nv;
-	char *value;
+	const char *value;

 	*source = NULL;
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(prop), &nv) == 0) {
-		verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);
+		value = fnvlist_lookup_string(nv, ZPROP_VALUE);
 		(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
 	} else {
 		verify(!zhp->zfs_props_table ||
 		    zhp->zfs_props_table[prop] == B_TRUE);
-		if ((value = (char *)zfs_prop_default_string(prop)) == NULL)
-			value = "";
+		value = zfs_prop_default_string(prop);
 		*source = "";
 	}

@ -2301,7 +2300,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 {
 	char *source = NULL;
 	uint64_t val;
-	char *str;
+	const char *str;
 	const char *strval;
 	boolean_t received = zfs_is_recvd_props_mode(zhp);

@ -2407,14 +2406,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 		break;

 	case ZFS_PROP_ORIGIN:
-		(void) strlcpy(propbuf, getprop_string(zhp, prop, &source),
-		    proplen);
-		/*
-		 * If there is no parent at all, return failure to indicate that
-		 * it doesn't apply to this dataset.
-		 */
-		if (propbuf[0] == '\0')
+		str = getprop_string(zhp, prop, &source);
+		if (str == NULL)
 			return (-1);
+		(void) strlcpy(propbuf, str, proplen);
 		break;

 	case ZFS_PROP_CLONES:
@ -2596,8 +2591,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 			break;

 		case PROP_TYPE_STRING:
-			(void) strlcpy(propbuf,
-			    getprop_string(zhp, prop, &source), proplen);
+			str = getprop_string(zhp, prop, &source);
+			if (str == NULL)
+				return (-1);
+			(void) strlcpy(propbuf, str, proplen);
 			break;

 		case PROP_TYPE_INDEX:
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@ -1051,6 +1051,17 @@ mount_cb(zfs_handle_t *zhp, void *data)
 		return (0);
 	}

+	/*
+	 * If this filesystem is inconsistent and has a receive resume
+	 * token, we can not mount it.
+	 */
+	if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
+	    zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+	    NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
+		zfs_close(zhp);
+		return (0);
+	}
+
 	libzfs_add_handle(cbp, zhp);
 	if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
 		zfs_close(zhp);
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@ -21,7 +21,7 @@

 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
 * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
 * All rights reserved
@ -56,6 +56,7 @@
 #include "zfs_prop.h"
 #include "zfs_fletcher.h"
 #include "libzfs_impl.h"
+#include <zlib.h>
 #include <sys/zio_checksum.h>
 #include <sys/ddt.h>
 #include <sys/socket.h>
@ -66,6 +67,8 @@ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
 static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *,
    recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int,
    uint64_t *);
+static int guid_to_name(libzfs_handle_t *, const char *,
+    uint64_t, boolean_t, char *);

 static const zio_cksum_t zero_cksum = { { 0 } };

@ -234,7 +237,7 @@ cksummer(void *arg)
 {
 	dedup_arg_t *dda = arg;
 	char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
-	dmu_replay_record_t thedrr;
+	dmu_replay_record_t thedrr = { 0 };
 	dmu_replay_record_t *drr = &thedrr;
 	FILE *ofp;
 	int outfd;
@ -283,8 +286,7 @@ cksummer(void *arg)
 			    DMU_BACKUP_FEATURE_DEDUPPROPS);
 			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);

-			if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
-			    DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
+			if (drr->drr_payloadlen != 0) {
 				sz = drr->drr_payloadlen;

 				if (sz > SPA_MAXBLOCKSIZE) {
@ -1013,17 +1015,14 @@ static void *
 send_progress_thread(void *arg)
 {
 	progress_arg_t *pa = arg;
-
 	zfs_cmd_t zc = {"\0"};
 	zfs_handle_t *zhp = pa->pa_zhp;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	unsigned long long bytes;
 	char buf[16];
-
 	time_t t;
 	struct tm *tm;

-	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));

 	if (!pa->pa_parsable)
@ -1056,6 +1055,51 @@ send_progress_thread(void *arg)
 	}
 }

+static void
+send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap,
+    uint64_t size, boolean_t parsable)
+{
+	if (parsable) {
+		if (fromsnap != NULL) {
+			(void) fprintf(fout, "incremental\t%s\t%s",
+			    fromsnap, tosnap);
+		} else {
+			(void) fprintf(fout, "full\t%s",
+			    tosnap);
+		}
+	} else {
+		if (fromsnap != NULL) {
+			if (strchr(fromsnap, '@') == NULL &&
+			    strchr(fromsnap, '#') == NULL) {
+				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
+				    "send from @%s to %s"),
+				    fromsnap, tosnap);
+			} else {
+				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
+				    "send from %s to %s"),
+				    fromsnap, tosnap);
+			}
+		} else {
+			(void) fprintf(fout, dgettext(TEXT_DOMAIN,
+			    "full send of %s"),
+			    tosnap);
+		}
+	}
+
+	if (size != 0) {
+		if (parsable) {
+			(void) fprintf(fout, "\t%llu",
+			    (longlong_t)size);
+		} else {
+			char buf[16];
+			zfs_nicenum(size, buf, sizeof (buf));
+			(void) fprintf(fout, dgettext(TEXT_DOMAIN,
+			    " estimated size is %s"), buf);
+		}
+	}
+	(void) fprintf(fout, "\n");
+}
+
 static int
 dump_snapshot(zfs_handle_t *zhp, void *arg)
 {
@ -1135,37 +1179,14 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
 	    (sdd->fromorigin || sdd->replicate);

 	if (sdd->verbose) {
-		uint64_t size;
-		err = estimate_ioctl(zhp, sdd->prevsnap_obj,
+		uint64_t size = 0;
+		(void) estimate_ioctl(zhp, sdd->prevsnap_obj,
 		    fromorigin, &size);

-		if (sdd->parsable) {
-			if (sdd->prevsnap[0] != '\0') {
-				(void) fprintf(fout, "incremental\t%s\t%s",
-				    sdd->prevsnap, zhp->zfs_name);
-			} else {
-				(void) fprintf(fout, "full\t%s",
-				    zhp->zfs_name);
-			}
-		} else {
-			(void) fprintf(fout, dgettext(TEXT_DOMAIN,
-			    "send from @%s to %s"),
-			    sdd->prevsnap, zhp->zfs_name);
-		}
-		if (err == 0) {
-			if (sdd->parsable) {
-				(void) fprintf(fout, "\t%llu\n",
-				    (longlong_t)size);
-			} else {
-				char buf[16];
-				zfs_nicenum(size, buf, sizeof (buf));
-				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
-				    " estimated size is %s\n"), buf);
-			}
-			sdd->size += size;
-		} else {
-			(void) fprintf(fout, "\n");
-		}
+		send_print_verbose(fout, zhp->zfs_name,
+		    sdd->prevsnap[0] ? sdd->prevsnap : NULL,
+		    size, sdd->parsable);
+		sdd->size += size;
 	}

 	if (!sdd->dryrun) {
@ -1376,6 +1397,231 @@ again:
 	return (0);
 }

+nvlist_t *
+zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token)
+{
+	unsigned int version;
+	int nread, i;
+	unsigned long long checksum, packed_len;
+
+	/*
+	 * Decode token header, which is:
+	 *   <token version>-<checksum of payload>-<uncompressed payload length>
+	 * Note that the only supported token version is 1.
+	 */
+	nread = sscanf(token, "%u-%llx-%llx-",
+	    &version, &checksum, &packed_len);
+	if (nread != 3) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt (invalid format)"));
+		return (NULL);
+	}
+
+	if (version != ZFS_SEND_RESUME_TOKEN_VERSION) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt (invalid version %u)"),
+		    version);
+		return (NULL);
+	}
+
+	/* convert hexadecimal representation to binary */
+	token = strrchr(token, '-') + 1;
+	int len = strlen(token) / 2;
+	unsigned char *compressed = zfs_alloc(hdl, len);
+	for (i = 0; i < len; i++) {
+		nread = sscanf(token + i * 2, "%2hhx", compressed + i);
+		if (nread != 1) {
+			free(compressed);
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "resume token is corrupt "
+			    "(payload is not hex-encoded)"));
+			return (NULL);
+		}
+	}
+
+	/* verify checksum */
+	zio_cksum_t cksum;
+	fletcher_4_native(compressed, len, &cksum);
+	if (cksum.zc_word[0] != checksum) {
+		free(compressed);
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt (incorrect checksum)"));
+		return (NULL);
+	}
+
+	/* uncompress */
+	void *packed = zfs_alloc(hdl, packed_len);
+	uLongf packed_len_long = packed_len;
+	if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK ||
+	    packed_len_long != packed_len) {
+		free(packed);
+		free(compressed);
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt (decompression failed)"));
+		return (NULL);
+	}
+
+	/* unpack nvlist */
+	nvlist_t *nv;
+	int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP);
+	free(packed);
+	free(compressed);
+	if (error != 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt (nvlist_unpack failed)"));
+		return (NULL);
+	}
+	return (nv);
+}
+
+int
+zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd,
+    const char *resume_token)
+{
+	char errbuf[1024];
+	char *toname;
+	char *fromname = NULL;
+	uint64_t resumeobj, resumeoff, toguid, fromguid, bytes;
+	zfs_handle_t *zhp;
+	int error = 0;
+	char name[ZFS_MAXNAMELEN];
+	enum lzc_send_flags lzc_flags = 0;
+
+	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+	    "cannot resume send"));
+
+	nvlist_t *resume_nvl =
+	    zfs_send_resume_token_to_nvlist(hdl, resume_token);
+	if (resume_nvl == NULL) {
+		/*
+		 * zfs_error_aux has already been set by
+		 * zfs_send_resume_token_to_nvlist
+		 */
+		return (zfs_error(hdl, EZFS_FAULT, errbuf));
+	}
+	if (flags->verbose) {
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+		    "resume token contents:\n"));
+		nvlist_print(stderr, resume_nvl);
+	}
+
+	if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 ||
+	    nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 ||
+	    nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 ||
+	    nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 ||
+	    nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt"));
+		return (zfs_error(hdl, EZFS_FAULT, errbuf));
+	}
+	fromguid = 0;
+	(void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid);
+
+	if (flags->embed_data || nvlist_exists(resume_nvl, "embedok"))
+		lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+
+	if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) {
+		if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "'%s' is no longer the same snapshot used in "
+			    "the initial send"), toname);
+		} else {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "'%s' used in the initial send no longer exists"),
+			    toname);
+		}
+		return (zfs_error(hdl, EZFS_BADPATH, errbuf));
+	}
+	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
+	if (zhp == NULL) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "unable to access '%s'"), name);
+		return (zfs_error(hdl, EZFS_BADPATH, errbuf));
+	}
+
+	if (fromguid != 0) {
+		if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "incremental source %#llx no longer exists"),
+			    (longlong_t)fromguid);
+			return (zfs_error(hdl, EZFS_BADPATH, errbuf));
+		}
+		fromname = name;
+	}
+
+	if (flags->verbose) {
+		uint64_t size = 0;
+		error = lzc_send_space(zhp->zfs_name, fromname, &size);
+		if (error == 0)
+			size = MAX(0, (int64_t)(size - bytes));
+		send_print_verbose(stderr, zhp->zfs_name, fromname,
+		    size, flags->parsable);
+	}
+
+	if (!flags->dryrun) {
+		progress_arg_t pa = { 0 };
+		pthread_t tid;
+		/*
+		 * If progress reporting is requested, spawn a new thread to
+		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
+		 */
+		if (flags->progress) {
+			pa.pa_zhp = zhp;
+			pa.pa_fd = outfd;
+			pa.pa_parsable = flags->parsable;
+
+			error = pthread_create(&tid, NULL,
+			    send_progress_thread, &pa);
+			if (error != 0) {
+				zfs_close(zhp);
+				return (error);
+			}
+		}
+
+		error = lzc_send_resume(zhp->zfs_name, fromname, outfd,
+		    lzc_flags, resumeobj, resumeoff);
+
+		if (flags->progress) {
+			(void) pthread_cancel(tid);
+			(void) pthread_join(tid, NULL);
+		}
+
+		char errbuf[1024];
+		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+		    "warning: cannot send '%s'"), zhp->zfs_name);
+
+		zfs_close(zhp);
+
+		switch (error) {
+		case 0:
+			return (0);
+		case EXDEV:
+		case ENOENT:
+		case EDQUOT:
+		case EFBIG:
+		case EIO:
+		case ENOLINK:
+		case ENOSPC:
+		case ENOSTR:
+		case ENXIO:
+		case EPIPE:
+		case ERANGE:
+		case EFAULT:
+		case EROFS:
+			zfs_error_aux(hdl, strerror(errno));
+			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
+
+		default:
+			return (zfs_standard_error(hdl, errno, errbuf));
+		}
+	}
+
+
+	zfs_close(zhp);
+
+	return (error);
+}
+
 /*
 * Generate a send stream for the dataset identified by the argument zhp.
 *
@ -1451,7 +1697,9 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
 		dmu_replay_record_t drr = { 0 };
 		char *packbuf = NULL;
 		size_t buflen = 0;
-		zio_cksum_t zc = { { 0 } };
+		zio_cksum_t zc;
+
+		ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);

 		if (flags->replicate || flags->props) {
 			nvlist_t *hdrnv;
@ -1913,6 +2161,7 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,

 typedef struct guid_to_name_data {
 	uint64_t guid;
+	boolean_t bookmark_ok;
 	char *name;
 	char *skip;
 } guid_to_name_data_t;
@ -1921,20 +2170,25 @@ static int
 guid_to_name_cb(zfs_handle_t *zhp, void *arg)
 {
 	guid_to_name_data_t *gtnd = arg;
+	const char *slash;
 	int err;

 	if (gtnd->skip != NULL &&
-	    strcmp(zhp->zfs_name, gtnd->skip) == 0) {
+	    (slash = strrchr(zhp->zfs_name, '/')) != NULL &&
+	    strcmp(slash + 1, gtnd->skip) == 0) {
+		zfs_close(zhp);
 		return (0);
 	}

-	if (zhp->zfs_dmustats.dds_guid == gtnd->guid) {
+	if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) {
 		(void) strcpy(gtnd->name, zhp->zfs_name);
 		zfs_close(zhp);
 		return (EEXIST);
 	}

 	err = zfs_iter_children(zhp, guid_to_name_cb, gtnd);
+	if (err != EEXIST && gtnd->bookmark_ok)
+		err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd);
 	zfs_close(zhp);
 	return (err);
 }
@ -1948,45 +2202,48 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg)
 */
 static int
 guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid,
-    char *name)
+    boolean_t bookmark_ok, char *name)
 {
-	/* exhaustive search all local snapshots */
 	char pname[ZFS_MAXNAMELEN];
 	guid_to_name_data_t gtnd;
-	int err = 0;
-	zfs_handle_t *zhp;
-	char *cp;

 	gtnd.guid = guid;
+	gtnd.bookmark_ok = bookmark_ok;
 	gtnd.name = name;
 	gtnd.skip = NULL;

-	(void) strlcpy(pname, parent, sizeof (pname));
-
 	/*
-	 * Search progressively larger portions of the hierarchy.  This will
+	 * Search progressively larger portions of the hierarchy, starting
+	 * with the filesystem specified by 'parent'.  This will
 	 * select the "most local" version of the origin snapshot in the case
 	 * that there are multiple matching snapshots in the system.
 	 */
-	while ((cp = strrchr(pname, '/')) != NULL) {
-
+	(void) strlcpy(pname, parent, sizeof (pname));
+	char *cp = strrchr(pname, '@');
+	if (cp == NULL)
+		cp = strchr(pname, '\0');
+	for (; cp != NULL; cp = strrchr(pname, '/')) {
 		/* Chop off the last component and open the parent */
 		*cp = '\0';
-		zhp = make_dataset_handle(hdl, pname);
+		zfs_handle_t *zhp = make_dataset_handle(hdl, pname);

 		if (zhp == NULL)
 			continue;
-
-		err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
+		int err = guid_to_name_cb(zfs_handle_dup(zhp), &gtnd);
+		if (err != EEXIST)
+			err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
+		if (err != EEXIST && bookmark_ok)
+			err = zfs_iter_bookmarks(zhp, guid_to_name_cb, &gtnd);
 		zfs_close(zhp);
 		if (err == EEXIST)
 			return (0);

 		/*
-		 * Remember the dataset that we already searched, so we
-		 * skip it next time through.
+		 * Remember the last portion of the dataset so we skip it next
+		 * time through (as we've already searched that portion of the
+		 * hierarchy).
 		 */
-		gtnd.skip = pname;
+		gtnd.skip = strrchr(pname, '/') + 1;
 	}

 	return (ENOENT);
@ -2586,11 +2843,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)

 		switch (drr->drr_type) {
 		case DRR_BEGIN:
-			/* NB: not to be used on v2 stream packages */
 			if (drr->drr_payloadlen != 0) {
-				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "invalid substream header"));
-				return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+				(void) recv_read(hdl, fd, buf,
+				    drr->drr_payloadlen, B_FALSE, NULL);
 			}
 			break;

@ -2651,6 +2906,40 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 	return (-1);
 }

+static void
+recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap,
+    boolean_t resumable)
+{
+	char target_fs[ZFS_MAXNAMELEN];
+
+	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+	    "checksum mismatch or incomplete stream"));
+
+	if (!resumable)
+		return;
+	(void) strlcpy(target_fs, target_snap, sizeof (target_fs));
+	*strchr(target_fs, '@') = '\0';
+	zfs_handle_t *zhp = zfs_open(hdl, target_fs,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+	if (zhp == NULL)
+		return;
+
+	char token_buf[ZFS_MAXPROPLEN];
+	int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+	    token_buf, sizeof (token_buf),
+	    NULL, NULL, 0, B_TRUE);
+	if (error == 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "checksum mismatch or incomplete stream.\n"
+		    "Partially received snapshot is saved.\n"
+		    "A resuming stream can be generated on the sending "
+		    "system by running:\n"
+		    "    zfs send -t %s"),
+		    token_buf);
+	}
+	zfs_close(zhp);
+}
+
 /*
 * Restores a backup of tosnap from the file descriptor specified by infd.
 */
@ -2799,7 +3088,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 	 */
 	if (drrb->drr_flags & DRR_FLAG_CLONE) {
 		if (guid_to_name(hdl, zc.zc_value,
-		    drrb->drr_fromguid, zc.zc_string) != 0) {
+		    drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) {
 			zcmd_free_nvlists(&zc);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "local origin for clone %s does not exist"),
@ -2815,8 +3104,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 			    zc.zc_string);
 	}

+	boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+	    DMU_BACKUP_FEATURE_RESUMING;
 	stream_wantsnewfs = (drrb->drr_fromguid == 0 ||
-	    (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap);
+	    (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming;

 	if (stream_wantsnewfs) {
 		/*
@ -2835,7 +3126,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 			char suffix[ZFS_MAXNAMELEN];
 			(void) strcpy(suffix, strrchr(zc.zc_value, '/'));
 			if (guid_to_name(hdl, zc.zc_name, parent_snapguid,
-			    zc.zc_value) == 0) {
+			    B_FALSE, zc.zc_value) == 0) {
 				*strchr(zc.zc_value, '@') = '\0';
 				(void) strcat(zc.zc_value, suffix);
 			}
@ -2862,7 +3153,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 			char snap[ZFS_MAXNAMELEN];
 			(void) strcpy(snap, strchr(zc.zc_value, '@'));
 			if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid,
-			    zc.zc_value) == 0) {
+			    B_FALSE, zc.zc_value) == 0) {
 				*strchr(zc.zc_value, '@') = '\0';
 				(void) strcat(zc.zc_value, snap);
 			}
@ -2876,11 +3167,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 		zfs_handle_t *zhp;

 		/*
-		 * Destination fs exists.  Therefore this should either
-		 * be an incremental, or the stream specifies a new fs
-		 * (full stream or clone) and they want us to blow it
-		 * away (and have therefore specified -F and removed any
-		 * snapshots).
+		 * Destination fs exists.  It must be one of these cases:
+		 *  - an incremental send stream
+		 *  - the stream specifies a new fs (full stream or clone)
+		 *    and they want us to blow away the existing fs (and
+		 *    have therefore specified -F and removed any snapshots)
+		 *  - we are resuming a failed receive.
 		 */
 		if (stream_wantsnewfs) {
 			if (!flags->force) {
@ -2935,6 +3227,18 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 				return (-1);
 			}
 		}
+
+		/*
+		 * If we are resuming a newfs, set newfs here so that we will
+		 * mount it if the recv succeeds this time.  We can tell
+		 * that it was a newfs on the first recv because the fs
+		 * itself will be inconsistent (if the fs existed when we
+		 * did the first recv, we would have received it into
+		 * .../%recv).
+		 */
+		if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT))
+			newfs = B_TRUE;
+
 		zfs_close(zhp);
 	} else {
 		/*
@ -2967,9 +3271,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 		newfs = B_TRUE;
 	}

-	zc.zc_begin_record = drr_noswap->drr_u.drr_begin;
+	zc.zc_begin_record = *drr_noswap;
 	zc.zc_cookie = infd;
 	zc.zc_guid = flags->force;
+	zc.zc_resumable = flags->resumable;
 	if (flags->verbose) {
 		(void) printf("%s %s stream of %s into %s\n",
 		    flags->dryrun ? "would receive" : "receiving",
@ -3106,8 +3411,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ECKSUM:
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "invalid stream (checksum mismatch)"));
+			recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable);
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ENOTSUP:
@ -3309,7 +3613,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
 * Restores a backup of tosnap from the file descriptor specified by infd.
 * Return 0 on total success, -2 if some things couldn't be
 * destroyed/renamed/promoted, -1 if some things couldn't be received.
- * (-1 will override -2).
+ * (-1 will override -2, if -1 and the resumable flag was specified the
+ * transfer can be resumed if the sending side supports it).
 */
 int
 zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props,
--- a/lib/libzfs_core/libzfs_core.c
+++ b/lib/libzfs_core/libzfs_core.c
@ -467,6 +467,13 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
 int
 lzc_send(const char *snapname, const char *from, int fd,
    enum lzc_send_flags flags)
+{
+	return (lzc_send_resume(snapname, from, fd, flags, 0, 0));
+}
+
+int
+lzc_send_resume(const char *snapname, const char *from, int fd,
+    enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff)
 {
 	nvlist_t *args;
 	int err;
@ -479,6 +486,10 @@ lzc_send(const char *snapname, const char *from, int fd,
 		fnvlist_add_boolean(args, "largeblockok");
 	if (flags & LZC_SEND_FLAG_EMBED_DATA)
 		fnvlist_add_boolean(args, "embedok");
+	if (resumeobj != 0 || resumeoff != 0) {
+		fnvlist_add_uint64(args, "resume_object", resumeobj);
+		fnvlist_add_uint64(args, "resume_offset", resumeoff);
+	}
 	err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
 	nvlist_free(args);
 	return (err);
@ -536,22 +547,9 @@ recv_read(int fd, void *buf, int ilen)
 	return (0);
 }

-/*
- * The simplest receive case: receive from the specified fd, creating the
- * specified snapshot.  Apply the specified properties a "received" properties
- * (which can be overridden by locally-set properties).  If the stream is a
- * clone, its origin snapshot must be specified by 'origin'.  The 'force'
- * flag will cause the target filesystem to be rolled back or destroyed if
- * necessary to receive.
- *
- * Return 0 on success or an errno on failure.
- *
- * Note: this interface does not work on dedup'd streams
- * (those with DMU_BACKUP_FEATURE_DEDUP).
- */
-int
-lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
-    boolean_t force, int fd)
+static int
+lzc_receive_impl(const char *snapname, nvlist_t *props, const char *origin,
+    boolean_t force, boolean_t resumable, int fd)
 {
 	/*
 	 * The receive ioctl is still legacy, so we need to construct our own
@ -561,7 +559,6 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
 	char *atp;
 	char *packed = NULL;
 	size_t size;
-	dmu_replay_record_t drr;
 	int error;

 	ASSERT3S(g_refcount, >, 0);
@ -597,10 +594,9 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
 		(void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string));

 	/* zc_begin_record is non-byteswapped BEGIN record */
-	error = recv_read(fd, &drr, sizeof (drr));
+	error = recv_read(fd, &zc.zc_begin_record, sizeof (zc.zc_begin_record));
 	if (error != 0)
 		goto out;
-	zc.zc_begin_record = drr.drr_u.drr_begin;

 	/* zc_cookie is fd to read from */
 	zc.zc_cookie = fd;
@ -608,6 +604,8 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
 	/* zc guid is force flag */
 	zc.zc_guid = force;

+	zc.zc_resumable = resumable;
+
 	/* zc_cleanup_fd is unused */
 	zc.zc_cleanup_fd = -1;

@ -622,6 +620,39 @@ out:
 	return (error);
 }

+/*
+ * The simplest receive case: receive from the specified fd, creating the
+ * specified snapshot.  Apply the specified properties as "received" properties
+ * (which can be overridden by locally-set properties).  If the stream is a
+ * clone, its origin snapshot must be specified by 'origin'.  The 'force'
+ * flag will cause the target filesystem to be rolled back or destroyed if
+ * necessary to receive.
+ *
+ * Return 0 on success or an errno on failure.
+ *
+ * Note: this interface does not work on dedup'd streams
+ * (those with DMU_BACKUP_FEATURE_DEDUP).
+ */
+int
+lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
+    boolean_t force, int fd)
+{
+	return (lzc_receive_impl(snapname, props, origin, force, B_FALSE, fd));
+}
+
+/*
+ * Like lzc_receive, but if the receive fails due to premature stream
+ * termination, the intermediate state will be preserved on disk.  In this
+ * case, ECKSUM will be returned.  The receive may subsequently be resumed
+ * with a resuming send stream generated by lzc_send_resume().
+ */
+int
+lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin,
+    boolean_t force, int fd)
+{
+	return (lzc_receive_impl(snapname, props, origin, force, B_TRUE, fd));
+}
+
 /*
 * Roll back this filesystem or volume to its most recent snapshot.
 * If snapnamebuf is not NULL, it will be filled in with the name
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@ -1408,6 +1408,8 @@ spl_fstrans_check(void)
 	return (0);
 }

+void *zvol_tag = "zvol_tag";
+
 void
 zvol_create_minors(spa_t *spa, const char *name, boolean_t async)
 {
--- a/man/man8/zfs.8
+++ b/man/man8/zfs.8
@ -180,17 +180,27 @@ zfs \- configures ZFS file systems

 .LP
 .nf
-\fBzfs\fR \fBsend\fR [\fB-eL\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-Le\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
 .fi

 .LP
 .nf
-\fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-Penv\fR] \fB-t\fR \fIreceive_resume_token\fR
 .fi

 .LP
 .nf
-\fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR
+\fBzfs\fR \fBreceive\fR [\fB-Fnsuv\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBreceive\fR [\fB-Fnsuv\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBreceive\fR \fB-A\fR \fIfilesystem\fR|\fIvolume\fR
 .fi

 .LP
@ -532,6 +542,17 @@ For cloned file systems or volumes, the snapshot from which the clone was create
 .sp
 .ne 2
 .na
+\fB\fBreceive_resume_token\fR\fR
+.ad
+.sp .6
+.RS 4n
+For filesystems or volumes which have saved partially-completed state from \fBzfs receive -s\fR , this opaque token can be provided to \fBzfs send -t\fR to resume and complete the \fBzfs receive\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
 \fB\fBreferenced\fR\fR
 .ad
 .sp .6
@ -2799,7 +2820,7 @@ The format of the stream is committed. You will be able to receive your streams
 .sp
 .ne 2
 .na
-\fBzfs send\fR [\fB-eL\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs send\fR [\fB-Le\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
 .ad
 .sp .6
 .RS 4n
@ -2809,24 +2830,6 @@ the pool must be read-only, or the filesystem must not be mounted.  When the
 stream generated from a filesystem or volume is received, the default snapshot
 name will be "--head--".

-.sp
-.ne 2
-.na
-\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR
-.ad
-.sp .6
-.RS 4n
-Generate an incremental send stream.  The incremental source must be an earlier
-snapshot in the destination's history. It will commonly be an earlier
-snapshot in the destination's filesystem, in which case it can be
-specified as the last component of the name (the \fB#\fR or \fB@\fR character
-and following).
-.sp
-If the incremental target is a clone, the incremental source can
-be the origin snapshot, or an earlier snapshot in the origin's filesystem,
-or the origin's origin, etc.
-.RE
-
 .sp
 .ne 2
 .na
@ -2859,15 +2862,39 @@ then the receiving system must have that feature enabled as well. See
 \fBembedded_data\fR feature.
 .RE

+.sp
+.ne 2
+.na
+\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR
+.ad
+.sp .6
+.RS 4n
+Generate an incremental send stream.  The incremental source must be an earlier snapshot in the destination's history. It will commonly be an earlier snapshot in the destination's filesystem, in which case it can be specified as the last component of the name (the \fB#\fR or \fB@\fR character and following).
+.sp
+If the incremental target is a clone, the incremental source can be the origin snapshot, or an earlier snapshot in the origin's filesystem, or the origin's origin, etc.
+.RE
+
 .RE
 .sp
 .ne 2
 .na
-\fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
+\fB\fBzfs send\fR [\fB-Penv\fR] \fB-t\fR \fIreceive_resume_token\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a send stream which resumes an interrupted receive. The \fIreceive_resume_token\fR is the value of this property on the filesystem or volume that was being received into. See the documentation for \fBzfs receive -s\fR for more details.
+
+.RE
+
+.RE
+.sp
+.ne 2
+.na
+\fB\fBzfs receive\fR [\fB-Fnsuv\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
 .ad
 .br
 .na
-\fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR\fR
+\fB\fBzfs receive\fR [\fB-Fnsuv\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR\fR
 .ad
 .sp .6
 .RS 4n
@ -2885,21 +2912,36 @@ The \fB-d\fR and \fB-e\fR options cause the file system name of the target snaps
 .sp
 .ne 2
 .na
-\fB\fB-d\fR\fR
+\fB\fB-F\fR\fR
 .ad
 .sp .6
 .RS 4n
-Discard the first element of the sent snapshot's file system name, using the remaining elements to determine the name of the target file system for the new snapshot as described in the paragraph above.
+Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
+\fB\fB-n\fR\fR
+.ad
+.sp .6
+.RS 4n
+Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use.
 .RE

 .sp
 .ne 2
 .na
-\fB\fB-e\fR\fR
+\fB\fB-s\fR\fR
 .ad
 .sp .6
 .RS 4n
-Discard all but the last element of the sent snapshot's file system name, using that element to determine the name of the target file system for the new snapshot as described in the paragraph above.
+If the receive is interrupted, save the partially received state, rather than deleting it. Interruption may be due to premature termination of the stream (e.g. due to network failure or failure of the remote system if the stream is being read over a network connection), a checksum error in the stream, termination of the \fBzfs receive\fR process, or unclean shutdown of the system.
+.sp
+The receive can be resumed with a stream generated by \fBzfs send -t\fR token, where the \fItoken\fR is the value of the \fBreceive_resume_token\fR property of the filesystem or volume which is received into.
+.sp
+To use this flag, the storage pool must have the \fBextensible_dataset\fR feature enabled.  See \fBzpool-features\fR(5) for details on ZFS feature flags.
 .RE

 .sp
@ -2925,11 +2967,21 @@ Print verbose information about the stream and the time required to perform the
 .sp
 .ne 2
 .na
-\fB\fB-n\fR\fR
+\fB\fB-d\fR\fR
 .ad
 .sp .6
 .RS 4n
-Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use.
+Discard the first element of the sent snapshot's file system name, using the remaining elements to determine the name of the target file system for the new snapshot as described in the paragraph above.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-e\fR\fR
+.ad
+.sp .6
+.RS 4n
+Discard all but the last element of the sent snapshot's file system name, using that element to determine the name of the target file system for the new snapshot as described in the paragraph above.
 .RE

 .sp
@ -2942,15 +2994,16 @@ Do not actually receive the stream. This can be useful in conjunction with the \
 Forces the stream to be received as a clone of the given snapshot. This is only valid if the stream is an incremental stream whose source is the same as the provided origin.
 .RE

+.RE
+
 .sp
 .ne 2
 .na
-\fB\fB-F\fR\fR
+\fB\fBzfs receive\fR [\fB-A\fR] \fIfilesystem\fR|\fIvolume\fR
 .ad
 .sp .6
 .RS 4n
-Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side.
-.RE
+Abort an interrupted \fBzfs receive \fB-s\fR\fR, deleting its saved partially received state.

 .RE

--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@ -375,6 +375,10 @@ zfs_prop_init(void)
 	zprop_register_string(ZFS_PROP_SELINUX_ROOTCONTEXT, "rootcontext",
 	    "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "<selinux rootcontext>",
 	    "ROOTCONTEXT");
+	zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN,
+	    "receive_resume_token",
+	    NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<string token>", "RESUMETOK");

 	/* readonly number properties */
 	zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@ -405,6 +405,17 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 	 * checksum/compression/copies.
 	 */
 	if (ds != NULL) {
+		boolean_t needlock = B_FALSE;
+
+		/*
+		 * Note: it's valid to open the objset if the dataset is
+		 * long-held, in which case the pool_config lock will not
+		 * be held.
+		 */
+		if (!dsl_pool_config_held(dmu_objset_pool(os))) {
+			needlock = B_TRUE;
+			dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+		}
 		err = dsl_prop_register(ds,
 		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 		    primary_cache_changed_cb, os);
@ -461,6 +472,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 				    dnodesize_changed_cb, os);
 			}
 		}
+		if (needlock)
+			dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 		if (err != 0) {
 			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
 			    &os->os_phys_buf));
@ -520,6 +533,13 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 {
 	int err = 0;

+	/*
+	 * We shouldn't be doing anything with dsl_dataset_t's unless the
+	 * pool_config lock is held, or the dataset is long-held.
+	 */
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
+	    dsl_dataset_long_held(ds));
+
 	mutex_enter(&ds->ds_opening_lock);
 	if (ds->ds_objset == NULL) {
 		objset_t *os;
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@ -23,6 +23,7 @@
 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
 */
@ -64,12 +65,14 @@ int zfs_send_queue_length = 16 * 1024 * 1024;
 int zfs_recv_queue_length = 16 * 1024 * 1024;

 static char *dmu_recv_tag = "dmu_recv_tag";
-static const char *recv_clone_name = "%recv";
+const char *recv_clone_name = "%recv";

 #define	BP_SPAN(datablkszsec, indblkshift, level) \
 	(((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
 	(level) * (indblkshift - SPA_BLKPTRSHIFT)))

+static void byteswap_record(dmu_replay_record_t *drr);
+
 struct send_thread_arg {
 	bqueue_t	q;
 	dsl_dataset_t	*ds;		/* Dataset to traverse */
@ -77,6 +80,7 @@ struct send_thread_arg {
 	int		flags;		/* flags to pass to traverse_dataset */
 	int		error_code;
 	boolean_t	cancel;
+	zbookmark_phys_t resume;
 };

 struct send_block_record {
@ -99,7 +103,7 @@ dump_bytes_cb(void *arg)
 {
 	dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg;
 	dmu_sendarg_t *dsp = dbi->dbi_dsp;
-	dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
+	dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
 	ssize_t resid; /* have to get resid to get detailed errno */
 	ASSERT0(dbi->dbi_len % 8);

@ -180,7 +184,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
 	 * that the receiving system doesn't have any dbufs in the range
 	 * being freed.  This is always true because there is a one-record
 	 * constraint: we only send one WRITE record for any given
-	 * object+offset.  We know that the one-record constraint is
+	 * object,offset.  We know that the one-record constraint is
 	 * true because we always send data in increasing order by
 	 * object,offset.
 	 *
@ -428,6 +432,19 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
 {
 	struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);

+	if (object < dsp->dsa_resume_object) {
+		/*
+		 * Note: when resuming, we will visit all the dnodes in
+		 * the block of dnodes that we are resuming from.  In
+		 * this case it's unnecessary to send the dnodes prior to
+		 * the one we are resuming from.  We should be at most one
+		 * block's worth of dnodes behind the resume point.
+		 */
+		ASSERT3U(dsp->dsa_resume_object - object, <,
+		    1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
+		return (0);
+	}
+
 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
 		return (dump_freeobjects(dsp, object, 1));

@ -509,6 +526,9 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	uint64_t record_size;
 	int err = 0;

+	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+	    zb->zb_object >= sta->resume.zb_object);
+
 	if (sta->cancel)
 		return (SET_ERROR(EINTR));

@ -545,8 +565,10 @@ send_traverse_thread(void *arg)
 	struct send_block_record *data;

 	if (st_arg->ds != NULL) {
-		err = traverse_dataset(st_arg->ds, st_arg->fromtxg,
-		    st_arg->flags, send_cb, arg);
+		err = traverse_dataset_resume(st_arg->ds,
+		    st_arg->fromtxg, &st_arg->resume,
+		    st_arg->flags, send_cb, st_arg);
+
 		if (err != EINTR)
 			st_arg->error_code = err;
 	}
@ -575,6 +597,9 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)

 	ASSERT3U(zb->zb_level, >=, 0);

+	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+	    zb->zb_object >= dsa->dsa_resume_object);
+
 	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 		return (0);
@ -637,6 +662,10 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
 		uint64_t offset;

 		ASSERT0(zb->zb_level);
+		ASSERT(zb->zb_object > dsa->dsa_resume_object ||
+		    (zb->zb_object == dsa->dsa_resume_object &&
+		    zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+
 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 		    &aflags, zb) != 0) {
@ -697,8 +726,10 @@ get_next_record(bqueue_t *bq, struct send_block_record *data)
 */
 static int
 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
-    zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok,
-    boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
+    zfs_bookmark_phys_t *ancestor_zb,
+    boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd,
+    uint64_t resumeobj, uint64_t resumeoff,
+    vnode_t *vp, offset_t *off)
 {
 	objset_t *os;
 	dmu_replay_record_t *drr;
@ -707,6 +738,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
 	uint64_t fromtxg = 0;
 	uint64_t featureflags = 0;
 	struct send_thread_arg to_arg;
+	void *payload = NULL;
+	size_t payload_len = 0;
 	struct send_block_record *to_data;

 	err = dmu_objset_from_ds(to_ds, &os);
@ -721,6 +754,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
 	DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
 	    DMU_SUBSTREAM);

+	bzero(&to_arg, sizeof (to_arg));
+
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS) {
 		uint64_t version;
@ -746,6 +781,10 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
 			featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
 	}

+	if (resumeobj != 0 || resumeoff != 0) {
+		featureflags |= DMU_BACKUP_FEATURE_RESUMING;
+	}
+
 	DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
 	    featureflags);

@ -781,6 +820,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
 	dsp->dsa_pending_op = PENDING_NONE;
 	dsp->dsa_incremental = (ancestor_zb != NULL);
 	dsp->dsa_featureflags = featureflags;
+	dsp->dsa_resume_object = resumeobj;
+	dsp->dsa_resume_offset = resumeoff;

 	mutex_enter(&to_ds->ds_sendstream_lock);
 	list_insert_head(&to_ds->ds_sendstreams, dsp);
@ -789,7 +830,26 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
 	dsl_dataset_long_hold(to_ds, FTAG);
 	dsl_pool_rele(dp, tag);

-	if (dump_record(dsp, NULL, 0) != 0) {
+	if (resumeobj != 0 || resumeoff != 0) {
+		dmu_object_info_t to_doi;
+		nvlist_t *nvl;
+		err = dmu_object_info(os, resumeobj, &to_doi);
+		if (err != 0)
+			goto out;
+		SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0,
+		    resumeoff / to_doi.doi_data_block_size);
+
+		nvl = fnvlist_alloc();
+		fnvlist_add_uint64(nvl, "resume_object", resumeobj);
+		fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
+		payload = fnvlist_pack(nvl, &payload_len);
+		drr->drr_payloadlen = payload_len;
+		fnvlist_free(nvl);
+	}
+
+	err = dump_record(dsp, payload, payload_len);
+	fnvlist_pack_free(payload, payload_len);
+	if (err != 0) {
 		err = dsp->dsa_err;
 		goto out;
 	}
@ -899,19 +959,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
 		is_clone = (fromds->ds_dir != ds->ds_dir);
 		dsl_dataset_rele(fromds, FTAG);
 		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
-		    embedok, large_block_ok, outfd, vp, off);
+		    embedok, large_block_ok, outfd, 0, 0, vp, off);
 	} else {
 		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
-		    embedok, large_block_ok, outfd, vp, off);
+		    embedok, large_block_ok, outfd, 0, 0, vp, off);
 	}
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }

 int
-dmu_send(const char *tosnap, const char *fromsnap,
-    boolean_t embedok, boolean_t large_block_ok,
-    int outfd, vnode_t *vp, offset_t *off)
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+    boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
+    vnode_t *vp, offset_t *off)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
@ -978,10 +1038,12 @@ dmu_send(const char *tosnap, const char *fromsnap,
 			return (err);
 		}
 		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
-		    embedok, large_block_ok, outfd, vp, off);
+		    embedok, large_block_ok,
+		    outfd, resumeobj, resumeoff, vp, off);
 	} else {
 		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
-		    embedok, large_block_ok, outfd, vp, off);
+		    embedok, large_block_ok,
+		    outfd, resumeobj, resumeoff, vp, off);
 	}
 	if (owned)
 		dsl_dataset_disown(ds, FTAG);
@ -1221,6 +1283,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)

 	/* already checked */
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+	ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));

 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM ||
@ -1233,6 +1296,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
 		return (SET_ERROR(ENOTSUP));

+	if (drba->drba_cookie->drc_resumable &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
+		return (SET_ERROR(ENOTSUP));
+
 	/*
 	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
 	 * record to a plan WRITE record, so the pool must have the
@ -1345,15 +1412,16 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 	const char *tofs = drba->drba_cookie->drc_tofs;
 	dsl_dataset_t *ds, *newds;
 	uint64_t dsobj;
 	int error;
-	uint64_t crflags;
+	uint64_t crflags = 0;

-	crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
-	    DS_FLAG_CI_DATASET : 0;
+	if (drrb->drr_flags & DRR_FLAG_CI_DATA)
+		crflags |= DS_FLAG_CI_DATASET;

 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
 	if (error == 0) {
@ -1391,6 +1459,32 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 	}
 	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));

+	if (drba->drba_cookie->drc_resumable) {
+		uint64_t one = 1;
+		uint64_t zero = 0;
+
+		dsl_dataset_zapify(newds, tx);
+		if (drrb->drr_fromguid != 0) {
+			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
+			    8, 1, &drrb->drr_fromguid, tx));
+		}
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
+		    8, 1, &drrb->drr_toguid, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
+		    1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
+		    8, 1, &one, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
+		    8, 1, &zero, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
+		    8, 1, &zero, tx));
+		if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+		    DMU_BACKUP_FEATURE_EMBED_DATA) {
+			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
+			    8, 1, &one, tx));
+		}
+	}
+
 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
 	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;

@ -1408,56 +1502,191 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 	spa_history_log_internal_ds(newds, "receive", tx, "");
 }

+static int
+dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+	int error;
+	uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+	dsl_dataset_t *ds;
+	const char *tofs = drba->drba_cookie->drc_tofs;
+	char recvname[ZFS_MAXNAMELEN];
+	uint64_t val;
+
+	/* already checked */
+	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+	ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
+
+	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+	    DMU_COMPOUNDSTREAM ||
+	    drrb->drr_type >= DMU_OST_NUMTYPES)
+		return (SET_ERROR(EINVAL));
+
+	/* Verify pool version supports SA if SA_SPILL feature set */
+	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+	 * record to a plain WRITE record, so the pool must have the
+	 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+	 * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+		return (SET_ERROR(ENOTSUP));
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+		return (SET_ERROR(ENOTSUP));
+
+	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
+	    tofs, recv_clone_name);
+
+	if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+		/* %recv does not exist; continue in tofs */
+		error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+		if (error != 0)
+			return (error);
+	}
+
+	/* check that ds is marked inconsistent */
+	if (!DS_IS_INCONSISTENT(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/* check that there is resuming data, and that the toguid matches */
+	if (!dsl_dataset_is_zapified(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+	error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
+	if (error != 0 || drrb->drr_toguid != val) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Check if the receive is still running.  If so, it will be owned.
+	 * Note that nothing else can own the dataset (e.g. after the receive
+	 * fails) because it will be marked inconsistent.
+	 */
+	if (dsl_dataset_has_owner(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EBUSY));
+	}
+
+	/* There should not be any snapshots of this fs yet. */
+	if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Note: resume point will be checked when we process the first WRITE
+	 * record.
+	 */
+
+	/* check that the origin matches */
+	val = 0;
+	(void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
+	if (drrb->drr_fromguid != val) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	const char *tofs = drba->drba_cookie->drc_tofs;
+	dsl_dataset_t *ds;
+	uint64_t dsobj;
+	char recvname[ZFS_MAXNAMELEN];
+
+	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
+	    tofs, recv_clone_name);
+
+	if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+		/* %recv does not exist; continue in tofs */
+		VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds));
+		drba->drba_cookie->drc_newfs = B_TRUE;
+	}
+
+	/* clear the inconsistent flag so that we can own it */
+	ASSERT(DS_IS_INCONSISTENT(ds));
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+	dsobj = ds->ds_object;
+	dsl_dataset_rele(ds, FTAG);
+
+	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds));
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)));
+
+	drba->drba_cookie->drc_ds = ds;
+
+	spa_history_log_internal_ds(ds, "resume receive", tx, "");
+}
+
 /*
 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
 * succeeds; otherwise we will leak the holds on the datasets.
 */
 int
-dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
-    boolean_t force, char *origin, dmu_recv_cookie_t *drc)
+dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
+    boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc)
 {
 	dmu_recv_begin_arg_t drba = { 0 };
-	dmu_replay_record_t *drr;

 	bzero(drc, sizeof (dmu_recv_cookie_t));
-	drc->drc_drrb = drrb;
+	drc->drc_drr_begin = drr_begin;
+	drc->drc_drrb = &drr_begin->drr_u.drr_begin;
 	drc->drc_tosnap = tosnap;
 	drc->drc_tofs = tofs;
 	drc->drc_force = force;
+	drc->drc_resumable = resumable;
 	drc->drc_cred = CRED();

-	if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		drc->drc_byteswap = B_TRUE;
-	else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
-		return (SET_ERROR(EINVAL));
-
-	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
-	drr->drr_type = DRR_BEGIN;
-	drr->drr_u.drr_begin = *drc->drc_drrb;
-	if (drc->drc_byteswap) {
-		fletcher_4_incremental_byteswap(drr,
+		fletcher_4_incremental_byteswap(drr_begin,
+		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
+		byteswap_record(drr_begin);
+	} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
+		fletcher_4_incremental_native(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 	} else {
-		fletcher_4_incremental_native(drr,
-		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
-	}
-	kmem_free(drr, sizeof (dmu_replay_record_t));
-
-	if (drc->drc_byteswap) {
-		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
-		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
-		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
-		drrb->drr_type = BSWAP_32(drrb->drr_type);
-		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
-		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
+		return (SET_ERROR(EINVAL));
 	}

 	drba.drba_origin = origin;
 	drba.drba_cookie = drc;
 	drba.drba_cred = CRED();

-	return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
-	    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+	if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
+	    DMU_BACKUP_FEATURE_RESUMING) {
+		return (dsl_sync_task(tofs,
+		    dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
+		    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+	} else  {
+		return (dsl_sync_task(tofs,
+		    dmu_recv_begin_check, dmu_recv_begin_sync,
+		    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+	}
 }

 struct receive_record_arg {
@ -1469,6 +1698,7 @@ struct receive_record_arg {
 	 */
 	arc_buf_t *write_buf;
 	int payload_size;
+	uint64_t bytes_read; /* bytes read from stream when record created */
 	boolean_t eos_marker; /* Marks the end of the stream */
 	bqueue_node_t node;
 };
@ -1477,6 +1707,7 @@ struct receive_writer_arg {
 	objset_t *os;
 	boolean_t byteswap;
 	bqueue_t q;
+
 	/*
 	 * These three args are used to signal to the main thread that we're
 	 * done.
@ -1484,15 +1715,20 @@ struct receive_writer_arg {
 	kmutex_t mutex;
 	kcondvar_t cv;
 	boolean_t done;
+
 	int err;
 	/* A map from guid to dataset to help handle dedup'd streams. */
 	avl_tree_t *guid_to_ds_map;
+	boolean_t resumable;
+	uint64_t last_object, last_offset;
+	uint64_t bytes_read; /* bytes read when current record created */
 };

 struct receive_arg  {
 	objset_t *os;
 	vnode_t *vp; /* The vnode to read the stream from */
 	uint64_t voff; /* The current offset in the stream */
+	uint64_t bytes_read;
 	/*
 	 * A record that has had its payload read in, but hasn't yet been handed
 	 * off to the worker thread.
@ -1564,14 +1800,21 @@ receive_read(struct receive_arg *ra, int len, void *buf)
 		    ra->voff, UIO_SYSSPACE, FAPPEND,
 		    RLIM64_INFINITY, CRED(), &resid);

-		if (resid == len - done)
-			ra->err = SET_ERROR(EINVAL);
+		if (resid == len - done) {
+			/*
+			 * Note: ECKSUM indicates that the receive
+			 * was interrupted and can potentially be resumed.
+			 */
+			ra->err = SET_ERROR(ECKSUM);
+		}
 		ra->voff += len - done - resid;
 		done = len - resid;
 		if (ra->err != 0)
 			return (ra->err);
 	}

+	ra->bytes_read += len;
+
 	ASSERT3U(done, ==, len);
 	return (0);
 }
@ -1675,6 +1918,43 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
 	}
 }

+static void
+save_resume_state(struct receive_writer_arg *rwa,
+    uint64_t object, uint64_t offset, dmu_tx_t *tx)
+{
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+	if (!rwa->resumable)
+		return;
+
+	/*
+	 * We use ds_resume_bytes[] != 0 to indicate that we need to
+	 * update this on disk, so it must not be 0.
+	 */
+	ASSERT(rwa->bytes_read != 0);
+
+	/*
+	 * We only resume from write records, which have a valid
+	 * (non-meta-dnode) object number.
+	 */
+	ASSERT(object != 0);
+
+	/*
+	 * For resuming to work correctly, we must receive records in order,
+	 * sorted by object,offset.  This is checked by the callers, but
+	 * assert it here for good measure.
+	 */
+	ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
+	ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
+	    offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
+	ASSERT3U(rwa->bytes_read, >=,
+	    rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
+
+	rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
+	rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
+	rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
+}
+
 noinline static int
 receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 	void *data)
@ -1773,6 +2053,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		dmu_buf_rele(db, FTAG);
 	}
 	dmu_tx_commit(tx);
+
 	return (0);
 }

@ -1820,6 +2101,18 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
 	    !DMU_OT_IS_VALID(drrw->drr_type))
 		return (SET_ERROR(EINVAL));

+	/*
+	 * For resuming to work, records must be in increasing order
+	 * by (object, offset).
+	 */
+	if (drrw->drr_object < rwa->last_object ||
+	    (drrw->drr_object == rwa->last_object &&
+	    drrw->drr_offset < rwa->last_offset)) {
+		return (SET_ERROR(EINVAL));
+	}
+	rwa->last_object = drrw->drr_object;
+	rwa->last_offset = drrw->drr_offset;
+
 	if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));

@ -1842,8 +2135,17 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
 	if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
 		return (SET_ERROR(EINVAL));
 	dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
+
+	/*
+	 * Note: If the receive fails, we want the resume stream to start
+	 * with the same record that we last successfully received (as opposed
+	 * to the next record), so that we can verify that we are
+	 * resuming from the correct location.
+	 */
+	save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
 	dmu_tx_commit(tx);
 	dmu_buf_rele(bonus, FTAG);
+
 	return (0);
 }

@ -1902,43 +2204,48 @@ receive_write_byref(struct receive_writer_arg *rwa,
 	dmu_write(rwa->os, drrwbr->drr_object,
 	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
 	dmu_buf_rele(dbp, FTAG);
+
+	/* See comment in restore_write. */
+	save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
 	dmu_tx_commit(tx);
 	return (0);
 }

 static int
 receive_write_embedded(struct receive_writer_arg *rwa,
-    struct drr_write_embedded *drrwnp, void *data)
+    struct drr_write_embedded *drrwe, void *data)
 {
 	dmu_tx_t *tx;
 	int err;

-	if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
+	if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
 		return (EINVAL);

-	if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
+	if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
 		return (EINVAL);

-	if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+	if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
 		return (EINVAL);
-	if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+	if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
 		return (EINVAL);

 	tx = dmu_tx_create(rwa->os);

-	dmu_tx_hold_write(tx, drrwnp->drr_object,
-	    drrwnp->drr_offset, drrwnp->drr_length);
+	dmu_tx_hold_write(tx, drrwe->drr_object,
+	    drrwe->drr_offset, drrwe->drr_length);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}

-	dmu_write_embedded(rwa->os, drrwnp->drr_object,
-	    drrwnp->drr_offset, data, drrwnp->drr_etype,
-	    drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
+	dmu_write_embedded(rwa->os, drrwe->drr_object,
+	    drrwe->drr_offset, data, drrwe->drr_etype,
+	    drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
 	    rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);

+	/* See comment in restore_write. */
+	save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
 	dmu_tx_commit(tx);
 	return (0);
 }
@ -2012,10 +2319,16 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
 static void
 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
 {
-	char name[MAXNAMELEN];
-	dsl_dataset_name(drc->drc_ds, name);
-	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
-	(void) dsl_destroy_head(name);
+	if (drc->drc_resumable) {
+		/* wait for our resume state to be written to disk */
+		txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
+		dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+	} else {
+		char name[MAXNAMELEN];
+		dsl_dataset_name(drc->drc_ds, name);
+		dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+		(void) dsl_destroy_head(name);
+	}
 }

 static void
@ -2044,12 +2357,17 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)

 	if (len != 0) {
 		ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
-		ra->rrd->payload = buf;
-		ra->rrd->payload_size = len;
-		err = receive_read(ra, len, ra->rrd->payload);
+		err = receive_read(ra, len, buf);
 		if (err != 0)
 			return (err);
-		receive_cksum(ra, len, ra->rrd->payload);
+		receive_cksum(ra, len, buf);
+
+		/* note: rrd is NULL when reading the begin record's payload */
+		if (ra->rrd != NULL) {
+			ra->rrd->payload = buf;
+			ra->rrd->payload_size = len;
+			ra->rrd->bytes_read = ra->bytes_read;
+		}
 	}

 	ra->prev_cksum = ra->cksum;
@ -2057,6 +2375,7 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
 	ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
 	err = receive_read(ra, sizeof (ra->next_rrd->header),
 	    &ra->next_rrd->header);
+	ra->next_rrd->bytes_read = ra->bytes_read;
 	if (err != 0) {
 		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
 		ra->next_rrd = NULL;
@ -2236,7 +2555,7 @@ receive_read_record(struct receive_arg *ra)
 	{
 		struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
 		if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
-			return (SET_ERROR(EINVAL));
+			return (SET_ERROR(ECKSUM));
 		return (0);
 	}
 	case DRR_SPILL:
@ -2263,6 +2582,10 @@ receive_process_record(struct receive_writer_arg *rwa,
 {
 	int err;

+	/* Processing in order, therefore bytes_read should be increasing. */
+	ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
+	rwa->bytes_read = rrd->bytes_read;
+
 	switch (rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
@ -2357,6 +2680,33 @@ receive_writer_thread(void *arg)
 	mutex_exit(&rwa->mutex);
 }

+static int
+resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
+{
+	uint64_t val;
+	objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset;
+	uint64_t dsobj = dmu_objset_id(ra->os);
+	uint64_t resume_obj, resume_off;
+
+	if (nvlist_lookup_uint64(begin_nvl,
+	    "resume_object", &resume_obj) != 0 ||
+	    nvlist_lookup_uint64(begin_nvl,
+	    "resume_offset", &resume_off) != 0) {
+		return (SET_ERROR(EINVAL));
+	}
+	VERIFY0(zap_lookup(mos, dsobj,
+	    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
+	if (resume_obj != val)
+		return (SET_ERROR(EINVAL));
+	VERIFY0(zap_lookup(mos, dsobj,
+	    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
+	if (resume_off != val)
+		return (SET_ERROR(EINVAL));
+
+	return (0);
+}
+
+
 /*
 * Read in the stream's records, one by one, and apply them to the pool.  There
 * are two threads involved; the thread that calls this function will spin up a
@ -2378,6 +2728,9 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
 	struct receive_writer_arg *rwa;
 	int featureflags;
 	struct receive_ign_obj_node *n;
+	uint32_t payloadlen;
+	void *payload;
+	nvlist_t *begin_nvl = NULL;

 	ra = kmem_zalloc(sizeof (*ra), KM_SLEEP);
 	rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP);
@ -2386,6 +2739,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
 	ra->cksum = drc->drc_cksum;
 	ra->vp = vp;
 	ra->voff = *voffp;
+
+	if (dsl_dataset_is_zapified(drc->drc_ds)) {
+		(void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
+		    drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
+		    sizeof (ra->bytes_read), 1, &ra->bytes_read);
+	}
+
 	list_create(&ra->ignore_obj_list, sizeof (struct receive_ign_obj_node),
 		offsetof(struct receive_ign_obj_node, node));

@ -2438,9 +2798,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
 		drc->drc_guid_to_ds_map = rwa->guid_to_ds_map;
 	}

-	err = receive_read_payload_and_next_header(ra, 0, NULL);
-	if (err)
+	payloadlen = drc->drc_drr_begin->drr_payloadlen;
+	payload = NULL;
+	if (payloadlen != 0)
+		payload = kmem_alloc(payloadlen, KM_SLEEP);
+
+	err = receive_read_payload_and_next_header(ra, payloadlen, payload);
+	if (err != 0) {
+		if (payloadlen != 0)
+			kmem_free(payload, payloadlen);
 		goto out;
+	}
+	if (payloadlen != 0) {
+		err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP);
+		kmem_free(payload, payloadlen);
+		if (err != 0)
+			goto out;
+	}
+
+	if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+		err = resume_check(ra, begin_nvl);
+		if (err != 0)
+			goto out;
+	}

 	(void) bqueue_init(&rwa->q, zfs_recv_queue_length,
 	    offsetof(struct receive_record_arg, node));
@ -2448,6 +2828,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
 	mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
 	rwa->os = ra->os;
 	rwa->byteswap = drc->drc_byteswap;
+	rwa->resumable = drc->drc_resumable;

 	(void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
 	    TS_RUN, minclsyspri);
@ -2461,7 +2842,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
 	 * We can leave this loop in 3 ways:  First, if rwa->err is
 	 * non-zero.  In that case, the writer thread will free the rrd we just
 	 * pushed.  Second, if  we're interrupted; in that case, either it's the
-	 * first loop and ra->rrd was never allocated, or it's later, and ra.rrd
+	 * first loop and ra->rrd was never allocated, or it's later and ra->rrd
 	 * has been handed off to the writer thread who will free it.  Finally,
 	 * if receive_read_record fails or we're at the end of the stream, then
 	 * we free ra->rrd and exit.
@ -2506,13 +2887,15 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
 		err = rwa->err;

 out:
+	nvlist_free(begin_nvl);
 	if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
 		zfs_onexit_fd_rele(cleanup_fd);

 	if (err != 0) {
 		/*
-		 * destroy what we created, so we don't leave it in the
-		 * inconsistent restoring state.
+		 * Clean up references. If receive is not resumable,
+		 * destroy what we created, so we don't leave it in
+		 * the inconsistent state.
 		 */
 		dmu_recv_cleanup_ds(drc);
 	}
@ -2674,6 +3057,20 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)

 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		if (dsl_dataset_has_resume_receive_state(ds)) {
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_FROMGUID, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_OBJECT, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_OFFSET, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_BYTES, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_TOGUID, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_TONAME, tx);
+		}
 	}
 	drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
 	zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE);
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@ -47,6 +47,7 @@ typedef struct prefetch_data {
 	int pd_flags;
 	boolean_t pd_cancel;
 	boolean_t pd_exited;
+	zbookmark_phys_t pd_resume;
 } prefetch_data_t;

 typedef struct traverse_data {
@ -323,30 +324,29 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		uint32_t flags = ARC_FLAG_WAIT;
 		int32_t i;
 		int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-		dnode_phys_t *cdnp;
+		dnode_phys_t *child_dnp;

 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
 			goto post;
-		cdnp = buf->b_data;
+		child_dnp = buf->b_data;

-		for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) {
-			prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset,
-			    zb->zb_blkid * epb + i);
+		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
+			prefetch_dnode_metadata(td, &child_dnp[i],
+			    zb->zb_objset, zb->zb_blkid * epb + i);
 		}

 		/* recursively visitbp() blocks below this */
-		for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) {
-			err = traverse_dnode(td, &cdnp[i], zb->zb_objset,
-			    zb->zb_blkid * epb + i);
+		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
+			err = traverse_dnode(td, &child_dnp[i],
+			    zb->zb_objset, zb->zb_blkid * epb + i);
 			if (err != 0)
 				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
-		dnode_phys_t *mdnp, *gdnp, *udnp;

 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
@ -354,11 +354,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 			goto post;

 		osp = buf->b_data;
-		mdnp = &osp->os_meta_dnode;
-		gdnp = &osp->os_groupused_dnode;
-		udnp = &osp->os_userused_dnode;
-
-		prefetch_dnode_metadata(td, mdnp, zb->zb_objset,
+		prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
 		/*
 		 * See the block comment above for the goal of this variable.
@ -370,21 +366,21 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 			td->td_realloc_possible = B_FALSE;

 		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			prefetch_dnode_metadata(td, gdnp, zb->zb_objset,
-			    DMU_GROUPUSED_OBJECT);
-			prefetch_dnode_metadata(td, udnp, zb->zb_objset,
-			    DMU_USERUSED_OBJECT);
+			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
+			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
+			prefetch_dnode_metadata(td, &osp->os_userused_dnode,
+			    zb->zb_objset, DMU_USERUSED_OBJECT);
 		}

-		err = traverse_dnode(td, mdnp, zb->zb_objset,
+		err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			err = traverse_dnode(td, gdnp, zb->zb_objset,
-			    DMU_GROUPUSED_OBJECT);
+			err = traverse_dnode(td, &osp->os_groupused_dnode,
+			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
 		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			err = traverse_dnode(td, udnp, zb->zb_objset,
-			    DMU_USERUSED_OBJECT);
+			err = traverse_dnode(td, &osp->os_userused_dnode,
+			    zb->zb_objset, DMU_USERUSED_OBJECT);
 		}
 	}

@ -416,9 +412,15 @@ post:
 		 * Set the bookmark to the first level-0 block that we need
 		 * to visit.  This way, the resuming code does not need to
 		 * deal with resuming from indirect blocks.
+		 *
+		 * Note, if zb_level <= 0, dnp may be NULL, so we don't want
+		 * to dereference it.
 		 */
-		td->td_resume->zb_blkid = zb->zb_blkid <<
-		    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+		td->td_resume->zb_blkid = zb->zb_blkid;
+		if (zb->zb_level > 0) {
+			td->td_resume->zb_blkid <<= zb->zb_level *
+			    (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
+		}
 		td->td_paused = B_TRUE;
 	}

@ -450,6 +452,10 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
 	int j, err = 0;
 	zbookmark_phys_t czb;

+	if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
+	    object < td->td_resume->zb_object)
+		return (0);
+
 	if (td->td_flags & TRAVERSE_PRE) {
 		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
 		    ZB_DNODE_BLKID);
@ -527,6 +533,7 @@ traverse_prefetch_thread(void *arg)
 	td.td_func = traverse_prefetcher;
 	td.td_arg = td_main->td_pfd;
 	td.td_pfd = NULL;
+	td.td_resume = &td_main->td_pfd->pd_resume;

 	SET_BOOKMARK(&czb, td.td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
@ -556,12 +563,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
 	ASSERT(ds == NULL || objset == ds->ds_object);
 	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));

-	/*
-	 * The data prefetching mechanism (the prefetch thread) is incompatible
-	 * with resuming from a bookmark.
-	 */
-	ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA));
-
 	td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP);
 	pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP);
 	czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
@ -586,6 +587,8 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
 	}

 	pd->pd_flags = flags;
+	if (resume != NULL)
+		pd->pd_resume = *resume;
 	mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);

@ -638,11 +641,19 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
 * in syncing context).
 */
 int
-traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
-    blkptr_cb_t func, void *arg)
+traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
+    zbookmark_phys_t *resume,
+    int flags, blkptr_cb_t func, void *arg)
 {
 	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
-	    &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg));
+	    &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
+}
+
+int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
+    int flags, blkptr_cb_t func, void *arg)
+{
+	return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
 }

 int
@ -675,7 +686,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,

 	/* visit each dataset */
 	for (obj = 1; err == 0;
-	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
+	    err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
 		dmu_object_info_t doi;

 		err = dmu_object_info(mos, obj, &doi);
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@ -52,6 +52,9 @@
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/policy.h>
+#include <sys/dmu_send.h>
+#include <sys/zio_compress.h>
+#include <zfs_fletcher.h>

 /*
 * The SPA supports block sizes up to 16MB.  However, very large blocks
@ -704,6 +707,7 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
 {
 	boolean_t gotit = FALSE;

+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	mutex_enter(&ds->ds_lock);
 	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
 		ds->ds_owner = tag;
@ -714,6 +718,16 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
 	return (gotit);
 }

+boolean_t
+dsl_dataset_has_owner(dsl_dataset_t *ds)
+{
+	boolean_t rv;
+	mutex_enter(&ds->ds_lock);
+	rv = (ds->ds_owner != NULL);
+	mutex_exit(&ds->ds_lock);
+	return (rv);
+}
+
 static void
 dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
 {
@ -1615,6 +1629,21 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;

+	if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
+		    &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
+		    &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
+		    &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
+		ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
+		ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
+		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
+	}
+
 	dmu_objset_sync(ds->ds_objset, zio, tx);

 	for (f = 0; f < SPA_FEATURES; f++) {
@ -1670,6 +1699,78 @@ fail:
 	nvlist_free(propval);
 }

+static void
+get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	if (dsl_dataset_has_resume_receive_state(ds)) {
+		char *str;
+		void *packed;
+		uint8_t *compressed;
+		uint64_t val;
+		nvlist_t *token_nv = fnvlist_alloc();
+		size_t packed_size, compressed_size;
+		zio_cksum_t cksum;
+		char *propval;
+		char buf[MAXNAMELEN];
+		int i;
+
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "fromguid", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "object", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "offset", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "bytes", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "toguid", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
+			fnvlist_add_string(token_nv, "toname", buf);
+		}
+		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_EMBEDOK) == 0) {
+			fnvlist_add_boolean(token_nv, "embedok");
+		}
+		packed = fnvlist_pack(token_nv, &packed_size);
+		fnvlist_free(token_nv);
+		compressed = kmem_alloc(packed_size, KM_SLEEP);
+
+		compressed_size = gzip_compress(packed, compressed,
+		    packed_size, packed_size, 6);
+
+		fletcher_4_native(compressed, compressed_size, &cksum);
+
+		str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
+		for (i = 0; i < compressed_size; i++) {
+			(void) sprintf(str + i * 2, "%02x", compressed[i]);
+		}
+		str[compressed_size * 2] = '\0';
+		propval = kmem_asprintf("%u-%llx-%llx-%s",
+		    ZFS_SEND_RESUME_TOKEN_VERSION,
+		    (longlong_t)cksum.zc_word[0],
+		    (longlong_t)packed_size, str);
+		dsl_prop_nvlist_add_string(nv,
+		    ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
+		kmem_free(packed, packed_size);
+		kmem_free(str, compressed_size * 2 + 1);
+		kmem_free(compressed, packed_size);
+		strfree(propval);
+	}
+}
+
 void
 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 {
@ -1743,6 +1844,29 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 		}
 	}

+	if (!dsl_dataset_is_snapshot(ds)) {
+		dsl_dataset_t *recv_ds;
+		char recvname[ZFS_MAXNAMELEN];
+
+		/*
+		 * A failed "newfs" (e.g. full) resumable receive leaves
+		 * the stats set on this dataset.  Check here for the prop.
+		 */
+		get_receive_resume_stats(ds, nv);
+
+		/*
+		 * A failed incremental resumable receive leaves the
+		 * stats set on our child named "%recv".  Check the child
+		 * for the prop.
+		 */
+		dsl_dataset_name(ds, recvname);
+		(void) strcat(recvname, "/");
+		(void) strcat(recvname, recv_clone_name);
+		if (dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
+			get_receive_resume_stats(recv_ds, nv);
+			dsl_dataset_rele(recv_ds, FTAG);
+		}
+	}
 }

 void
@ -1970,7 +2094,8 @@ dsl_dataset_rename_snapshot(const char *fsname,
 * only one long hold on the dataset.  We're not allowed to change anything here
 * so we don't permanently release the long hold or regular hold here.  We want
 * to do this only when syncing to avoid the dataset unexpectedly going away
- * when we release the long hold.
+ * when we release the long hold.  Allow a long hold to exist for volumes, this
+ * may occur when asynchronously registering the minor with the kernel.
 */
 static int
 dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
@ -1985,7 +2110,7 @@ dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
 		dsl_dataset_long_rele(ds, owner);
 	}

-	held = dsl_dataset_long_held(ds);
+	held = (dsl_dataset_long_held(ds) && (ds->ds_owner != zvol_tag));

 	if (owner != NULL)
 		dsl_dataset_long_hold(ds, owner);
@ -3391,6 +3516,23 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
 	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
 }

+boolean_t
+dsl_dataset_is_zapified(dsl_dataset_t *ds)
+{
+	dmu_object_info_t doi;
+
+	dmu_object_info_from_db(ds->ds_dbuf, &doi);
+	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
+
+boolean_t
+dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_is_zapified(ds) &&
+	    zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
+}
+
 #if defined(_KERNEL) && defined(HAVE_SPL)
 #if defined(_LP64)
 module_param(zfs_max_recordsize, int, 0644);
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@ -978,9 +978,17 @@ dsl_destroy_inconsistent(const char *dsname, void *arg)
 	objset_t *os;

 	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
-		boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+		boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+
+		/*
+		 * If the dataset is inconsistent because a resumable receive
+		 * has failed, then do not destroy it.
+		 */
+		if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
+			need_destroy = B_FALSE;
+
 		dmu_objset_rele(os, FTAG);
-		if (inconsistent)
+		if (need_destroy)
 			(void) dsl_destroy_head(dsname);
 	}
 	return (0);
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@ -4005,6 +4005,7 @@ static boolean_t zfs_ioc_recv_inject_err;
 * zc_guid		force flag
 * zc_cleanup_fd	cleanup-on-exit file descriptor
 * zc_action_handle	handle for this guid/ds mapping (or zero on first call)
+ * zc_resumable		if data is incomplete assume sender will resume
 *
 * outputs:
 * zc_cookie		number of bytes read
@ -4051,13 +4052,13 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 		return (SET_ERROR(EBADF));
 	}

-	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	errors = fnvlist_alloc();

 	if (zc->zc_string[0])
 		origin = zc->zc_string;

 	error = dmu_recv_begin(tofs, tosnap,
-	    &zc->zc_begin_record, force, origin, &drc);
+	    &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
 	if (error != 0)
 		goto out;

@ -5182,6 +5183,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
 *         indicates that blocks > 128KB are permitted
 *     (optional) "embedok" -> (value ignored)
 *         presence indicates DRR_WRITE_EMBEDDED records are permitted
+ *     (optional) "resume_object" and "resume_offset" -> (uint64)
+ *         if present, resume send stream from specified object and offset.
 * }
 *
 * outnvl is unused
@ -5197,6 +5200,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 	file_t *fp;
 	boolean_t largeblockok;
 	boolean_t embedok;
+	uint64_t resumeobj = 0;
+	uint64_t resumeoff = 0;

 	error = nvlist_lookup_int32(innvl, "fd", &fd);
 	if (error != 0)
@ -5207,12 +5212,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 	largeblockok = nvlist_exists(innvl, "largeblockok");
 	embedok = nvlist_exists(innvl, "embedok");

+	(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
+	(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
+
 	if ((fp = getf(fd)) == NULL)
 		return (SET_ERROR(EBADF));

 	off = fp->f_offset;
-	error = dmu_send(snapname, fromname, embedok, largeblockok,
-	    fd, fp->f_vnode, &off);
+	error = dmu_send(snapname, fromname, embedok, largeblockok, fd,
+	    resumeobj, resumeoff, fp->f_vnode, &off);

 	if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
 		fp->f_offset = off;
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@ -61,7 +61,7 @@ unsigned long zvol_max_discard_blocks = 16384;

 static kmutex_t zvol_state_lock;
 static list_t zvol_state_list;
-static char *zvol_tag = "zvol_tag";
+void *zvol_tag = "zvol_tag";

 /*
 * The in-core state of each volume.
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@ -146,14 +146,13 @@ tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos',
 tests = []

 # DISABLED:
-# zfs_receive_003_pos - needs investigation
-# zfs_receive_010_pos - needs investigation
-# zfs_receive_011_pos - needs investigation
-# zfs_receive_012_pos - needs investigation
+# zfs_receive_004_neg - Fails for OpenZFS on illumos
+# zfs_receive_011_pos - Requires port of OpenZFS 6562
+# zfs_receive_012_pos - Requires port of OpenZFS 6562
 [tests/functional/cli_root/zfs_receive]
-tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_005_neg',
-    'zfs_receive_006_pos', 'zfs_receive_007_neg', 'zfs_receive_008_pos',
-    'zfs_receive_009_neg']
+tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
+    'zfs_receive_005_neg', 'zfs_receive_006_pos',
+    'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg']

 # DISABLED:
 # zfs_rename_002_pos - needs investigation
@ -175,11 +174,10 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos']
 [tests/functional/cli_root/zfs_rollback]
 tests = ['zfs_rollback_003_neg', 'zfs_rollback_004_neg']

-# DISABLED:
-# zfs_send_007_pos - needs investigation
 [tests/functional/cli_root/zfs_send]
 tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos',
-    'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos']
+    'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos',
+    'zfs_send_007_pos']

 # DISABLED:
 # mountpoint_003_pos - needs investigation
@ -207,10 +205,11 @@ tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos',

 # DISABLED:
 # zfs_snapshot_008_neg - nested pools
+# zfs_snapshot_009_pos - Fails for OpenZFS on illumos
 [tests/functional/cli_root/zfs_snapshot]
 tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg',
    'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg',
-    'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_009_pos']
+    'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg']

 # DISABLED:
 # zfs_unmount_005_pos - needs investigation
@ -565,12 +564,17 @@ tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos',
 #[tests/functional/rootpool]
 #tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_neg']

-# DISABLED: Hangs on I/O for unclear reason.
-#[tests/functional/rsend]
-#tests = ['rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
-#    'rsend_005_pos', 'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos',
-#    'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos',
-#    'rsend_013_pos']
+# DISABLED:
+# rsend_008_pos - Fails for OpenZFS on illumos
+# rsend_009_pos - Fails for OpenZFS on illumos
+# rsend_020_pos - ASSERTs in dump_record()
+[tests/functional/rsend]
+tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
+    'rsend_005_pos', 'rsend_006_pos', 'rsend_007_pos',
+    'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos',
+    'rsend_013_pos', 'rsend_014_pos',
+    'rsend_019_pos',
+    'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos']

 [tests/functional/scrub_mirror]
 tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
@ -586,17 +590,17 @@ tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos',
    'slog_009_neg', 'slog_010_neg', 'slog_011_neg']

 # DISABLED:
+# clone_001_pos - nested pools
 # rollback_003_pos - Hangs in unmount and spins.
-# snapshot_013_pos - Hangs on I/O for unclear reason.
-# snapshot_016_pos - .zfs mv/rmdir/mkdir disabled by default.
-#[tests/functional/snapshot]
-#tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos',
-#    'snapshot_001_pos', 'snapshot_002_pos',
-#    'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos',
-#    'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos',
-#    'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos',
-#    'snapshot_012_pos', 'snapshot_014_pos',
-#    'snapshot_015_pos', 'snapshot_017_pos']
+# snapshot_016_pos - Problem with automount
+[tests/functional/snapshot]
+tests = ['rollback_001_pos', 'rollback_002_pos',
+    'snapshot_001_pos', 'snapshot_002_pos',
+    'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos',
+    'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos',
+    'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos',
+    'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos',
+    'snapshot_015_pos', 'snapshot_017_pos']
 [tests/functional/snapused]
 tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos',
    'snapused_004_pos', 'snapused_005_pos']
--- a/tests/zfs-tests/cmd/mktree/mktree.c
+++ b/tests/zfs-tests/cmd/mktree/mktree.c
@ -172,7 +172,7 @@ crtfile(char *pname)
 		exit(errno);
 	}

-	if (fsetxattr(fd, "xattr", pbuf, 1024, 0) < 0) {
+	if (fsetxattr(fd, "user.xattr", pbuf, 1024, 0) < 0) {
 		(void) fprintf(stderr, "fsetxattr(fd, \"xattr\", pbuf, "
 		    "1024, 0) failed.\n[%d]: %s.\n", errno, strerror(errno));
 		exit(errno);
--- a/tests/zfs-tests/include/commands.cfg.in
+++ b/tests/zfs-tests/include/commands.cfg.in
@ -85,6 +85,7 @@ export SHARE="@SHARE@"
 export SHUF="@SHUF@"
 export SLEEP="@SLEEP@"
 export SORT="@SORT@"
+export STAT="@STAT@"
 export STRINGS="@STRINGS@"
 export SU="@SU@"
 export SUM="@SUM@"
@ -97,8 +98,8 @@ export TAIL="@TAIL@"
 export TAR="@TAR@"
 export TOUCH="@TOUCH@"
 export TR="@TR@"
-export TRUE="@TRUE@"
 export TRUNCATE="@TRUNCATE@"
+export TRUE="@TRUE@"
 export UDEVADM="@UDEVADM@"
 export UFSDUMP="@UFSDUMP@"
 export UFSRESTORE="@UFSRESTORE@"
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am
@ -10,4 +10,7 @@ dist_pkgdata_SCRIPTS = \
 	zfs_receive_006_pos.ksh \
 	zfs_receive_007_neg.ksh \
 	zfs_receive_008_pos.ksh \
-	zfs_receive_009_neg.ksh
+	zfs_receive_009_neg.ksh \
+	zfs_receive_010_pos.ksh \
+	zfs_receive_011_pos.ksh \
+	zfs_receive_012_pos.ksh
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh
@ -61,7 +61,7 @@ test_pool ()
 	first_object=$(ls -i $mntpnt | awk '{print $1}')
 	log_must $ZFS snapshot $POOL/fs@a
 	while true; do
-		log_must $FIND $mntpnt -delete
+		log_must $FIND $mntpnt/* -delete
 		sync
 		log_must $MKFILES "$mntpnt/" 4000
 		FILE=$(ls -i $mntpnt | awk \
--- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
+++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
@ -74,9 +74,11 @@ function setup_test_model
 	if is_global_zone ; then
 		log_must $ZFS create -V 16M $pool/vol
 		log_must $ZFS create -V 16M $pool/$FS/vol
+		block_device_wait

 		log_must $ZFS snapshot $pool/$FS/vol@vsnap
 		log_must $ZFS clone $pool/$FS/vol@vsnap $pool/$FS/vclone
+		block_device_wait
 	fi

 	log_must snapshot_tree $pool/$FS/fs1/fs2@fsnap
@ -199,10 +201,10 @@ function cmp_ds_prop
 	typeset dtst1=$1
 	typeset dtst2=$2

-	for item in "type" "origin" "volblocksize" "aclinherit" "aclmode" \
+	for item in "type" "origin" "volblocksize" "aclinherit" "acltype" \
 	    "atime" "canmount" "checksum" "compression" "copies" "devices" \
 	    "dnodesize" "exec" "quota" "readonly" "recordsize" "reservation" \
-	    "setuid" "sharenfs" "snapdir" "version" "volsize" "xattr" "zoned" \
+	    "setuid" "snapdir" "version" "volsize" "xattr" "zoned" \
 	    "mountpoint";
 	do
 		$ZFS get -H -o property,value,source $item $dtst1 >> \
@ -393,7 +395,7 @@ function mk_files
 	for ((i=0; i<$nfiles; i=i+1)); do
 		$DD if=/dev/urandom \
 		    of=/$fs/file-$maxsize-$((i+$file_id_offset)) \
-		    bs=$(($RANDOM * $RANDOM % $maxsize)) \
+		    bs=$((($RANDOM * $RANDOM % ($maxsize - 1)) + 1)) \
 		    count=1 >/dev/null 2>&1 || log_fail \
 		    "Failed to create /$fs/file-$maxsize-$((i+$file_id_offset))"
 	done
@ -438,7 +440,7 @@ function mess_file
 		# write the same value that's already there.
 		#
 		log_must eval "$DD if=/dev/urandom of=$file conv=notrunc " \
-		    "bs=1 count=2 oseek=$offset >/dev/null 2>&1"
+		    "bs=1 count=2 seek=$offset >/dev/null 2>&1"
 	else
 		log_must $TRUNCATE -s $offset $file
 	fi
@ -523,20 +525,20 @@ function test_fs_setup
 		mk_files 100 1048576 0 $sendfs &
 		mk_files 10 10485760 0 $sendfs &
 		mk_files 1 104857600 0 $sendfs &
-		log_must $WAIT
+		wait
 		log_must $ZFS snapshot $sendfs@a

 		rm_files 200 256 0 $sendfs &
 		rm_files 200 131072 0 $sendfs &
 		rm_files 20 1048576 0 $sendfs &
 		rm_files 2 10485760 0 $sendfs &
-		log_must $WAIT
+		wait

 		mk_files 400 256 0 $sendfs &
 		mk_files 400 131072 0 $sendfs &
 		mk_files 40 1048576 0 $sendfs &
 		mk_files 4 10485760 0 $sendfs &
-		log_must $WAIT
+		wait

 		log_must $ZFS snapshot $sendfs@b
 		log_must eval "$ZFS send -v $sendfs@a >/$sendpool/initial.zsend"
--- a/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh
@ -110,9 +110,6 @@ function cleanup
 		log_must $ZFS inherit $prop $POOL2
 	done

-	#if is_shared $POOL; then
-	#	log_must $ZFS set sharenfs=off $POOL
-	#fi
 	log_must setup_test_model $POOL

 	if [[ -d $TESTDIR ]]; then
@ -131,7 +128,7 @@ for fs in "$POOL" "$POOL/pclone" "$POOL/$FS" "$POOL/$FS/fs1" \
 	"$POOL/$FS/fs1/fs2" "$POOL/$FS/fs1/fclone" ; do
 	rand_set_prop $fs aclinherit "discard" "noallow" "secure" "passthrough"
 	rand_set_prop $fs checksum "on" "off" "fletcher2" "fletcher4" "sha256"
-	rand_set_prop $fs aclmode "discard" "groupmask" "passthrough"
+	rand_set_prop $fs acltype "off" "noacl" "posixacl"
 	rand_set_prop $fs atime "on" "off"
 	rand_set_prop $fs checksum "on" "off" "fletcher2" "fletcher4" "sha256"
 	rand_set_prop $fs compression "on" "off" "lzjb" "gzip" \
@ -161,7 +158,8 @@ done


 # Verify inherited property can be received
-rand_set_prop $POOL sharenfs "on" "off" "rw"
+rand_set_prop $POOL redundant_metadata "all" "most"
+rand_set_prop $POOL sync "standard" "always" "disabled"

 #
 # Duplicate POOL2 for testing
--- a/tests/zfs-tests/tests/functional/rsend/rsend_014_pos.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/rsend_014_pos.ksh
@ -46,6 +46,7 @@ log_must cleanup_pool $POOL2

 log_must eval "$ZFS send -R $POOL/$FS@final > $BACKDIR/fs-final-R"
 log_must eval "$ZFS receive -d $POOL2 < $BACKDIR/fs-final-R"
+block_device_wait
 log_must eval "$ZPOOL export $POOL"
 log_must eval "$ZPOOL import $POOL"

--- a/tests/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh
--- a/tests/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh
--- a/tests/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh
--- a/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh
--- a/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh
--- a/zfs-script-config.sh.in
+++ b/zfs-script-config.sh.in
@ -50,7 +50,7 @@ export FILE_WRITE=${TESTSDIR}/zfs-tests/cmd/file_write/file_write
 export LARGEST_FILE=${TESTSDIR}/zfs-tests/cmd/largest_file/largest_file
 export MKBUSY=${TESTSDIR}/zfs-tests/cmd/mkbusy/mkbusy
 export MKFILE=${TESTSDIR}/zfs-tests/cmd/mkfile/mkfile
-export MKFILES=${TESTSDIR}/zfs-tests/cmd/mkfile/mkfiles
+export MKFILES=${TESTSDIR}/zfs-tests/cmd/mkfiles/mkfiles
 export MKTREE=${TESTSDIR}/zfs-tests/cmd/mktree/mktree
 export MMAP_EXEC=${TESTSDIR}/zfs-tests/cmd/mmap_exec/mmap_exec
 export MMAPWRITE=${TESTSDIR}/zfs-tests/cmd/mmapwrite/mmapwrite