OpenZFS 8115 - parallel zfs mount

Porting Notes: * Use thread pools (tpool) API instead of introducing taskq interfaces to libzfs. * Use pthread_mutext for locks as mutex_t isn't available. * Ignore alternative libshare initialization since OpenZFS-7955 is not present on zfsonlinux. Authored by: Sebastien Roy <seb@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: Brad Lewis <brad.lewis@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Paul Dagnelie <pcd@delphix.com> Reviewed by: Prashanth Sreenivasa <pks@delphix.com> Authored by: Brian Behlendorf <behlendorf1@llnl.gov> Approved by: Matt Ahrens <mahrens@delphix.com> Ported-by: Don Brady <don.brady@delphix.com> OpenZFS-issue: https://www.illumos.org/issues/8115 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/a3f0e2b569 Closes #8092
2018-11-05 08:40:05 -07:00 · 2018-11-05 08:40:05 -07:00 · a10d50f999
parent af2e8411da
commit a10d50f999
10 changed files with 718 additions and 141 deletions
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@ -6059,7 +6059,12 @@ zfs_do_holds(int argc, char **argv)
 #define	CHECK_SPINNER 30
 #define	SPINNER_TIME 3		/* seconds */
-#define	MOUNT_TIME 5		/* seconds */
+#define	MOUNT_TIME 1		/* seconds */
 typedef struct get_all_state {
 	boolean_t	ga_verbose;
 	get_all_cb_t	*ga_cbp;
 } get_all_state_t;
 static int
 get_one_dataset(zfs_handle_t *zhp, void *data)
@ -6068,10 +6073,10 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
 	static int spinval = 0;
 	static int spincheck = 0;
 	static time_t last_spin_time = (time_t)0;
-	get_all_cb_t *cbp = data;
+	get_all_state_t *state = data;
 	zfs_type_t type = zfs_get_type(zhp);
-	if (cbp->cb_verbose) {
+	if (state->ga_verbose) {
 		if (--spincheck < 0) {
 			time_t now = time(NULL);
 			if (last_spin_time + SPINNER_TIME < now) {
@ -6097,25 +6102,23 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
 		zfs_close(zhp);
 		return (0);
 	}
-	libzfs_add_handle(cbp, zhp);
+	libzfs_add_handle(state->ga_cbp, zhp);
-	assert(cbp->cb_used <= cbp->cb_alloc);
+	assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc);
 	return (0);
 }
 static void
-get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose)
+get_all_datasets(get_all_cb_t *cbp, boolean_t verbose)
 {
-	get_all_cb_t cb = { 0 };
+	get_all_state_t state = {
-	cb.cb_verbose = verbose;
+	    .ga_verbose = verbose,
-	cb.cb_getone = get_one_dataset;
+	    .ga_cbp = cbp
 	};
 	if (verbose)
 		set_progress_header(gettext("Reading ZFS config"));
-	(void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
+	(void) zfs_iter_root(g_zfs, get_one_dataset, &state);
 	*dslist = cb.cb_handles;
 	*count = cb.cb_used;
 	if (verbose)
 		finish_progress(gettext("done."));
@ -6126,8 +6129,19 @@ get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose)
 * similar, we have a common function with an extra parameter to determine which
 * mode we are using.
 */
-#define	OP_SHARE	0x1
+typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t;
-#define	OP_MOUNT	0x2
+
 typedef struct share_mount_state {
 	share_mount_op_t	sm_op;
 	boolean_t	sm_verbose;
 	int	sm_flags;
 	char	*sm_options;
 	char	*sm_proto; /* only valid for OP_SHARE */
 	pthread_mutex_t	sm_lock; /* protects the remaining fields */
 	uint_t	sm_total; /* number of filesystems to process */
 	uint_t	sm_done; /* number of filesystems processed */
 	int	sm_status; /* -1 if any of the share/mount operations failed */
 } share_mount_state_t;
 /*
 * Share or mount a dataset.
@ -6385,6 +6399,29 @@ report_mount_progress(int current, int total)
 		update_progress(info);
 }
 /*
 * zfs_foreach_mountpoint() callback that mounts or shares one filesystem and
 * updates the progress meter.
 */
 static int
 share_mount_one_cb(zfs_handle_t *zhp, void *arg)
 {
 	share_mount_state_t *sms = arg;
 	int ret;
 	ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto,
 	    B_FALSE, sms->sm_options);
 	pthread_mutex_lock(&sms->sm_lock);
 	if (ret != 0)
 		sms->sm_status = ret;
 	sms->sm_done++;
 	if (sms->sm_verbose)
 		report_mount_progress(sms->sm_done, sms->sm_total);
 	pthread_mutex_unlock(&sms->sm_lock);
 	return (ret);
 }
 static void
 append_options(char *mntopts, char *newopts)
 {
@ -6459,8 +6496,6 @@ share_mount(int op, int argc, char **argv)
 	/* check number of arguments */
 	if (do_all) {
 		zfs_handle_t **dslist = NULL;
 		size_t i, count = 0;
 		char *protocol = NULL;
 		if (op == OP_SHARE && argc > 0) {
@ -6481,27 +6516,35 @@ share_mount(int op, int argc, char **argv)
 		}
 		start_progress_timer();
-		get_all_datasets(&dslist, &count, verbose);
+		get_all_cb_t cb = { 0 };
 		get_all_datasets(&cb, verbose);
-		if (count == 0) {
+		if (cb.cb_used == 0) {
 			if (options != NULL)
 				free(options);
 			return (0);
 		}
-		qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);
+		share_mount_state_t share_mount_state = { 0 };
 		share_mount_state.sm_op = op;
 		share_mount_state.sm_verbose = verbose;
 		share_mount_state.sm_flags = flags;
 		share_mount_state.sm_options = options;
 		share_mount_state.sm_proto = protocol;
 		share_mount_state.sm_total = cb.cb_used;
 		pthread_mutex_init(&share_mount_state.sm_lock, NULL);
-		for (i = 0; i < count; i++) {
+		/*
-			if (verbose)
+		 * libshare isn't mt-safe, so only do the operation in parallel
-				report_mount_progress(i, count);
+		 * if we're mounting.
 		 */
 		zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used,
 		    share_mount_one_cb, &share_mount_state, op == OP_MOUNT);
 		ret = share_mount_state.sm_status;
-			if (share_mount_one(dslist[i], op, flags, protocol,
+		for (int i = 0; i < cb.cb_used; i++)
-			    B_FALSE, options) != 0)
+			zfs_close(cb.cb_handles[i]);
-				ret = 1;
+		free(cb.cb_handles);
 			zfs_close(dslist[i]);
 		}
 		free(dslist);
 	} else if (argc == 0) {
 		struct mnttab entry;
--- a/include/libzfs.h
+++ b/include/libzfs.h
@ -573,12 +573,11 @@ typedef struct get_all_cb {
 	zfs_handle_t	**cb_handles;
 	size_t		cb_alloc;
 	size_t		cb_used;
 	boolean_t	cb_verbose;
 	int		(*cb_getone)(zfs_handle_t *, void *);
 } get_all_cb_t;
 void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t,
    zfs_iter_f, void *, boolean_t);
 void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);
 int libzfs_dataset_cmp(const void *, const void *);
 /*
 * Functions to create and destroy datasets.
--- a/include/libzfs_impl.h
+++ b/include/libzfs_impl.h
@ -21,7 +21,7 @@
 /*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
 * Copyright (c) 2018 Datto Inc.
 */
@ -60,6 +60,13 @@ struct libzfs_handle {
 	void *libzfs_sharehdl; /* libshare handle */
 	uint_t libzfs_shareflags;
 	boolean_t libzfs_mnttab_enable;
 	/*
 	 * We need a lock to handle the case where parallel mount
 	 * threads are populating the mnttab cache simultaneously. The
 	 * lock only protects the integrity of the avl tree, and does
 	 * not protect the contents of the mnttab entries themselves.
 	 */
 	pthread_mutex_t libzfs_mnttab_cache_lock;
 	avl_tree_t libzfs_mnttab_cache;
 	int libzfs_pool_iter;
 	char libzfs_chassis_id[256];
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@ -791,6 +791,7 @@ libzfs_mnttab_cache_compare(const void *arg1, const void *arg2)
 void
 libzfs_mnttab_init(libzfs_handle_t *hdl)
 {
 	pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL);
 	assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
 	avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
 	    sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
@ -849,6 +850,7 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl)
 		free(mtn);
 	}
 	avl_destroy(&hdl->libzfs_mnttab_cache);
 	(void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock);
 }
 void
@ -863,7 +865,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,
 {
 	mnttab_node_t find;
 	mnttab_node_t *mtn;
-	int error;
+	int ret = ENOENT;
 	if (!hdl->libzfs_mnttab_enable) {
 		struct mnttab srch = { 0 };
@ -883,17 +885,24 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,
 			return (ENOENT);
 	}
-	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
+	pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
-		if ((error = libzfs_mnttab_update(hdl)) != 0)
+	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) {
 		int error;
 		if ((error = libzfs_mnttab_update(hdl)) != 0) {
 			pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
 			return (error);
 		}
 	}
 	find.mtn_mt.mnt_special = (char *)fsname;
 	mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
 	if (mtn) {
 		*entry = mtn->mtn_mt;
-		return (0);
+		ret = 0;
 	}
-	return (ENOENT);
+	pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
 	return (ret);
 }
 void
@ -902,14 +911,23 @@ libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special,
 {
 	mnttab_node_t *mtn;
-	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
+	pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
-		return;
+	if (avl_numnodes(&hdl->libzfs_mnttab_cache) != 0) {
 		mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
 		mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
 		mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
 		mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
 		mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
 		/*
 		 * Another thread may have already added this entry
 		 * via libzfs_mnttab_update. If so we should skip it.
 		 */
 		if (avl_find(&hdl->libzfs_mnttab_cache, mtn, NULL) != NULL)
 			free(mtn);
 		else
 			avl_add(&hdl->libzfs_mnttab_cache, mtn);
 	}
 	pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
 }
 void
@ -918,6 +936,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
 	mnttab_node_t find;
 	mnttab_node_t *ret;
 	pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
 	find.mtn_mt.mnt_special = (char *)fsname;
 	if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL))
 	    != NULL) {
@ -928,6 +947,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
 		free(ret->mtn_mt.mnt_mntopts);
 		free(ret);
 	}
 	pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
 }
 int
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@ -22,7 +22,7 @@
 /*
 * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
 * Copyright 2017 RackTop Systems.
 * Copyright (c) 2018 Datto Inc.
@ -84,11 +84,15 @@
 #include <libzfs.h>
 #include "libzfs_impl.h"
 #include <thread_pool.h>
 #include <libshare.h>
 #include <sys/systeminfo.h>
 #define	MAXISALEN	257	/* based on sysinfo(2) man page */
 static int mount_tp_nthr = 512;	/* tpool threads for multi-threaded mounting */
 static void zfs_mount_task(void *);
 static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
 zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
    zfs_share_proto_t);
@ -1146,25 +1150,32 @@ remove_mountpoint(zfs_handle_t *zhp)
 	}
 }
 /*
 * Add the given zfs handle to the cb_handles array, dynamically reallocating
 * the array if it is out of space.
 */
 void
 libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)
 {
 	if (cbp->cb_alloc == cbp->cb_used) {
 		size_t newsz;
-		void *ptr;
+		zfs_handle_t **newhandles;
-		newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64;
+		newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64;
-		ptr = zfs_realloc(zhp->zfs_hdl,
+		newhandles = zfs_realloc(zhp->zfs_hdl,
-		    cbp->cb_handles, cbp->cb_alloc * sizeof (void *),
+		    cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *),
-		    newsz * sizeof (void *));
+		    newsz * sizeof (zfs_handle_t *));
-		cbp->cb_handles = ptr;
+		cbp->cb_handles = newhandles;
 		cbp->cb_alloc = newsz;
 	}
 	cbp->cb_handles[cbp->cb_used++] = zhp;
 }
 /*
 * Recursive helper function used during file system enumeration
 */
 static int
-mount_cb(zfs_handle_t *zhp, void *data)
+zfs_iter_cb(zfs_handle_t *zhp, void *data)
 {
 	get_all_cb_t *cbp = data;
@ -1196,112 +1207,351 @@ mount_cb(zfs_handle_t *zhp, void *data)
 	}
 	libzfs_add_handle(cbp, zhp);
-	if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
+	if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) {
 		zfs_close(zhp);
 		return (-1);
 	}
 	return (0);
 }
 /*
 * Sort comparator that compares two mountpoint paths. We sort these paths so
 * that subdirectories immediately follow their parents. This means that we
 * effectively treat the '/' character as the lowest value non-nul char. An
 * example sorted list using this comparator would look like:
 *
 * /foo
 * /foo/bar
 * /foo/bar/baz
 * /foo/baz
 * /foo.bar
 *
 * The mounting code depends on this ordering to deterministically iterate
 * over filesystems in order to spawn parallel mount tasks.
 */
 int
-libzfs_dataset_cmp(const void *a, const void *b)
+mountpoint_cmp(const void *arga, const void *argb)
 {
-	zfs_handle_t **za = (zfs_handle_t **)a;
+	zfs_handle_t *const *zap = arga;
-	zfs_handle_t **zb = (zfs_handle_t **)b;
+	zfs_handle_t *za = *zap;
 	zfs_handle_t *const *zbp = argb;
 	zfs_handle_t *zb = *zbp;
 	char mounta[MAXPATHLEN];
 	char mountb[MAXPATHLEN];
 	const char *a = mounta;
 	const char *b = mountb;
 	boolean_t gota, gotb;
-	if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
+	gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM);
-		verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
+	if (gota) {
 		verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta,
 		    sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
-	if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
+	}
-		verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
+	gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM);
 	if (gotb) {
 		verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb,
 		    sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
 	}
-	if (gota && gotb)
+	if (gota && gotb) {
-		return (strcmp(mounta, mountb));
+		while (*a != '\0' && (*a == *b)) {
 			a++;
 			b++;
 		}
 		if (*a == *b)
 			return (0);
 		if (*a == '\0')
 			return (-1);
 		if (*b == '\0')
 			return (1);
 		if (*a == '/')
 			return (-1);
 		if (*b == '/')
 			return (1);
 		return (*a < *b ? -1 : *a > *b);
 	}
 	if (gota)
 		return (-1);
 	if (gotb)
 		return (1);
-	return (strcmp(zfs_get_name(*za), zfs_get_name(*zb)));
+	/*
 	 * If neither filesystem has a mountpoint, revert to sorting by
 	 * dataset name.
 	 */
 	return (strcmp(zfs_get_name(za), zfs_get_name(zb)));
 }
 /*
 * Return true if path2 is a child of path1.
 */
 static boolean_t
 libzfs_path_contains(const char *path1, const char *path2)
 {
 	return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/');
 }
 /*
 * Given a mountpoint specified by idx in the handles array, find the first
 * non-descendent of that mountpoint and return its index. Descendant paths
 * start with the parent's path. This function relies on the ordering
 * enforced by mountpoint_cmp().
 */
 static int
 non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int idx)
 {
 	char parent[ZFS_MAXPROPLEN];
 	char child[ZFS_MAXPROPLEN];
 	int i;
 	verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent,
 	    sizeof (parent), NULL, NULL, 0, B_FALSE) == 0);
 	for (i = idx + 1; i < num_handles; i++) {
 		verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child,
 		    sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
 		if (!libzfs_path_contains(parent, child))
 			break;
 	}
 	return (i);
 }
 typedef struct mnt_param {
 	libzfs_handle_t	*mnt_hdl;
 	tpool_t		*mnt_tp;
 	zfs_handle_t	**mnt_zhps; /* filesystems to mount */
 	size_t		mnt_num_handles;
 	int		mnt_idx;	/* Index of selected entry to mount */
 	zfs_iter_f	mnt_func;
 	void		*mnt_data;
 } mnt_param_t;
 /*
 * Allocate and populate the parameter struct for mount function, and
 * schedule mounting of the entry selected by idx.
 */
 static void
 zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles,
    size_t num_handles, int idx, zfs_iter_f func, void *data, tpool_t *tp)
 {
 	mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t));
 	mnt_param->mnt_hdl = hdl;
 	mnt_param->mnt_tp = tp;
 	mnt_param->mnt_zhps = handles;
 	mnt_param->mnt_num_handles = num_handles;
 	mnt_param->mnt_idx = idx;
 	mnt_param->mnt_func = func;
 	mnt_param->mnt_data = data;
 	(void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param);
 }
 /*
 * This is the structure used to keep state of mounting or sharing operations
 * during a call to zpool_enable_datasets().
 */
 typedef struct mount_state {
 	/*
 	 * ms_mntstatus is set to -1 if any mount fails. While multiple threads
 	 * could update this variable concurrently, no synchronization is
 	 * needed as it's only ever set to -1.
 	 */
 	int		ms_mntstatus;
 	int		ms_mntflags;
 	const char	*ms_mntopts;
 } mount_state_t;
 static int
 zfs_mount_one(zfs_handle_t *zhp, void *arg)
 {
 	mount_state_t *ms = arg;
 	int ret = 0;
 	/*
 	 * don't attempt to mount encrypted datasets with
 	 * unloaded keys
 	 */
 	if (zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) ==
 	    ZFS_KEYSTATUS_UNAVAILABLE)
 		return (0);
 	if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0)
 		ret = ms->ms_mntstatus = -1;
 	return (ret);
 }
 static int
 zfs_share_one(zfs_handle_t *zhp, void *arg)
 {
 	mount_state_t *ms = arg;
 	int ret = 0;
 	if (zfs_share(zhp) != 0)
 		ret = ms->ms_mntstatus = -1;
 	return (ret);
 }
 /*
 * Thread pool function to mount one file system. On completion, it finds and
 * schedules its children to be mounted. This depends on the sorting done in
 * zfs_foreach_mountpoint(). Note that the degenerate case (chain of entries
 * each descending from the previous) will have no parallelism since we always
 * have to wait for the parent to finish mounting before we can schedule
 * its children.
 */
 static void
 zfs_mount_task(void *arg)
 {
 	mnt_param_t *mp = arg;
 	int idx = mp->mnt_idx;
 	zfs_handle_t **handles = mp->mnt_zhps;
 	size_t num_handles = mp->mnt_num_handles;
 	char mountpoint[ZFS_MAXPROPLEN];
 	verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint,
 	    sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
 	if (mp->mnt_func(handles[idx], mp->mnt_data) != 0)
 		return;
 	/*
 	 * We dispatch tasks to mount filesystems with mountpoints underneath
 	 * this one. We do this by dispatching the next filesystem with a
 	 * descendant mountpoint of the one we just mounted, then skip all of
 	 * its descendants, dispatch the next descendant mountpoint, and so on.
 	 * The non_descendant_idx() function skips over filesystems that are
 	 * descendants of the filesystem we just dispatched.
 	 */
 	for (int i = idx + 1; i < num_handles;
 	    i = non_descendant_idx(handles, num_handles, i)) {
 		char child[ZFS_MAXPROPLEN];
 		verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT,
 		    child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
 		if (!libzfs_path_contains(mountpoint, child))
 			break; /* not a descendant, return */
 		zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i,
 		    mp->mnt_func, mp->mnt_data, mp->mnt_tp);
 	}
 	free(mp);
 }
 /*
 * Issue the func callback for each ZFS handle contained in the handles
 * array. This function is used to mount all datasets, and so this function
 * guarantees that filesystems for parent mountpoints are called before their
 * children. As such, before issuing any callbacks, we first sort the array
 * of handles by mountpoint.
 *
 * Callbacks are issued in one of two ways:
 *
 * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT
 *    environment variable is set, then we issue callbacks sequentially.
 *
 * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT
 *    environment variable is not set, then we use a tpool to dispatch threads
 *    to mount filesystems in parallel. This function dispatches tasks to mount
 *    the filesystems at the top-level mountpoints, and these tasks in turn
 *    are responsible for recursively mounting filesystems in their children
 *    mountpoints.
 */
 void
 zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles,
    size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel)
 {
 	/*
 	 * The ZFS_SERIAL_MOUNT environment variable is an undocumented
 	 * variable that can be used as a convenience to do a/b comparison
 	 * of serial vs. parallel mounting.
 	 */
 	boolean_t serial_mount = !parallel ||
 	    (getenv("ZFS_SERIAL_MOUNT") != NULL);
 	/*
 	 * Sort the datasets by mountpoint. See mountpoint_cmp for details
 	 * of how these are sorted.
 	 */
 	qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp);
 	if (serial_mount) {
 		for (int i = 0; i < num_handles; i++) {
 			func(handles[i], data);
 		}
 		return;
 	}
 	/*
 	 * Issue the callback function for each dataset using a parallel
 	 * algorithm that uses a thread pool to manage threads.
 	 */
 	tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL);
 	/*
 	 * There may be multiple "top level" mountpoints outside of the pool's
 	 * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each of
 	 * these.
 	 */
 	for (int i = 0; i < num_handles;
 	    i = non_descendant_idx(handles, num_handles, i)) {
 		zfs_dispatch_mount(hdl, handles, num_handles, i, func, data,
 		    tp);
 	}
 	tpool_wait(tp);	/* wait for all scheduled mounts to complete */
 	tpool_destroy(tp);
 }
 /*
 * Mount and share all datasets within the given pool.  This assumes that no
- * datasets within the pool are currently mounted.  Because users can create
+ * datasets within the pool are currently mounted.
 * complicated nested hierarchies of mountpoints, we first gather all the
 * datasets and mountpoints within the pool, and sort them by mountpoint.  Once
 * we have the list of all filesystems, we iterate over them in order and mount
 * and/or share each one.
 */
 #pragma weak zpool_mount_datasets = zpool_enable_datasets
 int
 zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
 {
 	get_all_cb_t cb = { 0 };
-	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	mount_state_t ms = { 0 };
 	zfs_handle_t *zfsp;
-	int i, ret = -1;
+	int ret = 0;
 	int *good;
-	/*
+	if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
-	 * Gather all non-snap datasets within the pool.
+	    ZFS_TYPE_DATASET)) == NULL)
 	 */
 	if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL)
 		goto out;
 	/*
 	 * Gather all non-snapshot datasets within the pool. Start by adding
 	 * the root filesystem for this pool to the list, and then iterate
 	 * over all child filesystems.
 	 */
 	libzfs_add_handle(&cb, zfsp);
-	if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0)
+	if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0)
 		goto out;
 	/*
 	 * Sort the datasets by mountpoint.
 	 */
 	qsort(cb.cb_handles, cb.cb_used, sizeof (void *),
 	    libzfs_dataset_cmp);
 	/*
 	 * And mount all the datasets, keeping track of which ones
 	 * succeeded or failed.
 	 */
 	if ((good = zfs_alloc(zhp->zpool_hdl,
 	    cb.cb_used * sizeof (int))) == NULL)
 		goto out;
 	ret = 0;
 	for (i = 0; i < cb.cb_used; i++) {
 	/*
-		 * don't attempt to mount encrypted datasets with
+	 * Mount all filesystems
 		 * unloaded keys
 	 */
-		if (zfs_prop_get_int(cb.cb_handles[i], ZFS_PROP_KEYSTATUS) ==
+	ms.ms_mntopts = mntopts;
-		    ZFS_KEYSTATUS_UNAVAILABLE)
+	ms.ms_mntflags = flags;
-			continue;
+	zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
-
+	    zfs_mount_one, &ms, B_TRUE);
-		if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)
+	if (ms.ms_mntstatus != 0)
-			ret = -1;
+		ret = ms.ms_mntstatus;
 		else
 			good[i] = 1;
 	}
 	/*
-	 * Then share all the ones that need to be shared. This needs
+	 * Share all filesystems that need to be shared. This needs to be
-	 * to be a separate pass in order to avoid excessive reloading
+	 * a separate pass because libshare is not mt-safe, and so we need
-	 * of the configuration. Good should never be NULL since
+	 * to share serially.
 	 * zfs_alloc is supposed to exit if memory isn't available.
 	 */
-	for (i = 0; i < cb.cb_used; i++) {
+	ms.ms_mntstatus = 0;
-		if (good[i] && zfs_share(cb.cb_handles[i]) != 0)
+	zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
-			ret = -1;
+	    zfs_share_one, &ms, B_FALSE);
-	}
+	if (ms.ms_mntstatus != 0)
-
+		ret = ms.ms_mntstatus;
 	free(good);
 out:
-	for (i = 0; i < cb.cb_used; i++)
+	for (int i = 0; i < cb.cb_used; i++)
 		zfs_close(cb.cb_handles[i]);
 	free(cb.cb_handles);
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@ -181,7 +181,7 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
    'zfs_mount_007_pos', 'zfs_mount_008_pos', 'zfs_mount_009_neg',
    'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_neg',
    'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount',
-    'zfs_multi_mount']
+    'zfs_multi_mount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints']
 tags = ['functional', 'cli_root', 'zfs_mount']
 [tests/functional/cli_root/zfs_program]
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am
@ -14,8 +14,10 @@ dist_pkgdata_SCRIPTS = \
 	zfs_mount_010_neg.ksh \
 	zfs_mount_011_neg.ksh \
 	zfs_mount_012_neg.ksh \
 	zfs_mount_encrypted.ksh \
 	zfs_mount_all_001_pos.ksh \
 	zfs_mount_all_fail.ksh \
 	zfs_mount_all_mountpoints.ksh \
 	zfs_mount_encrypted.ksh \
 	zfs_mount_remount.ksh \
 	zfs_multi_mount.ksh
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib
@ -25,7 +25,7 @@
 #
 #
-# Copyright (c) 2016 by Delphix. All rights reserved.
+# Copyright (c) 2017 by Delphix. All rights reserved.
 #
 . $STF_SUITE/include/libtest.shlib
@ -84,14 +84,12 @@ function setup_filesystem #disklist #pool #fs #mntpoint #type #vdev
 	fi
 	case "$type" in
-		'ctr')	log_must zfs create $pool/$fs
+		'ctr')	log_must zfs create -o mountpoint=$mntpoint $pool/$fs
 			log_must zfs set mountpoint=$mntpoint $pool/$fs
 			;;
 		'vol')	log_must zfs create -V $VOLSIZE $pool/$fs
 			block_device_wait
 			;;
-		*)	log_must zfs create $pool/$fs
+		*)	log_must zfs create -o mountpoint=$mntpoint $pool/$fs
 			log_must zfs set mountpoint=$mntpoint $pool/$fs
 			;;
 	esac
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh
@ -0,0 +1,96 @@
 #!/bin/ksh -p
 #
 # CDDL HEADER START
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 # CDDL HEADER END
 #
 #
 # Copyright (c) 2017 by Delphix. All rights reserved.
 #
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib
 # DESCRIPTION:
 #       Verify that if 'zfs mount -a' fails to mount one filesystem,
 #       the command fails with a non-zero error code, but all other
 #       filesystems are mounted.
 #
 # STRATEGY:
 #       1. Create zfs filesystems
 #       2. Unmount a leaf filesystem
 #       3. Create a file in the above filesystem's mountpoint
 #       4. Verify that 'zfs mount -a' fails to mount the above
 #       5. Verify that all other filesystems were mounted
 #
 verify_runnable "both"
 typeset -a filesystems
 typeset path=${TEST_BASE_DIR%%/}/testroot$$/$TESTPOOL
 typeset fscount=10
 function setup_all
 {
 	# Create $fscount filesystems at the top level of $path
 	for ((i=0; i<$fscount; i++)); do
 		setup_filesystem "$DISKS" "$TESTPOOL" $i "$path/$i" ctr
 	done
 	zfs list -r $TESTPOOL
 	return 0
 }
 function cleanup_all
 {
 	export __ZFS_POOL_RESTRICT="$TESTPOOL"
 	log_must zfs $unmountall
 	unset __ZFS_POOL_RESTRICT
 	[[ -d ${TEST_BASE_DIR%%/}/testroot$$ ]] && \
 		rm -rf ${TEST_BASE_DIR%%/}/testroot$$
 }
 log_onexit cleanup_all
 log_must setup_all
 #
 # Unmount all of the above so that we can create the stray file
 # in one of the mountpoint directories.
 #
 export __ZFS_POOL_RESTRICT="$TESTPOOL"
 log_must zfs $unmountall
 unset __ZFS_POOL_RESTRICT
 # All of our filesystems should be unmounted at this point
 for ((i=0; i<$fscount; i++)); do
 	log_mustnot mounted "$TESTPOOL/$i"
 done
 # Create a stray file in one filesystem's mountpoint
 touch $path/0/strayfile
 # Verify that zfs mount -a fails
 export __ZFS_POOL_RESTRICT="$TESTPOOL"
 log_mustnot zfs $mountall
 unset __ZFS_POOL_RESTRICT
 # All filesystems except for "0" should be mounted
 log_mustnot mounted "$TESTPOOL/0"
 for ((i=1; i<$fscount; i++)); do
 	log_must mounted "$TESTPOOL/$i"
 done
 log_pass "'zfs $mountall' failed as expected."
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh
@ -0,0 +1,162 @@
 #!/bin/ksh -p
 #
 # CDDL HEADER START
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 # CDDL HEADER END
 #
 #
 # Copyright (c) 2017 by Delphix. All rights reserved.
 #
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib
 # DESCRIPTION:
 #       Verify that 'zfs mount -a' succeeds given a set of filesystems
 #       whose mountpoints have a parent/child relationship which is
 #       counter to the filesystem parent/child relationship.
 #
 # STRATEGY:
 #       1. Create zfs filesystems within the given pool.
 #       2. Unmount all the filesystems.
 #       3. Verify that 'zfs mount -a' command succeed,
 #	   and all available ZFS filesystems are mounted.
 #	4. Verify that 'zfs mount' is identical with 'df -F zfs'
 #
 verify_runnable "both"
 typeset -a filesystems
 function setup_all
 {
 	typeset path=${TEST_BASE_DIR%%/}/testroot$$/$TESTPOOL
 	typeset fscount=10
 	#
 	# Generate an array of filesystem names that represent a deep
 	# hierarchy as such:
 	#
 	# 0
 	# 0/1
 	# 0/1/2
 	# 0/1/2/3
 	# 0/1/2/3/4
 	# ...
 	#
 	fs=0
 	for ((i=0; i<$fscount; i++)); do
 		if [[ $i -gt 0 ]]; then
 			fs=$fs/$i
 		fi
 		filesystems+=($fs)
 	done
 	# Create all of the above filesystems
 	for ((i=0; i<$fscount; i++)); do
 		fs=${filesystems[$i]}
 		setup_filesystem "$DISKS" "$TESTPOOL" "$fs" "$path/$i" ctr
 	done
 	zfs list -r $TESTPOOL
 	#
 	# Unmount all of the above so that we can setup our convoluted
 	# mount paths.
 	#
 	export __ZFS_POOL_RESTRICT="$TESTPOOL"
 	log_must zfs $unmountall
 	unset __ZFS_POOL_RESTRICT
 	#
 	# Configure the mount paths so that each mountpoint is contained
 	# in a child filesystem. We should end up with something like the
 	# following structure (modulo the number of filesystems):
 	#
 	# NAME                       MOUNTPOINT
 	# testpool                   /testpool
 	# testpool/0                 /testroot25416/testpool/0/1/2/3/4/5/6
 	# testpool/0/1               /testroot25416/testpool/0/1/2/3/4/5
 	# testpool/0/1/2             /testroot25416/testpool/0/1/2/3/4
 	# testpool/0/1/2/3           /testroot25416/testpool/0/1/2/3
 	# testpool/0/1/2/3/4         /testroot25416/testpool/0/1/2
 	# testpool/0/1/2/3/4/5       /testroot25416/testpool/0/1
 	# testpool/0/1/2/3/4/5/6     /testroot25416/testpool/0
 	#
 	for ((i=0; i<$fscount; i++)); do
 		fs=$TESTPOOL/${filesystems[$(($fscount - $i - 1))]}
 		mnt=$path/${filesystems[$i]}
 		zfs set mountpoint=$mnt $fs
 	done
 	zfs list -r $TESTPOOL
 	return 0
 }
 function cleanup_all
 {
 	export __ZFS_POOL_RESTRICT="$TESTPOOL"
 	log_must zfs $unmountall
 	unset __ZFS_POOL_RESTRICT
 	for fs in ${filesystems[@]}; do
 		cleanup_filesystem "$TESTPOOL" "$fs"
 	done
 	[[ -d ${TEST_BASE_DIR%%/}/testroot$$ ]] && \
 		rm -rf ${TEST_BASE_DIR%%/}/testroot$$
 }
 #
 # This function takes a single true/false argument. If true it will verify that
 # all file systems are mounted. If false it will verify that they are not
 # mounted.
 #
 function verify_all
 {
 	if $1; then
 		logfunc=log_must
 	else
 		logfunc=log_mustnot
 	fi
 	for fs in ${filesystems[@]}; do
 		$logfunc mounted "$TESTPOOL/$fs"
 	done
 	return 0
 }
 log_onexit cleanup_all
 log_must setup_all
 export __ZFS_POOL_RESTRICT="$TESTPOOL"
 log_must zfs $unmountall
 unset __ZFS_POOL_RESTRICT
 verify_all false
 export __ZFS_POOL_RESTRICT="$TESTPOOL"
 log_must zfs $mountall
 unset __ZFS_POOL_RESTRICT
 verify_all true
 log_note "Verify that 'zfs $mountcmd' will display " \
 	"all ZFS filesystems currently mounted."
 verify_mount_display
 log_pass "'zfs $mountall' succeeds as root, " \
 	"and all available ZFS filesystems are mounted."