From 097150e1b787fe3c6ac4fa882f8b040e6d37288a Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 30 Jul 2010 13:48:28 -0700 Subject: [PATCH 1/5] Minor device creation/removal (User) With the update to onnv_141 how minor devices were created and removed for ZVOL was substantially changed. The updated system is much more tightly integrated with Solaris's /dev/ filesystem. This is great for Solaris but bad for Linux. This patch keeps as much of those changes as possible which as useful for Linux. But it also reverts back to use the old system of explicit minor node creation via a decicated ioctl(). This worked well in zfs-0.4.9 and it should continue to work well. --- lib/libzfs/include/libzfs_impl.h | 2 + lib/libzfs/libzfs_dataset.c | 321 +++++++++++++++++++++++++++++-- lib/libzfs/libzfs_pool.c | 2 +- lib/libzfs/libzfs_sendrecv.c | 12 +- 4 files changed, 318 insertions(+), 19 deletions(-) diff --git a/lib/libzfs/include/libzfs_impl.h b/lib/libzfs/include/libzfs_impl.h index 89c48c1c03..98e81cacf3 100644 --- a/lib/libzfs/include/libzfs_impl.h +++ b/lib/libzfs/include/libzfs_impl.h @@ -186,6 +186,8 @@ zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *); int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); +int zvol_create_link(libzfs_handle_t *, const char *); +int zvol_remove_link(libzfs_handle_t *, const char *); boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); void namespace_clear(libzfs_handle_t *); diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index fdd9ad4939..064645a2a1 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -56,6 +56,7 @@ #include "libzfs_impl.h" #include "zfs_deleg.h" +static int zvol_create_link_common(libzfs_handle_t *, const char *, int); static int userquota_propname_decode(const char *propname, boolean_t zoned, zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); @@ -2845,6 +2846,18 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, /* create the dataset */ ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc); + if (ret == 0 && type == ZFS_TYPE_VOLUME) { + ret = zvol_create_link(hdl, path); + if (ret) { + (void) zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, + "Volume successfully created, but device links " + "were not created")); + zcmd_free_nvlists(&zc); + return (-1); + } + } + zcmd_free_nvlists(&zc); /* check for failure */ @@ -2907,6 +2920,9 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t defer) (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (ZFS_IS_VOLUME(zhp)) { + if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0) + return (-1); + zc.zc_objset_type = DMU_OST_ZVOL; } else { zc.zc_objset_type = DMU_OST_ZFS; @@ -2949,9 +2965,17 @@ zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) zfs_close(szhp); } + if (zhp->zfs_type == ZFS_TYPE_VOLUME) { + (void) zvol_remove_link(zhp->zfs_hdl, name); + /* + * NB: this is simply a best-effort. We don't want to + * return an error, because then we wouldn't visit all + * the volumes. + */ + } + dd->closezhp = B_TRUE; - if (!dd->gotone) - rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg); + rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg); if (closezhp) zfs_close(zhp); return (rv); @@ -3086,11 +3110,70 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); } + } else if (ZFS_IS_VOLUME(zhp)) { + ret = zvol_create_link(zhp->zfs_hdl, target); } return (ret); } +typedef struct promote_data { + char cb_mountpoint[MAXPATHLEN]; + const char *cb_target; + const char *cb_errbuf; + uint64_t cb_pivot_txg; +} promote_data_t; + +static int +promote_snap_cb(zfs_handle_t *zhp, void *data) +{ + promote_data_t *pd = data; + zfs_handle_t *szhp; + char snapname[MAXPATHLEN]; + int rv = 0; + + /* We don't care about snapshots after the pivot point */ + if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) { + zfs_close(zhp); + return (0); + } + + /* Remove the device link if it's a zvol. */ + if (ZFS_IS_VOLUME(zhp)) + (void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name); + + /* Check for conflicting names */ + (void) strlcpy(snapname, pd->cb_target, sizeof (snapname)); + (void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname)); + szhp = make_dataset_handle(zhp->zfs_hdl, snapname); + if (szhp != NULL) { + zfs_close(szhp); + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "snapshot name '%s' from origin \n" + "conflicts with '%s' from target"), + zhp->zfs_name, snapname); + rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf); + } + zfs_close(zhp); + return (rv); +} + +static int +promote_snap_done_cb(zfs_handle_t *zhp, void *data) +{ + promote_data_t *pd = data; + + /* We don't care about snapshots after the pivot point */ + if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) { + /* Create the device link if it's a zvol. */ + if (ZFS_IS_VOLUME(zhp)) + (void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name); + } + + zfs_close(zhp); + return (0); +} + /* * Promotes the given clone fs to be the clone parent. */ @@ -3100,7 +3183,10 @@ zfs_promote(zfs_handle_t *zhp) libzfs_handle_t *hdl = zhp->zfs_hdl; zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; char parent[MAXPATHLEN]; + char *cp; int ret; + zfs_handle_t *pzhp; + promote_data_t pd; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, @@ -3118,7 +3204,29 @@ zfs_promote(zfs_handle_t *zhp) "not a cloned filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } + cp = strchr(parent, '@'); + *cp = '\0'; + /* Walk the snapshots we will be moving */ + pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); + if (pzhp == NULL) + return (-1); + pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG); + zfs_close(pzhp); + pd.cb_target = zhp->zfs_name; + pd.cb_errbuf = errbuf; + pzhp = zfs_open(hdl, parent, ZFS_TYPE_DATASET); + if (pzhp == NULL) + return (-1); + (void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint, + sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE); + ret = zfs_iter_snapshots(pzhp, promote_snap_cb, &pd); + if (ret != 0) { + zfs_close(pzhp); + return (-1); + } + + /* issue the ioctl */ (void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin, sizeof (zc.zc_value)); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); @@ -3127,9 +3235,16 @@ zfs_promote(zfs_handle_t *zhp) if (ret != 0) { int save_errno = errno; + (void) zfs_iter_snapshots(pzhp, promote_snap_done_cb, &pd); + zfs_close(pzhp); + switch (save_errno) { case EEXIST: - /* There is a conflicting snapshot name. */ + /* + * There is a conflicting snapshot name. We + * should have caught this above, but they could + * have renamed something in the mean time. + */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "conflicting snapshot '%s' from parent '%s'"), zc.zc_string, parent); @@ -3138,7 +3253,44 @@ zfs_promote(zfs_handle_t *zhp) default: return (zfs_standard_error(hdl, save_errno, errbuf)); } + } else { + (void) zfs_iter_snapshots(zhp, promote_snap_done_cb, &pd); } + + zfs_close(pzhp); + return (ret); +} + +struct createdata { + const char *cd_snapname; + int cd_ifexists; +}; + +static int +zfs_create_link_cb(zfs_handle_t *zhp, void *arg) +{ + struct createdata *cd = arg; + int ret; + + if (zhp->zfs_type == ZFS_TYPE_VOLUME) { + char name[MAXPATHLEN]; + + (void) strlcpy(name, zhp->zfs_name, sizeof (name)); + (void) strlcat(name, "@", sizeof (name)); + (void) strlcat(name, cd->cd_snapname, sizeof (name)); + (void) zvol_create_link_common(zhp->zfs_hdl, name, + cd->cd_ifexists); + /* + * NB: this is simply a best-effort. We don't want to + * return an error, because then we wouldn't visit all + * the volumes. + */ + } + + ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd); + + zfs_close(zhp); + return (ret); } @@ -3202,11 +3354,31 @@ zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, * if it was recursive, the one that actually failed will be in * zc.zc_name. */ - if (ret != 0) { + if (ret != 0) (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value); - (void) zfs_standard_error(hdl, errno, errbuf); + + if (ret == 0 && recursive) { + struct createdata cd; + + cd.cd_snapname = delim + 1; + cd.cd_ifexists = B_FALSE; + (void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd); } + if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) { + ret = zvol_create_link(zhp->zfs_hdl, path); + if (ret != 0) { + (void) zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, + "Volume successfully snapshotted, but device links " + "were not created")); + zfs_close(zhp); + return (-1); + } + } + + if (ret != 0) + (void) zfs_standard_error(hdl, errno, errbuf); zfs_close(zhp); @@ -3309,6 +3481,8 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) */ if (zhp->zfs_type == ZFS_TYPE_VOLUME) { + if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0) + return (-1); if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); @@ -3346,6 +3520,10 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) */ if ((zhp->zfs_type == ZFS_TYPE_VOLUME) && (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { + if ((err = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name))) { + zfs_close(zhp); + return (err); + } if (restore_resv) { new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (old_volsize != new_volsize) @@ -3494,6 +3672,7 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) } if (recursive) { + struct destroydata dd; parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); if (parentname == NULL) { @@ -3508,6 +3687,15 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) goto error; } + dd.snapname = delim + 1; + dd.gotone = B_FALSE; + dd.closezhp = B_TRUE; + + /* We remove any zvol links prior to renaming them */ + ret = zfs_iter_filesystems(zhrp, zfs_check_snap_cb, &dd); + if (ret) { + goto error; + } } else { if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL) return (-1); @@ -3556,10 +3744,27 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) * On failure, we still want to remount any filesystems that * were previously mounted, so we don't alter the system state. */ - if (!recursive) + if (recursive) { + struct createdata cd; + + /* only create links for datasets that had existed */ + cd.cd_snapname = delim + 1; + cd.cd_ifexists = B_TRUE; + (void) zfs_iter_filesystems(zhrp, zfs_create_link_cb, + &cd); + } else { (void) changelist_postfix(cl); + } } else { - if (!recursive) { + if (recursive) { + struct createdata cd; + + /* only create links for datasets that had existed */ + cd.cd_snapname = strchr(target, '@') + 1; + cd.cd_ifexists = B_TRUE; + ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb, + &cd); + } else { changelist_rename(cl, zfs_get_name(zhp), target); ret = changelist_postfix(cl); } @@ -3578,21 +3783,105 @@ error: return (ret); } +/* + * Given a zvol dataset, issue the ioctl to create the appropriate minor node, + * and wait briefly for udev to create the /dev link. + */ +int +zvol_create_link(libzfs_handle_t *hdl, const char *dataset) +{ + return (zvol_create_link_common(hdl, dataset, B_FALSE)); +} + +static int +zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists) +{ + zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; + char path[MAXPATHLEN]; + int error; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + + /* + * Issue the appropriate ioctl. + */ + if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) { + switch (errno) { + case EEXIST: + /* + * Silently ignore the case where the link already + * exists. This allows 'zfs volinit' to be run multiple + * times without errors. + */ + return (0); + + case ENOENT: + /* + * Dataset does not exist in the kernel. If we + * don't care (see zfs_rename), then ignore the + * error quietly. + */ + if (ifexists) { + return (0); + } + + /* FALLTHROUGH */ + + default: + return (zfs_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot create device links " + "for '%s'"), dataset)); + } + } + + /* + * Wait up to 10 seconds for udev to create the device. + */ + (void) snprintf(path, sizeof (path), "%s/%s", ZVOL_DIR, dataset); + error = zpool_label_disk_wait(path, 10000); + if (error) + (void) printf(gettext("%s may not be immediately " + "available\n"), path); + + return (0); +} + +/* + * Remove a minor node for the given zvol and the associated /dev links. + */ +int +zvol_remove_link(libzfs_handle_t *hdl, const char *dataset) +{ + zfs_cmd_t zc = { "\0", "\0", "\0", "\0", 0 }; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + + if (ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) { + switch (errno) { + case ENXIO: + /* + * Silently ignore the case where the link no longer + * exists, so that 'zfs volfini' can be run multiple + * times without errors. + */ + return (0); + + default: + return (zfs_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot remove device " + "links for '%s'"), dataset)); + } + } + + return (0); +} + nvlist_t * zfs_get_user_props(zfs_handle_t *zhp) { return (zhp->zfs_user_props); } -nvlist_t * -zfs_get_recvd_props(zfs_handle_t *zhp) -{ - if (zhp->zfs_recvd_props == NULL) - if (get_recvd_props_ioctl(zhp) != 0) - return (NULL); - return (zhp->zfs_recvd_props); -} - /* * This function is used by 'zfs list' to determine the exact set of columns to * display, and their maximum widths. This does two main things: diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 04a5cbad7a..b85ac7bf25 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3577,7 +3577,7 @@ zpool_label_disk_check(char *path) (void) close(fd); return EIDRM; } - + efi_free(vtoc); (void) close(fd); return 0; diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 10aafb8c25..efec76afd2 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -2559,6 +2559,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, return (-1); } } + if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_VOLUME && + zvol_remove_link(hdl, zhp->zfs_name) != 0) { + zfs_close(zhp); + zcmd_free_nvlists(&zc); + return (-1); + } zfs_close(zhp); } else { /* @@ -2752,7 +2758,6 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, * children of the target filesystem if we did a replication * receive (indicated by stream_avl being non-NULL). */ -#ifdef HAVE_ZPL cp = strchr(zc.zc_value, '@'); if (cp && (ioctl_err == 0 || !newfs)) { zfs_handle_t *h; @@ -2763,6 +2768,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, if (h != NULL) { if (h->zfs_type == ZFS_TYPE_VOLUME) { *cp = '@'; + err = zvol_create_link(hdl, h->zfs_name); + if (err == 0 && ioctl_err == 0) + err = zvol_create_link(hdl, + zc.zc_value); } else if (newfs || stream_avl) { /* * Track the first/top of hierarchy fs, @@ -2775,7 +2784,6 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } *cp = '@'; } -#endif /* HAVE_ZPL */ if (clp) { err |= changelist_postfix(clp); From f162433deb029769f76db8cc216cad9b4f7356fb Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 30 Jul 2010 13:53:44 -0700 Subject: [PATCH 2/5] Minor device creation/removal (Kernel) With the update to onnv_141 how minor devices were created and removed for ZVOL was substantially changed. The updated system is much more tightly integrated with Solaris's /dev/ filesystem. This is great for Solaris but bad for Linux. On the kernel side the ZFS_IOC_{CREATE,REMOVE}_MINOR ioctl entry points have been re-added. They now call directly in to the ZVOL to create the needed minor node and add the sysfs entried for udev. Also as part of this change I've decided it would really be best if all the zvols were in a /dev/zvol directory like on Solaris. Organizationally this makes sense and on the code side it allows us to know a block device is a zvol simply by where it is located in /dev/. Unless Solaris there still is to ./dsk or ./rdsk as part of the path. --- module/zcommon/include/sys/fs/zfs.h | 20 ++++--- module/zfs/include/sys/zvol.h | 2 +- module/zfs/vdev.c | 7 --- module/zfs/zfs_ioctl.c | 62 +++++++++++++--------- module/zfs/zvol.c | 82 +++++++++++++++-------------- 5 files changed, 94 insertions(+), 79 deletions(-) diff --git a/module/zcommon/include/sys/fs/zfs.h b/module/zcommon/include/sys/fs/zfs.h index 243a0dada0..2f3c48b6e8 100644 --- a/module/zcommon/include/sys/fs/zfs.h +++ b/module/zcommon/include/sys/fs/zfs.h @@ -686,13 +686,17 @@ typedef struct ddt_histogram { ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ } ddt_histogram_t; -#define ZVOL_DRIVER "zvol" -#define ZFS_DRIVER "zfs" -#define ZFS_DEV "/dev/zfs" -#define ZVOL_MAJOR 230 -#define ZVOL_MINOR_BITS 4 -#define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) -#define ZVOL_MINORS (1 << 4) +#define ZVOL_DRIVER "zvol" +#define ZFS_DRIVER "zfs" +#define ZFS_DEV "/dev/zfs" + +/* general zvol path */ +#define ZVOL_DIR "/dev/zvol" + +#define ZVOL_MAJOR 230 +#define ZVOL_MINOR_BITS 4 +#define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) +#define ZVOL_MINORS (1 << 4) #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 8192 @@ -726,6 +730,8 @@ typedef enum zfs_ioc { ZFS_IOC_DATASET_LIST_NEXT, ZFS_IOC_SNAPSHOT_LIST_NEXT, ZFS_IOC_SET_PROP, + ZFS_IOC_CREATE_MINOR, + ZFS_IOC_REMOVE_MINOR, ZFS_IOC_CREATE, ZFS_IOC_DESTROY, ZFS_IOC_ROLLBACK, diff --git a/module/zfs/include/sys/zvol.h b/module/zfs/include/sys/zvol.h index ed12c923de..c8b9d65071 100644 --- a/module/zfs/include/sys/zvol.h +++ b/module/zfs/include/sys/zvol.h @@ -42,7 +42,7 @@ extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); extern int zvol_create_minor(const char *); extern int zvol_create_minors(const char *); extern int zvol_remove_minor(const char *); -extern int zvol_remove_minors(const char *); +extern void zvol_remove_minors(const char *); extern int zvol_set_volsize(const char *, uint64_t); extern int zvol_set_volblocksize(const char *, uint64_t); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 14de500fa7..e1018ccc6a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1072,12 +1072,6 @@ vdev_open_child(void *arg) boolean_t vdev_uses_zvols(vdev_t *vd) { -/* - * NOTE: Disabled because under Linux I've choosen not to put all the zvols - * in their own directory. This could be changed or this code can be updated - * to perhap run an ioctl() on the vdev path to determine if it is a zvol. - */ -#if 0 int c; if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, @@ -1086,7 +1080,6 @@ vdev_uses_zvols(vdev_t *vd) for (c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); -#endif return (B_FALSE); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 1befd58631..f29c5edaf4 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1112,8 +1112,9 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); - (void) zvol_remove_minors(zc->zc_name); error = spa_destroy(zc->zc_name); + if (error == 0) + zvol_remove_minors(zc->zc_name); return (error); } @@ -1144,7 +1145,7 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) error = spa_import(zc->zc_name, config, props); if (error == 0) - error = zvol_create_minors(zc->zc_name); + zvol_create_minors(zc->zc_name); if (zc->zc_nvlist_dst != 0) (void) put_nvlist(zc, config); @@ -1165,10 +1166,9 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); - error = zvol_remove_minors(zc->zc_name); + error = spa_export(zc->zc_name, NULL, force, hardforce); if (error == 0) - error = spa_export(zc->zc_name, NULL, force, hardforce); - + zvol_remove_minors(zc->zc_name); return (error); } @@ -2438,6 +2438,30 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of volume + * + * outputs: none + */ +static int +zfs_ioc_create_minor(struct file *filp, zfs_cmd_t *zc) +{ + return (zvol_create_minor(zc->zc_name)); +} + +/* + * inputs: + * zc_name name of volume + * + * outputs: none + */ +static int +zfs_ioc_remove_minor(struct file *filp, zfs_cmd_t *zc) +{ + return (zvol_remove_minor(zc->zc_name)); +} + /* * inputs: * zc_name name of filesystem @@ -2834,18 +2858,9 @@ zfs_ioc_create(zfs_cmd_t *zc) if (error == 0) { error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, nvprops, NULL); - if (error != 0) { + if (error != 0) (void) dmu_objset_destroy(zc->zc_name, B_FALSE); - goto out; - } - - if (type == DMU_OST_ZVOL) { - error = zvol_create_minor(zc->zc_name); - if (error != 0) - (void) dmu_objset_destroy(zc->zc_name, B_FALSE); - } } -out: nvlist_free(nvprops); return (error); } @@ -2968,7 +2983,7 @@ zfs_ioc_destroy(zfs_cmd_t *zc) err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy); if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) - err = zvol_remove_minor(zc->zc_name); + (void) zvol_remove_minor(zc->zc_name); return (err); } @@ -3068,7 +3083,6 @@ static int zfs_ioc_rename(zfs_cmd_t *zc) { boolean_t recursive = zc->zc_cookie & 1; - int err; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || @@ -3082,16 +3096,12 @@ zfs_ioc_rename(zfs_cmd_t *zc) */ if (!recursive && strchr(zc->zc_name, '@') != NULL && zc->zc_objset_type == DMU_OST_ZFS) { - err = zfs_unmount_snap(zc->zc_name, NULL); + int err = zfs_unmount_snap(zc->zc_name, NULL); if (err) return (err); } - if (zc->zc_objset_type == DMU_OST_ZVOL) { - err = zvol_remove_minor(zc->zc_name); - if (err) - return (err); - } - + if (zc->zc_objset_type == DMU_OST_ZVOL) + (void) zvol_remove_minor(zc->zc_name); return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); } @@ -4315,6 +4325,10 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, B_TRUE }, { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_create_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_remove_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE, + B_FALSE }, { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE }, { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, B_TRUE}, diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 2e16e60c23..fa94cd433c 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -57,6 +57,7 @@ static char *zvol_tag = "zvol_tag"; * The in-core state of each volume. */ typedef struct zvol_state { + char zv_name[DISK_NAME_LEN]; /* name */ uint64_t zv_volsize; /* advertised space */ uint64_t zv_volblocksize;/* volume block size */ objset_t *zv_objset; /* objset handle */ @@ -130,7 +131,7 @@ zvol_find_by_name(const char *name) ASSERT(MUTEX_HELD(&zvol_state_lock)); for (zv = list_head(&zvol_state_list); zv != NULL; zv = list_next(&zvol_state_list, zv)) { - if (!strncmp(zv->zv_disk->disk_name, name, DISK_NAME_LEN)) + if (!strncmp(zv->zv_name, name, DISK_NAME_LEN)) return zv; } @@ -757,11 +758,10 @@ zvol_first_open(zvol_state_t *zv) objset_t *os; uint64_t volsize; int error; - uint64_t readonly; + uint64_t ro; /* lie and say we're read-only */ - error = dmu_objset_own(zv->zv_disk->disk_name, - DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); + error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zvol_tag, &os); if (error) return (-error); @@ -782,9 +782,8 @@ zvol_first_open(zvol_state_t *zv) zv->zv_volsize = volsize; zv->zv_zilog = zil_open(os, zvol_get_data); - VERIFY(dsl_prop_get_integer(zv->zv_disk->disk_name, - "readonly", &readonly, NULL) == 0); - if (readonly || dmu_objset_is_snapshot(os)) { + VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL) == 0); + if (ro || dmu_objset_is_snapshot(os)) { set_disk_ro(zv->zv_disk, 1); zv->zv_flags |= ZVOL_RDONLY; } else { @@ -1031,6 +1030,7 @@ zvol_alloc(dev_t dev, const char *name) zv->zv_queue->queuedata = zv; zv->zv_dev = dev; zv->zv_open_count = 0; + strlcpy(zv->zv_name, name, DISK_NAME_LEN); mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, @@ -1043,7 +1043,7 @@ zvol_alloc(dev_t dev, const char *name) zv->zv_disk->fops = &zvol_ops; zv->zv_disk->private_data = zv; zv->zv_disk->queue = zv->zv_queue; - strlcpy(zv->zv_disk->disk_name, name, DISK_NAME_LEN); + snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "zvol/%s", name); return zv; @@ -1173,31 +1173,25 @@ zvol_create_minors_cb(spa_t *spa, uint64_t dsobj, return zvol_create_minor(dsname); } -static int -zvol_remove_minors_cb(spa_t *spa, uint64_t dsobj, - const char *dsname, void *arg) -{ - if (strchr(dsname, '/') == NULL) - return 0; - - return zvol_remove_minor(dsname); -} - -static int -zvol_cr_minors_common(const char *pool, - int func(spa_t *, uint64_t, const char *, void *), void *arg) +/* + * Create minors for specified pool, if pool is NULL create minors + * for all available pools. + */ +int +zvol_create_minors(const char *pool) { spa_t *spa = NULL; int error = 0; if (pool) { - error = dmu_objset_find_spa(NULL, pool, func, arg, - DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + error = dmu_objset_find_spa(NULL, pool, zvol_create_minors_cb, + NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); } else { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { - error = dmu_objset_find_spa(NULL, spa_name(spa), - func, arg, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + error = dmu_objset_find_spa(NULL, + spa_name(spa), zvol_create_minors_cb, NULL, + DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); if (error) break; } @@ -1208,21 +1202,31 @@ zvol_cr_minors_common(const char *pool, } /* - * Create minors for specified pool, or if NULL create all minors. + * Remove minors for specified pool, if pool is NULL remove all minors. */ -int -zvol_create_minors(const char *pool) -{ - return zvol_cr_minors_common(pool, zvol_create_minors_cb, NULL); -} - -/* - * Remove minors for specified pool, or if NULL remove all minors. - */ -int +void zvol_remove_minors(const char *pool) { - return zvol_cr_minors_common(pool, zvol_remove_minors_cb, NULL); + zvol_state_t *zv, *zv_next; + char *str; + + str = kmem_zalloc(DISK_NAME_LEN, KM_SLEEP); + if (pool) { + (void) strncpy(str, pool, strlen(pool)); + (void) strcat(str, "/"); + } + + mutex_enter(&zvol_state_lock); + for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { + zv_next = list_next(&zvol_state_list, zv); + + if (pool == NULL || !strncmp(str, zv->zv_name, strlen(str))) { + zvol_remove(zv); + zvol_free(zv); + } + } + mutex_exit(&zvol_state_lock); + kmem_free(str, DISK_NAME_LEN); } int @@ -1262,12 +1266,10 @@ zvol_init(void) void zvol_fini(void) { - (void) zvol_remove_minors(NULL); - + zvol_remove_minors(NULL); blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); unregister_blkdev(zvol_major, ZVOL_DRIVER); taskq_destroy(zvol_taskq); - mutex_destroy(&zvol_state_lock); list_destroy(&zvol_state_list); } From 88b37fbe57acf1cb0a7a9358a9427c127a6b65ec Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 30 Jul 2010 09:27:40 -0700 Subject: [PATCH 3/5] Fix zvol_init() lock inversion During module load we could deadlock because the zvol_init() callpath took the spa_namespace_lock before the zvol_state_lock. The rest of the zvol code takes the locks in the opposite order. In particular, I observed the following deadlock cause by the lock inversion. I've fixed the ording by creating an unlocked version of zvol_create_minor and zvol_remove_minor. This allows me to take the zvol_state_lock before the spa_namespace_lock in zvol_cr_minors_common and simply call the unlocked version. --- module/zfs/zvol.c | 71 +++++++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 27 deletions(-) diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index fa94cd433c..a25d700f93 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -1071,13 +1071,8 @@ zvol_free(zvol_state_t *zv) kmem_free(zv, sizeof (zvol_state_t)); } -/* - * Create a block device minor node and setup the linkage between it - * and the specified volume. Once this function returns the block - * device is live and ready for use. - */ -int -zvol_create_minor(const char *name) +static int +__zvol_create_minor(const char *name) { zvol_state_t *zv; objset_t *os; @@ -1085,7 +1080,7 @@ zvol_create_minor(const char *name) unsigned minor = 0; int error = 0; - mutex_enter(&zvol_state_lock); + ASSERT(MUTEX_HELD(&zvol_state_lock)); zv = zvol_find_by_name(name); if (zv) { @@ -1128,9 +1123,44 @@ zvol_create_minor(const char *name) out_dmu_objset_disown: dmu_objset_disown(os, zvol_tag); out: + return (error); +} + +/* + * Create a block device minor node and setup the linkage between it + * and the specified volume. Once this function returns the block + * device is live and ready for use. + */ +int +zvol_create_minor(const char *name) +{ + int error; + + mutex_enter(&zvol_state_lock); + error = __zvol_create_minor(name); mutex_exit(&zvol_state_lock); - return (-error); + return (error); +} + +static int +__zvol_remove_minor(const char *name) +{ + zvol_state_t *zv; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + + zv = zvol_find_by_name(name); + if (zv == NULL) + return (ENXIO); + + if (zv->zv_open_count > 0) + return (EBUSY); + + zvol_remove(zv); + zvol_free(zv); + + return (0); } /* @@ -1139,25 +1169,10 @@ out: int zvol_remove_minor(const char *name) { - zvol_state_t *zv; - int error = 0; + int error; mutex_enter(&zvol_state_lock); - - zv = zvol_find_by_name(name); - if (zv == NULL) { - error = ENXIO; - goto out; - } - - if (zv->zv_open_count > 0) { - error = EBUSY; - goto out; - } - - zvol_remove(zv); - zvol_free(zv); -out: + error = __zvol_remove_minor(name); mutex_exit(&zvol_state_lock); return (error); @@ -1170,7 +1185,7 @@ zvol_create_minors_cb(spa_t *spa, uint64_t dsobj, if (strchr(dsname, '/') == NULL) return 0; - return zvol_create_minor(dsname); + return __zvol_create_minor(dsname); } /* @@ -1183,6 +1198,7 @@ zvol_create_minors(const char *pool) spa_t *spa = NULL; int error = 0; + mutex_enter(&zvol_state_lock); if (pool) { error = dmu_objset_find_spa(NULL, pool, zvol_create_minors_cb, NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); @@ -1197,6 +1213,7 @@ zvol_create_minors(const char *pool) } mutex_exit(&spa_namespace_lock); } + mutex_exit(&zvol_state_lock); return error; } From 7195d18e6a092dbf335547a195bfa9dbc8245f28 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 30 Jul 2010 10:01:24 -0700 Subject: [PATCH 4/5] Reduce stack using in ZVOL The dmu_object_info_t structures which are roughly 60 bytes have been moved from the stack to the heap. Every little bit helps and there's absolutely no reason these temporary working variables need to be on the stack. --- module/zfs/zvol.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index a25d700f93..e93d94897e 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -183,7 +183,7 @@ int zvol_get_stats(objset_t *os, nvlist_t *nv) { int error; - dmu_object_info_t doi; + dmu_object_info_t *doi; uint64_t val; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); @@ -191,14 +191,16 @@ zvol_get_stats(objset_t *os, nvlist_t *nv) return (error); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); - - error = dmu_object_info(os, ZVOL_OBJ, &doi); + doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); + error = dmu_object_info(os, ZVOL_OBJ, doi); if (error == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, - doi.doi_data_block_size); + doi->doi_data_block_size); } + kmem_free(doi, sizeof(dmu_object_info_t)); + return (error); } @@ -274,7 +276,7 @@ int zvol_set_volsize(const char *name, uint64_t volsize) { zvol_state_t *zv; - dmu_object_info_t doi; + dmu_object_info_t *doi; objset_t *os = NULL; uint64_t readonly; int error; @@ -287,26 +289,30 @@ zvol_set_volsize(const char *name, uint64_t volsize) goto out; } + doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); + error = dmu_objset_hold(name, FTAG, &os); if (error) - goto out; + goto out_doi; - if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 || - (error = zvol_check_volsize(volsize,doi.doi_data_block_size)) != 0) - goto out; + if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) != 0 || + (error = zvol_check_volsize(volsize,doi->doi_data_block_size)) != 0) + goto out_doi; VERIFY(dsl_prop_get_integer(name, "readonly", &readonly, NULL) == 0); if (readonly) { error = EROFS; - goto out; + goto out_doi; } if (get_disk_ro(zv->zv_disk) || (zv->zv_flags & ZVOL_RDONLY)) { error = EROFS; - goto out; + goto out_doi; } error = zvol_update_volsize(zv, volsize); +out_doi: + kmem_free(doi, sizeof(dmu_object_info_t)); out: if (os) dmu_objset_rele(os, FTAG); @@ -1076,7 +1082,7 @@ __zvol_create_minor(const char *name) { zvol_state_t *zv; objset_t *os; - dmu_object_info_t doi; + dmu_object_info_t *doi; unsigned minor = 0; int error = 0; @@ -1088,11 +1094,13 @@ __zvol_create_minor(const char *name) goto out; } + doi = kmem_alloc(sizeof(dmu_object_info_t), KM_SLEEP); + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); if (error) - goto out; + goto out_doi; - error = dmu_object_info(os, ZVOL_OBJ, &doi); + error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; @@ -1109,7 +1117,7 @@ __zvol_create_minor(const char *name) if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; - zv->zv_volblocksize = doi.doi_data_block_size; + zv->zv_volblocksize = doi->doi_data_block_size; if (zil_replay_disable) zil_destroy(dmu_objset_zil(os), B_FALSE); @@ -1122,6 +1130,8 @@ __zvol_create_minor(const char *name) out_dmu_objset_disown: dmu_objset_disown(os, zvol_tag); +out_doi: + kmem_free(doi, sizeof(dmu_object_info_t)); out: return (error); } From dd9a55895abfe6c94a96b1a208cef505aa1007fb Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 2 Aug 2010 11:53:38 -0700 Subject: [PATCH 5/5] Revert zconfig.sh test 4 This test was accidentally readded to the linux-kernel-disk topic branch. It is being reverted so it can be reapplied with a few minor tweaks in the right place. --- scripts/zconfig.sh | 48 ---------------------------------------------- 1 file changed, 48 deletions(-) diff --git a/scripts/zconfig.sh b/scripts/zconfig.sh index 5123d25ad5..b74c57087a 100755 --- a/scripts/zconfig.sh +++ b/scripts/zconfig.sh @@ -159,52 +159,4 @@ EOF } zconfig_test3 -# zpool import/export check -zconfig_test4() { - POOL_NAME=test4 - ZVOL_NAME1=fish1 - ZVOL_NAME2=fish2 - FULL_NAME1=${POOL_NAME}/${ZVOL_NAME1} - FULL_NAME2=${POOL_NAME}/${ZVOL_NAME2} - TMP_CACHE=`mktemp -p /tmp zpool.cache.XXXXXXXX` - - echo -n "test 4 - zpool import/export: " - - # Create a pool and volume. - ${ZFS_SH} zfs="spa_config_path=${TMP_CACHE}" || fail 1 - ${ZPOOL_CREATE_SH} -p ${POOL_NAME} -c lo-raidz2 || fail 2 - ${ZFS} create -V 100M ${FULL_NAME1} || fail 3 - ${ZFS} create -V 100M ${FULL_NAME2} || fail 4 - - # Verify the devices were created - stat /dev/${FULL_NAME1} &>/dev/null || fail 5 - stat /dev/${FULL_NAME2} &>/dev/null || fail 6 - - # Export the pool - ${ZPOOL} export ${POOL_NAME} || fail 7 - - # Verify the devices were removed - stat /dev/${FULL_NAME1} &>/dev/null && fail 8 - stat /dev/${FULL_NAME2} &>/dev/null && fail 9 - - # Import the pool - ${ZPOOL} import ${POOL_NAME} || fail 10 - - # Verify the devices were created - stat /dev/${FULL_NAME1} &>/dev/null || fail 11 - stat /dev/${FULL_NAME2} &>/dev/null || fail 12 - - # Destroy the pool and consequently the devices - ${ZPOOL_CREATE_SH} -p ${POOL_NAME} -c lo-raidz2 -d || fail 15 - - # Verify the devices were removed - stat /dev/${FULL_NAME1} &>/dev/null && fail 16 - stat /dev/${FULL_NAME2} &>/dev/null && fail 17 - - ${ZFS_SH} -u || fail 18 - - pass -} -zconfig_test4 - exit 0