diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index cae20147b3..8e7d311231 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -1206,12 +1206,10 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) /* * Remove any previously existing symlink from a udev path to - * the device before labeling the disk. This makes - * zpool_label_disk_wait() truly wait for the new link to show - * up instead of returning if it finds an old link still in - * place. Otherwise there is a window between when udev - * deletes and recreates the link during which access attempts - * will fail with ENOENT. + * the device before labeling the disk. This ensures that + * only newly created links are used. Otherwise there is a + * window between when udev deletes and recreates the link + * during which access attempts will fail with ENOENT. */ strncpy(udevpath, path, MAXPATHLEN); (void) zfs_append_partition(udevpath, MAXPATHLEN); @@ -1235,6 +1233,8 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) * and then block until udev creates the new link. */ if (!is_exclusive || !is_spare(NULL, udevpath)) { + char *devnode = strrchr(devpath, '/') + 1; + ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); if (ret == 0) { ret = lstat64(udevpath, &statbuf); @@ -1242,18 +1242,29 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) (void) unlink(udevpath); } - if (zpool_label_disk(g_zfs, zhp, - strrchr(devpath, '/') + 1) == -1) + /* + * When labeling a pool the raw device node name + * is provided as it appears under /dev/. + */ + if (zpool_label_disk(g_zfs, zhp, devnode) == -1) return (-1); + /* + * Wait for udev to signal the device is available + * by the provided path. + */ ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); if (ret) { - (void) fprintf(stderr, gettext("cannot " - "resolve path '%s': %d\n"), udevpath, ret); - return (-1); + (void) fprintf(stderr, + gettext("missing link: %s was " + "partitioned but %s is missing\n"), + devnode, udevpath); + return (ret); } - (void) zero_label(udevpath); + ret = zero_label(udevpath); + if (ret) + return (ret); } /* diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 1d9ef76af9..e40f6f6947 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -123,6 +123,40 @@ get_devid(const char *path) return (ret); } +/* + * Wait up to timeout_ms for udev to set up the device node. The device is + * considered ready when the provided path have been verified to exist and + * it has been allowed to settle. At this point the device the device can + * be accessed reliably. Depending on the complexity of the udev rules thisi + * process could take several seconds. + */ +int +zpool_label_disk_wait(char *path, int timeout_ms) +{ + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + struct stat64 statbuf; + + start = gethrtime(); + settle = 0; + + do { + errno = 0; + if ((stat64(path, &statbuf) == 0) && (errno == 0)) { + if (settle == 0) + settle = gethrtime(); + else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) + return (0); + } else if (errno != ENOENT) { + return (errno); + } + + usleep(sleep_ms * MILLISEC); + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + return (ENODEV); +} /* * Go through and fix up any path and/or devid information for the given vdev diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 3eee43e605..a194b8b57e 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -4094,29 +4094,6 @@ find_start_block(nvlist_t *config) return (MAXOFFSET_T); } -int -zpool_label_disk_wait(char *path, int timeout) -{ - struct stat64 statbuf; - int i; - - /* - * Wait timeout miliseconds for a newly created device to be available - * from the given path. There is a small window when a /dev/ device - * will exist and the udev link will not, so we must wait for the - * symlink. Depending on the udev rules this may take a few seconds. - */ - for (i = 0; i < timeout; i++) { - usleep(1000); - - errno = 0; - if ((stat64(path, &statbuf) == 0) && (errno == 0)) - return (0); - } - - return (ENOENT); -} - int zpool_label_disk_check(char *path) { @@ -4282,12 +4259,11 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) (void) close(fd); efi_free(vtoc); - /* Wait for the first expected partition to appear. */ - (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); (void) zfs_append_partition(path, MAXPATHLEN); - rval = zpool_label_disk_wait(path, 3000); + /* Wait to udev to signal use the device has settled. */ + rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT); if (rval) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to " "detect device partitions on '%s': %d"), path, rval); diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index cdb8f78e27..9b51ecc1d9 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -244,12 +244,12 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, { struct block_device *bdev = ERR_PTR(-ENXIO); vdev_disk_t *vd; - int mode, block_size; + int count = 0, mode, block_size; /* Must have a pathname and it must be absolute. */ if (v->vdev_path == NULL || v->vdev_path[0] != '/') { v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); + return (SET_ERROR(EINVAL)); } /* @@ -264,7 +264,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); if (vd == NULL) - return (ENOMEM); + return (SET_ERROR(ENOMEM)); /* * Devices are always opened by the path provided at configuration @@ -279,16 +279,35 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, * /dev/[hd]d devices which may be reordered due to probing order. * Devices in the wrong locations will be detected by the higher * level vdev validation. + * + * The specified paths may be briefly removed and recreated in + * response to udev events. This should be exceptionally unlikely + * because the zpool command makes every effort to verify these paths + * have already settled prior to reaching this point. Therefore, + * a ENOENT failure at this point is highly likely to be transient + * and it is reasonable to sleep and retry before giving up. In + * practice delays have been observed to be on the order of 100ms. */ mode = spa_mode(v->vdev_spa); if (v->vdev_wholedisk && v->vdev_expanding) bdev = vdev_disk_rrpart(v->vdev_path, mode, vd); - if (IS_ERR(bdev)) + + while (IS_ERR(bdev) && count < 50) { bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), zfs_vdev_holder); + if (unlikely(PTR_ERR(bdev) == -ENOENT)) { + msleep(10); + count++; + } else if (IS_ERR(bdev)) { + break; + } + } + if (IS_ERR(bdev)) { + dprintf("failed open v->vdev_path=%s, error=%d count=%d\n", + v->vdev_path, -PTR_ERR(bdev), count); kmem_free(vd, sizeof (vdev_disk_t)); - return (-PTR_ERR(bdev)); + return (SET_ERROR(-PTR_ERR(bdev))); } v->vdev_tsd = vd;