Cleanly integrate ZFS tools with libblkid.

The majority of this this patch concerns itself with doing a direct
replacement of Solaris's libdiskmgt library with libblkid+libefi.
You'll notice that this patch removes all libdiskmgt code instead of
ifdef'ing it out.  This was done to minimize any confusion when reading
the code because it seems unlikely we will ever port libdiskmgt to Linux.

Despite the replacement the behavior of the tools should have remained
the same with one exception.  For the moment, we are unable to check
the partitions of devices which have an MBR style partition table when
creating a filesystem.  If a non-efi partition sceme is detected on a
whole disk device we prompt the user to explicity use the force option.
It would not be a ton of work to make the tool aware of MBR style
partitions if this becomes a problem.

I've done basic sanity checking for various configurations and all
the issues I'm aware of have been addressed.  Even things like blkid
misidentifing a disk as ext3 when it is added to a zfs pool.  I'm
careful to always zero out the first 4k of any new zfs partition.  That
all said this is all new code and while it looks like it's working right
for me we should keep an eye on it for any strange behavior.
This commit is contained in:
Brian Behlendorf 2009-10-15 16:28:47 -07:00
parent 992be351d5
commit 48d1b0c5ab
4 changed files with 324 additions and 197 deletions

View File

@ -1,6 +1,9 @@
From: Brian Behlendorf <behlendorf1@llnl.gov>
Subject: [PATCH] linux user disk
Use Linux style /dev/disk names
This topic branch contains all the changes needed to integrate the user
side zfs tools with Linux style devices. Primarily this includes fixing
up the Solaris libefi library to be Linux friendly, and integrating with
the libblkid library which is provided by e2fsprogs.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>

View File

@ -1674,12 +1674,6 @@ zpool_do_import(int argc, char **argv)
usage(B_FALSE);
}
if (searchdirs == NULL) {
searchdirs = safe_malloc(sizeof (char *));
searchdirs[0] = DISK_ROOT;
nsearch = 1;
}
/* check argument count */
if (do_all) {
if (argc != 0) {
@ -1700,7 +1694,8 @@ zpool_do_import(int argc, char **argv)
if (argc == 0 && !priv_ineffect(PRIV_SYS_CONFIG)) {
(void) fprintf(stderr, gettext("cannot "
"discover pools: permission denied\n"));
free(searchdirs);
if (searchdirs != NULL)
free(searchdirs);
return (1);
}
}
@ -1747,7 +1742,8 @@ zpool_do_import(int argc, char **argv)
(void) fprintf(stderr, gettext("cannot import '%s': "
"no such pool available\n"), argv[0]);
}
free(searchdirs);
if (searchdirs != NULL)
free(searchdirs);
return (1);
}
@ -1842,7 +1838,8 @@ zpool_do_import(int argc, char **argv)
error:
nvlist_free(props);
nvlist_free(pools);
free(searchdirs);
if (searchdirs != NULL)
free(searchdirs);
return (err ? 1 : 0);
}

View File

@ -51,7 +51,7 @@
*
* 1. Construct the vdev specification. Performs syntax validation and
* makes sure each device is valid.
* 2. Check for devices in use. Using libdiskmgt, makes sure that no
* 2. Check for devices in use. Using libblkid to make sure that no
* devices are also in use. Some can be overridden using the 'force'
* flag, others cannot.
* 3. Check for replication errors if the 'force' flag is not specified.
@ -64,7 +64,6 @@
#include <devid.h>
#include <errno.h>
#include <fcntl.h>
#include <libdiskmgt.h>
#include <libintl.h>
#include <libnvpair.h>
#include <limits.h>
@ -75,6 +74,10 @@
#include <sys/stat.h>
#include <sys/vtoc.h>
#include <sys/mntent.h>
#include <uuid/uuid.h>
#ifdef HAVE_LIBBLKID
#include <blkid/blkid.h>
#endif
#include "zpool_util.h"
@ -108,168 +111,6 @@ vdev_error(const char *fmt, ...)
va_end(ap);
}
static void
libdiskmgt_error(int error)
{
/*
* ENXIO/ENODEV is a valid error message if the device doesn't live in
* /dev/dsk. Don't bother printing an error message in this case.
*/
if (error == ENXIO || error == ENODEV)
return;
(void) fprintf(stderr, gettext("warning: device in use checking "
"failed: %s\n"), strerror(error));
}
/*
* Validate a device, passing the bulk of the work off to libdiskmgt.
*/
static int
check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
{
char *msg;
int error = 0;
dm_who_type_t who;
if (force)
who = DM_WHO_ZPOOL_FORCE;
else if (isspare)
who = DM_WHO_ZPOOL_SPARE;
else
who = DM_WHO_ZPOOL;
if (dm_inuse((char *)path, &msg, who, &error) || error) {
if (error != 0) {
libdiskmgt_error(error);
return (0);
} else {
vdev_error("%s", msg);
free(msg);
return (-1);
}
}
/*
* If we're given a whole disk, ignore overlapping slices since we're
* about to label it anyway.
*/
error = 0;
if (!wholedisk && !force &&
(dm_isoverlapping((char *)path, &msg, &error) || error)) {
if (error == 0) {
/* dm_isoverlapping returned -1 */
vdev_error(gettext("%s overlaps with %s\n"), path, msg);
free(msg);
return (-1);
} else if (error != ENODEV) {
/* libdiskmgt's devcache only handles physical drives */
libdiskmgt_error(error);
return (0);
}
}
return (0);
}
/*
* Validate a whole disk. Iterate over all slices on the disk and make sure
* that none is in use by calling check_slice().
*/
static int
check_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
{
dm_descriptor_t *drive, *media, *slice;
int err = 0;
int i;
int ret;
/*
* Get the drive associated with this disk. This should never fail,
* because we already have an alias handle open for the device.
*/
if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
&err)) == NULL || *drive == NULL) {
if (err)
libdiskmgt_error(err);
return (0);
}
if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
&err)) == NULL) {
dm_free_descriptors(drive);
if (err)
libdiskmgt_error(err);
return (0);
}
dm_free_descriptors(drive);
/*
* It is possible that the user has specified a removable media drive,
* and the media is not present.
*/
if (*media == NULL) {
dm_free_descriptors(media);
vdev_error(gettext("'%s' has no media in drive\n"), name);
return (-1);
}
if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
&err)) == NULL) {
dm_free_descriptors(media);
if (err)
libdiskmgt_error(err);
return (0);
}
dm_free_descriptors(media);
ret = 0;
/*
* Iterate over all slices and report any errors. We don't care about
* overlapping slices because we are using the whole disk.
*/
for (i = 0; slice[i] != NULL; i++) {
char *name = dm_get_name(slice[i], &err);
if (check_slice(name, force, B_TRUE, isspare) != 0)
ret = -1;
dm_free_name(name);
}
dm_free_descriptors(slice);
return (ret);
}
/*
* Validate a device.
*/
static int
check_device(const char *path, boolean_t force, boolean_t isspare)
{
dm_descriptor_t desc;
int err;
char *dev;
/*
* For whole disks, libdiskmgt does not include the leading dev path.
*/
dev = strrchr(path, '/');
assert(dev != NULL);
dev++;
if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) {
err = check_disk(path, desc, force, isspare);
dm_free_descriptor(desc);
return (err);
}
return (check_slice(path, force, B_FALSE, isspare));
}
/*
* Check that a file is valid. All we can do in this case is check that it's
* not in use by another pool, and not in use by swap.
@ -280,19 +121,9 @@ check_file(const char *file, boolean_t force, boolean_t isspare)
char *name;
int fd;
int ret = 0;
int err;
pool_state_t state;
boolean_t inuse;
if (dm_inuse_swap(file, &err)) {
if (err)
libdiskmgt_error(err);
else
vdev_error(gettext("%s is currently used by swap. "
"Please see swap(1M).\n"), file);
return (-1);
}
if ((fd = open(file, O_RDONLY)) < 0)
return (0);
@ -345,6 +176,173 @@ check_file(const char *file, boolean_t force, boolean_t isspare)
return (ret);
}
#ifdef HAVE_LIBBLKID
static void
check_error(int err)
{
(void) fprintf(stderr, gettext("warning: device in use checking "
"failed: %s\n"), strerror(err));
}
static int
check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
{
struct stat64 statbuf;
char *value;
int err;
if (stat64(path, &statbuf) != 0) {
vdev_error(gettext("cannot open %s: %s\n"),
path, strerror(errno));
return (-1);
}
/* No valid type detected device is safe to use */
value = blkid_get_tag_value(cache, "TYPE", path);
if (value == NULL)
return (0);
/*
* If libblkid detects a ZFS device, we check the device
* using check_file() to see if it's safe. The one safe
* case is a spare device shared between multiple pools.
*/
if (strcmp(value, "zfs") == 0) {
err = check_file(path, force, isspare);
} else {
if (force) {
err = 0;
} else {
err = -1;
vdev_error(gettext("%s contains a filesystem of "
"type '%s'\n"), path, value);
}
}
free(value);
return (err);
}
/*
* Validate a whole disk. Iterate over all slices on the disk and make sure
* that none is in use by calling check_slice().
*/
static int
check_disk(const char *path, blkid_cache cache, int force,
boolean_t isspare, boolean_t iswholedisk)
{
struct dk_gpt *vtoc;
char slice_path[MAXPATHLEN];
int err = 0;
int fd, i;
/* Check the device as given */
err = check_slice(path, cache, force, isspare);
if (err)
return (err);
/* Additional checking is only required for whole disks */
if (!iswholedisk)
return 0;
/*
* When the device is a whole disk try to read the efi partition
* label. If this is successful we safely check the all of the
* partitions. However, when it fails it may simply be because
* the disk is partitioned via the MBR. Since we currently can
* not easily decode the MBR return a failure and prompt to the
* user to use --force since we cannot check the partitions.
*/
if ((fd = open(path, O_RDONLY|O_NDELAY)) < 0) {
check_error(errno);
return -1;
}
if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) {
(void) close(fd);
if (force) {
return 0;
} else {
vdev_error(gettext(
"%s may contain a non-efi partition table "
"describing existing\nfilesystems. If you are "
"sure you want to use this device use the\n"
"force command line option.\n"), path);
return -1;
}
}
/*
* The primary efi partition label is damaged however the secondary
* label at the end of the device is intact. Rather than use this
* label we should play it safe and treat this as a non efi device.
*/
if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
vdev_error(gettext(
"%s contains a corrupt primary efi partition table. "
"If you are\nsure you want to use this device use "
"the force command line option.\n"), path);
efi_free(vtoc);
(void) close(fd);
return -1;
}
for (i = 0; i < vtoc->efi_nparts; i++) {
if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
continue;
sprintf(slice_path, "%s%d", path, i+1);
err = check_slice(slice_path, cache, force, isspare);
if (err)
break;
}
efi_free(vtoc);
(void) close(fd);
return (err);
}
static int
check_device(const char *path, boolean_t force,
boolean_t isspare, boolean_t iswholedisk)
{
static blkid_cache cache = NULL;
int err;
/*
* There is no easy way to add a correct blkid_put_cache() call,
* memory will be reclaimed when the command exits.
*/
if (cache == NULL) {
if ((err = blkid_get_cache(&cache, NULL)) != 0) {
check_error(err);
return -1;
}
if ((err = blkid_probe_all(cache)) != 0) {
blkid_put_cache(cache);
check_error(err);
return -1;
}
}
return check_disk(path, cache, force, isspare, iswholedisk);
}
#else /* HAVE_LIBBLKID */
static int
check_device(const char *path, boolean_t force,
boolean_t isspare, boolean_t iswholedisk)
{
return check_file(path, force, isspare);
}
#endif /* HAVE_LIBBLKID */
/*
* By "whole disk" we mean an entire physical disk (something we can
@ -874,6 +872,39 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
return (ret);
}
static int
zero_label(char *path)
{
const int size = 4096;
char buf[size];
int err, fd;
if ((fd = open(path, O_WRONLY)) < 0) {
(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
path, strerror(errno));
return (-1);
}
memset(buf, 0, size);
err = write(fd, buf, size);
(void) fdatasync(fd);
(void) close(fd);
if (err == -1) {
(void) fprintf(stderr, gettext("cannot zero first %d bytes "
"of '%s': %s\n"), size, path, strerror(errno));
return (-1);
}
if (err != size) {
(void) fprintf(stderr, gettext("could only zero %d/%d bytes "
"of '%s'\n"), err, size, path);
return (-1);
}
return 0;
}
/*
* Go through and find any whole disks in the vdev specification, labelling them
* as appropriate. When constructing the vdev spec, we were unable to open this
@ -908,14 +939,20 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
return (0);
/*
* We have a disk device. Get the path to the device
* and see if it's a whole disk by appending the backup
* slice and stat()ing the device.
* We have a disk device. If this is a whole disk write
* out the efi partition table, otherwise write zero's to
* the first 4k of the partition. This is to ensure that
* libblkid will not misidentify the partition due to a
* magic value left by the previous filesystem.
*/
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk) != 0 || !wholedisk)
return (0);
verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk));
if (!wholedisk) {
ret = zero_label(path);
return (ret);
}
diskname = strrchr(path, '/');
assert(diskname != NULL);
@ -960,6 +997,9 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
(void) close(fd);
/* Just in case this partition already existed. */
(void) zero_label(buf);
return (0);
}
@ -1041,7 +1081,7 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
nvlist_t **child;
uint_t c, children;
char *type, *path;
int ret;
int ret = 0;
char buf[MAXPATHLEN];
uint64_t wholedisk;
@ -1050,16 +1090,17 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0) {
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk));
/*
* As a generic check, we look to see if this is a replace of a
* hot spare within the same pool. If so, we allow it
* regardless of what libdiskmgt or zpool_in_use() says.
* regardless of what libblkid or zpool_in_use() says.
*/
if (isreplacing) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk) == 0 && wholedisk)
if (wholedisk)
(void) snprintf(buf, sizeof (buf), "%ss0",
path);
else
@ -1069,7 +1110,7 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
}
if (strcmp(type, VDEV_TYPE_DISK) == 0)
ret = check_device(path, force, isspare);
ret = check_device(path, force, isspare, wholedisk);
if (strcmp(type, VDEV_TYPE_FILE) == 0)
ret = check_file(path, force, isspare);

View File

@ -52,6 +52,9 @@
#include <fcntl.h>
#include <sys/vdev_impl.h>
#ifdef HAVE_LIBBLKID
#include <blkid/blkid.h>
#endif
#include "libzfs.h"
#include "libzfs_impl.h"
@ -777,6 +780,77 @@ zpool_read_label(int fd, nvlist_t **config)
return (0);
}
#ifdef HAVE_LIBBLKID
/*
* Use libblkid to quickly search for zfs devices
*/
static int
zpool_find_import_blkid(libzfs_handle_t *hdl, pool_list_t *pools)
{
blkid_cache cache;
blkid_dev_iterate iter;
blkid_dev dev;
const char *devname;
nvlist_t *config;
int fd, err;
err = blkid_get_cache(&cache, NULL);
if (err != 0) {
(void) zfs_error_fmt(hdl, EZFS_BADCACHE,
dgettext(TEXT_DOMAIN, "blkid_get_cache() %d"), err);
goto err_blkid1;
}
err = blkid_probe_all(cache);
if (err != 0) {
(void) zfs_error_fmt(hdl, EZFS_BADCACHE,
dgettext(TEXT_DOMAIN, "blkid_probe_all() %d"), err);
goto err_blkid2;
}
iter = blkid_dev_iterate_begin(cache);
if (iter == NULL) {
(void) zfs_error_fmt(hdl, EZFS_BADCACHE,
dgettext(TEXT_DOMAIN, "blkid_dev_iterate_begin()"));
goto err_blkid2;
}
err = blkid_dev_set_search(iter, "TYPE", "zfs");
if (err != 0) {
(void) zfs_error_fmt(hdl, EZFS_BADCACHE,
dgettext(TEXT_DOMAIN, "blkid_dev_set_search() %d"), err);
goto err_blkid3;
}
while (blkid_dev_next(iter, &dev) == 0) {
devname = blkid_dev_devname(dev);
if ((fd = open64(devname, O_RDONLY)) < 0)
continue;
err = zpool_read_label(fd, &config);
(void) close(fd);
if (err != 0) {
(void) no_memory(hdl);
goto err_blkid3;
}
if (config != NULL) {
err = add_config(hdl, pools, devname, config);
if (err != 0)
goto err_blkid3;
}
}
err_blkid3:
blkid_dev_iterate_end(iter);
err_blkid2:
blkid_put_cache(cache);
err_blkid1:
return err;
}
#endif /* HAVE_LIBBLKID */
/*
* Given a list of directories to search, find all pools stored on disk. This
* includes partial pools which are not available to import. If no args are
@ -807,6 +881,15 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
verify(poolname == NULL || guid == 0);
if (argc == 0) {
#ifdef HAVE_LIBBLKID
/* Use libblkid to scan all device for their type */
if (zpool_find_import_blkid(hdl, &pools) == 0)
goto skip_scanning;
(void) zfs_error_fmt(hdl, EZFS_BADCACHE,
dgettext(TEXT_DOMAIN, "blkid failure falling back "
"to manual probing"));
#endif /* HAVE_LIBBLKID */
argc = 1;
argv = &default_dir;
}
@ -917,6 +1000,9 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
dirp = NULL;
}
#ifdef HAVE_LIBBLKID
skip_scanning:
#endif
ret = get_configs(hdl, &pools, active_ok);
error: