zfs/lib/libzfs/libzfs_pool.c

3807 lines
95 KiB
C
Raw Normal View History

2008-11-20 20:01:55 +00:00
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2008-11-20 20:01:55 +00:00
*/
#include <ctype.h>
#include <errno.h>
#include <devid.h>
#include <fcntl.h>
#include <libintl.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <zone.h>
#include <sys/stat.h>
2008-11-20 20:01:55 +00:00
#include <sys/efi_partition.h>
#include <sys/vtoc.h>
#include <sys/zfs_ioctl.h>
2009-07-02 22:44:48 +00:00
#include <dlfcn.h>
2008-11-20 20:01:55 +00:00
#include "zfs_namecheck.h"
#include "zfs_prop.h"
#include "libzfs_impl.h"
#include "zfs_comutil.h"
2008-11-20 20:01:55 +00:00
static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
#if defined(__i386) || defined(__amd64)
#define BOOTCMD "installgrub(1M)"
#else
#define BOOTCMD "installboot(1M)"
#endif
2008-11-20 20:01:55 +00:00
/*
* ====================================================================
* zpool property functions
* ====================================================================
*/
static int
zpool_get_all_props(zpool_handle_t *zhp)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
return (-1);
while (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) {
if (errno == ENOMEM) {
if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
zcmd_free_nvlists(&zc);
return (-1);
}
} else {
zcmd_free_nvlists(&zc);
return (-1);
}
}
if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) {
zcmd_free_nvlists(&zc);
return (-1);
}
zcmd_free_nvlists(&zc);
return (0);
}
static int
zpool_props_refresh(zpool_handle_t *zhp)
{
nvlist_t *old_props;
old_props = zhp->zpool_props;
if (zpool_get_all_props(zhp) != 0)
return (-1);
nvlist_free(old_props);
return (0);
}
static char *
zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop,
zprop_source_t *src)
{
nvlist_t *nv, *nvl;
uint64_t ival;
char *value;
zprop_source_t source;
nvl = zhp->zpool_props;
if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0);
source = ival;
verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);
} else {
source = ZPROP_SRC_DEFAULT;
if ((value = (char *)zpool_prop_default_string(prop)) == NULL)
value = "-";
}
if (src)
*src = source;
return (value);
}
uint64_t
zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src)
{
nvlist_t *nv, *nvl;
uint64_t value;
zprop_source_t source;
if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) {
/*
* zpool_get_all_props() has most likely failed because
* the pool is faulted, but if all we need is the top level
* vdev's guid then get it from the zhp config nvlist.
*/
if ((prop == ZPOOL_PROP_GUID) &&
(nvlist_lookup_nvlist(zhp->zpool_config,
ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) &&
(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value)
== 0)) {
return (value);
}
2008-11-20 20:01:55 +00:00
return (zpool_prop_default_numeric(prop));
}
2008-11-20 20:01:55 +00:00
nvl = zhp->zpool_props;
if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) {
verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &value) == 0);
source = value;
verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0);
} else {
source = ZPROP_SRC_DEFAULT;
value = zpool_prop_default_numeric(prop);
}
if (src)
*src = source;
return (value);
}
/*
* Map VDEV STATE to printed strings.
*/
char *
zpool_state_to_name(vdev_state_t state, vdev_aux_t aux)
{
switch (state) {
default:
break;
2008-11-20 20:01:55 +00:00
case VDEV_STATE_CLOSED:
case VDEV_STATE_OFFLINE:
return (gettext("OFFLINE"));
case VDEV_STATE_REMOVED:
return (gettext("REMOVED"));
case VDEV_STATE_CANT_OPEN:
if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
2008-11-20 20:01:55 +00:00
return (gettext("FAULTED"));
else if (aux == VDEV_AUX_SPLIT_POOL)
return (gettext("SPLIT"));
2008-11-20 20:01:55 +00:00
else
return (gettext("UNAVAIL"));
case VDEV_STATE_FAULTED:
return (gettext("FAULTED"));
case VDEV_STATE_DEGRADED:
return (gettext("DEGRADED"));
case VDEV_STATE_HEALTHY:
return (gettext("ONLINE"));
}
return (gettext("UNKNOWN"));
}
/*
* Get a zpool property value for 'prop' and return the value in
* a pre-allocated buffer.
*/
int
zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
zprop_source_t *srctype)
{
uint64_t intval;
const char *strval;
zprop_source_t src = ZPROP_SRC_NONE;
nvlist_t *nvroot;
vdev_stat_t *vs;
uint_t vsc;
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
2009-02-18 20:51:31 +00:00
switch (prop) {
case ZPOOL_PROP_NAME:
2008-11-20 20:01:55 +00:00
(void) strlcpy(buf, zpool_get_name(zhp), len);
2009-02-18 20:51:31 +00:00
break;
case ZPOOL_PROP_HEALTH:
2008-11-20 20:01:55 +00:00
(void) strlcpy(buf, "FAULTED", len);
2009-02-18 20:51:31 +00:00
break;
case ZPOOL_PROP_GUID:
intval = zpool_get_prop_int(zhp, prop, &src);
2009-02-18 23:15:56 +00:00
(void) snprintf(buf, len, "%llu", (u_longlong_t)intval);
2009-02-18 20:51:31 +00:00
break;
case ZPOOL_PROP_ALTROOT:
case ZPOOL_PROP_CACHEFILE:
if (zhp->zpool_props != NULL ||
zpool_get_all_props(zhp) == 0) {
(void) strlcpy(buf,
zpool_get_prop_string(zhp, prop, &src),
len);
if (srctype != NULL)
*srctype = src;
return (0);
}
/* FALLTHROUGH */
default:
2008-11-20 20:01:55 +00:00
(void) strlcpy(buf, "-", len);
2009-02-18 20:51:31 +00:00
break;
}
if (srctype != NULL)
*srctype = src;
2008-11-20 20:01:55 +00:00
return (0);
}
if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) &&
prop != ZPOOL_PROP_NAME)
return (-1);
switch (zpool_prop_get_type(prop)) {
case PROP_TYPE_STRING:
(void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src),
len);
break;
case PROP_TYPE_NUMBER:
intval = zpool_get_prop_int(zhp, prop, &src);
switch (prop) {
case ZPOOL_PROP_SIZE:
case ZPOOL_PROP_ALLOCATED:
case ZPOOL_PROP_FREE:
2008-11-20 20:01:55 +00:00
(void) zfs_nicenum(intval, buf, len);
break;
case ZPOOL_PROP_CAPACITY:
(void) snprintf(buf, len, "%llu%%",
(u_longlong_t)intval);
break;
case ZPOOL_PROP_DEDUPRATIO:
(void) snprintf(buf, len, "%llu.%02llux",
(u_longlong_t)(intval / 100),
(u_longlong_t)(intval % 100));
break;
2008-11-20 20:01:55 +00:00
case ZPOOL_PROP_HEALTH:
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
verify(nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
== 0);
2008-11-20 20:01:55 +00:00
(void) strlcpy(buf, zpool_state_to_name(intval,
vs->vs_aux), len);
break;
default:
2008-12-04 22:41:11 +00:00
(void) snprintf(buf, len, "%llu", (u_longlong_t)intval);
2008-11-20 20:01:55 +00:00
}
break;
case PROP_TYPE_INDEX:
intval = zpool_get_prop_int(zhp, prop, &src);
if (zpool_prop_index_to_string(prop, intval, &strval)
!= 0)
return (-1);
(void) strlcpy(buf, strval, len);
break;
default:
abort();
}
if (srctype)
*srctype = src;
return (0);
}
/*
* Check if the bootfs name has the same pool name as it is set to.
* Assuming bootfs is a valid dataset name.
*/
static boolean_t
bootfs_name_valid(const char *pool, char *bootfs)
{
int len = strlen(pool);
if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT))
2008-11-20 20:01:55 +00:00
return (B_FALSE);
if (strncmp(pool, bootfs, len) == 0 &&
(bootfs[len] == '/' || bootfs[len] == '\0'))
return (B_TRUE);
return (B_FALSE);
}
/*
* Inspect the configuration to determine if any of the devices contain
* an EFI label.
*/
static boolean_t
pool_uses_efi(nvlist_t *config)
{
nvlist_t **child;
uint_t c, children;
if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
return (read_efi_label(config, NULL) >= 0);
for (c = 0; c < children; c++) {
if (pool_uses_efi(child[c]))
return (B_TRUE);
}
return (B_FALSE);
}
static boolean_t
pool_is_bootable(zpool_handle_t *zhp)
{
char bootfs[ZPOOL_MAXNAMELEN];
return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs,
sizeof (bootfs), NULL) == 0 && strncmp(bootfs, "-",
sizeof (bootfs)) != 0);
}
2008-11-20 20:01:55 +00:00
/*
* Given an nvlist of zpool properties to be set, validate that they are
* correct, and parse any numeric properties (index, boolean, etc) if they are
* specified as strings.
*/
static nvlist_t *
zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
2008-11-20 20:01:55 +00:00
nvlist_t *props, uint64_t version, boolean_t create_or_import, char *errbuf)
{
nvpair_t *elem;
nvlist_t *retprops;
zpool_prop_t prop;
char *strval;
uint64_t intval;
char *slash;
struct stat64 statbuf;
zpool_handle_t *zhp;
nvlist_t *nvroot;
2008-11-20 20:01:55 +00:00
if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) {
(void) no_memory(hdl);
return (NULL);
}
elem = NULL;
while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
const char *propname = nvpair_name(elem);
/*
* Make sure this property is valid and applies to this type.
*/
if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid property '%s'"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
if (zpool_prop_readonly(prop)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
"is readonly"), propname);
(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
goto error;
}
if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops,
&strval, &intval, errbuf) != 0)
goto error;
/*
* Perform additional checking for specific properties.
*/
switch (prop) {
default:
break;
2008-11-20 20:01:55 +00:00
case ZPOOL_PROP_VERSION:
if (intval < version || intval > SPA_VERSION) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' number %d is invalid."),
propname, intval);
(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
goto error;
}
break;
case ZPOOL_PROP_BOOTFS:
if (create_or_import) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' cannot be set at creation "
"or import time"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
if (version < SPA_VERSION_BOOTFS) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool must be upgraded to support "
"'%s' property"), propname);
(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
goto error;
}
/*
* bootfs property value has to be a dataset name and
* the dataset has to be in the same pool as it sets to.
*/
if (strval[0] != '\0' && !bootfs_name_valid(poolname,
strval)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' "
"is an invalid name"), strval);
(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
goto error;
}
if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"could not open pool '%s'"), poolname);
(void) zfs_error(hdl, EZFS_OPENFAILED, errbuf);
goto error;
}
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
/*
* bootfs property cannot be set on a disk which has
* been EFI labeled.
*/
if (pool_uses_efi(nvroot)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' not supported on "
"EFI labeled devices"), propname);
(void) zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf);
zpool_close(zhp);
goto error;
}
zpool_close(zhp);
2008-11-20 20:01:55 +00:00
break;
case ZPOOL_PROP_ALTROOT:
if (!create_or_import) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' can only be set during pool "
"creation or import"), propname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
if (strval[0] != '/') {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"bad alternate root '%s'"), strval);
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
goto error;
}
break;
case ZPOOL_PROP_CACHEFILE:
if (strval[0] == '\0')
break;
if (strcmp(strval, "none") == 0)
break;
if (strval[0] != '/') {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"property '%s' must be empty, an "
"absolute path, or 'none'"), propname);
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
goto error;
}
slash = strrchr(strval, '/');
if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
strcmp(slash, "/..") == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' is not a valid file"), strval);
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
goto error;
}
*slash = '\0';
if (strval[0] != '\0' &&
(stat64(strval, &statbuf) != 0 ||
!S_ISDIR(statbuf.st_mode))) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"'%s' is not a valid directory"),
strval);
(void) zfs_error(hdl, EZFS_BADPATH, errbuf);
goto error;
}
*slash = '/';
break;
}
}
return (retprops);
error:
nvlist_free(retprops);
return (NULL);
}
/*
* Set zpool property : propname=propval.
*/
int
zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
int ret = -1;
char errbuf[1024];
nvlist_t *nvl = NULL;
nvlist_t *realprops;
uint64_t version;
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
zhp->zpool_name);
if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
return (no_memory(zhp->zpool_hdl));
if (nvlist_add_string(nvl, propname, propval) != 0) {
nvlist_free(nvl);
return (no_memory(zhp->zpool_hdl));
}
version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
if ((realprops = zpool_valid_proplist(zhp->zpool_hdl,
2008-11-20 20:01:55 +00:00
zhp->zpool_name, nvl, version, B_FALSE, errbuf)) == NULL) {
nvlist_free(nvl);
return (-1);
}
nvlist_free(nvl);
nvl = realprops;
/*
* Execute the corresponding ioctl() to set this property.
*/
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl) != 0) {
nvlist_free(nvl);
return (-1);
}
ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc);
zcmd_free_nvlists(&zc);
nvlist_free(nvl);
if (ret)
(void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf);
else
(void) zpool_props_refresh(zhp);
return (ret);
}
int
zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp)
{
libzfs_handle_t *hdl = zhp->zpool_hdl;
zprop_list_t *entry;
char buf[ZFS_MAXPROPLEN];
if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0)
return (-1);
for (entry = *plp; entry != NULL; entry = entry->pl_next) {
if (entry->pl_fixed)
continue;
if (entry->pl_prop != ZPROP_INVAL &&
zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf),
NULL) == 0) {
if (strlen(buf) > entry->pl_width)
entry->pl_width = strlen(buf);
}
}
return (0);
}
2009-07-02 22:44:48 +00:00
/*
* Don't start the slice at the default block of 34; many storage
Changes required to integrate libefi in to Linux. The major change here is to fix up libefi to be linux aware. For the most part this wasn't too hard but there were a few major issues. First off I needed to handle the DKIOCGMEDIAINFO and DKIOCINFO ioctls. There is no direct equivilant for these ioctls under linux. To handle this I added wrapper functions which under Solaris simple call the ioctls. But under Linux dig around the system a little bit getting the needed info to fill in the requested structures. Secondly the efi_ioctl() call was adapted such that under linux it directly read or writes out the partition table. Under Solaris this work was handed off to the kernel via an ioctl. In the efi_write() case we also ensure we prompt the kernel via BLKRRPART to re-scan the new partition table. The libefi generated partition tables are correct but older versions of ~parted-1.8.1 can not read them without a small patch. The kernel and fdisk are able to read them just fine. Thirdly efi_alloc_and_init() which is used by zpool to determine if a device is a 'wholedisk' was updated to be linux aware. This check is performed by using the partition number for the device, which the partition number is 0 on linux it is a 'wholedisk'. However, certain device type such as the loopback and ram disks needed to be excluded because they do not support partitioning. Forthly the zpool command was made symlink aware so it can correctly resolve udev entries such as /dev/disk/by-*/*. This symlinks are fully expanded ensuring all block devices are recognized. When a when a 'wholedisk' block device is detected we now properly write out an efi label and place zfs in the first partition (0th slice). This partition is created 1MiB in to the disk to ensure it is aligned nicely with all high end block devices I'm aware of. This all works for me now but it did take quite a bit of work to get it all sorted out. It would not surprise me if certain special cases were missed so we should keep any eye of for any odd behavior.
2009-10-14 23:07:48 +00:00
* devices will use a stripe width of 128k, other vendors prefer a 1m
* alignment. It is best to play it safe and ensure a 1m alignment
* give 512b blocks. When the block size is larger by a power of 2
* we will still be 1m aligned.
2009-07-02 22:44:48 +00:00
*/
Changes required to integrate libefi in to Linux. The major change here is to fix up libefi to be linux aware. For the most part this wasn't too hard but there were a few major issues. First off I needed to handle the DKIOCGMEDIAINFO and DKIOCINFO ioctls. There is no direct equivilant for these ioctls under linux. To handle this I added wrapper functions which under Solaris simple call the ioctls. But under Linux dig around the system a little bit getting the needed info to fill in the requested structures. Secondly the efi_ioctl() call was adapted such that under linux it directly read or writes out the partition table. Under Solaris this work was handed off to the kernel via an ioctl. In the efi_write() case we also ensure we prompt the kernel via BLKRRPART to re-scan the new partition table. The libefi generated partition tables are correct but older versions of ~parted-1.8.1 can not read them without a small patch. The kernel and fdisk are able to read them just fine. Thirdly efi_alloc_and_init() which is used by zpool to determine if a device is a 'wholedisk' was updated to be linux aware. This check is performed by using the partition number for the device, which the partition number is 0 on linux it is a 'wholedisk'. However, certain device type such as the loopback and ram disks needed to be excluded because they do not support partitioning. Forthly the zpool command was made symlink aware so it can correctly resolve udev entries such as /dev/disk/by-*/*. This symlinks are fully expanded ensuring all block devices are recognized. When a when a 'wholedisk' block device is detected we now properly write out an efi label and place zfs in the first partition (0th slice). This partition is created 1MiB in to the disk to ensure it is aligned nicely with all high end block devices I'm aware of. This all works for me now but it did take quite a bit of work to get it all sorted out. It would not surprise me if certain special cases were missed so we should keep any eye of for any odd behavior.
2009-10-14 23:07:48 +00:00
#define NEW_START_BLOCK 2048
2009-07-02 22:44:48 +00:00
2008-11-20 20:01:55 +00:00
/*
* Validate the given pool name, optionally putting an extended error message in
* 'buf'.
*/
boolean_t
zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool)
{
namecheck_err_t why;
char what;
int ret;
ret = pool_namecheck(pool, &why, &what);
/*
* The rules for reserved pool names were extended at a later point.
* But we need to support users with existing pools that may now be
* invalid. So we only check for this expanded set of names during a
* create (or import), and only in userland.
*/
if (ret == 0 && !isopen &&
(strncmp(pool, "mirror", 6) == 0 ||
strncmp(pool, "raidz", 5) == 0 ||
strncmp(pool, "spare", 5) == 0 ||
strcmp(pool, "log") == 0)) {
if (hdl != NULL)
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "name is reserved"));
return (B_FALSE);
}
if (ret != 0) {
if (hdl != NULL) {
switch (why) {
case NAME_ERR_TOOLONG:
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "name is too long"));
break;
case NAME_ERR_INVALCHAR:
zfs_error_aux(hdl,
dgettext(TEXT_DOMAIN, "invalid character "
"'%c' in pool name"), what);
break;
case NAME_ERR_NOLETTER:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"name must begin with a letter"));
break;
case NAME_ERR_RESERVED:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"name is reserved"));
break;
case NAME_ERR_DISKLIKE:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool name is reserved"));
break;
case NAME_ERR_LEADING_SLASH:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"leading slash in name"));
break;
case NAME_ERR_EMPTY_COMPONENT:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"empty component in name"));
break;
case NAME_ERR_TRAILING_SLASH:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"trailing slash in name"));
break;
case NAME_ERR_MULTIPLE_AT:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"multiple '@' delimiters in name"));
break;
case NAME_ERR_NO_AT:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"permission set is missing '@'"));
break;
2008-11-20 20:01:55 +00:00
}
}
return (B_FALSE);
}
return (B_TRUE);
}
/*
* Open a handle to the given pool, even if the pool is currently in the FAULTED
* state.
*/
zpool_handle_t *
zpool_open_canfail(libzfs_handle_t *hdl, const char *pool)
{
zpool_handle_t *zhp;
boolean_t missing;
/*
* Make sure the pool name is valid.
*/
if (!zpool_name_valid(hdl, B_TRUE, pool)) {
(void) zfs_error_fmt(hdl, EZFS_INVALIDNAME,
dgettext(TEXT_DOMAIN, "cannot open '%s'"),
pool);
return (NULL);
}
if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
return (NULL);
zhp->zpool_hdl = hdl;
(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
if (zpool_refresh_stats(zhp, &missing) != 0) {
zpool_close(zhp);
return (NULL);
}
if (missing) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool"));
(void) zfs_error_fmt(hdl, EZFS_NOENT,
dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool);
zpool_close(zhp);
return (NULL);
}
return (zhp);
}
/*
* Like the above, but silent on error. Used when iterating over pools (because
* the configuration cache may be out of date).
*/
int
zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret)
{
zpool_handle_t *zhp;
boolean_t missing;
if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL)
return (-1);
zhp->zpool_hdl = hdl;
(void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name));
if (zpool_refresh_stats(zhp, &missing) != 0) {
zpool_close(zhp);
return (-1);
}
if (missing) {
zpool_close(zhp);
*ret = NULL;
return (0);
}
*ret = zhp;
return (0);
}
/*
* Similar to zpool_open_canfail(), but refuses to open pools in the faulted
* state.
*/
zpool_handle_t *
zpool_open(libzfs_handle_t *hdl, const char *pool)
{
zpool_handle_t *zhp;
if ((zhp = zpool_open_canfail(hdl, pool)) == NULL)
return (NULL);
if (zhp->zpool_state == POOL_STATE_UNAVAIL) {
(void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL,
dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name);
zpool_close(zhp);
return (NULL);
}
return (zhp);
}
/*
* Close the handle. Simply frees the memory associated with the handle.
*/
void
zpool_close(zpool_handle_t *zhp)
{
if (zhp->zpool_config)
nvlist_free(zhp->zpool_config);
if (zhp->zpool_old_config)
nvlist_free(zhp->zpool_old_config);
if (zhp->zpool_props)
nvlist_free(zhp->zpool_props);
free(zhp);
}
/*
* Return the name of the pool.
*/
const char *
zpool_get_name(zpool_handle_t *zhp)
{
return (zhp->zpool_name);
}
/*
* Return the state of the pool (ACTIVE or UNAVAILABLE)
*/
int
zpool_get_state(zpool_handle_t *zhp)
{
return (zhp->zpool_state);
}
/*
* Create the named pool, using the provided vdev list. It is assumed
* that the consumer has already validated the contents of the nvlist, so we
* don't have to worry about error semantics.
*/
int
zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
nvlist_t *props, nvlist_t *fsprops)
2008-11-20 20:01:55 +00:00
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
nvlist_t *zc_fsprops = NULL;
nvlist_t *zc_props = NULL;
2008-11-20 20:01:55 +00:00
char msg[1024];
char *altroot;
int ret = -1;
2008-11-20 20:01:55 +00:00
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot create '%s'"), pool);
if (!zpool_name_valid(hdl, B_FALSE, pool))
return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
return (-1);
if (props) {
if ((zc_props = zpool_valid_proplist(hdl, pool, props,
SPA_VERSION_1, B_TRUE, msg)) == NULL) {
goto create_failed;
}
}
2008-11-20 20:01:55 +00:00
if (fsprops) {
uint64_t zoned;
char *zonestr;
zoned = ((nvlist_lookup_string(fsprops,
zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) &&
strcmp(zonestr, "on") == 0);
if ((zc_fsprops = zfs_valid_proplist(hdl,
ZFS_TYPE_FILESYSTEM, fsprops, zoned, NULL, msg)) == NULL) {
goto create_failed;
}
if (!zc_props &&
(nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) {
goto create_failed;
}
if (nvlist_add_nvlist(zc_props,
ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) {
goto create_failed;
}
2008-11-20 20:01:55 +00:00
}
if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
goto create_failed;
2008-11-20 20:01:55 +00:00
(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) {
2008-11-20 20:01:55 +00:00
zcmd_free_nvlists(&zc);
nvlist_free(zc_props);
nvlist_free(zc_fsprops);
2008-11-20 20:01:55 +00:00
switch (errno) {
case EBUSY:
/*
* This can happen if the user has specified the same
* device multiple times. We can't reliably detect this
* until we try to add it and see we already have a
* label. This can also happen under if the device is
* part of an active md or lvm device.
2008-11-20 20:01:55 +00:00
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more vdevs refer to the same device, or one of\n"
"the devices is part of an active md or lvm device"));
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_BADDEV, msg));
case EOVERFLOW:
/*
* This occurs when one of the devices is below
* SPA_MINDEVSIZE. Unfortunately, we can't detect which
* device was the problem device since there's no
* reliable way to determine device size from userland.
*/
{
char buf[64];
zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more devices is less than the "
"minimum size (%s)"), buf);
}
return (zfs_error(hdl, EZFS_BADDEV, msg));
case ENOSPC:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more devices is out of space"));
return (zfs_error(hdl, EZFS_BADDEV, msg));
case ENOTBLK:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cache device must be a disk or disk slice"));
return (zfs_error(hdl, EZFS_BADDEV, msg));
default:
return (zpool_standard_error(hdl, errno, msg));
}
}
/*
* If this is an alternate root pool, then we automatically set the
* mountpoint of the root dataset to be '/'.
*/
if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT),
&altroot) == 0) {
zfs_handle_t *zhp;
verify((zhp = zfs_open(hdl, pool, ZFS_TYPE_DATASET)) != NULL);
verify(zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
"/") == 0);
zfs_close(zhp);
}
create_failed:
2008-11-20 20:01:55 +00:00
zcmd_free_nvlists(&zc);
nvlist_free(zc_props);
nvlist_free(zc_fsprops);
return (ret);
2008-11-20 20:01:55 +00:00
}
/*
* Destroy the given pool. It is up to the caller to ensure that there are no
* datasets left in the pool.
*/
int
zpool_destroy(zpool_handle_t *zhp)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
zfs_handle_t *zfp = NULL;
libzfs_handle_t *hdl = zhp->zpool_hdl;
char msg[1024];
if (zhp->zpool_state == POOL_STATE_ACTIVE &&
(zfp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
ZFS_TYPE_FILESYSTEM)) == NULL)
return (-1);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot destroy '%s'"), zhp->zpool_name);
if (errno == EROFS) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more devices is read only"));
(void) zfs_error(hdl, EZFS_BADDEV, msg);
} else {
(void) zpool_standard_error(hdl, errno, msg);
}
if (zfp)
zfs_close(zfp);
return (-1);
}
if (zfp) {
remove_mountpoint(zfp);
zfs_close(zfp);
}
return (0);
}
/*
* Add the given vdevs to the pool. The caller must have already performed the
* necessary verification to ensure that the vdev specification is well-formed.
*/
int
zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
int ret;
libzfs_handle_t *hdl = zhp->zpool_hdl;
char msg[1024];
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot add to '%s'"), zhp->zpool_name);
if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
SPA_VERSION_SPARES &&
nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
&spares, &nspares) == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
"upgraded to add hot spares"));
return (zfs_error(hdl, EZFS_BADVERSION, msg));
}
if (pool_is_bootable(zhp) && nvlist_lookup_nvlist_array(nvroot,
ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) {
uint64_t s;
for (s = 0; s < nspares; s++) {
char *path;
if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
&path) == 0 && pool_uses_efi(spares[s])) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"device '%s' contains an EFI label and "
"cannot be used on root pools."),
zpool_vdev_name(hdl, NULL, spares[s],
B_FALSE));
return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
}
}
}
2008-11-20 20:01:55 +00:00
if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
SPA_VERSION_L2CACHE &&
nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
&l2cache, &nl2cache) == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
"upgraded to add cache devices"));
return (zfs_error(hdl, EZFS_BADVERSION, msg));
}
if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
return (-1);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
switch (errno) {
case EBUSY:
/*
* This can happen if the user has specified the same
* device multiple times. We can't reliably detect this
* until we try to add it and see we already have a
* label.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more vdevs refer to the same device"));
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
case EOVERFLOW:
/*
* This occurrs when one of the devices is below
* SPA_MINDEVSIZE. Unfortunately, we can't detect which
* device was the problem device since there's no
* reliable way to determine device size from userland.
*/
{
char buf[64];
zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf));
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"device is less than the minimum "
"size (%s)"), buf);
}
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
case ENOTSUP:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool must be upgraded to add these vdevs"));
(void) zfs_error(hdl, EZFS_BADVERSION, msg);
break;
case EDOM:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"root pool can not have multiple vdevs"
" or separate logs"));
(void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg);
break;
case ENOTBLK:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cache device must be a disk or disk slice"));
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
default:
(void) zpool_standard_error(hdl, errno, msg);
}
ret = -1;
} else {
ret = 0;
}
zcmd_free_nvlists(&zc);
return (ret);
}
/*
* Exports the pool from the system. The caller must ensure that there are no
* mounted datasets in the pool.
*/
int
2009-01-15 21:59:39 +00:00
zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce)
2008-11-20 20:01:55 +00:00
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
char msg[1024];
2008-11-20 20:01:55 +00:00
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot export '%s'"), zhp->zpool_name);
2008-11-20 20:01:55 +00:00
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_cookie = force;
2009-01-15 21:59:39 +00:00
zc.zc_guid = hardforce;
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
switch (errno) {
case EXDEV:
zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
"use '-f' to override the following errors:\n"
"'%s' has an active shared spare which could be"
" used by other pools once '%s' is exported."),
zhp->zpool_name, zhp->zpool_name);
return (zfs_error(zhp->zpool_hdl, EZFS_ACTIVE_SPARE,
msg));
default:
return (zpool_standard_error_fmt(zhp->zpool_hdl, errno,
msg));
}
}
2008-11-20 20:01:55 +00:00
return (0);
}
2009-01-15 21:59:39 +00:00
int
zpool_export(zpool_handle_t *zhp, boolean_t force)
{
return (zpool_export_common(zhp, force, B_FALSE));
}
int
zpool_export_force(zpool_handle_t *zhp)
{
return (zpool_export_common(zhp, B_TRUE, B_TRUE));
}
static void
zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
nvlist_t *rbi)
{
uint64_t rewindto;
int64_t loss = -1;
struct tm t;
char timestr[128];
if (!hdl->libzfs_printerr || rbi == NULL)
return;
if (nvlist_lookup_uint64(rbi, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
return;
(void) nvlist_lookup_int64(rbi, ZPOOL_CONFIG_REWIND_TIME, &loss);
if (localtime_r((time_t *)&rewindto, &t) != NULL &&
strftime(timestr, 128, "%c", &t) != 0) {
if (dryrun) {
(void) printf(dgettext(TEXT_DOMAIN,
"Would be able to return %s "
"to its state as of %s.\n"),
name, timestr);
} else {
(void) printf(dgettext(TEXT_DOMAIN,
"Pool %s returned to its state as of %s.\n"),
name, timestr);
}
if (loss > 120) {
(void) printf(dgettext(TEXT_DOMAIN,
"%s approximately %lld "),
dryrun ? "Would discard" : "Discarded",
((longlong_t)loss + 30) / 60);
(void) printf(dgettext(TEXT_DOMAIN,
"minutes of transactions.\n"));
} else if (loss > 0) {
(void) printf(dgettext(TEXT_DOMAIN,
"%s approximately %lld "),
dryrun ? "Would discard" : "Discarded",
(longlong_t)loss);
(void) printf(dgettext(TEXT_DOMAIN,
"seconds of transactions.\n"));
}
}
}
void
zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
nvlist_t *config)
{
int64_t loss = -1;
uint64_t edata = UINT64_MAX;
uint64_t rewindto;
struct tm t;
char timestr[128];
if (!hdl->libzfs_printerr)
return;
if (reason >= 0)
(void) printf(dgettext(TEXT_DOMAIN, "action: "));
else
(void) printf(dgettext(TEXT_DOMAIN, "\t"));
/* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
if (nvlist_lookup_uint64(config,
ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
goto no_info;
(void) nvlist_lookup_int64(config, ZPOOL_CONFIG_REWIND_TIME, &loss);
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
&edata);
(void) printf(dgettext(TEXT_DOMAIN,
"Recovery is possible, but will result in some data loss.\n"));
if (localtime_r((time_t *)&rewindto, &t) != NULL &&
strftime(timestr, 128, "%c", &t) != 0) {
(void) printf(dgettext(TEXT_DOMAIN,
"\tReturning the pool to its state as of %s\n"
"\tshould correct the problem. "),
timestr);
} else {
(void) printf(dgettext(TEXT_DOMAIN,
"\tReverting the pool to an earlier state "
"should correct the problem.\n\t"));
}
if (loss > 120) {
(void) printf(dgettext(TEXT_DOMAIN,
"Approximately %lld minutes of data\n"
"\tmust be discarded, irreversibly. "),
((longlong_t)loss + 30) / 60);
} else if (loss > 0) {
(void) printf(dgettext(TEXT_DOMAIN,
"Approximately %lld seconds of data\n"
"\tmust be discarded, irreversibly. "),
(longlong_t)loss);
}
if (edata != 0 && edata != UINT64_MAX) {
if (edata == 1) {
(void) printf(dgettext(TEXT_DOMAIN,
"After rewind, at least\n"
"\tone persistent user-data error will remain. "));
} else {
(void) printf(dgettext(TEXT_DOMAIN,
"After rewind, several\n"
"\tpersistent user-data errors will remain. "));
}
}
(void) printf(dgettext(TEXT_DOMAIN,
"Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "),
reason >= 0 ? "clear" : "import", name);
(void) printf(dgettext(TEXT_DOMAIN,
"A scrub of the pool\n"
"\tis strongly recommended after recovery.\n"));
return;
no_info:
(void) printf(dgettext(TEXT_DOMAIN,
"Destroy and re-create the pool from\n\ta backup source.\n"));
}
2008-11-20 20:01:55 +00:00
/*
* zpool_import() is a contracted interface. Should be kept the same
* if possible.
*
* Applications should use zpool_import_props() to import a pool with
* new properties value to be set.
*/
int
zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
char *altroot)
{
nvlist_t *props = NULL;
int ret;
if (altroot != NULL) {
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
return (zfs_error_fmt(hdl, EZFS_NOMEM,
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
newname));
}
if (nvlist_add_string(props,
2009-01-15 21:59:39 +00:00
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 ||
nvlist_add_string(props,
zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) {
2008-11-20 20:01:55 +00:00
nvlist_free(props);
return (zfs_error_fmt(hdl, EZFS_NOMEM,
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
newname));
}
}
ret = zpool_import_props(hdl, config, newname, props, B_FALSE);
2008-11-20 20:01:55 +00:00
if (props)
nvlist_free(props);
return (ret);
}
/*
* Import the given pool using the known configuration and a list of
* properties to be set. The configuration should have come from
* zpool_find_import(). The 'newname' parameters control whether the pool
* is imported with a different name.
*/
int
zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
nvlist_t *props, boolean_t importfaulted)
2008-11-20 20:01:55 +00:00
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
zpool_rewind_policy_t policy;
nvlist_t *nvi = NULL;
2008-11-20 20:01:55 +00:00
char *thename;
char *origname;
uint64_t returned_size;
2008-11-20 20:01:55 +00:00
int ret;
char errbuf[1024];
verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
&origname) == 0);
(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
"cannot import pool '%s'"), origname);
if (newname != NULL) {
if (!zpool_name_valid(hdl, B_FALSE, newname))
return (zfs_error_fmt(hdl, EZFS_INVALIDNAME,
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
newname));
thename = (char *)newname;
} else {
thename = origname;
}
if (props) {
uint64_t version;
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
&version) == 0);
if ((props = zpool_valid_proplist(hdl, origname,
2008-11-20 20:01:55 +00:00
props, version, B_TRUE, errbuf)) == NULL) {
return (-1);
} else if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
nvlist_free(props);
return (-1);
}
}
(void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name));
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
&zc.zc_guid) == 0);
if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) {
nvlist_free(props);
return (-1);
}
returned_size = zc.zc_nvlist_conf_size + 512;
if (zcmd_alloc_dst_nvlist(hdl, &zc, returned_size) != 0) {
nvlist_free(props);
return (-1);
}
2008-11-20 20:01:55 +00:00
zc.zc_cookie = (uint64_t)importfaulted;
2008-11-20 20:01:55 +00:00
ret = 0;
if (zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc) != 0) {
char desc[1024];
(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
zpool_get_rewind_policy(config, &policy);
/*
* Dry-run failed, but we print out what success
* looks like if we found a best txg
*/
if ((policy.zrp_request & ZPOOL_TRY_REWIND) && nvi) {
zpool_rewind_exclaim(hdl, newname ? origname : thename,
B_TRUE, nvi);
nvlist_free(nvi);
return (-1);
}
2008-11-20 20:01:55 +00:00
if (newname == NULL)
(void) snprintf(desc, sizeof (desc),
dgettext(TEXT_DOMAIN, "cannot import '%s'"),
thename);
else
(void) snprintf(desc, sizeof (desc),
dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"),
origname, thename);
switch (errno) {
case ENOTSUP:
/*
* Unsupported version.
*/
(void) zfs_error(hdl, EZFS_BADVERSION, desc);
break;
case EINVAL:
(void) zfs_error(hdl, EZFS_INVALCONFIG, desc);
break;
case EROFS:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"one or more devices is read only"));
(void) zfs_error(hdl, EZFS_BADDEV, desc);
break;
2008-11-20 20:01:55 +00:00
default:
(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
2008-11-20 20:01:55 +00:00
(void) zpool_standard_error(hdl, errno, desc);
zpool_explain_recover(hdl,
newname ? origname : thename, -errno, nvi);
nvlist_free(nvi);
break;
2008-11-20 20:01:55 +00:00
}
ret = -1;
} else {
zpool_handle_t *zhp;
/*
* This should never fail, but play it safe anyway.
*/
if (zpool_open_silent(hdl, thename, &zhp) != 0)
2008-11-20 20:01:55 +00:00
ret = -1;
else if (zhp != NULL)
2008-11-20 20:01:55 +00:00
zpool_close(zhp);
(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
zpool_get_rewind_policy(config, &policy);
if (policy.zrp_request &
(ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
zpool_rewind_exclaim(hdl, newname ? origname : thename,
((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
nvi);
2008-11-20 20:01:55 +00:00
}
nvlist_free(nvi);
return (0);
2008-11-20 20:01:55 +00:00
}
zcmd_free_nvlists(&zc);
nvlist_free(props);
return (ret);
}
/*
* Scan the pool.
2008-11-20 20:01:55 +00:00
*/
int
zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func)
2008-11-20 20:01:55 +00:00
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_cookie = func;
2008-11-20 20:01:55 +00:00
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 ||
(errno == ENOENT && func != POOL_SCAN_NONE))
2008-11-20 20:01:55 +00:00
return (0);
if (func == POOL_SCAN_SCRUB) {
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name);
} else if (func == POOL_SCAN_NONE) {
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"),
zc.zc_name);
} else {
assert(!"unexpected result");
}
2008-11-20 20:01:55 +00:00
if (errno == EBUSY) {
nvlist_t *nvroot;
pool_scan_stat_t *ps = NULL;
uint_t psc;
verify(nvlist_lookup_nvlist(zhp->zpool_config,
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
if (ps && ps->pss_func == POOL_SCAN_SCRUB)
return (zfs_error(hdl, EZFS_SCRUBBING, msg));
else
return (zfs_error(hdl, EZFS_RESILVERING, msg));
} else if (errno == ENOENT) {
return (zfs_error(hdl, EZFS_NO_SCRUB, msg));
} else {
2008-11-20 20:01:55 +00:00
return (zpool_standard_error(hdl, errno, msg));
}
}
/*
* This provides a very minimal check whether a given string is likely a
* c#t#d# style string. Users of this are expected to do their own
* verification of the s# part.
*/
#define CTD_CHECK(str) (str && str[0] == 'c' && isdigit(str[1]))
/*
* More elaborate version for ones which may start with "/dev/dsk/"
* and the like.
*/
static int
ctd_check_path(char *str) {
/*
* If it starts with a slash, check the last component.
*/
if (str && str[0] == '/') {
char *tmp = strrchr(str, '/');
/*
* If it ends in "/old", check the second-to-last
* component of the string instead.
*/
if (tmp != str && strcmp(tmp, "/old") == 0) {
for (tmp--; *tmp != '/'; tmp--)
;
}
str = tmp + 1;
}
return (CTD_CHECK(str));
2008-11-20 20:01:55 +00:00
}
/*
2009-07-02 22:44:48 +00:00
* Find a vdev that matches the search criteria specified. We use the
* the nvpair name to determine how we should look for the device.
2008-11-20 20:01:55 +00:00
* 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL
* spare; but FALSE if its an INUSE spare.
*/
static nvlist_t *
2009-07-02 22:44:48 +00:00
vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
boolean_t *l2cache, boolean_t *log)
2008-11-20 20:01:55 +00:00
{
uint_t c, children;
nvlist_t **child;
nvlist_t *ret;
uint64_t is_log;
2009-07-02 22:44:48 +00:00
char *srchkey;
nvpair_t *pair = nvlist_next_nvpair(search, NULL);
2008-11-20 20:01:55 +00:00
2009-07-02 22:44:48 +00:00
/* Nothing to look for */
if (search == NULL || pair == NULL)
return (NULL);
/* Obtain the key we will use to search */
srchkey = nvpair_name(pair);
switch (nvpair_type(pair)) {
case DATA_TYPE_UINT64: {
uint64_t srchval, theguid, present;
verify(nvpair_value_uint64(pair, &srchval) == 0);
if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
&present) == 0) {
/*
* If the device has never been present since
* import, the only reliable way to match the
* vdev is by GUID.
*/
verify(nvlist_lookup_uint64(nv,
ZPOOL_CONFIG_GUID, &theguid) == 0);
if (theguid == srchval)
return (nv);
}
}
break;
}
case DATA_TYPE_STRING: {
char *srchval, *val;
verify(nvpair_value_string(pair, &srchval) == 0);
if (nvlist_lookup_string(nv, srchkey, &val) != 0)
break;
2008-11-20 20:01:55 +00:00
/*
* Search for the requested value. Special cases:
*
* - ZPOOL_CONFIG_PATH for whole disk entries. These end in
* "s0" or "s0/old". The "s0" part is hidden from the user,
* but included in the string, so this matches around it.
* - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
*
* Otherwise, all other searches are simple string compares.
2008-11-20 20:01:55 +00:00
*/
if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 &&
ctd_check_path(val)) {
2009-07-02 22:44:48 +00:00
uint64_t wholedisk = 0;
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk);
if (wholedisk) {
int slen = strlen(srchval);
int vlen = strlen(val);
if (slen != vlen - 2)
break;
/*
* make_leaf_vdev() should only set
* wholedisk for ZPOOL_CONFIG_PATHs which
* will include "/dev/dsk/", giving plenty of
* room for the indices used next.
*/
ASSERT(vlen >= 6);
2009-07-02 22:44:48 +00:00
/*
* strings identical except trailing "s0"
2009-07-02 22:44:48 +00:00
*/
if (strcmp(&val[vlen - 2], "s0") == 0 &&
strncmp(srchval, val, slen) == 0)
2009-07-02 22:44:48 +00:00
return (nv);
2009-07-02 22:44:48 +00:00
/*
* strings identical except trailing "s0/old"
2009-07-02 22:44:48 +00:00
*/
if (strcmp(&val[vlen - 6], "s0/old") == 0 &&
strcmp(&srchval[slen - 4], "/old") == 0 &&
strncmp(srchval, val, slen - 4) == 0)
2009-07-02 22:44:48 +00:00
return (nv);
2009-07-02 22:44:48 +00:00
break;
}
} else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
char *type, *idx, *end, *p;
uint64_t id, vdev_id;
/*
* Determine our vdev type, keeping in mind
* that the srchval is composed of a type and
* vdev id pair (i.e. mirror-4).
*/
if ((type = strdup(srchval)) == NULL)
return (NULL);
if ((p = strrchr(type, '-')) == NULL) {
free(type);
2009-07-02 22:44:48 +00:00
break;
}
idx = p + 1;
*p = '\0';
/*
* If the types don't match then keep looking.
*/
if (strncmp(val, type, strlen(val)) != 0) {
free(type);
break;
}
verify(strncmp(type, VDEV_TYPE_RAIDZ,
strlen(VDEV_TYPE_RAIDZ)) == 0 ||
strncmp(type, VDEV_TYPE_MIRROR,
strlen(VDEV_TYPE_MIRROR)) == 0);
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
&id) == 0);
errno = 0;
vdev_id = strtoull(idx, &end, 10);
free(type);
if (errno != 0)
return (NULL);
/*
* Now verify that we have the correct vdev id.
*/
if (vdev_id == id)
return (nv);
2008-11-20 20:01:55 +00:00
}
/*
2009-07-02 22:44:48 +00:00
* Common case
2008-11-20 20:01:55 +00:00
*/
2009-07-02 22:44:48 +00:00
if (strcmp(srchval, val) == 0)
2008-11-20 20:01:55 +00:00
return (nv);
2009-07-02 22:44:48 +00:00
break;
}
default:
break;
2008-11-20 20:01:55 +00:00
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
return (NULL);
for (c = 0; c < children; c++) {
2009-07-02 22:44:48 +00:00
if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) {
/*
* The 'is_log' value is only set for the toplevel
* vdev, not the leaf vdevs. So we always lookup the
* log device from the root of the vdev tree (where
* 'log' is non-NULL).
*/
if (log != NULL &&
nvlist_lookup_uint64(child[c],
ZPOOL_CONFIG_IS_LOG, &is_log) == 0 &&
is_log) {
*log = B_TRUE;
}
2008-11-20 20:01:55 +00:00
return (ret);
}
}
2008-11-20 20:01:55 +00:00
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
&child, &children) == 0) {
for (c = 0; c < children; c++) {
2009-07-02 22:44:48 +00:00
if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) {
2008-11-20 20:01:55 +00:00
*avail_spare = B_TRUE;
return (ret);
}
}
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
&child, &children) == 0) {
for (c = 0; c < children; c++) {
2009-07-02 22:44:48 +00:00
if ((ret = vdev_to_nvlist_iter(child[c], search,
avail_spare, l2cache, NULL)) != NULL) {
2008-11-20 20:01:55 +00:00
*l2cache = B_TRUE;
return (ret);
}
}
}
return (NULL);
}
2009-07-02 22:44:48 +00:00
/*
* Given a physical path (minus the "/devices" prefix), find the
* associated vdev.
*/
nvlist_t *
zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log)
{
nvlist_t *search, *nvroot, *ret;
verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0);
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
*avail_spare = B_FALSE;
ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
nvlist_free(search);
return (ret);
}
/*
* Determine if we have an "interior" top-level vdev (i.e mirror/raidz).
*/
boolean_t
zpool_vdev_is_interior(const char *name)
{
if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
return (B_TRUE);
return (B_FALSE);
}
2008-11-20 20:01:55 +00:00
nvlist_t *
zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
boolean_t *l2cache, boolean_t *log)
2008-11-20 20:01:55 +00:00
{
char buf[MAXPATHLEN];
char *end;
2009-07-02 22:44:48 +00:00
nvlist_t *nvroot, *search, *ret;
2008-11-20 20:01:55 +00:00
uint64_t guid;
2009-07-02 22:44:48 +00:00
verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2008-11-20 20:01:55 +00:00
guid = strtoull(path, &end, 10);
if (guid != 0 && *end == '\0') {
2009-07-02 22:44:48 +00:00
verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
} else if (zpool_vdev_is_interior(path)) {
verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
2008-11-20 20:01:55 +00:00
} else if (path[0] != '/') {
2008-12-05 19:25:15 +00:00
(void) snprintf(buf, sizeof (buf), "%s/%s", DISK_ROOT, path);
2009-07-02 22:44:48 +00:00
verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0);
2008-11-20 20:01:55 +00:00
} else {
2009-07-02 22:44:48 +00:00
verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0);
2008-11-20 20:01:55 +00:00
}
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
*avail_spare = B_FALSE;
*l2cache = B_FALSE;
if (log != NULL)
*log = B_FALSE;
2009-07-02 22:44:48 +00:00
ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
nvlist_free(search);
return (ret);
}
static int
vdev_online(nvlist_t *nv)
{
uint64_t ival;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
return (0);
return (1);
}
/*
2009-07-02 22:44:48 +00:00
* Helper function for zpool_get_physpaths().
*/
2009-07-02 22:44:48 +00:00
static int
vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size,
size_t *bytes_written)
{
size_t bytes_left, pos, rsz;
char *tmppath;
const char *format;
if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH,
&tmppath) != 0)
return (EZFS_NODEVICE);
pos = *bytes_written;
bytes_left = physpath_size - pos;
format = (pos == 0) ? "%s" : " %s";
rsz = snprintf(physpath + pos, bytes_left, format, tmppath);
*bytes_written += rsz;
if (rsz >= bytes_left) {
/* if physpath was not copied properly, clear it */
if (bytes_left != 0) {
physpath[pos] = 0;
}
return (EZFS_NOSPC);
}
return (0);
}
static int
vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size,
size_t *rsz, boolean_t is_spare)
{
char *type;
int ret;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
return (EZFS_INVALCONFIG);
if (strcmp(type, VDEV_TYPE_DISK) == 0) {
/*
* An active spare device has ZPOOL_CONFIG_IS_SPARE set.
* For a spare vdev, we only want to boot from the active
* spare device.
*/
if (is_spare) {
uint64_t spare = 0;
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
&spare);
if (!spare)
return (EZFS_INVALCONFIG);
}
if (vdev_online(nv)) {
if ((ret = vdev_get_one_physpath(nv, physpath,
phypath_size, rsz)) != 0)
return (ret);
}
} else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
(is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) {
nvlist_t **child;
uint_t count;
int i, ret;
if (nvlist_lookup_nvlist_array(nv,
ZPOOL_CONFIG_CHILDREN, &child, &count) != 0)
return (EZFS_INVALCONFIG);
for (i = 0; i < count; i++) {
ret = vdev_get_physpaths(child[i], physpath,
phypath_size, rsz, is_spare);
if (ret == EZFS_NOSPC)
return (ret);
}
}
return (EZFS_POOL_INVALARG);
}
/*
* Get phys_path for a root pool config.
* Return 0 on success; non-zero on failure.
*/
static int
zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size)
{
2009-07-02 22:44:48 +00:00
size_t rsz;
nvlist_t *vdev_root;
nvlist_t **child;
uint_t count;
2009-07-02 22:44:48 +00:00
char *type;
2009-07-02 22:44:48 +00:00
rsz = 0;
2009-07-02 22:44:48 +00:00
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&vdev_root) != 0)
return (EZFS_INVALCONFIG);
2009-07-02 22:44:48 +00:00
if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 ||
nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN,
&child, &count) != 0)
2009-07-02 22:44:48 +00:00
return (EZFS_INVALCONFIG);
2009-07-02 22:44:48 +00:00
/*
* root pool can not have EFI labeled disks and can only have
* a single top-level vdev.
*/
if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1 ||
pool_uses_efi(vdev_root))
return (EZFS_POOL_INVALARG);
2009-07-02 22:44:48 +00:00
(void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz,
B_FALSE);
/* No online devices */
if (rsz == 0)
return (EZFS_NODEVICE);
return (0);
2008-11-20 20:01:55 +00:00
}
2009-07-02 22:44:48 +00:00
/*
* Get phys_path for a root pool
* Return 0 on success; non-zero on failure.
*/
int
zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size)
{
return (zpool_get_config_physpath(zhp->zpool_config, physpath,
phypath_size));
}
/*
* If the device has being dynamically expanded then we need to relabel
* the disk to use the new unallocated space.
*/
static int
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
zpool_relabel_disk(libzfs_handle_t *hdl, const char *path)
2009-07-02 22:44:48 +00:00
{
char errbuf[1024];
int fd, error;
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) {
2009-07-02 22:44:48 +00:00
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
"relabel '%s': unable to open device"), path);
2009-07-02 22:44:48 +00:00
return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
}
/*
* It's possible that we might encounter an error if the device
* does not have any unallocated space left. If so, we simply
* ignore that error and continue on.
*/
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
error = efi_use_whole_disk(fd);
2009-07-02 22:44:48 +00:00
(void) close(fd);
if (error && error != VT_ENOSPC) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
"relabel '%s': unable to read disk capacity"), path);
2009-07-02 22:44:48 +00:00
return (zfs_error(hdl, EZFS_NOCAP, errbuf));
}
return (0);
}
2008-11-20 20:01:55 +00:00
/*
* Bring the specified vdev online. The 'flags' parameter is a set of the
* ZFS_ONLINE_* flags.
*/
int
zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
vdev_state_t *newstate)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
char msg[1024];
nvlist_t *tgt;
2009-07-02 22:44:48 +00:00
boolean_t avail_spare, l2cache, islog;
2008-11-20 20:01:55 +00:00
libzfs_handle_t *hdl = zhp->zpool_hdl;
2009-07-02 22:44:48 +00:00
if (flags & ZFS_ONLINE_EXPAND) {
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot expand %s"), path);
} else {
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot online %s"), path);
}
2008-11-20 20:01:55 +00:00
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
2009-07-02 22:44:48 +00:00
&islog)) == NULL)
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_NODEVICE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
if (avail_spare)
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_ISSPARE, msg));
2009-07-02 22:44:48 +00:00
if (flags & ZFS_ONLINE_EXPAND ||
zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
char *pathname = NULL;
uint64_t wholedisk = 0;
(void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk);
verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH,
&pathname) == 0);
/*
* XXX - L2ARC 1.0 devices can't support expansion.
*/
if (l2cache) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot expand cache devices"));
return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg));
}
if (wholedisk) {
(void) zpool_relabel_disk(zhp->zpool_hdl, pathname);
}
}
2008-11-20 20:01:55 +00:00
zc.zc_cookie = VDEV_STATE_ONLINE;
zc.zc_obj = flags;
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
if (errno == EINVAL) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
"from this pool into a new one. Use '%s' "
"instead"), "zpool detach");
return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg));
}
2008-11-20 20:01:55 +00:00
return (zpool_standard_error(hdl, errno, msg));
}
2008-11-20 20:01:55 +00:00
*newstate = zc.zc_cookie;
return (0);
}
/*
* Take the specified vdev offline
*/
int
zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
char msg[1024];
nvlist_t *tgt;
boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot offline %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
NULL)) == NULL)
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_NODEVICE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
if (avail_spare)
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_ISSPARE, msg));
zc.zc_cookie = VDEV_STATE_OFFLINE;
zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0;
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
return (0);
switch (errno) {
case EBUSY:
/*
* There are no other replicas of this device.
*/
return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
2009-07-02 22:44:48 +00:00
case EEXIST:
/*
* The log device has unplayed logs
*/
return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg));
2008-11-20 20:01:55 +00:00
default:
return (zpool_standard_error(hdl, errno, msg));
}
}
/*
* Mark the given vdev faulted.
*/
int
zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
2008-11-20 20:01:55 +00:00
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
2008-12-04 22:41:11 +00:00
dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid);
2008-11-20 20:01:55 +00:00
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_guid = guid;
zc.zc_cookie = VDEV_STATE_FAULTED;
zc.zc_obj = aux;
2008-11-20 20:01:55 +00:00
if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
return (0);
switch (errno) {
case EBUSY:
/*
* There are no other replicas of this device.
*/
return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
default:
return (zpool_standard_error(hdl, errno, msg));
}
}
/*
* Mark the given vdev degraded.
*/
int
zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
2008-11-20 20:01:55 +00:00
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
2008-12-04 22:41:11 +00:00
dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid);
2008-11-20 20:01:55 +00:00
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_guid = guid;
zc.zc_cookie = VDEV_STATE_DEGRADED;
zc.zc_obj = aux;
2008-11-20 20:01:55 +00:00
if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
return (0);
return (zpool_standard_error(hdl, errno, msg));
}
/*
* Returns TRUE if the given nvlist is a vdev that was originally swapped in as
* a hot spare.
*/
static boolean_t
is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which)
{
nvlist_t **child;
uint_t c, children;
char *type;
if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child,
&children) == 0) {
verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE,
&type) == 0);
if (strcmp(type, VDEV_TYPE_SPARE) == 0 &&
children == 2 && child[which] == tgt)
return (B_TRUE);
for (c = 0; c < children; c++)
if (is_replacing_spare(child[c], tgt, which))
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Attach new_disk (fully described by nvroot) to old_disk.
* If 'replacing' is specified, the new disk will replace the old one.
*/
int
zpool_vdev_attach(zpool_handle_t *zhp,
const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
char msg[1024];
int ret;
nvlist_t *tgt;
boolean_t avail_spare, l2cache, islog;
uint64_t val;
char *path, *newname;
2008-11-20 20:01:55 +00:00
nvlist_t **child;
uint_t children;
nvlist_t *config_root;
libzfs_handle_t *hdl = zhp->zpool_hdl;
boolean_t rootpool = pool_is_bootable(zhp);
2008-11-20 20:01:55 +00:00
if (replacing)
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot replace %s with %s"), old_disk, new_disk);
else
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot attach %s to %s"), new_disk, old_disk);
/*
* If this is a root pool, make sure that we're not attaching an
* EFI labeled device.
*/
if (rootpool && pool_uses_efi(nvroot)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"EFI labeled devices are not supported on root pools."));
return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
}
2008-11-20 20:01:55 +00:00
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache,
&islog)) == 0)
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_NODEVICE, msg));
if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
if (l2cache)
return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
zc.zc_cookie = replacing;
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0 || children != 1) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"new device must be a single disk"));
return (zfs_error(hdl, EZFS_INVALCONFIG, msg));
}
verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0);
if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL)
return (-1);
2008-11-20 20:01:55 +00:00
/*
* If the target is a hot spare that has been swapped in, we can only
* replace it with another hot spare.
*/
if (replacing &&
nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 &&
(zpool_find_vdev(zhp, newname, &avail_spare, &l2cache,
NULL) == NULL || !avail_spare) &&
is_replacing_spare(config_root, tgt, 1)) {
2008-11-20 20:01:55 +00:00
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"can only be replaced by another hot spare"));
free(newname);
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_BADTARGET, msg));
}
/*
* If we are attempting to replace a spare, it canot be applied to an
* already spared device.
*/
if (replacing &&
nvlist_lookup_string(child[0], ZPOOL_CONFIG_PATH, &path) == 0 &&
zpool_find_vdev(zhp, newname, &avail_spare,
&l2cache, NULL) != NULL && avail_spare &&
is_replacing_spare(config_root, tgt, 0)) {
2008-11-20 20:01:55 +00:00
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"device has already been replaced with a spare"));
free(newname);
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_BADTARGET, msg));
}
free(newname);
2008-11-20 20:01:55 +00:00
if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
return (-1);
ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ATTACH, &zc);
zcmd_free_nvlists(&zc);
if (ret == 0) {
if (rootpool) {
/*
* XXX - This should be removed once we can
* automatically install the bootblocks on the
* newly attached disk.
*/
(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Please "
"be sure to invoke %s to make '%s' bootable.\n"),
BOOTCMD, new_disk);
2009-07-02 22:44:48 +00:00
/*
* XXX need a better way to prevent user from
* booting up a half-baked vdev.
*/
(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make "
"sure to wait until resilver is done "
"before rebooting.\n"));
}
2008-11-20 20:01:55 +00:00
return (0);
}
2008-11-20 20:01:55 +00:00
switch (errno) {
case ENOTSUP:
/*
* Can't attach to or replace this type of vdev.
*/
if (replacing) {
if (islog)
2008-11-20 20:01:55 +00:00
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot replace a log with a spare"));
else
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot replace a replacing device"));
} else {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"can only attach to mirrors and top-level "
"disks"));
}
(void) zfs_error(hdl, EZFS_BADTARGET, msg);
break;
case EINVAL:
/*
* The new device must be a single disk.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"new device must be a single disk"));
(void) zfs_error(hdl, EZFS_INVALCONFIG, msg);
break;
case EBUSY:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"),
new_disk);
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
case EOVERFLOW:
/*
* The new device is too small.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"device is too small"));
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
case EDOM:
/*
* The new device has a different alignment requirement.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"devices have different sector alignment"));
(void) zfs_error(hdl, EZFS_BADDEV, msg);
break;
case ENAMETOOLONG:
/*
* The resulting top-level vdev spec won't fit in the label.
*/
(void) zfs_error(hdl, EZFS_DEVOVERFLOW, msg);
break;
default:
(void) zpool_standard_error(hdl, errno, msg);
}
return (-1);
}
/*
* Detach the specified device.
*/
int
zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
char msg[1024];
nvlist_t *tgt;
boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot detach %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
NULL)) == 0)
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_NODEVICE, msg));
if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
if (l2cache)
return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0)
return (0);
switch (errno) {
case ENOTSUP:
/*
* Can't detach from this type of vdev.
*/
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only "
"applicable to mirror and replacing vdevs"));
(void) zfs_error(zhp->zpool_hdl, EZFS_BADTARGET, msg);
break;
case EBUSY:
/*
* There are no other replicas of this device.
*/
(void) zfs_error(hdl, EZFS_NOREPLICAS, msg);
break;
default:
(void) zpool_standard_error(hdl, errno, msg);
}
return (-1);
}
/*
* Find a mirror vdev in the source nvlist.
*
* The mchild array contains a list of disks in one of the top-level mirrors
* of the source pool. The schild array contains a list of disks that the
* user specified on the command line. We loop over the mchild array to
* see if any entry in the schild array matches.
*
* If a disk in the mchild array is found in the schild array, we return
* the index of that entry. Otherwise we return -1.
*/
static int
find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren,
nvlist_t **schild, uint_t schildren)
{
uint_t mc;
for (mc = 0; mc < mchildren; mc++) {
uint_t sc;
char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp,
mchild[mc], B_FALSE);
for (sc = 0; sc < schildren; sc++) {
char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp,
schild[sc], B_FALSE);
boolean_t result = (strcmp(mpath, spath) == 0);
free(spath);
if (result) {
free(mpath);
return (mc);
}
}
free(mpath);
}
return (-1);
}
/*
* Split a mirror pool. If newroot points to null, then a new nvlist
* is generated and it is the responsibility of the caller to free it.
*/
int
zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot,
nvlist_t *props, splitflags_t flags)
{
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL;
nvlist_t **varray = NULL, *zc_props = NULL;
uint_t c, children, newchildren, lastlog = 0, vcount, found = 0;
libzfs_handle_t *hdl = zhp->zpool_hdl;
uint64_t vers;
boolean_t freelist = B_FALSE, memory_err = B_TRUE;
int retval = 0;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name);
if (!zpool_name_valid(hdl, B_FALSE, newname))
return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
if ((config = zpool_get_config(zhp, NULL)) == NULL) {
(void) fprintf(stderr, gettext("Internal error: unable to "
"retrieve pool configuration\n"));
return (-1);
}
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree)
== 0);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0);
if (props) {
if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name,
props, vers, B_TRUE, msg)) == NULL)
return (-1);
}
if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
&children) != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"Source pool is missing vdev tree"));
if (zc_props)
nvlist_free(zc_props);
return (-1);
}
varray = zfs_alloc(hdl, children * sizeof (nvlist_t *));
vcount = 0;
if (*newroot == NULL ||
nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN,
&newchild, &newchildren) != 0)
newchildren = 0;
for (c = 0; c < children; c++) {
uint64_t is_log = B_FALSE, is_hole = B_FALSE;
char *type;
nvlist_t **mchild, *vdev;
uint_t mchildren;
int entry;
/*
* Unlike cache & spares, slogs are stored in the
* ZPOOL_CONFIG_CHILDREN array. We filter them out here.
*/
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
&is_log);
(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
&is_hole);
if (is_log || is_hole) {
/*
* Create a hole vdev and put it in the config.
*/
if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0)
goto out;
if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE,
VDEV_TYPE_HOLE) != 0)
goto out;
if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE,
1) != 0)
goto out;
if (lastlog == 0)
lastlog = vcount;
varray[vcount++] = vdev;
continue;
}
lastlog = 0;
verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type)
== 0);
if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"Source pool must be composed only of mirrors\n"));
retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
goto out;
}
verify(nvlist_lookup_nvlist_array(child[c],
ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
/* find or add an entry for this top-level vdev */
if (newchildren > 0 &&
(entry = find_vdev_entry(zhp, mchild, mchildren,
newchild, newchildren)) >= 0) {
/* We found a disk that the user specified. */
vdev = mchild[entry];
++found;
} else {
/* User didn't specify a disk for this vdev. */
vdev = mchild[mchildren - 1];
}
if (nvlist_dup(vdev, &varray[vcount++], 0) != 0)
goto out;
}
/* did we find every disk the user specified? */
if (found != newchildren) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must "
"include at most one disk from each mirror"));
retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
goto out;
}
/* Prepare the nvlist for populating. */
if (*newroot == NULL) {
if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0)
goto out;
freelist = B_TRUE;
if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE,
VDEV_TYPE_ROOT) != 0)
goto out;
} else {
verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0);
}
/* Add all the children we found */
if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray,
lastlog == 0 ? vcount : lastlog) != 0)
goto out;
/*
* If we're just doing a dry run, exit now with success.
*/
if (flags.dryrun) {
memory_err = B_FALSE;
freelist = B_FALSE;
goto out;
}
/* now build up the config list & call the ioctl */
if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0)
goto out;
if (nvlist_add_nvlist(newconfig,
ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 ||
nvlist_add_string(newconfig,
ZPOOL_CONFIG_POOL_NAME, newname) != 0 ||
nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0)
goto out;
/*
* The new pool is automatically part of the namespace unless we
* explicitly export it.
*/
if (!flags.import)
zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT;
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
(void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string));
if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0)
goto out;
if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
goto out;
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) {
retval = zpool_standard_error(hdl, errno, msg);
goto out;
}
freelist = B_FALSE;
memory_err = B_FALSE;
out:
if (varray != NULL) {
int v;
for (v = 0; v < vcount; v++)
nvlist_free(varray[v]);
free(varray);
}
zcmd_free_nvlists(&zc);
if (zc_props)
nvlist_free(zc_props);
if (newconfig)
nvlist_free(newconfig);
if (freelist) {
nvlist_free(*newroot);
*newroot = NULL;
}
if (retval != 0)
return (retval);
if (memory_err)
return (no_memory(hdl));
return (0);
}
2008-11-20 20:01:55 +00:00
/*
* Remove the given device. Currently, this is supported only for hot spares
* and level 2 cache devices.
*/
int
zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
char msg[1024];
nvlist_t *tgt;
boolean_t avail_spare, l2cache, islog;
2008-11-20 20:01:55 +00:00
libzfs_handle_t *hdl = zhp->zpool_hdl;
uint64_t version;
2008-11-20 20:01:55 +00:00
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
&islog)) == 0)
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_NODEVICE, msg));
/*
* XXX - this should just go away.
*/
if (!avail_spare && !l2cache && !islog) {
2008-11-20 20:01:55 +00:00
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"only inactive hot spares, cache, top-level, "
"or log devices can be removed"));
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_NODEVICE, msg));
}
version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
if (islog && version < SPA_VERSION_HOLES) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"pool must be upgrade to support log removal"));
return (zfs_error(hdl, EZFS_BADVERSION, msg));
}
2008-11-20 20:01:55 +00:00
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
return (0);
return (zpool_standard_error(hdl, errno, msg));
}
/*
* Clear the errors for the pool, or the particular device if specified.
*/
int
zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
2008-11-20 20:01:55 +00:00
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
char msg[1024];
nvlist_t *tgt;
zpool_rewind_policy_t policy;
2008-11-20 20:01:55 +00:00
boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
nvlist_t *nvi = NULL;
2008-11-20 20:01:55 +00:00
if (path)
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
path);
else
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot clear errors for %s"),
zhp->zpool_name);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (path) {
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare,
&l2cache, NULL)) == 0)
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_NODEVICE, msg));
/*
* Don't allow error clearing for hot spares. Do allow
* error clearing for l2cache devices.
*/
if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID,
&zc.zc_guid) == 0);
}
zpool_get_rewind_policy(rewindnvl, &policy);
zc.zc_cookie = policy.zrp_request;
if (zcmd_alloc_dst_nvlist(hdl, &zc, 8192) != 0)
return (-1);
if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, rewindnvl) != 0)
return (-1);
if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0 ||
((policy.zrp_request & ZPOOL_TRY_REWIND) &&
errno != EPERM && errno != EACCES)) {
if (policy.zrp_request &
(ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
zpool_rewind_exclaim(hdl, zc.zc_name,
((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
nvi);
nvlist_free(nvi);
}
zcmd_free_nvlists(&zc);
2008-11-20 20:01:55 +00:00
return (0);
}
2008-11-20 20:01:55 +00:00
zcmd_free_nvlists(&zc);
2008-11-20 20:01:55 +00:00
return (zpool_standard_error(hdl, errno, msg));
}
/*
* Similar to zpool_clear(), but takes a GUID (used by fmd).
*/
int
zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
char msg[1024];
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"),
2008-12-04 22:41:11 +00:00
(u_longlong_t)guid);
2008-11-20 20:01:55 +00:00
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_guid = guid;
zc.zc_cookie = ZPOOL_NO_REWIND;
2008-11-20 20:01:55 +00:00
if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0)
return (0);
return (zpool_standard_error(hdl, errno, msg));
}
/*
* Convert from a devid string to a path.
*/
static char *
devid_to_path(char *devid_str)
{
ddi_devid_t devid;
char *minor;
char *path;
devid_nmlist_t *list = NULL;
int ret;
if (devid_str_decode(devid_str, &devid, &minor) != 0)
return (NULL);
ret = devid_deviceid_to_nmlist("/dev", devid, minor, &list);
devid_str_free(minor);
devid_free(devid);
if (ret != 0)
return (NULL);
if ((path = strdup(list[0].devname)) == NULL)
return (NULL);
devid_free_nmlist(list);
return (path);
}
/*
* Convert from a path to a devid string.
*/
static char *
path_to_devid(const char *path)
{
int fd;
ddi_devid_t devid;
char *minor, *ret;
if ((fd = open(path, O_RDONLY)) < 0)
return (NULL);
minor = NULL;
ret = NULL;
if (devid_get(fd, &devid) == 0) {
if (devid_get_minor_name(fd, &minor) == 0)
ret = devid_str_encode(devid, minor);
if (minor != NULL)
devid_str_free(minor);
devid_free(devid);
}
(void) close(fd);
return (ret);
}
/*
* Issue the necessary ioctl() to update the stored path value for the vdev. We
* ignore any failure here, since a common case is for an unprivileged user to
* type 'zpool status', and we'll display the correct information anyway.
*/
static void
set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
(void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
(void) strncpy(zc.zc_value, path, sizeof (zc.zc_value));
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
&zc.zc_guid) == 0);
(void) ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SETPATH, &zc);
}
/*
* Given a vdev, return the name to display in iostat. If the vdev has a path,
* we use that, stripping off any leading "/dev/dsk/"; if not, we use the type.
* We also check if this is a whole disk, in which case we strip off the
* trailing 's0' slice name.
*
* This routine is also responsible for identifying when disks have been
* reconfigured in a new location. The kernel will have opened the device by
* devid, but the path will still refer to the old location. To catch this, we
* first do a path -> devid translation (which is fast for the common case). If
* the devid matches, we're done. If not, we do a reverse devid -> path
* translation and issue the appropriate ioctl() to update the path of the vdev.
* If 'zhp' is NULL, then this is an exported pool, and we don't need to do any
* of these checks.
*/
char *
zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
boolean_t verbose)
2008-11-20 20:01:55 +00:00
{
Always preserve the passed path at creation time so udev may be used After spending considerable time thinking about this I've come to the conclusion that on Linux systems we don't need Solaris style devid support. Instead was can simply use udev if we are careful, there are even some advantages. The Solaris style devid's are designed to provide a mechanism by which a device can be opened reliably regardless of it's location in the system. This is exactly what udev provides us on Linux, a flexible mechanism for consistently identifing the same devices regardless of probing order. We just need to be careful to always open the device by the path provided at creation time, this path must be stored in ZPOOL_CONFIG_PATH. This in fact has certain advantages. For example, if in your system you always want the zpool to be able to locate the disk regardless of physical location you can create the pool using /dev/disk/by-id/. This is perhaps what you'ld want on a desktop system where the exact location is not that important. It's more critical that all the disks can be found. However, in an enterprise setup there's a good chace that the physical location of each drive is important. You have like set things up such that your raid groups span multiple hosts adapters, such that you can lose an adapter without downtime. In this case you would want to use the /dev/disk/by-path/ path to ensure the path information is preserved and you always open the disks at the right physical locations. This would ensure your system never gets accidently misconfigured and still just works because the zpool was still able to locate the disk. Finally, if you want to get really fancy you can always create your own udev rules. This way you could implement whatever lookup sceme you wanted in user space for your drives. This would include nice cosmetic things like being able to control the device names in tools like zpool status, since the name as just based of the device names. I've yet to come up with a good reason to implement devid support on Linux since we have udev. But I've still just commented it out for now because somebody might come up with a really good I forgot.
2009-10-19 20:46:48 +00:00
char *path, *devid, *type;
2008-11-20 20:01:55 +00:00
uint64_t value;
char buf[64];
vdev_stat_t *vs;
uint_t vsc;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
&value) == 0) {
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
&value) == 0);
(void) snprintf(buf, sizeof (buf), "%llu",
(u_longlong_t)value);
path = buf;
} else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
/*
* If the device is dead (faulted, offline, etc) then don't
* bother opening it. Otherwise we may be forcing the user to
* open a misbehaving device, which can have undesirable
* effects.
*/
if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
2008-11-20 20:01:55 +00:00
(uint64_t **)&vs, &vsc) != 0 ||
vs->vs_state >= VDEV_STATE_DEGRADED) &&
zhp != NULL &&
nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) {
/*
* Determine if the current path is correct.
*/
char *newdevid = path_to_devid(path);
if (newdevid == NULL ||
strcmp(devid, newdevid) != 0) {
char *newpath;
if ((newpath = devid_to_path(devid)) != NULL) {
/*
* Update the path appropriately.
*/
set_path(zhp, nv, newpath);
if (nvlist_add_string(nv,
ZPOOL_CONFIG_PATH, newpath) == 0)
verify(nvlist_lookup_string(nv,
ZPOOL_CONFIG_PATH,
&path) == 0);
free(newpath);
}
}
if (newdevid)
devid_str_free(newdevid);
}
Always preserve the passed path at creation time so udev may be used After spending considerable time thinking about this I've come to the conclusion that on Linux systems we don't need Solaris style devid support. Instead was can simply use udev if we are careful, there are even some advantages. The Solaris style devid's are designed to provide a mechanism by which a device can be opened reliably regardless of it's location in the system. This is exactly what udev provides us on Linux, a flexible mechanism for consistently identifing the same devices regardless of probing order. We just need to be careful to always open the device by the path provided at creation time, this path must be stored in ZPOOL_CONFIG_PATH. This in fact has certain advantages. For example, if in your system you always want the zpool to be able to locate the disk regardless of physical location you can create the pool using /dev/disk/by-id/. This is perhaps what you'ld want on a desktop system where the exact location is not that important. It's more critical that all the disks can be found. However, in an enterprise setup there's a good chace that the physical location of each drive is important. You have like set things up such that your raid groups span multiple hosts adapters, such that you can lose an adapter without downtime. In this case you would want to use the /dev/disk/by-path/ path to ensure the path information is preserved and you always open the disks at the right physical locations. This would ensure your system never gets accidently misconfigured and still just works because the zpool was still able to locate the disk. Finally, if you want to get really fancy you can always create your own udev rules. This way you could implement whatever lookup sceme you wanted in user space for your drives. This would include nice cosmetic things like being able to control the device names in tools like zpool status, since the name as just based of the device names. I've yet to come up with a good reason to implement devid support on Linux since we have udev. But I've still just commented it out for now because somebody might come up with a really good I forgot.
2009-10-19 20:46:48 +00:00
/*
* For a block device only use the name.
*/
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
if (strcmp(type, VDEV_TYPE_DISK) == 0) {
path = strrchr(path, '/');
path++;
}
2008-11-20 20:01:55 +00:00
2008-12-05 19:25:15 +00:00
#if defined(__sun__) || defined(__sun)
/*
* The following code strips the slice from the device path.
*/
2008-11-20 20:01:55 +00:00
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
&value) == 0 && value) {
int pathlen = strlen(path);
2008-11-20 20:01:55 +00:00
char *tmp = zfs_strdup(hdl, path);
/*
* If it starts with c#, and ends with "s0", chop
* the "s0" off, or if it ends with "s0/old", remove
* the "s0" from the middle.
*/
if (CTD_CHECK(tmp)) {
if (strcmp(&tmp[pathlen - 2], "s0") == 0) {
tmp[pathlen - 2] = '\0';
} else if (pathlen > 6 &&
strcmp(&tmp[pathlen - 6], "s0/old") == 0) {
(void) strcpy(&tmp[pathlen - 6],
"/old");
}
}
2008-11-20 20:01:55 +00:00
return (tmp);
}
2008-12-05 19:25:15 +00:00
#endif
2008-11-20 20:01:55 +00:00
} else {
verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0);
/*
* If it's a raidz device, we need to stick in the parity level.
*/
if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) {
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&value) == 0);
(void) snprintf(buf, sizeof (buf), "%s%llu", path,
(u_longlong_t)value);
path = buf;
}
/*
* We identify each top-level vdev by using a <type-id>
* naming convention.
*/
if (verbose) {
uint64_t id;
verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
&id) == 0);
(void) snprintf(buf, sizeof (buf), "%s-%llu", path,
(u_longlong_t)id);
path = buf;
}
2008-11-20 20:01:55 +00:00
}
return (zfs_strdup(hdl, path));
}
static int
zbookmark_compare(const void *a, const void *b)
{
return (memcmp(a, b, sizeof (zbookmark_t)));
}
/*
* Retrieve the persistent error log, uniquify the members, and return to the
* caller.
*/
int
zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
uint64_t count;
zbookmark_t *zb = NULL;
int i;
/*
* Retrieve the raw error list from the kernel. If the number of errors
* has increased, allocate more space and continue until we get the
* entire list.
*/
verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT,
&count) == 0);
if (count == 0)
return (0);
if ((zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
count * sizeof (zbookmark_t))) == (uintptr_t)NULL)
return (-1);
zc.zc_nvlist_dst_size = count;
(void) strcpy(zc.zc_name, zhp->zpool_name);
for (;;) {
if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_ERROR_LOG,
&zc) != 0) {
free((void *)(uintptr_t)zc.zc_nvlist_dst);
if (errno == ENOMEM) {
count = zc.zc_nvlist_dst_size;
if ((zc.zc_nvlist_dst = (uintptr_t)
zfs_alloc(zhp->zpool_hdl, count *
sizeof (zbookmark_t))) == (uintptr_t)NULL)
return (-1);
} else {
return (-1);
}
} else {
break;
}
}
/*
* Sort the resulting bookmarks. This is a little confusing due to the
* implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last
* to first, and 'zc_nvlist_dst_size' indicates the number of boomarks
* _not_ copied as part of the process. So we point the start of our
* array appropriate and decrement the total number of elements.
*/
zb = ((zbookmark_t *)(uintptr_t)zc.zc_nvlist_dst) +
zc.zc_nvlist_dst_size;
count -= zc.zc_nvlist_dst_size;
qsort(zb, count, sizeof (zbookmark_t), zbookmark_compare);
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
/*
* Fill in the nverrlistp with nvlist's of dataset and object numbers.
*/
for (i = 0; i < count; i++) {
nvlist_t *nv;
/* ignoring zb_blkid and zb_level for now */
if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset &&
zb[i-1].zb_object == zb[i].zb_object)
continue;
if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0)
goto nomem;
if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET,
zb[i].zb_objset) != 0) {
nvlist_free(nv);
goto nomem;
}
if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT,
zb[i].zb_object) != 0) {
nvlist_free(nv);
goto nomem;
}
if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) {
nvlist_free(nv);
goto nomem;
}
nvlist_free(nv);
}
free((void *)(uintptr_t)zc.zc_nvlist_dst);
return (0);
nomem:
free((void *)(uintptr_t)zc.zc_nvlist_dst);
return (no_memory(zhp->zpool_hdl));
}
/*
* Upgrade a ZFS pool to the latest on-disk version.
*/
int
zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) strcpy(zc.zc_name, zhp->zpool_name);
zc.zc_cookie = new_version;
if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0)
return (zpool_standard_error_fmt(hdl, errno,
dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"),
zhp->zpool_name));
return (0);
}
void
zpool_set_history_str(const char *subcommand, int argc, char **argv,
char *history_str)
{
int i;
(void) strlcpy(history_str, subcommand, HIS_MAX_RECORD_LEN);
for (i = 1; i < argc; i++) {
if (strlen(history_str) + 1 + strlen(argv[i]) >
HIS_MAX_RECORD_LEN)
break;
(void) strlcat(history_str, " ", HIS_MAX_RECORD_LEN);
(void) strlcat(history_str, argv[i], HIS_MAX_RECORD_LEN);
}
}
/*
* Stage command history for logging.
*/
int
zpool_stage_history(libzfs_handle_t *hdl, const char *history_str)
{
if (history_str == NULL)
return (EINVAL);
if (strlen(history_str) > HIS_MAX_RECORD_LEN)
return (EINVAL);
if (hdl->libzfs_log_str != NULL)
free(hdl->libzfs_log_str);
if ((hdl->libzfs_log_str = strdup(history_str)) == NULL)
return (no_memory(hdl));
return (0);
}
/*
* Perform ioctl to get some command history of a pool.
*
* 'buf' is the buffer to fill up to 'len' bytes. 'off' is the
* logical offset of the history buffer to start reading from.
*
* Upon return, 'off' is the next logical offset to read from and
* 'len' is the actual amount of bytes read into 'buf'.
*/
static int
get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_history = (uint64_t)(uintptr_t)buf;
zc.zc_history_len = *len;
zc.zc_history_offset = *off;
if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) {
switch (errno) {
case EPERM:
return (zfs_error_fmt(hdl, EZFS_PERM,
dgettext(TEXT_DOMAIN,
"cannot show history for pool '%s'"),
zhp->zpool_name));
case ENOENT:
return (zfs_error_fmt(hdl, EZFS_NOHISTORY,
dgettext(TEXT_DOMAIN, "cannot get history for pool "
"'%s'"), zhp->zpool_name));
case ENOTSUP:
return (zfs_error_fmt(hdl, EZFS_BADVERSION,
dgettext(TEXT_DOMAIN, "cannot get history for pool "
"'%s', pool must be upgraded"), zhp->zpool_name));
default:
return (zpool_standard_error_fmt(hdl, errno,
dgettext(TEXT_DOMAIN,
"cannot get history for '%s'"), zhp->zpool_name));
}
}
*len = zc.zc_history_len;
*off = zc.zc_history_offset;
return (0);
}
/*
* Process the buffer of nvlists, unpacking and storing each nvlist record
* into 'records'. 'leftover' is set to the number of bytes that weren't
* processed as there wasn't a complete record.
*/
int
2008-11-20 20:01:55 +00:00
zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover,
nvlist_t ***records, uint_t *numrecords)
{
uint64_t reclen;
nvlist_t *nv;
int i;
while (bytes_read > sizeof (reclen)) {
/* get length of packed record (stored as little endian) */
for (i = 0, reclen = 0; i < sizeof (reclen); i++)
reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i);
if (bytes_read < sizeof (reclen) + reclen)
break;
/* unpack record */
if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0)
return (ENOMEM);
bytes_read -= sizeof (reclen) + reclen;
buf += sizeof (reclen) + reclen;
/* add record to nvlist array */
(*numrecords)++;
if (ISP2(*numrecords + 1)) {
*records = realloc(*records,
*numrecords * 2 * sizeof (nvlist_t *));
}
(*records)[*numrecords - 1] = nv;
}
*leftover = bytes_read;
return (0);
}
#define HIS_BUF_LEN (128*1024)
/*
* Retrieve the command history of a pool.
*/
int
zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp)
{
char buf[HIS_BUF_LEN];
uint64_t off = 0;
nvlist_t **records = NULL;
uint_t numrecords = 0;
int err, i;
do {
uint64_t bytes_read = sizeof (buf);
uint64_t leftover;
if ((err = get_history(zhp, buf, &off, &bytes_read)) != 0)
break;
/* if nothing else was read in, we're at EOF, just return */
if (!bytes_read)
break;
if ((err = zpool_history_unpack(buf, bytes_read,
&leftover, &records, &numrecords)) != 0)
break;
off -= leftover;
/* CONSTCOND */
} while (1);
if (!err) {
verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0);
verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD,
records, numrecords) == 0);
}
for (i = 0; i < numrecords; i++)
nvlist_free(records[i]);
free(records);
return (err);
}
Add linux-events topic branch for zevent handling. This topic branch leverages the Solaris style FMA call points in ZFS to create a user space visible event notification system under Linux. This new system is called zevent and it unifies all previous Solaris style ereports and sysevent notifications. Under this Linux specific scheme when a sysevent or ereport event occurs an nvlist describing the event is created which looks almost exactly like a Solaris ereport. These events are queued up in the kernel when they occur and conditionally logged to the console. It is then up to a user space application to consume the events and do whatever it likes with them. To make this possible the existing /dev/zfs ABI has been extended with two new ioctls which behave as follows. * ZFS_IOC_EVENTS_NEXT Get the next pending event. The kernel will keep track of the last event consumed by the file descriptor and provide the next one if available. If no new events are available the ioctl() will block waiting for the next event. This ioctl may also be called in a non-blocking mode by setting zc.zc_guid = ZEVENT_NONBLOCK. In the non-blocking case if no events are available ENOENT will be returned. It is possible that ESHUTDOWN will be returned if the ioctl() is called while module unloading is in progress. And finally ENOMEM may occur if the provided nvlist buffer is not large enough to contain the entire event. * ZFS_IOC_EVENTS_CLEAR Clear are events queued by the kernel. The kernel will keep a fairly large number of recent events queued, use this ioctl to clear the in kernel list. This will effect all user space processes consuming events. The zpool command has been extended to use this events ABI with the 'events' subcommand. You may run 'zpool events -v' to output a verbose log of all recent events. This is very similar to the Solaris 'fmdump -ev' command with the key difference being it also includes what would be considered sysevents under Solaris. You may also run in follow mode with the '-f' option. To clear the in kernel event queue use the '-c' option. $ sudo cmd/zpool/zpool events -fv TIME CLASS May 13 2010 16:31:15.777711000 ereport.fs.zfs.config.sync class = "ereport.fs.zfs.config.sync" ena = 0x40982b7897700001 detector = (embedded nvlist) version = 0x0 scheme = "zfs" pool = 0xed976600de75dfa6 (end detector) time = 0x4bec8bc3 0x2e5aed98 pool = "zpios" pool_guid = 0xed976600de75dfa6 pool_context = 0x0 While the 'zpool events' command is handy for interactive debugging it is not expected to be the primary consumer of zevents. This ABI was primarily added to facilitate the addition of a user space monitoring daemon. This daemon would consume all events posted by the kernel and based on the type of event perform an action. For most events simply forwarding them on to syslog is likely enough. But this interface also cleanly allows for more sophisticated actions to be taken such as generating an email for a failed drive
2010-05-14 19:40:44 +00:00
/*
* Retrieve the next event. If there is a new event available 'nvp' will
* contain a newly allocated nvlist and 'dropped' will be set to the number
* of missed events since the last call to this function. When 'nvp' is
* set to NULL it indicates no new events are available. In either case
* the function returns 0 and it is up to the caller to free 'nvp'. In
* the case of a fatal error the function will return a non-zero value.
* When the function is called in blocking mode it will not return until
* a new event is available.
*/
int
zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp, int *dropped, int block)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
int error = 0;
*nvp = NULL;
*dropped = 0;
if (!block)
zc.zc_guid = ZEVENT_NONBLOCK;
if (zcmd_alloc_dst_nvlist(hdl, &zc, ZEVENT_SIZE) != 0)
return (-1);
retry:
if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_NEXT, &zc) != 0) {
switch (errno) {
case ESHUTDOWN:
error = zfs_error_fmt(hdl, EZFS_POOLUNAVAIL,
dgettext(TEXT_DOMAIN, "zfs shutdown"));
goto out;
case ENOENT:
/* Blocking error case should not occur */
if (block)
error = zpool_standard_error_fmt(hdl, errno,
dgettext(TEXT_DOMAIN, "cannot get event"));
goto out;
case ENOMEM:
if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
error = zfs_error_fmt(hdl, EZFS_NOMEM,
dgettext(TEXT_DOMAIN, "cannot get event"));
goto out;
} else {
goto retry;
}
default:
error = zpool_standard_error_fmt(hdl, errno,
dgettext(TEXT_DOMAIN, "cannot get event"));
goto out;
}
}
error = zcmd_read_dst_nvlist(hdl, &zc, nvp);
if (error != 0)
goto out;
*dropped = (int)zc.zc_cookie;
out:
zcmd_free_nvlists(&zc);
return (error);
}
/*
* Clear all events.
*/
int
zpool_events_clear(libzfs_handle_t *hdl, int *count)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
char msg[1024];
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot clear events"));
if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_CLEAR, &zc) != 0)
return (zpool_standard_error_fmt(hdl, errno, msg));
if (count != NULL)
*count = (int)zc.zc_cookie; /* # of events cleared */
return (0);
}
2008-11-20 20:01:55 +00:00
void
zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
char *pathname, size_t len)
{
zfs_cmd_t zc = { "\0", "\0", "\0", 0 };
2008-11-20 20:01:55 +00:00
boolean_t mounted = B_FALSE;
char *mntpnt = NULL;
char dsname[MAXNAMELEN];
if (dsobj == 0) {
/* special case for the MOS */
2008-12-04 22:41:11 +00:00
(void) snprintf(pathname, len, "<metadata>:<0x%llx>", (longlong_t)obj);
2008-11-20 20:01:55 +00:00
return;
}
/* get the dataset's name */
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_obj = dsobj;
if (ioctl(zhp->zpool_hdl->libzfs_fd,
ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) {
/* just write out a path of two object numbers */
(void) snprintf(pathname, len, "<0x%llx>:<0x%llx>",
2008-12-04 22:41:11 +00:00
(longlong_t)dsobj, (longlong_t)obj);
2008-11-20 20:01:55 +00:00
return;
}
(void) strlcpy(dsname, zc.zc_value, sizeof (dsname));
/* find out if the dataset is mounted */
mounted = is_mounted(zhp->zpool_hdl, dsname, &mntpnt);
/* get the corrupted object's path */
(void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name));
zc.zc_obj = obj;
if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_PATH,
&zc) == 0) {
if (mounted) {
(void) snprintf(pathname, len, "%s%s", mntpnt,
zc.zc_value);
} else {
(void) snprintf(pathname, len, "%s:%s",
dsname, zc.zc_value);
}
} else {
2008-12-04 22:41:11 +00:00
(void) snprintf(pathname, len, "%s:<0x%llx>", dsname, (longlong_t)obj);
2008-11-20 20:01:55 +00:00
}
free(mntpnt);
}
/*
* Read the EFI label from the config, if a label does not exist then
* pass back the error to the caller. If the caller has passed a non-NULL
* diskaddr argument then we set it to the starting address of the EFI
* partition.
*/
static int
read_efi_label(nvlist_t *config, diskaddr_t *sb)
{
char *path;
int fd;
char diskname[MAXPATHLEN];
int err = -1;
if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0)
return (err);
(void) snprintf(diskname, sizeof (diskname), "%s%s", RDISK_ROOT,
strrchr(path, '/'));
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
if ((fd = open(diskname, O_RDWR|O_DIRECT)) >= 0) {
struct dk_gpt *vtoc;
if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) {
if (sb != NULL)
*sb = vtoc->efi_parts[0].p_start;
efi_free(vtoc);
}
(void) close(fd);
}
return (err);
}
2008-11-20 20:01:55 +00:00
/*
* determine where a partition starts on a disk in the current
* configuration
*/
static diskaddr_t
find_start_block(nvlist_t *config)
{
nvlist_t **child;
uint_t c, children;
diskaddr_t sb = MAXOFFSET_T;
uint64_t wholedisk;
if (nvlist_lookup_nvlist_array(config,
ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) {
if (nvlist_lookup_uint64(config,
ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk) != 0 || !wholedisk) {
return (MAXOFFSET_T);
}
if (read_efi_label(config, &sb) < 0)
sb = MAXOFFSET_T;
2008-11-20 20:01:55 +00:00
return (sb);
}
for (c = 0; c < children; c++) {
sb = find_start_block(child[c]);
if (sb != MAXOFFSET_T) {
return (sb);
}
}
return (MAXOFFSET_T);
}
int
zpool_label_disk_wait(char *path, int timeout)
{
struct stat64 statbuf;
int i;
/*
* Wait timeout miliseconds for a newly created device to be available
* from the given path. There is a small window when a /dev/ device
* will exist and the udev link will not, so we must wait for the
* symlink. Depending on the udev rules this may take a few seconds.
*/
for (i = 0; i < timeout; i++) {
usleep(1000);
errno = 0;
if ((stat64(path, &statbuf) == 0) && (errno == 0))
return (0);
}
return (ENOENT);
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
}
int
zpool_label_disk_check(char *path)
{
struct dk_gpt *vtoc;
int fd, err;
if ((fd = open(path, O_RDWR|O_DIRECT)) < 0)
return errno;
if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) {
(void) close(fd);
return err;
}
if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
efi_free(vtoc);
(void) close(fd);
return EIDRM;
}
efi_free(vtoc);
(void) close(fd);
return 0;
}
2008-11-20 20:01:55 +00:00
/*
* Label an individual disk. The name provided is the short name,
* stripped of any leading /dev path.
*/
int
zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
{
char path[MAXPATHLEN];
struct dk_gpt *vtoc;
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
int rval, fd;
2008-11-20 20:01:55 +00:00
size_t resv = EFI_MIN_RESV_SIZE;
uint64_t slice_size;
diskaddr_t start_block;
char errbuf[1024];
2008-11-20 20:01:55 +00:00
/* prepare an error message just in case */
(void) snprintf(errbuf, sizeof (errbuf),
dgettext(TEXT_DOMAIN, "cannot label '%s'"), name);
if (zhp) {
nvlist_t *nvroot;
if (pool_is_bootable(zhp)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"EFI labeled devices are not supported on root "
"pools."));
return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf));
}
2008-11-20 20:01:55 +00:00
verify(nvlist_lookup_nvlist(zhp->zpool_config,
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
if (zhp->zpool_start_block == 0)
start_block = find_start_block(nvroot);
else
start_block = zhp->zpool_start_block;
zhp->zpool_start_block = start_block;
} else {
/* new pool */
start_block = NEW_START_BLOCK;
}
(void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name,
BACKUP_SLICE);
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) {
2008-11-20 20:01:55 +00:00
/*
* This shouldn't happen. We've long since verified that this
* is a valid device.
*/
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"unable to open device '%s': %d"), path, errno);
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
}
if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) {
/*
* The only way this can fail is if we run out of memory, or we
* were unable to read the disk's capacity
*/
if (errno == ENOMEM)
(void) no_memory(hdl);
(void) close(fd);
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"unable to read disk capacity"), name);
return (zfs_error(hdl, EZFS_NOCAP, errbuf));
}
slice_size = vtoc->efi_last_u_lba + 1;
slice_size -= EFI_MIN_RESV_SIZE;
if (start_block == MAXOFFSET_T)
start_block = NEW_START_BLOCK;
slice_size -= start_block;
vtoc->efi_parts[0].p_start = start_block;
vtoc->efi_parts[0].p_size = slice_size;
/*
* Why we use V_USR: V_BACKUP confuses users, and is considered
* disposable by some EFI utilities (since EFI doesn't have a backup
* slice). V_UNASSIGNED is supposed to be used only for zero size
* partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT,
* etc. were all pretty specific. V_USR is as close to reality as we
* can get, in the absence of V_OTHER.
*/
vtoc->efi_parts[0].p_tag = V_USR;
(void) strcpy(vtoc->efi_parts[0].p_name, "zfs");
vtoc->efi_parts[8].p_start = slice_size + start_block;
vtoc->efi_parts[8].p_size = resv;
vtoc->efi_parts[8].p_tag = V_RESERVED;
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
if ((rval = efi_write(fd, vtoc)) != 0) {
2008-11-20 20:01:55 +00:00
/*
* Some block drivers (like pcata) may not support EFI
* GPT labels. Print out a helpful error message dir-
* ecting the user to manually label the disk and give
* a specific slice.
*/
(void) close(fd);
efi_free(vtoc);
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using "
"parted(8) and then provide a specific slice: %d"), rval);
2008-11-20 20:01:55 +00:00
return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
}
(void) close(fd);
efi_free(vtoc);
Changes required to integrate libefi in to Linux. The major change here is to fix up libefi to be linux aware. For the most part this wasn't too hard but there were a few major issues. First off I needed to handle the DKIOCGMEDIAINFO and DKIOCINFO ioctls. There is no direct equivilant for these ioctls under linux. To handle this I added wrapper functions which under Solaris simple call the ioctls. But under Linux dig around the system a little bit getting the needed info to fill in the requested structures. Secondly the efi_ioctl() call was adapted such that under linux it directly read or writes out the partition table. Under Solaris this work was handed off to the kernel via an ioctl. In the efi_write() case we also ensure we prompt the kernel via BLKRRPART to re-scan the new partition table. The libefi generated partition tables are correct but older versions of ~parted-1.8.1 can not read them without a small patch. The kernel and fdisk are able to read them just fine. Thirdly efi_alloc_and_init() which is used by zpool to determine if a device is a 'wholedisk' was updated to be linux aware. This check is performed by using the partition number for the device, which the partition number is 0 on linux it is a 'wholedisk'. However, certain device type such as the loopback and ram disks needed to be excluded because they do not support partitioning. Forthly the zpool command was made symlink aware so it can correctly resolve udev entries such as /dev/disk/by-*/*. This symlinks are fully expanded ensuring all block devices are recognized. When a when a 'wholedisk' block device is detected we now properly write out an efi label and place zfs in the first partition (0th slice). This partition is created 1MiB in to the disk to ensure it is aligned nicely with all high end block devices I'm aware of. This all works for me now but it did take quite a bit of work to get it all sorted out. It would not surprise me if certain special cases were missed so we should keep any eye of for any odd behavior.
2009-10-14 23:07:48 +00:00
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
/* Wait for the first expected slice to appear. */
(void) snprintf(path, sizeof (path), "%s/%s%s%s", DISK_ROOT, name,
isdigit(name[strlen(name)-1]) ? "p" : "", FIRST_SLICE);
Misc fixed based on testing with the dragon config. In check_disk() we should only check the entire device if it not a whole disk. It is a whole disk with an EFI label on it, it is possible that libblkid will misidentify the device as a filesystem. I had a case yesterday where 2 bytes in the EFI GUID happened we set to the right values such that libblkid decided there was a minux filesystem there. If it's a whole device we look for a EFI label. If we are able to read the backup EFI label from a device but the primary is corrupt. Then don't bother trying to stat the partitions in /dev/ the kernel will not create devices using the backup label when the primary is damaged. Add code to determine if we have a udev path instead of a normal device path. In this case use the -part# partition naming scheme instead of the /dev/disk# scheme. This is important because we always want to access devices using the full path provided at configuration time. Readded support for zpool_relabel_disk() now that we have the full libefi library in place we do have access to this functionality. Lots of additional paranoia to ensure EFI label are written correctly. These changes include: 1) Removing the O_NDELAY flag when opening a file descriptor for libefi. This flag should really only be used when you do not intend to do any file IO. Under Solaris only ioctl()'s were performed under linux we do perform reads and writes. 2) Use O_DIRECT to ensure any caching is bypassed while writing or reading the EFI labels. This change forces the use of sector aligned memory buffers which are allocated using posix_memalign(). 3) Add additional efi_debug error messages to efi_ioctl(). 4) While doing a fsync is good to ensure the EFI label is on disk we can, and should go one step futher by issuing the BLKFLSBUF ioctl(). This signals the kernel to instruct the drive to flush it's on-disk cache. 5) Because of some initial strangeness I observed in testing with some flakey drives be extra paranoid in zpool_label_disk(). After we've written the device without error, flushed the drive caches, correctly detected the new partitions created by the kernel. Then additionally read back the EFI label from user space to make sure it is intact and correct. I don't think we can ever be to careful here. NOTE: The was recently some concern expressed that writing EFI labels from user space on Linux was not the right way to do this. That instead two kernel ioctl()s should be used to create and remove partitions. After some investigation it's clear to me using those ioctl() would be a bad idea. The in fact don't actually write partition tables to the disk, they only create the partition devices in the kernel. So what you really want to do is write the label out from user space, then prompt the kernel to re-read the partition from disk to create the partitions. This is in fact exactly what newer version of parted do.
2009-10-23 18:57:59 +00:00
rval = zpool_label_disk_wait(path, 3000);
if (rval) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to "
"detect device partitions on '%s': %d"), path, rval);
return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
}
/* We can't be to paranoid. Read the label back and verify it. */
(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
rval = zpool_label_disk_check(path);
if (rval) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written "
"EFI label on '%s' is damaged. Ensure\nthis device "
"is not in in use, and is functioning properly: %d"),
path, rval);
return (zfs_error(hdl, EZFS_LABELFAILED, errbuf));
}
return 0;
2008-11-20 20:01:55 +00:00
}