Special failsafe feature

Special failsafe is a feature that allows your special allocation
class vdevs ('special' and 'dedup') to fail without losing any data.  It
works by automatically backing up all special data to the pool.  This
has the added benefit that you can safely create pools with non-matching
alloc class redundancy (like a mirrored pool with a single special
device).

This behavior is controlled via two properties:

1. feature@special_failsafe - This feature flag enables the special
   failsafe subsystem.  It prevents the backed-up pool from being
   imported read/write on an older version of ZFS that does not
   support special failsafe.

2. special_failsafe - This pool property is the main on/off switch
   to control special failsafe.  If you want to use special failsafe
   simply turn it on either at creation time or with `zpool set` prior
   to adding a special alloc class device.  After special device have
   been added, then you can either leave the property on or turn it
   off, but once it's off you can't turn it back on again.

Note that special failsafe may create a performance penalty over pure
alloc class writes due to the extra backup copy write to the pool.
Alloc class reads should not be affected as they always read from DVA 0
first (the copy of the data on the special device).  It can also inflate
disk usage on dRAID pools.

Closes: #15118

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
This commit is contained in:
Tony Hutter 2023-12-27 16:46:07 -08:00
parent 20c8bdd85e
commit 69e9faf47a
52 changed files with 2225 additions and 272 deletions

View File

@ -1164,6 +1164,23 @@ zpool_do_add(int argc, char **argv)
} }
} }
/*
* Special case:
*
* We need to know the special_failsafe pool property value to determine
* if the new vdev configuration has the correct redundancy requirements
* for special and dedup vdevs.
*
* Pass in the current value for special_failsafe to the proplist.
*/
char strval[ZFS_MAXPROPLEN];
if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval,
ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) {
verify(add_prop_list(
zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval,
&props, B_TRUE) == 0);
}
/* pass off to make_root_vdev for processing */ /* pass off to make_root_vdev for processing */
nvroot = make_root_vdev(zhp, props, !check_inuse, nvroot = make_root_vdev(zhp, props, !check_inuse,
check_replication, B_FALSE, dryrun, argc, argv); check_replication, B_FALSE, dryrun, argc, argv);
@ -6940,6 +6957,23 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
} }
} }
/*
* Special case:
*
* We need to know the special_failsafe pool property value to determine
* if the new vdev configuration has the correct redundancy requirements
* for special and dedup vdevs.
*
* Pass in the current value for special_failsafe to the proplist.
*/
char strval[ZFS_MAXPROPLEN];
if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval,
ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) {
verify(add_prop_list(
zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval,
&props, B_TRUE) == 0);
}
nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE,
argc, argv); argc, argv);
if (nvroot == NULL) { if (nvroot == NULL) {

View File

@ -85,6 +85,7 @@
*/ */
boolean_t error_seen; boolean_t error_seen;
boolean_t is_force; boolean_t is_force;
boolean_t is_alloc_class;
void void
vdev_error(const char *fmt, ...) vdev_error(const char *fmt, ...)
@ -94,8 +95,15 @@ vdev_error(const char *fmt, ...)
if (!error_seen) { if (!error_seen) {
(void) fprintf(stderr, gettext("invalid vdev specification\n")); (void) fprintf(stderr, gettext("invalid vdev specification\n"));
if (!is_force) if (!is_force)
(void) fprintf(stderr, gettext("use '-f' to override " if (is_alloc_class) {
"the following errors:\n")); (void) fprintf(stderr, gettext("Turn on the "
"special_failsafe pool property or use '-f'"
" to override the following errors:\n"));
is_alloc_class = B_FALSE;
} else {
(void) fprintf(stderr, gettext("use '-f' to "
"override the following errors:\n"));
}
else else
(void) fprintf(stderr, gettext("the following errors " (void) fprintf(stderr, gettext("the following errors "
"must be manually repaired:\n")); "must be manually repaired:\n"));
@ -442,6 +450,7 @@ typedef struct replication_level {
const char *zprl_type; const char *zprl_type;
uint64_t zprl_children; uint64_t zprl_children;
uint64_t zprl_parity; uint64_t zprl_parity;
boolean_t zprl_is_alloc_class;
} replication_level_t; } replication_level_t;
#define ZPOOL_FUZZ (16 * 1024 * 1024) #define ZPOOL_FUZZ (16 * 1024 * 1024)
@ -480,13 +489,43 @@ is_raidz_draid(replication_level_t *a, replication_level_t *b)
return (B_FALSE); return (B_FALSE);
} }
/*
* Return true if 'props' contains:
*
* special_failsafe=on
*
* ... and feature@special_failsafe is NOT disabled.
*/
static boolean_t
is_special_failsafe_enabled_in_props(nvlist_t *props)
{
const char *str = NULL;
if (nvlist_lookup_string(props, "feature@special_failsafe",
&str) == 0) {
if ((str != NULL) && strcmp(str, "disabled") == 0) {
return (B_FALSE);
}
}
if (nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE),
&str) == 0) {
if ((str != NULL) && strcmp(str, "on") == 0) {
return (B_TRUE); /* It is enabled */
}
}
return (B_FALSE);
}
/* /*
* Given a list of toplevel vdevs, return the current replication level. If * Given a list of toplevel vdevs, return the current replication level. If
* the config is inconsistent, then NULL is returned. If 'fatal' is set, then * the config is inconsistent, then NULL is returned. If 'fatal' is set, then
* an error message will be displayed for each self-inconsistent vdev. * an error message will be displayed for each self-inconsistent vdev.
*/ */
static replication_level_t * static replication_level_t *
get_replication(nvlist_t *nvroot, boolean_t fatal) get_replication(nvlist_t *props, nvlist_t *nvroot, boolean_t fatal)
{ {
nvlist_t **top; nvlist_t **top;
uint_t t, toplevels; uint_t t, toplevels;
@ -495,7 +534,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
nvlist_t *nv; nvlist_t *nv;
const char *type; const char *type;
replication_level_t lastrep = {0}; replication_level_t lastrep = {0};
replication_level_t rep; replication_level_t rep = {0};
replication_level_t *ret; replication_level_t *ret;
replication_level_t *raidz, *mirror; replication_level_t *raidz, *mirror;
boolean_t dontreport; boolean_t dontreport;
@ -507,6 +546,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
for (t = 0; t < toplevels; t++) { for (t = 0; t < toplevels; t++) {
uint64_t is_log = B_FALSE; uint64_t is_log = B_FALSE;
const char *str = NULL;
nv = top[t]; nv = top[t];
@ -528,12 +568,32 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
strcmp(type, VDEV_TYPE_INDIRECT) == 0) strcmp(type, VDEV_TYPE_INDIRECT) == 0)
continue; continue;
rep.zprl_type = type;
/*
* If special_failsafe=on then we know the special allocation
* class devices have at least one copy of their data on the
* pool so we can ignore their replication level.
*/
(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
&str);
if (str &&
((strcmp(str, VDEV_ALLOC_BIAS_SPECIAL) == 0) ||
(strcmp(str, VDEV_ALLOC_BIAS_DEDUP) == 0))) {
rep.zprl_is_alloc_class = B_TRUE;
is_alloc_class = B_TRUE;
if (is_special_failsafe_enabled_in_props(props)) {
continue; /* We're backed up, skip redundancy */
}
} else {
is_alloc_class = B_FALSE;
}
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0) { &child, &children) != 0) {
/* /*
* This is a 'file' or 'disk' vdev. * This is a 'file' or 'disk' vdev.
*/ */
rep.zprl_type = type;
rep.zprl_children = 1; rep.zprl_children = 1;
rep.zprl_parity = 0; rep.zprl_parity = 0;
} else { } else {
@ -548,7 +608,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
* We also check that the size of each vdev (if it can * We also check that the size of each vdev (if it can
* be determined) is the same. * be determined) is the same.
*/ */
rep.zprl_type = type;
rep.zprl_children = 0; rep.zprl_children = 0;
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
@ -808,7 +867,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
* report any difference between the two. * report any difference between the two.
*/ */
static int static int
check_replication(nvlist_t *config, nvlist_t *newroot) check_replication(nvlist_t *props, nvlist_t *config, nvlist_t *newroot)
{ {
nvlist_t **child; nvlist_t **child;
uint_t children; uint_t children;
@ -825,7 +884,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0); &nvroot) == 0);
if ((current = get_replication(nvroot, B_FALSE)) == NULL) if ((current = get_replication(props, nvroot, B_FALSE)) == NULL)
return (0); return (0);
} }
/* /*
@ -850,17 +909,31 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
* Get the replication level of the new vdev spec, reporting any * Get the replication level of the new vdev spec, reporting any
* inconsistencies found. * inconsistencies found.
*/ */
if ((new = get_replication(newroot, B_TRUE)) == NULL) { if ((new = get_replication(props, newroot, B_TRUE)) == NULL) {
free(current); free(current);
return (-1); return (-1);
} }
/* /*
* Check to see if the new vdev spec matches the replication level of * Check to see if the new vdev spec matches the replication level of
* the current pool. * the current pool.
*/ */
ret = 0; ret = 0;
if (current != NULL) { if (current != NULL) {
if (current->zprl_is_alloc_class || new->zprl_is_alloc_class)
is_alloc_class = B_TRUE;
else
is_alloc_class = B_FALSE;
/*
* Special case:
* If there were any redundancy problems with alloc class vdevs
* BUT the pool had special_failsafe on, then we're fine since
* all the alloc class data has a copy in the main pool.
*/
if (is_special_failsafe_enabled_in_props(props) &&
is_alloc_class)
goto out;
if (is_raidz_mirror(current, new, &raidz, &mirror) || if (is_raidz_mirror(current, new, &raidz, &mirror) ||
is_raidz_mirror(new, current, &raidz, &mirror)) { is_raidz_mirror(new, current, &raidz, &mirror)) {
if (raidz->zprl_parity != mirror->zprl_children - 1) { if (raidz->zprl_parity != mirror->zprl_children - 1) {
@ -899,7 +972,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot)
ret = -1; ret = -1;
} }
} }
out:
free(new); free(new);
if (current != NULL) if (current != NULL)
free(current); free(current);
@ -1888,7 +1961,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
* found. We include the existing pool spec, if any, as we need to * found. We include the existing pool spec, if any, as we need to
* catch changes against the existing replication level. * catch changes against the existing replication level.
*/ */
if (check_rep && check_replication(poolconfig, newroot) != 0) { if (check_rep && check_replication(props, poolconfig, newroot) != 0) {
nvlist_free(newroot); nvlist_free(newroot);
return (NULL); return (NULL);
} }

View File

@ -258,6 +258,7 @@ typedef enum {
ZPOOL_PROP_BCLONEUSED, ZPOOL_PROP_BCLONEUSED,
ZPOOL_PROP_BCLONESAVED, ZPOOL_PROP_BCLONESAVED,
ZPOOL_PROP_BCLONERATIO, ZPOOL_PROP_BCLONERATIO,
ZPOOL_PROP_SPECIAL_FAILSAFE,
ZPOOL_NUM_PROPS ZPOOL_NUM_PROPS
} zpool_prop_t; } zpool_prop_t;
@ -1610,6 +1611,7 @@ typedef enum {
ZFS_ERR_CRYPTO_NOTSUP, ZFS_ERR_CRYPTO_NOTSUP,
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
ZFS_ERR_ASHIFT_MISMATCH, ZFS_ERR_ASHIFT_MISMATCH,
ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE,
} zfs_errno_t; } zfs_errno_t;
/* /*

View File

@ -1117,7 +1117,8 @@ extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern uint64_t spa_get_last_removal_txg(spa_t *spa);
extern boolean_t spa_trust_config(spa_t *spa); extern boolean_t spa_trust_config(spa_t *spa);
extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing,
uint64_t missing_special);
extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
extern uint64_t spa_total_metaslabs(spa_t *spa); extern uint64_t spa_total_metaslabs(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa);

View File

@ -336,6 +336,13 @@ struct spa {
uint64_t spa_missing_tvds; /* unopenable tvds on load */ uint64_t spa_missing_tvds; /* unopenable tvds on load */
uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ uint64_t spa_missing_tvds_allowed; /* allow loading spa? */
/*
* Number of 'spa_missing_tvds' that are alloc class devices
* in the pool that has special_failsafe on, and are thus recoverable
* from errors.
*/
uint64_t spa_missing_recovered_tvds;
uint64_t spa_nonallocating_dspace; uint64_t spa_nonallocating_dspace;
spa_removing_phys_t spa_removing_phys; spa_removing_phys_t spa_removing_phys;
spa_vdev_removal_t *spa_vdev_removal; spa_vdev_removal_t *spa_vdev_removal;
@ -474,6 +481,9 @@ struct spa {
*/ */
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
zfs_refcount_t spa_refcount; /* number of opens */ zfs_refcount_t spa_refcount; /* number of opens */
/* Backup special/dedup devices data to the pool */
boolean_t spa_special_failsafe;
}; };
extern char *spa_config_path; extern char *spa_config_path;

View File

@ -640,6 +640,11 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
void vdev_metaslab_group_create(vdev_t *vd); void vdev_metaslab_group_create(vdev_t *vd);
uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
extern boolean_t vdev_is_leaf(vdev_t *vd);
extern boolean_t vdev_is_special(vdev_t *vd);
extern boolean_t vdev_is_dedup(vdev_t *vd);
extern boolean_t vdev_is_alloc_class(vdev_t *vd);
extern boolean_t vdev_is_special_failsafe(vdev_t *vd);
/* /*
* Vdev ashift optimization tunables * Vdev ashift optimization tunables

View File

@ -82,6 +82,7 @@ typedef enum spa_feature {
SPA_FEATURE_AVZ_V2, SPA_FEATURE_AVZ_V2,
SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURE_REDACTION_LIST_SPILL,
SPA_FEATURE_RAIDZ_EXPANSION, SPA_FEATURE_RAIDZ_EXPANSION,
SPA_FEATURE_SPECIAL_FAILSAFE,
SPA_FEATURES SPA_FEATURES
} spa_feature_t; } spa_feature_t;

View File

@ -607,7 +607,7 @@
<elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='spa_feature_table' size='2296' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='spa_feature_table' size='2352' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@ -2921,7 +2921,8 @@
<enumerator name='ZPOOL_PROP_BCLONEUSED' value='33'/> <enumerator name='ZPOOL_PROP_BCLONEUSED' value='33'/>
<enumerator name='ZPOOL_PROP_BCLONESAVED' value='34'/> <enumerator name='ZPOOL_PROP_BCLONESAVED' value='34'/>
<enumerator name='ZPOOL_PROP_BCLONERATIO' value='35'/> <enumerator name='ZPOOL_PROP_BCLONERATIO' value='35'/>
<enumerator name='ZPOOL_NUM_PROPS' value='36'/> <enumerator name='ZPOOL_PROP_SPECIAL_FAILSAFE' value='36'/>
<enumerator name='ZPOOL_NUM_PROPS' value='37'/>
</enum-decl> </enum-decl>
<typedef-decl name='zpool_prop_t' type-id='af1ba157' id='5d0c23fb'/> <typedef-decl name='zpool_prop_t' type-id='af1ba157' id='5d0c23fb'/>
<typedef-decl name='regoff_t' type-id='95e97e5e' id='54a2a2a8'/> <typedef-decl name='regoff_t' type-id='95e97e5e' id='54a2a2a8'/>
@ -5963,7 +5964,8 @@
<enumerator name='SPA_FEATURE_AVZ_V2' value='38'/> <enumerator name='SPA_FEATURE_AVZ_V2' value='38'/>
<enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/> <enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/>
<enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/> <enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/>
<enumerator name='SPA_FEATURES' value='41'/> <enumerator name='SPA_FEATURE_SPECIAL_FAILSAFE' value='41'/>
<enumerator name='SPA_FEATURES' value='42'/>
</enum-decl> </enum-decl>
<typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/> <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
<qualified-type-def type-id='22cce67b' const='yes' id='d2816df0'/> <qualified-type-def type-id='22cce67b' const='yes' id='d2816df0'/>
@ -9025,8 +9027,8 @@
</function-decl> </function-decl>
</abi-instr> </abi-instr>
<abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'> <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
<array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18368' id='b93e4d14'> <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18816' id='b937914f'>
<subrange length='41' type-id='7359adad' id='cb834f44'/> <subrange length='42' type-id='7359adad' id='cb7c937f'/>
</array-type-def> </array-type-def>
<enum-decl name='zfeature_flags' id='6db816a4'> <enum-decl name='zfeature_flags' id='6db816a4'>
<underlying-type type-id='9cac1fee'/> <underlying-type type-id='9cac1fee'/>
@ -9103,7 +9105,7 @@
<pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/> <pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
<qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/> <qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
<pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/> <pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
<var-decl name='spa_feature_table' type-id='b93e4d14' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/> <var-decl name='spa_feature_table' type-id='b937914f' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
<var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/> <var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
<function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'> <function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/> <parameter type-id='80f4b756'/>

View File

@ -774,6 +774,15 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case ZFS_ERR_ASHIFT_MISMATCH: case ZFS_ERR_ASHIFT_MISMATCH:
zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap); zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap);
break; break;
case ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"Cannot set pool prop special_failsafe=on since "
"feature@special_failsafe is not set to 'enabled'.\n"
"This could be because the special_failsafe pool prop was "
"manually turned off while the special_failsafe feature "
"flag was active, or the feature flag was disabled."));
zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap);
break;
default: default:
zfs_error_aux(hdl, "%s", zfs_strerror(error)); zfs_error_aux(hdl, "%s", zfs_strerror(error));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);

View File

@ -1924,7 +1924,7 @@ zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp,
/* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */ /* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */
static boolean_t static boolean_t
vdev_is_leaf(nvlist_t *nv) vdev_is_leaf_nv(nvlist_t *nv)
{ {
uint_t children = 0; uint_t children = 0;
nvlist_t **child; nvlist_t **child;
@ -1937,10 +1937,10 @@ vdev_is_leaf(nvlist_t *nv)
/* Return if a vdev is a leaf vdev and a real device (disk or file) */ /* Return if a vdev is a leaf vdev and a real device (disk or file) */
static boolean_t static boolean_t
vdev_is_real_leaf(nvlist_t *nv) vdev_is_real_leaf_nv(nvlist_t *nv)
{ {
const char *type = NULL; const char *type = NULL;
if (!vdev_is_leaf(nv)) if (!vdev_is_leaf_nv(nv))
return (B_FALSE); return (B_FALSE);
(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type); (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type);
@ -1973,7 +1973,7 @@ __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv,
/* The very first entry in the NV list is a special case */ /* The very first entry in the NV list is a special case */
if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) { if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) {
if (real_leaves_only && !vdev_is_real_leaf(nv)) if (real_leaves_only && !vdev_is_real_leaf_nv(nv))
return (0); return (0);
*((nvlist_t **)last_nv) = nv; *((nvlist_t **)last_nv) = nv;
@ -1996,7 +1996,7 @@ __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv,
* we want. * we want.
*/ */
if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) { if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) {
if (real_leaves_only && !vdev_is_real_leaf(nv)) if (real_leaves_only && !vdev_is_real_leaf_nv(nv))
return (0); return (0);
*((nvlist_t **)last_nv) = nv; *((nvlist_t **)last_nv) = nv;

View File

@ -322,6 +322,40 @@ With device removal, it can be returned to the
.Sy enabled .Sy enabled
state if all the dedicated allocation class vdevs are removed. state if all the dedicated allocation class vdevs are removed.
. .
.feature org.zfsonlinux special_failsafe yes allocation_classes
This feature allows the
.Sy special_failsafe
pool property to be used.
When the
.Sy special_failsafe
pool property is set to "on" all proceeding writes to allocation class vdevs
(like special and dedup vdevs) will also generate an additional copy of the data
to be written to the pool.
This allows alloc class vdev data to be "backed up" to the pool.
A fully backed up allocation device vdev can fail without causing the pool to be
suspended, even if the alloc class device is not redundant.
.Pp
It is important to note the difference between the
.Sy special_failsafe
feature flag and a
.Sy special_failsafe
pool property since they appear similar.
The
.Sy special_failsafe
feature flag is a safeguard to prevent a pool that is using special_failsafe
from being imported read/write on an older version of ZFS that does not support
special_failsafe (and possibly compromising the integrity of the backup
guarantees).
The pool property is what actually allows you to turn on/off the backup copy
writes.
The
.Sy special_failsafe
feature will switch from "enabled" to "active" when allocation class devices
are added.
See the
.Sy special_failsafe
pool property for more details.
.
.feature com.delphix async_destroy yes .feature com.delphix async_destroy yes
Destroying a file system requires traversing all of its data in order to Destroying a file system requires traversing all of its data in order to
return its used space to the pool. return its used space to the pool.

View File

@ -181,14 +181,18 @@ section.
.It Sy dedup .It Sy dedup
A device solely dedicated for deduplication tables. A device solely dedicated for deduplication tables.
The redundancy of this device should match the redundancy of the other normal The redundancy of this device should match the redundancy of the other normal
devices in the pool. devices in the pool except if the
.Sy special_failsafe
pool property is enabled.
If more than one dedup device is specified, then If more than one dedup device is specified, then
allocations are load-balanced between those devices. allocations are load-balanced between those devices.
.It Sy special .It Sy special
A device dedicated solely for allocating various kinds of internal metadata, A device dedicated solely for allocating various kinds of internal metadata,
and optionally small file blocks. and optionally small file blocks.
The redundancy of this device should match the redundancy of the other normal The redundancy of this device should match the redundancy of the other normal
devices in the pool. devices in the pool except if the
.Sy special_failsafe
pool property is enabled.
If more than one special device is specified, then If more than one special device is specified, then
allocations are load-balanced between those devices. allocations are load-balanced between those devices.
.Pp .Pp

View File

@ -437,6 +437,34 @@ command, though this property can be used when a specific version is needed for
backwards compatibility. backwards compatibility.
Once feature flags are enabled on a pool this property will no longer have a Once feature flags are enabled on a pool this property will no longer have a
value. value.
.It Sy special_failsafe Ns = Ns Sy on Ns | Ns Sy off
Controls the special failsafe subsystem for special allocation
class vdevs.
When it's turned on, all writes to special allocation class vdevs
(like 'special' and 'dedup' vdevs) will also write an additional copy of the
data to the main pool.
This allows alloc class vdev data to be "backed up" to the pool.
When
.Sy special_failsafe
is turned on, alloc class vdevs can fail regardless of their redundancy level
without the pool loosing data.
To use
.Sy special_failsafe
simply turn it on at zpool create time, or turn it on prior to adding
alloc class devices.
It's important to note that after alloc class vdevs are added to the pool with
.Sy special_failsafe
on, you can still turn
.Sy special_failsafe
off again, but once it's off you can't turn it back on.
.Sy special_failsafe
can be freely toggled on/off if alloc class devices haven't been added to the
pool, since the pool prop would have no effect.
The
.Sy feature@special_failsafe
feature flag must be enabled in order to use the
.Sy special_failsafe
pool property.
.El .El
. .
.Ss User Properties .Ss User Properties

View File

@ -753,6 +753,18 @@ zpool_feature_init(void)
"org.openzfs:raidz_expansion", "raidz_expansion", "org.openzfs:raidz_expansion", "raidz_expansion",
"Support for raidz expansion", "Support for raidz expansion",
ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
{
static const spa_feature_t special_failsafe_deps[] = {
SPA_FEATURE_ALLOCATION_CLASSES,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_SPECIAL_FAILSAFE,
"org.openzfs:special_failsafe", "special_failsafe",
"Save a copy of allocation class device data to main pool",
ZFEATURE_FLAG_MOS,
ZFEATURE_TYPE_BOOLEAN, special_failsafe_deps,
sfeatures);
}
zfs_mod_list_supported_free(sfeatures); zfs_mod_list_supported_free(sfeatures);
} }

View File

@ -153,6 +153,10 @@ zpool_prop_init(void)
zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0, zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0,
PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST", PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST",
boolean_table, sfeatures); boolean_table, sfeatures);
zprop_register_index(ZPOOL_PROP_SPECIAL_FAILSAFE,
"special_failsafe", 0, PROP_DEFAULT, ZFS_TYPE_POOL,
"on | off", "SPECIAL_FAILSAFE", boolean_table,
sfeatures);
/* default index properties */ /* default index properties */
zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode", zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",

View File

@ -5848,10 +5848,22 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
dva_t *dva = bp->blk_dva; dva_t *dva = bp->blk_dva;
dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
int error = 0; int error = 0;
boolean_t is_special_failsafe = B_FALSE;
if ((spa->spa_special_failsafe && ((mc == spa_special_class(spa)) ||
(mc == spa_dedup_class(spa))))) {
is_special_failsafe = B_TRUE;
}
ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
/*
* Earlier layers of the code should set nvdas > 1 if the
* alloc class vdev is being backed up.
*/
ASSERT(!(is_special_failsafe && ndvas == 1));
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
if (mc->mc_allocator[allocator].mca_rotor == NULL) { if (mc->mc_allocator[allocator].mca_rotor == NULL) {
@ -5866,7 +5878,21 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
ASSERT3P(zal, !=, NULL); ASSERT3P(zal, !=, NULL);
for (int d = 0; d < ndvas; d++) { for (int d = 0; d < ndvas; d++) {
error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, metaslab_class_t *_mc;
if (is_special_failsafe && (d == 1)) {
/*
* If we have the special_failsafe prop set, then make
* the 2nd copy of the data we are going to write go to
* the regular pool rather than yet another copy to the
* alloc class device. That way, if the special device
* is lost, there's still a backup in the pool.
*/
_mc = spa_normal_class(spa);
} else {
_mc = mc;
}
error = metaslab_alloc_dva(spa, _mc, psize, dva, d, hintdva,
txg, flags, zal, allocator); txg, flags, zal, allocator);
if (error != 0) { if (error != 0) {
for (d--; d >= 0; d--) { for (d--; d >= 0; d--) {

View File

@ -477,6 +477,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
DNODE_MIN_SIZE, ZPROP_SRC_NONE); DNODE_MIN_SIZE, ZPROP_SRC_NONE);
} }
if (spa_feature_is_enabled(spa, SPA_FEATURE_SPECIAL_FAILSAFE)) {
zprop_source_t src;
if ((uint64_t)spa->spa_special_failsafe ==
zpool_prop_default_numeric(ZPOOL_PROP_SPECIAL_FAILSAFE))
src = ZPROP_SRC_DEFAULT;
else
src = ZPROP_SRC_LOCAL;
spa_prop_add_list(*nvp, ZPOOL_PROP_SPECIAL_FAILSAFE,
NULL, spa->spa_special_failsafe, src);
} else {
/* special_failsafe not used */
spa_prop_add_list(*nvp, ZPOOL_PROP_SPECIAL_FAILSAFE,
NULL, B_FALSE, ZPROP_SRC_NONE);
}
if ((dp = list_head(&spa->spa_config_list)) != NULL) { if ((dp = list_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path == NULL) { if (dp->scd_path == NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@ -610,6 +626,27 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
int error = 0, reset_bootfs = 0; int error = 0, reset_bootfs = 0;
uint64_t objnum = 0; uint64_t objnum = 0;
boolean_t has_feature = B_FALSE; boolean_t has_feature = B_FALSE;
boolean_t special_failsafe_prop = B_FALSE;
/*
* The way the feature flags work here are a little interesting.
*
* At zpool creation time, this feature will not be initialized yet when
* spa_prop_validate() gets called. This works out though, as the
* feature flag will be passed in the nvlist if the feature is enabled.
*
* After the pool is created, calls to this function (like zpool set)
* will not include the feature flag in the props nvlist, but the
* feature table will be initialized, so we can use
* spa_feature_is_active().
*/
boolean_t special_failsafe_feature_disabled;
special_failsafe_feature_disabled = !(spa_feature_is_enabled(spa,
SPA_FEATURE_SPECIAL_FAILSAFE) || spa_feature_is_active(spa,
SPA_FEATURE_SPECIAL_FAILSAFE));
/* Did they explicitly pass feature@special_failsafe=enabled ? */
boolean_t special_failsafe_feature_passed = B_FALSE;
elem = NULL; elem = NULL;
while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
@ -617,6 +654,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
const char *strval, *slash, *check, *fname; const char *strval, *slash, *check, *fname;
const char *propname = nvpair_name(elem); const char *propname = nvpair_name(elem);
zpool_prop_t prop = zpool_name_to_prop(propname); zpool_prop_t prop = zpool_name_to_prop(propname);
spa_feature_t fid = 0;
switch (prop) { switch (prop) {
case ZPOOL_PROP_INVAL: case ZPOOL_PROP_INVAL:
@ -651,11 +689,30 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
} }
fname = strchr(propname, '@') + 1; fname = strchr(propname, '@') + 1;
if (zfeature_lookup_name(fname, NULL) != 0) { if (zfeature_lookup_name(fname, &fid) != 0) {
error = SET_ERROR(EINVAL); error = SET_ERROR(EINVAL);
break; break;
} }
/*
* Special case - If both:
*
* SPA_FEATURE_SPECIAL_FAILSAFE = disabled
*
* ... and ...
*
* ZPOOL_PROP_SPECIAL_FAILSAFE = on
*
* then we need to fail. Note that the presence
* of SPA_FEATURE_SPECIAL_FAILSAFE in the
* nvlist means it is enabled (although its
* intval will be 0). If it's disabled, then
* SPA_FEATURE_SPECIAL_FAILSAFE will not
* be in the nvlist at all.
*/
if (fid == SPA_FEATURE_SPECIAL_FAILSAFE) {
special_failsafe_feature_passed =
B_TRUE;
}
has_feature = B_TRUE; has_feature = B_TRUE;
} else { } else {
error = SET_ERROR(EINVAL); error = SET_ERROR(EINVAL);
@ -799,6 +856,13 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
if (strlen(strval) > ZPROP_MAX_COMMENT) if (strlen(strval) > ZPROP_MAX_COMMENT)
error = SET_ERROR(E2BIG); error = SET_ERROR(E2BIG);
break; break;
case ZPOOL_PROP_SPECIAL_FAILSAFE:
error = nvpair_value_uint64(elem, &intval);
if (!error && intval > 1)
error = SET_ERROR(EINVAL);
if (intval == 1)
special_failsafe_prop = B_TRUE;
break;
default: default:
break; break;
@ -811,6 +875,26 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
(void) nvlist_remove_all(props, (void) nvlist_remove_all(props,
zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
if (special_failsafe_prop && special_failsafe_feature_disabled &&
!special_failsafe_feature_passed) {
/*
* We can't enable SPECIAL_FAILSAFE pool prop if the
* feature flag SPA_FEATURE_SPECIAL_FAILSAFE is
* disabled.
*/
error = SET_ERROR(ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE);
}
/*
* If the user wants to turn on the special_failsafe prop, but it
* was turned off (while the feature was active), then it can't be
* turned on again.
*/
if (spa_feature_is_active(spa, SPA_FEATURE_SPECIAL_FAILSAFE) &&
!spa->spa_special_failsafe && special_failsafe_prop) {
error = SET_ERROR(ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE);
}
if (!error && reset_bootfs) { if (!error && reset_bootfs) {
error = nvlist_remove(props, error = nvlist_remove(props,
zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
@ -2475,6 +2559,53 @@ spa_check_removed(vdev_t *vd)
} }
} }
/*
* Decide what to do if we have missing/corrupted alloc class devices.
*
* If we have missing top-level vdevs and they are all alloc class devices with
* special_failsafe set, then we may still be able to import the pool.
*/
static int
spa_check_for_bad_alloc_class_devices(spa_t *spa)
{
if (spa->spa_missing_recovered_tvds == 0)
return (0);
/*
* Are there missing alloc class devices but
* SPA_FEATURE_SPECIAL_FAILSAFE is not enabled? If so,
* then we can't import.
*/
if (!spa_feature_is_active(spa, SPA_FEATURE_SPECIAL_FAILSAFE)) {
spa_load_note(spa, "some alloc class devices are missing, "
"cannot import.");
return (SET_ERROR(ENXIO));
}
/*
* If all the missing top-level devices are alloc class devices, and
* if they have all their data backed up to the pool, then we can still
* import the pool.
*/
if (spa->spa_missing_tvds > 0 &&
spa->spa_missing_tvds == spa->spa_missing_recovered_tvds) {
spa_load_note(spa, "only alloc class devices are missing, and "
"the normal pool has copies of the alloc class data, so "
"it's still possible to import.");
return (0);
}
/*
* If we're here, then it means that not all the missing top-level vdevs
* were alloc class devices. This should have been caught earlier.
*/
spa_load_note(spa, "some alloc class devices that do not have a "
" special_failsafe backup copy are amongst those that are missing,"
" cannot import");
return (SET_ERROR(ENXIO));
}
static int static int
spa_check_for_missing_logs(spa_t *spa) spa_check_for_missing_logs(spa_t *spa)
{ {
@ -3966,7 +4097,24 @@ spa_ld_open_vdevs(spa_t *spa)
error = vdev_open(spa->spa_root_vdev); error = vdev_open(spa->spa_root_vdev);
spa_config_exit(spa, SCL_ALL, FTAG); spa_config_exit(spa, SCL_ALL, FTAG);
if (spa->spa_missing_tvds != 0) { if (spa->spa_missing_tvds != 0 &&
spa->spa_missing_tvds == spa->spa_missing_recovered_tvds &&
(error == 0 || error == ENOENT)) {
/*
* Special case: If all the missing top-level vdevs are special
* devices, we may or may not be able to import the pool,
* depending on if the relevant special_failsafe feature and
* property are set. At this early stage of import we do not
* have the feature flags loaded yet, so for now proceed
* with the import. We will do the backup checks later after
* the feature flags are loaded.
*/
spa_load_note(spa, "vdev tree has %lld missing special "
"top-level vdevs. Keep importing for now until we "
"can check the feature flags.",
(u_longlong_t)spa->spa_missing_tvds);
error = 0;
} else if (spa->spa_missing_tvds != 0) {
spa_load_note(spa, "vdev tree has %lld missing top-level " spa_load_note(spa, "vdev tree has %lld missing top-level "
"vdevs.", (u_longlong_t)spa->spa_missing_tvds); "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
@ -4737,6 +4885,14 @@ spa_ld_get_props(spa_t *spa)
spa->spa_autoreplace = (autoreplace != 0); spa->spa_autoreplace = (autoreplace != 0);
} }
uint64_t special_failsafe = 0;
spa_prop_find(spa, ZPOOL_PROP_SPECIAL_FAILSAFE,
&special_failsafe);
if (special_failsafe)
spa->spa_special_failsafe = B_TRUE;
else
spa->spa_special_failsafe = B_FALSE;
/* /*
* If we are importing a pool with missing top-level vdevs, * If we are importing a pool with missing top-level vdevs,
* we enforce that the pool doesn't panic or get suspended on * we enforce that the pool doesn't panic or get suspended on
@ -5398,6 +5554,13 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
if (error != 0) if (error != 0)
goto fail; goto fail;
spa_import_progress_set_notes(spa, "Checking for bad alloc class "
"devices");
spa_check_for_bad_alloc_class_devices(spa);
if (error != 0)
return (error);
spa_import_progress_set_notes(spa, "Loading dedup tables"); spa_import_progress_set_notes(spa, "Loading dedup tables");
error = spa_ld_load_dedup_tables(spa); error = spa_ld_load_dedup_tables(spa);
if (error != 0) if (error != 0)
@ -6589,6 +6752,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
/*
* Set initial special_failsafe settings. These may change after the
* nvlist properties are processed a little later in spa_sync_props().
*/
spa->spa_special_failsafe = (boolean_t)
zpool_prop_default_numeric(ZPOOL_PROP_SPECIAL_FAILSAFE);
if (props != NULL) { if (props != NULL) {
spa_configfile_set(spa, props, B_FALSE); spa_configfile_set(spa, props, B_FALSE);
spa_sync_props(props, tx); spa_sync_props(props, tx);
@ -9487,6 +9657,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
const char *elemname = nvpair_name(elem); const char *elemname = nvpair_name(elem);
zprop_type_t proptype; zprop_type_t proptype;
spa_feature_t fid; spa_feature_t fid;
// boolean_t boolval;
switch (prop = zpool_name_to_prop(elemname)) { switch (prop = zpool_name_to_prop(elemname)) {
case ZPOOL_PROP_VERSION: case ZPOOL_PROP_VERSION:
@ -9549,7 +9720,6 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
spa_history_log_internal(spa, "set", tx, spa_history_log_internal(spa, "set", tx,
"%s=%s", nvpair_name(elem), strval); "%s=%s", nvpair_name(elem), strval);
break; break;
case ZPOOL_PROP_INVAL: case ZPOOL_PROP_INVAL:
if (zpool_prop_feature(elemname)) { if (zpool_prop_feature(elemname)) {
fname = strchr(elemname, '@') + 1; fname = strchr(elemname, '@') + 1;
@ -9631,6 +9801,10 @@ spa_sync_props(void *arg, dmu_tx_t *tx)
case ZPOOL_PROP_MULTIHOST: case ZPOOL_PROP_MULTIHOST:
spa->spa_multihost = intval; spa->spa_multihost = intval;
break; break;
case ZPOOL_PROP_SPECIAL_FAILSAFE:
spa->spa_special_failsafe =
(boolean_t)intval;
break;
default: default:
break; break;
} }

View File

@ -738,6 +738,17 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
/*
* Testing showed that spa_special_failsafe needs to be on by default
* here no matter what. Later on it will be turned off since
* the feature is off by default. If you don't have it on at early
* SPA creation time, then it's impossible to import the pool with all
* the special devices missing. This could be due to the need to
* write two copies of early metadata.
*/
spa->spa_special_failsafe = B_TRUE;
spa_set_deadman_failmode(spa, zfs_deadman_failmode); spa_set_deadman_failmode(spa, zfs_deadman_failmode);
spa_set_allocator(spa, zfs_active_allocator); spa_set_allocator(spa, zfs_active_allocator);
@ -1682,6 +1693,9 @@ spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
*/ */
ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
if (spa->spa_special_failsafe)
spa_feature_incr(spa, SPA_FEATURE_SPECIAL_FAILSAFE, tx);
} }
/* /*
@ -2850,10 +2864,21 @@ spa_syncing_log_sm(spa_t *spa)
return (spa->spa_syncing_log_sm); return (spa->spa_syncing_log_sm);
} }
/*
* Record the total number of missing top-level vdevs ('missing'), and the
* number of missing top-level vdevs that are recoverable ('missing_recovered').
* In this case, missing_recovered is the number of top-level alloc class vdevs
* that are recoverable since the special_failsafe pool prop was on, and thus
* their data is "backed up" to the main pool.
*
* The separate 'missing_recovered' count is used during pool import to
* determine if we can import a pool with missing alloc class vdevs.
*/
void void
spa_set_missing_tvds(spa_t *spa, uint64_t missing) spa_set_missing_tvds(spa_t *spa, uint64_t missing, uint64_t missing_recovered)
{ {
spa->spa_missing_tvds = missing; spa->spa_missing_tvds = missing;
spa->spa_missing_recovered_tvds = missing_recovered;
} }
/* /*

View File

@ -728,6 +728,60 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
return (vd); return (vd);
} }
boolean_t
vdev_is_leaf(vdev_t *vd)
{
return (vd->vdev_children == 0);
}
/* Return true if vdev or TLD vdev is special alloc class */
boolean_t
vdev_is_special(vdev_t *vd)
{
if (vd->vdev_alloc_bias == VDEV_BIAS_SPECIAL)
return (B_TRUE);
/*
* If the vdev is a leaf vdev, and is part of a mirror, its parent
* 'mirror' TLD will have vdev_alloc_bias == VDEV_BIAS_SPECIAL, but the
* leaf vdev itself will not. So we also need to check the parent
* in those cases.
*/
if (vdev_is_leaf(vd) &&
(vd->vdev_parent != NULL && vdev_is_special(vd->vdev_parent))) {
return (B_TRUE);
}
return (B_FALSE);
}
/* Return true if vdev or TLD vdev is dedup alloc class */
boolean_t
vdev_is_dedup(vdev_t *vd)
{
if (vd->vdev_alloc_bias == VDEV_BIAS_DEDUP)
return (B_TRUE);
/*
* If the vdev is a leaf vdev, and is part of a mirror, it's parent
* 'mirror' TLD will have vdev_alloc_bias == VDEV_BIAS_DEDUP, but the
* leaf vdev itself will not. So we also need to check the parent
* in those cases.
*/
if (vdev_is_leaf(vd) &&
(vd->vdev_parent != NULL && vdev_is_dedup(vd->vdev_parent))) {
return (B_TRUE);
}
return (B_FALSE);
}
boolean_t
vdev_is_alloc_class(vdev_t *vd)
{
return (vdev_is_special(vd) || vdev_is_dedup(vd));
}
/* /*
* Allocate a new vdev. The 'alloctype' is used to control whether we are * Allocate a new vdev. The 'alloctype' is used to control whether we are
* creating a new vdev or loading an existing one - the behavior is slightly * creating a new vdev or loading an existing one - the behavior is slightly
@ -746,6 +800,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
int rc; int rc;
vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
boolean_t top_level = (parent && !parent->vdev_parent); boolean_t top_level = (parent && !parent->vdev_parent);
const char *bias = NULL;
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
@ -797,8 +852,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
return (SET_ERROR(ENOTSUP)); return (SET_ERROR(ENOTSUP));
if (top_level && alloctype == VDEV_ALLOC_ADD) { if (top_level && alloctype == VDEV_ALLOC_ADD) {
const char *bias;
/* /*
* If creating a top-level vdev, check for allocation * If creating a top-level vdev, check for allocation
* classes input. * classes input.
@ -840,6 +893,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
vd->vdev_tsd = tsd; vd->vdev_tsd = tsd;
vd->vdev_islog = islog; vd->vdev_islog = islog;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
&bias) == 0) {
alloc_bias = vdev_derive_alloc_bias(bias);
}
if (top_level && alloc_bias != VDEV_BIAS_NONE) if (top_level && alloc_bias != VDEV_BIAS_NONE)
vd->vdev_alloc_bias = alloc_bias; vd->vdev_alloc_bias = alloc_bias;
@ -3690,8 +3748,9 @@ vdev_load(vdev_t *vd)
VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
bias_str); bias_str);
if (error == 0) { if (error == 0) {
ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); if (vd->vdev_alloc_bias == VDEV_BIAS_NONE)
vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); vd->vdev_alloc_bias =
vdev_derive_alloc_bias(bias_str);
} else if (error != ENOENT) { } else if (error != ENOENT) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA); VDEV_AUX_CORRUPT_DATA);
@ -4150,7 +4209,8 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
* If this device has the only valid copy of the data, then * If this device has the only valid copy of the data, then
* back off and simply mark the vdev as degraded instead. * back off and simply mark the vdev as degraded instead.
*/ */
if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) &&
vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
vd->vdev_degraded = 1ULL; vd->vdev_degraded = 1ULL;
vd->vdev_faulted = 0ULL; vd->vdev_faulted = 0ULL;
@ -4366,8 +4426,8 @@ top:
* don't allow it to be offlined. Log devices are always * don't allow it to be offlined. Log devices are always
* expendable. * expendable.
*/ */
if (!tvd->vdev_islog && vd->vdev_aux == NULL && if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) &&
vdev_dtl_required(vd)) vd->vdev_aux == NULL && vdev_dtl_required(vd))
return (spa_vdev_state_exit(spa, NULL, return (spa_vdev_state_exit(spa, NULL,
SET_ERROR(EBUSY))); SET_ERROR(EBUSY)));
@ -4423,7 +4483,8 @@ top:
vd->vdev_offline = B_TRUE; vd->vdev_offline = B_TRUE;
vdev_reopen(tvd); vdev_reopen(tvd);
if (!tvd->vdev_islog && vd->vdev_aux == NULL && if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) &&
vd->vdev_aux == NULL &&
vdev_is_dead(tvd)) { vdev_is_dead(tvd)) {
vd->vdev_offline = B_FALSE; vd->vdev_offline = B_FALSE;
vdev_reopen(tvd); vdev_reopen(tvd);
@ -5269,10 +5330,14 @@ vdev_propagate_state(vdev_t *vd)
* device, treat the root vdev as if it were * device, treat the root vdev as if it were
* degraded. * degraded.
*/ */
if (child->vdev_islog && vd == rvd) if ((child->vdev_islog ||
vdev_is_special_failsafe(child)) &&
(vd == rvd)) {
degraded++; degraded++;
else } else {
faulted++; faulted++;
}
} else if (child->vdev_state <= VDEV_STATE_DEGRADED) { } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
degraded++; degraded++;
} }
@ -5448,8 +5513,9 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
zfs_post_state_change(spa, vd, save_state); zfs_post_state_change(spa, vd, save_state);
} }
if (!isopen && vd->vdev_parent) if (!isopen && vd->vdev_parent) {
vdev_propagate_state(vd->vdev_parent); vdev_propagate_state(vd->vdev_parent);
}
} }
boolean_t boolean_t
@ -5517,6 +5583,24 @@ vdev_log_state_valid(vdev_t *vd)
return (B_FALSE); return (B_FALSE);
} }
/*
* Is the vdev an alloc class vdev that is part of a pool that has
* special_failsafe on, and thus has all it's data backed up to the main pool?
*
* This function works for both top-level vdevs and leaf vdevs.
*/
boolean_t
vdev_is_special_failsafe(vdev_t *vd)
{
if (vdev_is_alloc_class(vd))
return (vd->vdev_spa->spa_special_failsafe);
if (vdev_is_leaf(vd) && vd->vdev_parent != NULL)
return (vdev_is_special_failsafe(vd->vdev_parent));
return (B_FALSE);
}
/* /*
* Expand a vdev if possible. * Expand a vdev if possible.
*/ */

View File

@ -521,8 +521,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vd->vdev_removing); vd->vdev_removing);
} }
/* zpool command expects alloc class data */ if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
const char *bias = NULL; const char *bias = NULL;
switch (vd->vdev_alloc_bias) { switch (vd->vdev_alloc_bias) {
@ -539,6 +538,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
ASSERT3U(vd->vdev_alloc_bias, ==, ASSERT3U(vd->vdev_alloc_bias, ==,
VDEV_BIAS_NONE); VDEV_BIAS_NONE);
} }
fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
bias); bias);
} }
@ -1804,9 +1804,10 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
spa_t *spa = svd[0]->vdev_spa; spa_t *spa = svd[0]->vdev_spa;
zio_t *zio; zio_t *zio;
uint64_t good_writes = 0; uint64_t good_writes = 0;
boolean_t failure_but_special_failsafe = B_FALSE;
int rc;
zio = zio_root(spa, NULL, NULL, flags); zio = zio_root(spa, NULL, NULL, flags);
for (int v = 0; v < svdcount; v++) for (int v = 0; v < svdcount; v++)
vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
@ -1850,7 +1851,38 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
(void) zio_wait(zio); (void) zio_wait(zio);
return (good_writes >= 1 ? 0 : EIO); /*
* Special case:
*
* If we had zero good writes, but all the writes were to alloc class
* disks that were on a pool with special_failsafe on, then it's not
* fatal.
*/
if (good_writes == 0) {
failure_but_special_failsafe = B_TRUE;
for (int v = 0; v < svdcount; v++) {
if (!vdev_is_special_failsafe(svd[v])) {
failure_but_special_failsafe = B_FALSE;
break;
}
}
}
if (good_writes >= 1) {
/* success */
rc = 0;
} else if (failure_but_special_failsafe) {
/*
* All the failures are on allocation class disks that were
* fully backed up to the pool, so this isn't fatal.
*/
rc = 0;
} else {
/* failure */
rc = EIO;
}
return (rc);
} }
/* /*
@ -1966,7 +1998,8 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
zio_t *vio = zio_null(zio, spa, NULL, zio_t *vio = zio_null(zio, spa, NULL,
(vd->vdev_islog || vd->vdev_aux != NULL) ? (vd->vdev_islog || vd->vdev_aux != NULL ||
vdev_is_special_failsafe(vd)) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done, vdev_label_sync_ignore_done : vdev_label_sync_top_done,
good_writes, flags); good_writes, flags);
vdev_label_sync(vio, good_writes, vd, l, txg, flags); vdev_label_sync(vio, good_writes, vd, l, txg, flags);
@ -2019,6 +2052,7 @@ retry:
if (error != 0) { if (error != 0) {
if ((flags & ZIO_FLAG_TRYHARD) != 0) if ((flags & ZIO_FLAG_TRYHARD) != 0)
return (error); return (error);
flags |= ZIO_FLAG_TRYHARD; flags |= ZIO_FLAG_TRYHARD;
} }

View File

@ -32,6 +32,7 @@
#include <sys/vdev_impl.h> #include <sys/vdev_impl.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
#include <sys/zfeature.h>
/* /*
* Virtual device vector for the pool's root vdev. * Virtual device vector for the pool's root vdev.
@ -46,6 +47,7 @@ vdev_root_core_tvds(vdev_t *vd)
vdev_t *cvd = vd->vdev_child[c]; vdev_t *cvd = vd->vdev_child[c];
if (!cvd->vdev_ishole && !cvd->vdev_islog && if (!cvd->vdev_ishole && !cvd->vdev_islog &&
!vdev_is_special_failsafe(vd) &&
cvd->vdev_ops != &vdev_indirect_ops) { cvd->vdev_ops != &vdev_indirect_ops) {
tvds++; tvds++;
} }
@ -87,6 +89,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
int lasterror = 0; int lasterror = 0;
int numerrors = 0; int numerrors = 0;
int numerrors_recovered = 0;
if (vd->vdev_children == 0) { if (vd->vdev_children == 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
@ -97,18 +100,25 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
for (int c = 0; c < vd->vdev_children; c++) { for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c]; vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_open_error && !cvd->vdev_islog && if (cvd->vdev_open_error && !cvd->vdev_islog &&
cvd->vdev_ops != &vdev_indirect_ops) { cvd->vdev_ops != &vdev_indirect_ops) {
lasterror = cvd->vdev_open_error; lasterror = cvd->vdev_open_error;
numerrors++; numerrors++;
if (vdev_is_special_failsafe(cvd))
numerrors_recovered++;
} }
} }
if (spa_load_state(spa) != SPA_LOAD_NONE) if (spa_load_state(spa) != SPA_LOAD_NONE) {
spa_set_missing_tvds(spa, numerrors); spa_set_missing_tvds(spa, numerrors, numerrors_recovered);
}
if (too_many_errors(vd, numerrors)) { if (numerrors != 0 && (numerrors == numerrors_recovered)) {
vdev_dbgmsg(vd, "there were %lu top-level errors, but they were"
" all alloc class vdevs with special_failsafe. Keep trying"
"to import.",
(long unsigned) numerrors);
} else if (too_many_errors(vd, numerrors)) {
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
return (lasterror); return (lasterror);
} }

View File

@ -3501,6 +3501,19 @@ zio_ddt_write(zio_t *zio)
ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
/*
* Dedup writes can either to do a dedicated dedup device or to a
* dedicated special device. If we have special_failsafe on, we need
* to make an extra copy of the data to go on the pool. To do this
* we need to adjust the ZIO's copies here so the later stages in the
* ZIO pipeline work correctly.
*/
if (spa->spa_special_failsafe && zp->zp_copies == 1) {
zp->zp_copies = 2;
}
p = zp->zp_copies;
ddt_enter(ddt); ddt_enter(ddt);
dde = ddt_lookup(ddt, bp, B_TRUE); dde = ddt_lookup(ddt, bp, B_TRUE);
ddp = &dde->dde_phys[p]; ddp = &dde->dde_phys[p];
@ -3631,6 +3644,22 @@ zio_dva_throttle(zio_t *zio)
mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
/*
* If the special_failsafe pool prop is enabled, we will do the regular
* write to the special/dedup device and an additional "backup"
* write to the normal pool. That way if the special/dedup devices
* all fail, we don't lose all data in our pool.
*
* Reserve that 2nd write to the regular pool here. The DVAs
* for both writes will later be allocated in the
* next step in the ZIO pipeline in
* zio_dva_allocate()->metaslab_alloc().
*/
if ((spa->spa_special_failsafe && (mc == spa_special_class(spa) ||
mc == spa_dedup_class(spa))) && zio->io_prop.zp_copies == 1) {
zio->io_prop.zp_copies = 2;
}
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
!mc->mc_alloc_throttle_enabled || !mc->mc_alloc_throttle_enabled ||
zio->io_child_type == ZIO_CHILD_GANG || zio->io_child_type == ZIO_CHILD_GANG ||

View File

@ -53,6 +53,14 @@ tags = ['functional', 'arc']
tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on']
tags = ['functional', 'atime'] tags = ['functional', 'atime']
[tests/functional/special_failsafe]
tests = ['special_failsafe_add', 'special_failsafe_create',
'special_failsafe_files', 'special_failsafe_import',
'special_failsafe_offline', 'special_failsafe_prop',
'special_failsafe_scrub', 'special_failsafe_split']
tags = ['functional', 'special_failsafe']
[tests/functional/bclone] [tests/functional/bclone]
tests = ['bclone_crossfs_corner_cases_limited', tests = ['bclone_crossfs_corner_cases_limited',
'bclone_crossfs_data', 'bclone_crossfs_data',

View File

@ -1081,6 +1081,16 @@ function get_pool_prop # property pool
zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool" zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool"
} }
# Get the specified vdev property in parsable format or fail
function get_vdev_prop
{
typeset prop=$1
typeset pool=$2
typeset vdev=$3
zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev"
}
# Return 0 if a pool exists; $? otherwise # Return 0 if a pool exists; $? otherwise
# #
# $1 - pool name # $1 - pool name
@ -1815,7 +1825,8 @@ function verify_pool
function get_disklist # pool function get_disklist # pool
{ {
echo $(zpool iostat -v $1 | awk '(NR > 4) {print $1}' | \ echo $(zpool iostat -v $1 | awk '(NR > 4) {print $1}' | \
grep -vEe '^-----' -e "^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|\-[0-9]$") grep -vEe '^-----' | \
grep -Ev '^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|-[0-9]$')
} }
# #
@ -3907,3 +3918,28 @@ function pop_coredump_pattern
;; ;;
esac esac
} }
# Get a list of all vdevs in the pool that are a certain type.
#
# The returned list is in a space-separated string, with the full path of each
# vdev included:
#
# "/dev/sda /dev/sdb /dev/sdc"
#
# $1: Type of disk to get ('special', 'dedup', 'log', 'cache', 'spare')
# $2: (optional) pool name
function get_list_of_vdevs_that_are {
poolname=${2:-$TESTPOOL}
zpool status -P $poolname | sed -r '/\s+(mirror|draid|raidz)/d' | \
awk -v token="$1" '{
if (tmp == 1 && substr($1,1,1) == "/") {
if (first != 1) {
printf "%s", $1;
first=1;
} else {
printf " %s", $1;
}
} else {tmp=0}; if ($1 == token) {tmp=1}}
END {print ""}'
}

View File

@ -90,6 +90,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \
functional/alloc_class/alloc_class.kshlib \ functional/alloc_class/alloc_class.kshlib \
functional/atime/atime.cfg \ functional/atime/atime.cfg \
functional/atime/atime_common.kshlib \ functional/atime/atime_common.kshlib \
functional/special_failsafe/special_failsafe.cfg \
functional/special_failsafe/special_failsafe.kshlib \
functional/bclone/bclone.cfg \ functional/bclone/bclone.cfg \
functional/bclone/bclone_common.kshlib \ functional/bclone/bclone_common.kshlib \
functional/bclone/bclone_corner_cases.kshlib \ functional/bclone/bclone_corner_cases.kshlib \
@ -441,6 +443,16 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/atime/root_atime_on.ksh \ functional/atime/root_atime_on.ksh \
functional/atime/root_relatime_on.ksh \ functional/atime/root_relatime_on.ksh \
functional/atime/setup.ksh \ functional/atime/setup.ksh \
functional/special_failsafe/special_failsafe_add.ksh \
functional/special_failsafe/special_failsafe_create.ksh \
functional/special_failsafe/special_failsafe_files.ksh \
functional/special_failsafe/special_failsafe_import.ksh \
functional/special_failsafe/special_failsafe_prop.ksh \
functional/special_failsafe/special_failsafe_offline.ksh \
functional/special_failsafe/special_failsafe_scrub.ksh \
functional/special_failsafe/special_failsafe_split.ksh \
functional/special_failsafe/cleanup.ksh \
functional/special_failsafe/setup.ksh \
functional/bclone/bclone_crossfs_corner_cases.ksh \ functional/bclone/bclone_crossfs_corner_cases.ksh \
functional/bclone/bclone_crossfs_corner_cases_limited.ksh \ functional/bclone/bclone_crossfs_corner_cases_limited.ksh \
functional/bclone/bclone_crossfs_data.ksh \ functional/bclone/bclone_crossfs_data.ksh \

View File

@ -32,12 +32,16 @@ log_assert $claim
log_onexit cleanup log_onexit cleanup
log_must disk_setup log_must disk_setup
for type in special dedup; do
log_mustnot zpool create -d $TESTPOOL $CLASS_DISK0 $type $CLASS_DISK1 for arg in '-o special_failsafe=on' '' ; do
for type in special dedup; do
log_mustnot zpool create $args -d $TESTPOOL $CLASS_DISK0 $type \
$CLASS_DISK1
done
log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
$CLASS_DISK0 $CLASS_DISK1
log_must display_status "$TESTPOOL"
log_must zpool destroy -f "$TESTPOOL"
done done
log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
$CLASS_DISK0 $CLASS_DISK1
log_must display_status "$TESTPOOL"
log_must zpool destroy -f "$TESTPOOL"
log_pass $claim log_pass $claim

View File

@ -31,27 +31,29 @@ log_onexit cleanup
log_must disk_setup log_must disk_setup
for type in "" "mirror" "raidz" for arg in '-o special_failsafe=on' '' ; do
do for type in "" "mirror" "raidz"
log_must zpool create $TESTPOOL $type $ZPOOL_DISKS do
log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS
if [ "$type" = "mirror" ]; then if [ "$type" = "mirror" ]; then
log_must zpool add $TESTPOOL special mirror \ log_must zpool add $TESTPOOL special mirror \
$CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2 $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2
log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 log_must zpool iostat -H $TESTPOOL $CLASS_DISK2
elif [ "$type" = "raidz" ]; then elif [ "$type" = "raidz" ]; then
log_must zpool add $TESTPOOL special mirror \ log_must zpool add $TESTPOOL special mirror \
$CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK0 $CLASS_DISK1
log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 log_must zpool iostat -H $TESTPOOL $CLASS_DISK1
else else
log_must zpool add $TESTPOOL special $CLASS_DISK0 log_must zpool add $TESTPOOL special $CLASS_DISK0
log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 log_must zpool iostat -H $TESTPOOL $CLASS_DISK0
fi fi
log_must zpool destroy -f $TESTPOOL log_must zpool destroy -f $TESTPOOL
done
done done
log_pass $claim log_pass $claim

View File

@ -36,31 +36,35 @@ typeset ac_value
typeset stype="" typeset stype=""
typeset sdisks="" typeset sdisks=""
for type in "" "mirror" "raidz" for arg in '-o special_failsafe=on' '' ; do
do for type in "" "mirror" "raidz"
if [ "$type" = "mirror" ]; then do
stype="mirror" if [ "$type" = "mirror" ]; then
sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" stype="mirror"
elif [ "$type" = "raidz" ]; then sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}"
stype="mirror" elif [ "$type" = "raidz" ]; then
sdisks="${CLASS_DISK0} ${CLASS_DISK1}" stype="mirror"
else sdisks="${CLASS_DISK0} ${CLASS_DISK1}"
stype="" else
sdisks="${CLASS_DISK0}" stype=""
fi sdisks="${CLASS_DISK0}"
fi
log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \ log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS \
special $stype $sdisks special $stype $sdisks
ac_value="$(zpool get -H -o property,value all | awk '/allocation_classes/ {print $2}')" ac_value="$(zpool get -H -o property,value \
if [ "$ac_value" = "active" ]; then feature@allocation_classes | \
log_note "feature@allocation_classes is active" awk '/allocation_classes/ {print $2}')"
else if [ "$ac_value" = "active" ]; then
log_fail "feature@allocation_classes not active, \ log_note "feature@allocation_classes is active"
status = $ac_value" else
fi log_fail "feature@allocation_classes not active, \
status = $ac_value"
fi
log_must zpool destroy -f $TESTPOOL log_must zpool destroy -f $TESTPOOL
done
done done
log_pass $claim log_pass $claim

View File

@ -34,38 +34,44 @@ log_must disk_setup
typeset ac_value typeset ac_value
for type in "" "mirror" "raidz" for arg in '-o special_failsafe=on' '' ; do
do for type in "" "mirror" "raidz"
if [ "$type" = "mirror" ]; then do
log_must zpool create $TESTPOOL $type $ZPOOL_DISK0 $ZPOOL_DISK1 if [ "$type" = "mirror" ]; then
else log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISK0 \
log_must zpool create $TESTPOOL $type $ZPOOL_DISKS $ZPOOL_DISK1
fi else
ac_value="$(zpool get -H -o property,value all | \ log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS
awk '/allocation_classes/ {print $2}')" fi
if [ "$ac_value" = "enabled" ]; then ac_value="$(zpool get -H -o property,value \
log_note "feature@allocation_classes is enabled" feature@allocation_classes | \
else awk '/allocation_classes/ {print $2}')"
log_fail "feature@allocation_classes not enabled, \ if [ "$ac_value" = "enabled" ]; then
status = $ac_value" log_note "feature@allocation_classes is enabled"
fi else
log_fail "feature@allocation_classes not enabled, \
status = $ac_value"
fi
if [ "$type" = "" ]; then if [ "$type" = "" ]; then
log_must zpool add $TESTPOOL special $CLASS_DISK0 log_must zpool add $TESTPOOL special $CLASS_DISK0
else else
log_must zpool add $TESTPOOL special mirror \ log_must zpool add $TESTPOOL special mirror \
$CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK0 $CLASS_DISK1
fi fi
ac_value="$(zpool get -H -o property,value all | \ ac_value="$(zpool get -H -o property,value \
awk '/allocation_classes/ {print $2}')" feature@allocation_classes | \
if [ "$ac_value" = "active" ]; then awk '/allocation_classes/ {print $2}')"
log_note "feature@allocation_classes is active"
else
log_fail "feature@allocation_classes not active, \
status = $ac_value"
fi
log_must zpool destroy -f $TESTPOOL if [ "$ac_value" = "active" ]; then
log_note "feature@allocation_classes is active"
else
log_fail "feature@allocation_classes not active, \
status = $ac_value"
fi
log_must zpool destroy -f $TESTPOOL
done
done done
log_pass "Values of allocation_classes feature flag correct." log_pass "Values of allocation_classes feature flag correct."

View File

@ -32,10 +32,14 @@ log_onexit cleanup
log_must disk_setup log_must disk_setup
log_must zpool create $TESTPOOL \ for arg in '-o special_failsafe=on' '' ; do
mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \ log_must zpool create $arg $TESTPOOL \
special mirror $CLASS_DISK0 $CLASS_DISK1 mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \
log_must zpool split $TESTPOOL split_pool special mirror $CLASS_DISK0 $CLASS_DISK1
log_must zpool destroy -f $TESTPOOL log_must zpool split $TESTPOOL split_pool
log_must zpool import -d $(dirname $CLASS_DISK1) split_pool
log_must zpool destroy -f $TESTPOOL
log_must zpool destroy -f split_pool
done
log_pass $claim log_pass $claim

View File

@ -31,11 +31,13 @@ log_onexit cleanup
log_must disk_setup log_must disk_setup
log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS \ for arg in '-o special_failsafe=on' '' ; do
special mirror $CLASS_DISK0 $CLASS_DISK1 log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS \
log_must zpool replace $TESTPOOL $CLASS_DISK1 $CLASS_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1
log_must sleep 10 log_must zpool replace $TESTPOOL $CLASS_DISK1 $CLASS_DISK2
log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 log_must sleep 10
log_must zpool destroy -f $TESTPOOL log_must zpool iostat -H $TESTPOOL $CLASS_DISK2
log_must zpool destroy -f $TESTPOOL
done
log_pass $claim log_pass $claim

View File

@ -35,22 +35,24 @@ typeset special_type=""
typeset create_disks="" typeset create_disks=""
typeset added_disks="" typeset added_disks=""
for type in "" "raidz" for arg in '-o special_failsafe=on' '' ; do
do for type in "" "raidz"
if [ "$type" = "raidz" ]; then do
special_type="mirror" if [ "$type" = "raidz" ]; then
create_disks="${CLASS_DISK0} ${CLASS_DISK1}" special_type="mirror"
added_disks="${CLASS_DISK2} ${CLASS_DISK3}" create_disks="${CLASS_DISK0} ${CLASS_DISK1}"
else added_disks="${CLASS_DISK2} ${CLASS_DISK3}"
special_type="" else
create_disks="${CLASS_DISK0}" special_type=""
added_disks="${CLASS_DISK1}" create_disks="${CLASS_DISK0}"
fi added_disks="${CLASS_DISK1}"
log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \ fi
special $special_type $create_disks log_must zpool create $args$TESTPOOL $type $ZPOOL_DISKS \
log_must zpool add $TESTPOOL special $special_type $added_disks special $special_type $create_disks
log_must zpool iostat $TESTPOOL $added_disks log_must zpool add $TESTPOOL special $special_type $added_disks
log_must zpool destroy -f $TESTPOOL log_must zpool iostat $TESTPOOL $added_disks
log_must zpool destroy -f $TESTPOOL
done
done done
log_pass $claim log_pass $claim

View File

@ -35,35 +35,39 @@ typeset stype=""
typeset sdisks="" typeset sdisks=""
typeset props="" typeset props=""
for type in "" "mirror" "raidz" for arg in '-o special_failsafe=on' '' ; do
do for type in "" "mirror" "raidz"
if [ "$type" = "mirror" ]; then do
stype="mirror" if [ "$type" = "mirror" ]; then
sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" stype="mirror"
props="-o ashift=12" sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}"
elif [ "$type" = "raidz" ]; then props="-o ashift=12"
stype="mirror" elif [ "$type" = "raidz" ]; then
sdisks="${CLASS_DISK0} ${CLASS_DISK1}" stype="mirror"
else sdisks="${CLASS_DISK0} ${CLASS_DISK1}"
stype="" else
sdisks="${CLASS_DISK0}" stype=""
fi sdisks="${CLASS_DISK0}"
fi
# #
# 1/3 of the time add the special vdev after creating the pool # 1/3 of the time add the special vdev after creating the pool
# #
if [ $((RANDOM % 3)) -eq 0 ]; then if [ $((RANDOM % 3)) -eq 0 ]; then
log_must zpool create ${props} $TESTPOOL $type $ZPOOL_DISKS log_must zpool create $arg ${props} $TESTPOOL $type \
log_must zpool add ${props} $TESTPOOL special $stype $sdisks $ZPOOL_DISKS
else log_must zpool add ${props} $TESTPOOL special $stype \
log_must zpool create ${props} $TESTPOOL $type $ZPOOL_DISKS \ $sdisks
special $stype $sdisks else
fi log_must zpool create $arg ${props} $TESTPOOL $type \
$ZPOOL_DISKS special $stype $sdisks
fi
log_must zpool export $TESTPOOL log_must zpool export $TESTPOOL
log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL
log_must display_status $TESTPOOL log_must display_status $TESTPOOL
log_must zpool destroy -f $TESTPOOL log_must zpool destroy -f $TESTPOOL
done
done done
log_pass $claim log_pass $claim

View File

@ -32,19 +32,22 @@ log_onexit cleanup
log_must disk_setup log_must disk_setup
log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ for arg in '-o special_failsafe=on' '' ; do
$CLASS_DISK0 $CLASS_DISK1 log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS special mirror \
$CLASS_DISK0 $CLASS_DISK1
for value in 0 512 1024 2048 4096 8192 16384 32768 65536 131072 for value in 0 512 1024 2048 4096 8192 16384 32768 65536 131072
do do
log_must zfs set special_small_blocks=$value $TESTPOOL log_must zfs set special_small_blocks=$value $TESTPOOL
ACTUAL=$(zfs get -p special_small_blocks $TESTPOOL | \ ACTUAL=$(zfs get -p special_small_blocks $TESTPOOL | \
awk '/special_small_blocks/ {print $3}') awk '/special_small_blocks/ {print $3}')
if [ "$ACTUAL" != "$value" ] if [ "$ACTUAL" != "$value" ]
then then
log_fail "v. $ACTUAL set for $TESTPOOL, expected v. $value!" log_fail "v. $ACTUAL set for $TESTPOOL, expected v. $value"
fi fi
done
log_must zpool destroy -f "$TESTPOOL"
done done
log_must zpool destroy -f "$TESTPOOL"
log_pass $claim log_pass $claim

View File

@ -32,13 +32,17 @@ log_assert $claim
log_onexit cleanup log_onexit cleanup
log_must disk_setup log_must disk_setup
log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \
$CLASS_DISK0 $CLASS_DISK1
for value in 256 1025 33554432 for arg in '-o special_failsafe=on' '' ; do
do log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS special mirror \
log_mustnot zfs set special_small_blocks=$value $TESTPOOL $CLASS_DISK0 $CLASS_DISK1
for value in 256 1025 33554432
do
log_mustnot zfs set special_small_blocks=$value $TESTPOOL
done
log_must zpool destroy -f "$TESTPOOL"
done done
log_must zpool destroy -f "$TESTPOOL"
log_pass $claim log_pass $claim

View File

@ -25,20 +25,20 @@
verify_runnable "global" verify_runnable "global"
# #
# Verify the file identified by the input <inode> is written on a special vdev # Given a dataset and an inode number, return a list of all the vdev numbers
# According to the pool layout used in this test vdev_id 3 and 4 are special # that the inode has blocks on.
# XXX: move this function to libtest.shlib once we get "Vdev Properties"
# #
function file_in_special_vdev # <dataset> <inode> # For example, if the inode has blocks on vdevs 0, 1 and 2, this would return
# the string "0 1 2"
#
function vdevs_file_is_on # <dataset> <inode>
{ {
typeset dataset="$1" typeset dataset="$1"
typeset inum="$2" typeset inum="$2"
typeset num_normal=$(echo $ZPOOL_DISKS | wc -w) zdb -dddddd $dataset $inum | awk '
num_normal=${num_normal##* } /L0 [0-9]+/{
zdb -dddddd $dataset $inum | awk -v d=$num_normal '{
# find DVAs from string "offset level dva" only for L0 (data) blocks # find DVAs from string "offset level dva" only for L0 (data) blocks
if (match($0,"L0 [0-9]+")) { # if (match($0,"L0 [0-9]+")) {
dvas[0]=$3 dvas[0]=$3
dvas[1]=$4 dvas[1]=$4
dvas[2]=$5 dvas[2]=$5
@ -50,25 +50,46 @@ if (match($0,"L0 [0-9]+")) {
print "Error parsing DVA: <" dva ">"; print "Error parsing DVA: <" dva ">";
exit 1; exit 1;
} }
# verify vdev is "special" count[arr[1]]++;
if (arr[1] < d) {
exit 1;
}
} }
} }
}}' #}
}
END {
# Print out the unique vdev numbers that had data
firstprint=1;
for (i in count) {
if (firstprint==1) {
printf("%d", i);
firstprint=0;
} else {
printf(" %d", i);
}
}
}
'
} }
# #
# Check that device removal works for special class vdevs # Check that device removal works for special class vdevs
# #
# $1: Set to 1 to backup alloc class data to the pool. Leave blank to disable
# backup.
function check_removal function check_removal
{ {
typeset backup
if [ "$1" == "1" ] ; then
backup=1
args="-o special_failsafe=on"
else
backup=0
args=""
fi
# #
# Create a non-raidz pool so we can remove top-level vdevs # Create a non-raidz pool so we can remove top-level vdevs
# #
log_must disk_setup log_must zpool create $args $TESTPOOL $ZPOOL_DISKS \
log_must zpool create $TESTPOOL $ZPOOL_DISKS \
special $CLASS_DISK0 special $CLASS_DISK1 special $CLASS_DISK0 special $CLASS_DISK1
log_must display_status "$TESTPOOL" log_must display_status "$TESTPOOL"
@ -93,19 +114,49 @@ function check_removal
for i in 1 2 3 4; do for i in 1 2 3 4; do
dataset="$TESTPOOL/$TESTFS" dataset="$TESTPOOL/$TESTFS"
inum="$(get_objnum /$TESTPOOL/$TESTFS/testfile.$i)" inum="$(get_objnum /$TESTPOOL/$TESTFS/testfile.$i)"
log_must file_in_special_vdev $dataset $inum
# Get a list of all the vdevs 'testfile.$i' has blocks on.
# The list will be string like "0 1 2 3" if the blocks are on
# vdevs 0-3.
on_vdevs="$(vdevs_file_is_on $dataset $inum)"
# Get the number of normal (non-special) pool disks
num_pool_disks=$(echo $ZPOOL_DISKS | wc -w)
num_pool_disks=${num_pool_disks##* }
if [ "$backup" == "1" ] ; then
# Data should be on all vdevs (both pool and special
# devices).
lowest_data_disk=0
highest_data_disk=$(($num_pool_disks + 1))
else
# Data should only be on special devices
lowest_data_disk=$num_pool_disks
highest_data_disk=$(($lowest_data_disk + 1))
fi
# Get the starting disks that we expect the data to be on.
# We assume two special devices are attached to the pool.
# Disk numbers start at zero.
expected_on_vdevs="$(seq -s ' ' $lowest_data_disk $highest_data_disk)"
# Compare the disks we expect to see the blocks on with
# the actual disks they're on.
if [ "$on_vdevs" != "$expected_on_vdevs" ] ; then
# Data distribution is not what we expected, break out of
# the loop so we can properly tear down the pool. We will
# error out after the loop.
break;
fi
done done
log_must zpool remove $TESTPOOL $CLASS_DISK0 log_must zpool remove $TESTPOOL $CLASS_DISK0
sleep 5
sync_pool $TESTPOOL
sleep 1
log_must zdb -bbcc $TESTPOOL
log_must zpool list -v $TESTPOOL
log_must zpool destroy -f "$TESTPOOL" log_must zpool destroy -f "$TESTPOOL"
log_must disk_cleanup
if [ "$on_vdevs" != "$expected_on_vdevs" ] ; then
log_fail "Expected data on disks $expected_on_vdevs, got $on_vdevs"
fi
} }
claim="Removing a special device from a pool succeeds." claim="Removing a special device from a pool succeeds."
@ -113,12 +164,15 @@ claim="Removing a special device from a pool succeeds."
log_assert $claim log_assert $claim
log_onexit cleanup log_onexit cleanup
typeset CLASS_DEVSIZE=$CLASS_DEVSIZE log_must disk_setup
for CLASS_DEVSIZE in $CLASS_DEVSIZE $ZPOOL_DEVSIZE; do for backup in "1" "" ; do
typeset ZPOOL_DISKS=$ZPOOL_DISKS typeset CLASS_DEVSIZE=$CLASS_DEVSIZE
for ZPOOL_DISKS in "$ZPOOL_DISKS" $ZPOOL_DISK0; do for CLASS_DEVSIZE in $CLASS_DEVSIZE $ZPOOL_DEVSIZE; do
check_removal typeset ZPOOL_DISKS=$ZPOOL_DISKS
for ZPOOL_DISKS in "$ZPOOL_DISKS" $ZPOOL_DISK0; do
check_removal $backup
done
done done
done done
log_must disk_cleanup
log_pass $claim log_pass $claim

View File

@ -33,31 +33,34 @@ log_onexit cleanup
# Create a non-raidz pool so we can remove top-level vdevs # Create a non-raidz pool so we can remove top-level vdevs
# #
log_must disk_setup log_must disk_setup
log_must zpool create $TESTPOOL $ZPOOL_DISKS dedup $CLASS_DISK0
log_must display_status "$TESTPOOL"
# for arg in '-o special_failsafe=on' '' ; do
# Generate some dedup data in the dedup class before removal log_must zpool create $arg $TESTPOOL $ZPOOL_DISKS dedup $CLASS_DISK0
# log_must display_status "$TESTPOOL"
log_must zfs create -o dedup=on -V 2G $TESTPOOL/$TESTVOL #
block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL" # Generate some dedup data in the dedup class before removal
log_must eval "new_fs $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL >/dev/null" #
sync_pool log_must zfs create -o dedup=on -V 2G $TESTPOOL/$TESTVOL
log_must zpool list -v $TESTPOOL block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL"
log_must eval "new_fs $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL >/dev/null"
# sync_pool
# remove a dedup allocation vdev log_must zpool list -v $TESTPOOL
#
log_must zpool remove $TESTPOOL $CLASS_DISK0
sleep 5 #
sync_pool $TESTPOOL # remove a dedup allocation vdev
sleep 1 #
log_must zpool remove $TESTPOOL $CLASS_DISK0
log_must zdb -bbcc $TESTPOOL sleep 5
sync_pool $TESTPOOL
sleep 1
log_must zpool destroy -f "$TESTPOOL" log_must zdb -bbcc $TESTPOOL
log_must zpool destroy -f "$TESTPOOL"
done
log_pass $claim log_pass $claim

View File

@ -26,13 +26,15 @@ log_assert $claim
log_onexit cleanup log_onexit cleanup
log_must disk_setup log_must disk_setup
for size in 512 4096 32768 131072 524288 1048576 for arg in '-o special_failsafe=on' '' ; do
do for size in 512 4096 32768 131072 524288 1048576
let bigger=$size*2 do
log_mustnot zpool create -O recordsize=$size \ let bigger=$size*2
-O special_small_blocks=$bigger \ log_mustnot zpool create $arg -O recordsize=$size \
$TESTPOOL raidz $ZPOOL_DISKS special mirror \ -O special_small_blocks=$bigger \
$CLASS_DISK0 $CLASS_DISK1 $TESTPOOL raidz $ZPOOL_DISKS special mirror \
$CLASS_DISK0 $CLASS_DISK1
done
done done
log_pass $claim log_pass $claim

View File

@ -26,20 +26,22 @@ log_assert $claim
log_onexit cleanup log_onexit cleanup
log_must disk_setup log_must disk_setup
for size in 8192 32768 131072 524288 1048576 for arg in '-o special_failsafe=on' '' ; do
do for size in 8192 32768 131072 524288 1048576
let smaller=$size/2 do
log_must zpool create -O recordsize=$size \ let smaller=$size/2
-O special_small_blocks=$smaller \ log_must zpool create $arg -O recordsize=$size \
$TESTPOOL raidz $ZPOOL_DISKS special mirror \ -O special_small_blocks=$smaller \
$CLASS_DISK0 $CLASS_DISK1 $TESTPOOL raidz $ZPOOL_DISKS special mirror \
log_must zpool destroy -f "$TESTPOOL" $CLASS_DISK0 $CLASS_DISK1
log_must zpool destroy -f "$TESTPOOL"
log_must zpool create -O recordsize=$size \ log_must zpool create $arg -O recordsize=$size \
-O special_small_blocks=$size \ -O special_small_blocks=$size \
$TESTPOOL raidz $ZPOOL_DISKS special mirror \ $TESTPOOL raidz $ZPOOL_DISKS special mirror \
$CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK0 $CLASS_DISK1
log_must zpool destroy -f "$TESTPOOL" log_must zpool destroy -f "$TESTPOOL"
done
done done
log_pass $claim log_pass $claim

View File

@ -61,6 +61,7 @@ typeset -a properties=(
"bcloneused" "bcloneused"
"bclonesaved" "bclonesaved"
"bcloneratio" "bcloneratio"
"special_failsafe"
"feature@async_destroy" "feature@async_destroy"
"feature@empty_bpobj" "feature@empty_bpobj"
"feature@lz4_compress" "feature@lz4_compress"
@ -87,6 +88,7 @@ typeset -a properties=(
"feature@device_rebuild" "feature@device_rebuild"
"feature@draid" "feature@draid"
"feature@redaction_list_spill" "feature@redaction_list_spill"
"feature@special_failsafe"
) )
if is_linux || is_freebsd; then if is_linux || is_freebsd; then

View File

@ -0,0 +1,27 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2017, Intel Corporation.
# Copyright (c) 2018, Delphix
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
verify_runnable "global"
default_cleanup_noexit
disk_cleanup
log_pass

View File

@ -0,0 +1,24 @@
#!/bin/ksh -p
# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
# Refer to the OpenZFS git commit log for authoritative copyright attribution.
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License Version 1.0 (CDDL-1.0).
# You can obtain a copy of the license from the top-level file
# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
# You may not use this file except in compliance with the license.
#
# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
#
# Copyright (c) 2017, Intel Corporation.
# Copyright (c) 2018 by Delphix. All rights reserved.
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
verify_runnable "global"
disk_cleanup
log_pass

View File

@ -0,0 +1,36 @@
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2017, Intel Corporation.
# Copyright (c) 2018 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
export ZPOOL_DISK0="$TEST_BASE_DIR/device-0"
export ZPOOL_DISK1="$TEST_BASE_DIR/device-1"
export ZPOOL_DISK2="$TEST_BASE_DIR/device-2"
export ZPOOL_DISKS="${ZPOOL_DISK0} ${ZPOOL_DISK1} ${ZPOOL_DISK2}"
export CLASS_DISK0="$TEST_BASE_DIR/device-3"
export CLASS_DISK1="$TEST_BASE_DIR/device-4"
export CLASS_DISK2="$TEST_BASE_DIR/device-5"
export CLASS_DISK3="$TEST_BASE_DIR/device-6"
export CLASS_DISK4="$TEST_BASE_DIR/device-7"
export CLASS_DISK5="$TEST_BASE_DIR/device-8"
export CLASS_DISKS="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2} ${CLASS_DISK3} ${CLASS_DISK4} ${CLASS_DISK5}"
export ZPOOL_DEVSIZE=200M
export CLASS_DEVSIZE=200M
export IMPORTDIR="$TEST_BASE_DIR"

View File

@ -0,0 +1,255 @@
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2017, Intel Corporation.
# Copyright (c) 2018 by Delphix. All rights reserved.
# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.cfg
BACKUP_DIR=$TEST_BASE_DIR/backups
function disk_setup
{
truncate -s $ZPOOL_DEVSIZE $ZPOOL_DISKS
truncate -s $CLASS_DEVSIZE $CLASS_DISKS
if [ -d $BACKUP_DIR ] ; then
log_fail "Existing $TEST_BASE_DIR/backups directory (maybe leftover from failed test run?)"
fi
mkdir -p $BACKUP_DIR
}
function disk_cleanup
{
rm -f $ZPOOL_DEVSIZE $ZPOOL_DISKS 2> /dev/null
rm -f $CLASS_DEVSIZE $CLASS_DISKS 2> /dev/null
rm -f special_failsafe.key
rm -fr $BACKUP_DIR
}
function cleanup
{
if datasetexists $TESTPOOL ; then
zpool destroy -f $TESTPOOL 2> /dev/null
fi
disk_cleanup
}
# Write zeros to an existing file, keeping the same size.
function zero_file {
dd status=none if=/dev/zero of="$1" bs=$(stat_size "$1") count=1
}
# Write a verifiable file that will end up on a 'dedup' or 'special' vdev.
# The filename will include the sha256 of the file for easy verification later.
#
# $1: Write type - "dedup" or "special"
# $2: Path to directory to write the file to
#
# Note: we don't use log_must here since this can get really chatty and
# we don't want to spam the logs. It will log_fail if there is an error.
function write_verifiable_file {
class="$1"
writedir="$2"
if [[ "$class" == "dedup" ]] ; then
# Our dedup file size can be up to a megabyte-ish
filesize=$((32768 + ($RANDOM * $RANDOM % 1000000)))
# Make write a multiple of the recordsize for dedup
bs=32768
count=$(($filesize / $bs))
# Fill data with the letter 'a' for dedup
file_write -b $bs -c $count -d 'a' -o create -f $writedir/tmp || return
else
# Make all files less than the 32k special_small_blocks size we
# setup at dataset creation time
filesize=$((($RANDOM % 32767) + 1))
bs=$filesize
count=1
dd status=none if=/dev/urandom bs=$bs count=$count of="$writedir/tmp" || return
fi
csum=$(sha256digest "$writedir/tmp")
newfile=$csum.$class$totalwritten
mv "$writedir/tmp" "$writedir/$newfile"
# Basic sanity that we created our final file, and it has a non-zero size
expectedsize=$(($bs * $count))
actualsize=$(stat_size "$writedir/$newfile")
if [[ "$actualsize" != "$expectedsize" ]] || [[ "$actualsize" == "0" ]] ; then
log_fail "File $writedir/$newfile bad size $actualsize (expected $expectedsize)"
return
fi
totalwritten=$(($totalwritten + 1))
}
# Write some files to all our datasets.
#
# For each dataset:
#
# - 10 files should hit special vdevs
# - 10 files should hit dedup vdevs
function write_some_files {
typeset i
for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do
for j in $(seq 1 10) ; do
write_verifiable_file special /$TESTPOOL/$i
write_verifiable_file dedup /$TESTPOOL/$i
done
done
}
# Given a directory containing only files created by write_verifiable_file(),
# verify that the contents of the file match the sha256sum in the file's name.
#
# $1: Dir path with files to verify
function verify_directory {
typeset verifydir="$1"
typeset i
for i in $(ls $verifydir) ; do
# Files will look like:
#
# ed324386045fa39d3f41d4f13c8c3e6a4698466e2b694c327f7e490be9e4e33f.dedup13
#
# Just grab the sha256 part
shaname="$(echo $i | cut -f1 -d'.')"
if [[ $(sha256digest "$verifydir/$i") != "$shaname" ]] ; then
log_fail "$verifydir/$i sha256 not $shaname"
false
return
fi
done
true
}
function backup_alloc_class_disks {
typeset i
for i in $@ ; do
cp ${i} $BACKUP_DIR/$(basename $i)
done
}
function restore_alloc_class_disks {
typeset i
for i in $@ ; do
mv $BACKUP_DIR/$(basename $i) ${i}
done
}
function zero_alloc_class_disks {
typeset i
for i in $@ ; do
zero_file "${i}"
done
}
# Create multiple datasets with different permutations of copies and encryption
function special_failsafe_make_datasets {
log_must zfs create -o compression=off -o special_small_blocks=32K -o recordsize=32K \
-o dedup=on $TESTPOOL/$TESTFS
keyfile=$(pwd)/special_failsafe.key
dd if=/dev/urandom of=$keyfile bs=32 count=1
log_must zfs create -o copies=2 -o special_small_blocks=32K -o recordsize=32K -o dedup=on \
$TESTPOOL/2copies
log_must zfs create -o copies=3 -o special_small_blocks=32K -o recordsize=32K -o dedup=on \
$TESTPOOL/3copies
log_must zfs create -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \
$TESTPOOL/encrypted
log_must zfs create -o copies=2 -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \
$TESTPOOL/encrypted2copies
log_must zfs create -o copies=3 -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \
$TESTPOOL/encrypted3copies
}
# For each dataset we created in special_failsafe_make_datasets, go though
# and check that all the files in the datasets have the correct data.
function verify_all_directories {
typeset i
for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do
verify_directory /$TESTPOOL/$i
done
# ...we should also have the correct number of files
totalfiles=0
for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do
totalfiles=$(($totalfiles + $(ls /$TESTPOOL/$i | wc -w)))
done
if [[ "$totalfiles" != "$totalwritten" ]] ; then
log_fail "Wrong file count: expected $totalwritten, got $totalfiles"
else
log_note "Verified $totalfiles files"
fi
}
# Return a space separated string of disks that are alloc class vdevs. Disk
# names will include the full path.
function get_list_of_alloc_class_disks {
typeset special_disks=$(get_list_of_vdevs_that_are "special")
typeset dedup_disks=$(get_list_of_vdevs_that_are "dedup")
typeset disks="$dedup_disks"
if [ -n "$special_disks" ] ; then
disks="$special_disks $disks"
fi
echo "$disks"
}
# Check that the pool/vdev proprieties and features for alloc class backups
# are sane.
function check_pool_alloc_class_props {
typeset special_failsafe_feature=$(get_pool_prop feature@special_failsafe $TESTPOOL)
typeset special_failsafe_prop=$(get_pool_prop special_failsafe $TESTPOOL)
if [ "$special_failsafe_feature" == "disabled" ] ; then
log_must [ "$special_failsafe_prop" == "off" ]
fi
}
# Simple function to check pool and vdev proprieties are what we expect. The
# values we expect are passed to this function:
#
# $1: 'feature@special_failsafe' pool feature
# $2: 'special_failsafe' pool prop
#
# This function will log_fail on error.
function boilerplate_check {
typeset special_failsafe_feature=$1
typeset special_failsafe_prop=$2
if [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" != "$special_failsafe_feature" ] ; then
log_fail "feature@special_failsafe = $(get_pool_prop feature@special_failsafe $TESTPOOL), expected $special_failsafe_feature"
fi
if [ "$(get_pool_prop special_failsafe $TESTPOOL)" != "$special_failsafe_prop" ] ; then
log_fail "special_failsafe = $(get_pool_prop special_failsafe $TESTPOOL), expected $special_failsafe_prop"
fi
}

View File

@ -0,0 +1,96 @@
#!/bin/ksh -p
# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
# Refer to the OpenZFS git commit log for authoritative copyright attribution.
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License Version 1.0 (CDDL-1.0).
# You can obtain a copy of the license from the top-level file
# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
# You may not use this file except in compliance with the license.
#
# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
#
# DESCRIPTION:
# Verify that 'zpool add' and 'zpool attach' disks have the correct
# special_failsafe settings.
verify_runnable "global"
claim="zpool add|attach disks have correct special_failsafe settings"
log_assert $claim
log_onexit cleanup
# Try different pool configurations
configs="mirror $ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1
mirror $ZPOOL_DISK0 $ZPOOL_DISK1 dedup mirror $CLASS_DISK0 $CLASS_DISK1"
log_must disk_setup
function do_test {
typeset config="$1"
typeset initial=$2
typeset new=$3
log_must zpool create -o special_failsafe=$initial $TESTPOOL $config
totalwritten=0
# Sanity check that feature@special_failsafe aligns with the
# pool prop
if [ $initial == "on" ] ; then
feature_expected="active"
else
feature_expected="enabled"
fi
boilerplate_check "$feature_expected" "$initial"
special_failsafe_make_datasets
write_some_files
if [ $initial != "off" ] ; then
log_must zpool set special_failsafe=$new $TESTPOOL
fi
write_some_files
# Now add a new special/dedup disk to the special mirror
log_must zpool attach $TESTPOOL $CLASS_DISK0 $CLASS_DISK2
write_some_files
# Add another special & dedup disk in RAID0 with the existing
# special mirror
log_must zpool add $TESTPOOL special $CLASS_DISK3
log_must zpool add $TESTPOOL dedup $CLASS_DISK4
write_some_files
verify_all_directories
log_must zpool export $TESTPOOL
alloc_class_disks="$(get_list_of_alloc_class_disks)"
zero_alloc_class_disks $alloc_class_disks
log_must zpool import -l -d $IMPORTDIR $TESTPOOL
verify_all_directories
log_must zpool destroy $TESTPOOL
}
# Create a pool that is initially not special_failsafe. Then, enable
# special_failsafe and add/attach a disk.
echo "$configs" | while read config ; do
for initial in "on" "off" ; do
for new in "on" "off" ; do
do_test "$config" $initial $new
done
done
done
cleanup
log_pass $claim

View File

@ -0,0 +1,86 @@
#!/bin/ksh -p
# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
# Refer to the OpenZFS git commit log for authoritative copyright attribution.
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License Version 1.0 (CDDL-1.0).
# You can obtain a copy of the license from the top-level file
# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
# You may not use this file except in compliance with the license.
#
# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
# DESCRIPTION:
# Verify 'zpool create' with different alloc class redundancy
# levels will correctly succeed or fail.
verify_runnable "global"
claim="zpool create with different special_failsafe and disk permutations work"
log_assert $claim
log_onexit cleanup
# These should always pass since they have same redundancy level
configs_pass="mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1
mirror $ZPOOL_DISK1 $ZPOOL_DISK2 dedup mirror $CLASS_DISK0 $CLASS_DISK1
mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3"
# These should always pass with special_failsafe enabled or when '-f' is passed.
# They should fail otherwise.
configs_pass_failsafe="mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special $CLASS_DISK0
mirror $ZPOOL_DISK1 $ZPOOL_DISK2 dedup $CLASS_DISK0
mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special $CLASS_DISK0 dedup $CLASS_DISK2
mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2"
log_must disk_setup
# Try configs with matching redundancy levels. They should all pass.
echo "$configs_pass" | while read config ; do
log_must zpool create -o feature@special_failsafe=disabled $TESTPOOL $config
log_must zpool destroy $TESTPOOL
log_must zpool create -o special_failsafe=on $TESTPOOL $config
log_must zpool destroy $TESTPOOL
log_must zpool create -f -o feature@special_failsafe=disabled $TESTPOOL $config
log_must zpool destroy $TESTPOOL
log_must zpool create -f -o special_failsafe=on $TESTPOOL $config
log_must zpool destroy $TESTPOOL
log_must zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config
log_must zpool destroy $TESTPOOL
log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=on $TESTPOOL $config
log_must zpool destroy $TESTPOOL
done
# Try configs with lower redundancy level. They should fail if special_failsafe
# is turned off and -f is not used.
echo "$configs_pass_failsafe" | while read config ; do
log_mustnot zpool create -o feature@special_failsafe=disabled $TESTPOOL $config
log_must zpool create -o special_failsafe=on $TESTPOOL $config
log_must zpool destroy $TESTPOOL
log_must zpool create -f -o feature@special_failsafe=disabled $TESTPOOL $config
log_must zpool destroy $TESTPOOL
log_must zpool create -f -o special_failsafe=on $TESTPOOL $config
log_must zpool destroy $TESTPOOL
log_mustnot zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config
log_must zpool create -f -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config
log_must zpool destroy $TESTPOOL
log_mustnot zpool create -o feature@special_failsafe=enabled -o special_failsafe=off $TESTPOOL $config
done
cleanup
log_pass $claim

View File

@ -0,0 +1,124 @@
#!/bin/ksh -p
# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
# Refer to the OpenZFS git commit log for authoritative copyright attribution.
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License Version 1.0 (CDDL-1.0).
# You can obtain a copy of the license from the top-level file
# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
# You may not use this file except in compliance with the license.
#
# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
#
# DESCRIPTION:
# Test multiple different special_failsafe permutations. After each step
# write a bunch of known files. Verify all files are present and correct
# after all the steps are complete.
verify_runnable "global"
claim="Files on special_failsafe enabled disks do not get corrupted"
log_assert $claim
log_onexit cleanup
# Try different pool configurations
configs="mirror $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3
raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3
$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1
$ZPOOL_DISKS special $CLASS_DISK0
$ZPOOL_DISKS dedup $CLASS_DISK0"
echo "$configs" | while read config ; do
log_must disk_setup
log_must zpool create -o special_failsafe=on $TESTPOOL $config
totalwritten=0
special_failsafe_make_datasets
write_some_files
verify_all_directories
alloc_class_disks="$(get_list_of_alloc_class_disks)"
log_must zpool export $TESTPOOL
backup_alloc_class_disks $alloc_class_disks
zero_alloc_class_disks $alloc_class_disks
log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
# Our pool is imported but has all its special devices zeroed out. Try
# writing some files to it and export the pool
write_some_files
log_must zpool export $TESTPOOL
log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
write_some_files
log_must zpool export $TESTPOOL
log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
write_some_files
# Make our old disks appear again (which have older data). Do a zpool
# clear to make them come back online and resilver.
restore_alloc_class_disks $alloc_class_disks
log_must zpool clear $TESTPOOL
write_some_files
# At this point the pool should be normal. The next test is to
# corrupt the alloc class devices while the pool is running.
zero_alloc_class_disks $alloc_class_disks
# Trigger a scrub with our newly-zeroed alloc class disks
log_must zpool scrub $TESTPOOL
# The pool should be degraded, but still alive.
check_state $TESTPOOL "" "DEGRADED"
write_some_files
# Replace all the alloc class disks. This should get the pool
# back to normal.
for disk in $alloc_class_disks ; do
log_must zpool replace $TESTPOOL $disk
done
write_some_files
log_must zpool export $TESTPOOL
# Backup special disks, then totally remove them.
backup_alloc_class_disks $alloc_class_disks
rm -f $alloc_class_disks
# Try to import with the alloc class disks missing - it should work.
log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
# After all the pain we've put our pool though, it should still have all the
# correct file data.
log_must verify_all_directories
if [[ "$totalwritten" != "840" ]] ; then
log_fail "Didn't see 840 files, saw $totalwritten"
fi
# We've checked all the files. Do some more verifications.
verify_pool $TESTPOOL
verify_filesys $TESTPOOL $TESTPOOL $IMPORTDIR
# Record a few stats that show metadata re in use
zpool get dedup $TESTPOOL
zdb -bb $TESTPOOL 2>&1 | grep -Ei 'normal|special|dedup|ddt'
log_must zpool destroy $TESTPOOL
cleanup
done
log_pass $claim

View File

@ -0,0 +1,93 @@
#!/bin/ksh -p
# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
# Refer to the OpenZFS git commit log for authoritative copyright attribution.
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License Version 1.0 (CDDL-1.0).
# You can obtain a copy of the license from the top-level file
# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
# You may not use this file except in compliance with the license.
#
# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
#
# DESCRIPTION:
# Verify we can import a special_failsafe pool even if all its alloc class
# devices are missing.
#
verify_runnable "global"
claim="Verify imports work on special_failsafe pools when vdevs missing"
log_assert $claim
log_onexit cleanup
TWO_ZPOOL_DISKS="$ZPOOL_DISK0 $ZPOOL_DISK1"
REPLACE_DISK="$ZPOOL_DISK2"
# Try a bunch of different pool configurations
configs="$TWO_ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3
raidz $TWO_ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3
$TWO_ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1
$TWO_ZPOOL_DISKS special $CLASS_DISK0
$TWO_ZPOOL_DISKS dedup $CLASS_DISK0"
function do_test {
typeset config="$1"
typeset action="$2"
typeset onoff="$3"
totalwritten=0
log_must disk_setup
log_must zpool create -o special_failsafe=$onoff $TESTPOOL $config
alloc_class_disks="$(get_list_of_alloc_class_disks)"
special_failsafe_make_datasets
write_some_files
verify_all_directories
log_must zpool export $TESTPOOL
# Backup alloc class disk before removing them
backup_alloc_class_disks $alloc_class_disks
if [ "$action" == "remove" ] ; then
rm -f $alloc_class_disks
else
zero_alloc_class_disks $alloc_class_disks
fi
# import should succeed or fail depending on how we're backed up
if [ "$onoff" == "on" ] ; then
log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
else
log_mustnot zpool import -l -d "$IMPORTDIR" $TESTPOOL
# With the disks restored, we should be able to import
restore_alloc_class_disks $alloc_class_disks
log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
fi
write_some_files
# Do a scrub and verify everything is correct
verify_pool $TESTPOOL
verify_all_directories
zpool destroy $TESTPOOL
cleanup
}
echo "$configs" | while read config ; do
for action in "remove" "zero" ; do
for onoff in "off" "on" ; do
do_test "$config" "$action" "$onoff"
done
done
done
log_pass $claim

View File

@ -0,0 +1,124 @@
#!/bin/ksh -p
# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
# Refer to the OpenZFS git commit log for authoritative copyright attribution.
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License Version 1.0 (CDDL-1.0).
# You can obtain a copy of the license from the top-level file
# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
# You may not use this file except in compliance with the license.
#
# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
#
# DESCRIPTION:
# Verify we can offline special_failsafe alloc class disks.
# Verify we cannot offline non-special_failsafe alloc class disks.
#
verify_runnable "global"
claim="Verify correct behavior when we force fault an alloc class disk"
log_assert $claim
log_onexit cleanup
# Try a bunch of different pool configurations
configs="mirror $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3
raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3
$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1
$ZPOOL_DISKS special $CLASS_DISK0
$ZPOOL_DISKS dedup $CLASS_DISK0"
function do_test {
prop="$1"
config="$2"
log_must disk_setup
log_must zpool create -f $prop $TESTPOOL $config
check_pool_alloc_class_props
special_failsafe_make_datasets
totalwritten=0
write_some_files
alloc_class_disks=$(get_list_of_alloc_class_disks)
alloc_class_disks_arr=($alloc_class_disks)
if [ "$prop" == "-o special_failsafe=on" ] ; then
log_must [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" == "active" ]
else
log_must [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" == "enabled" ]
fi
for ((i = 0; i < ${#alloc_class_disks_arr[@]}; i++)); do
disk="${alloc_class_disks_arr[$i]}"
if [ "$prop" == "-o special_failsafe=on" ] ; then
# Everything is backed-up. We should be able to
# offline all the disks.
log_must zpool offline $TESTPOOL $disk
log_must check_state $TESTPOOL "$disk" "OFFLINE"
log_must check_state $TESTPOOL "" "DEGRADED"
else
PARENT=$(get_vdev_prop parent $TESTPOOL $disk)
if [ "$PARENT" == "$TESTPOOL" ] ; then
# Leaf is TLD, offline should fail
log_mustnot zpool offline $TESTPOOL $disk
log_must check_state $TESTPOOL "$disk" "ONLINE"
log_must check_state $TESTPOOL "" "ONLINE"
else
# We're part of a mirror. We know all
# mirrors in our test pool are two disk
# so we should be able to offline the
# first disk, but not the second.
if [ "$i" == "0" ] ; then
# First alloc class disk - pretend
# "previous" disk was online to
# make things easy.
prev_online=1
else
if check_state $TESTPOOL "${alloc_class_disks_arr[$i - 1]}" "ONLINE" ; then
prev_online=1
else
prev_online=0
fi
fi
if [ "$prev_online" == "1" ] ; then
# First disk in mirror, can offline
log_must zpool offline $TESTPOOL $disk
log_must check_state $TESTPOOL "$disk" "OFFLINE"
log_must check_state $TESTPOOL "" "DEGRADED"
else
# Second disk in mirror, can't offline
# but we should still be in a pool
# degraded state from the first disk
# going offline.
log_mustnot zpool offline $TESTPOOL $disk
log_must check_state $TESTPOOL "$disk" "ONLINE"
log_must check_state $TESTPOOL "" "DEGRADED"
fi
fi
fi
done
write_some_files
verify_all_directories
# We've checked all the files. Do some more verifications.
verify_pool $TESTPOOL
verify_filesys $TESTPOOL $TESTPOOL $IMPORTDIR
zpool clear $TESTPOOL
zpool destroy $TESTPOOL
cleanup
}
for prop in "-o special_failsafe=on" "" ; do
echo "$configs" | while read config ; do
do_test "$prop" "$config"
done
done
log_pass $claim

View File

@ -0,0 +1,133 @@
#!/bin/ksh -p
# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
# Refer to the OpenZFS git commit log for authoritative copyright attribution.
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License Version 1.0 (CDDL-1.0).
# You can obtain a copy of the license from the top-level file
# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
# You may not use this file except in compliance with the license.
#
# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
#
# DESCRIPTION:
# Verify that special_failsafe prop does not work if
# SPA_FEATURE_SPECIAL_FAILSAFE is disabled. Also, test upgrades.
verify_runnable "global"
claim="special_failsafe prop shouldn't work without SPA_FEATURE_SPECIAL_FAILSAFE"
log_assert $claim
log_onexit cleanup
# Try a bunch of different pool configurations
configs="$ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3
raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3
$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1
$ZPOOL_DISKS special $CLASS_DISK0
$ZPOOL_DISKS dedup $CLASS_DISK0"
# Make the pool disks smaller to make them quicker to back up. We don't use
# much data on them.
export ZPOOL_DEVSIZE=200M
export CLASS_DEVSIZE=200M
log_must disk_setup
echo "$configs" | while read config ; do
# We should not be able to set special_failsafe=on if the feature
# flag is disabled.
log_mustnot zpool create -o feature@special_failsafe=disabled -o special_failsafe=on $TESTPOOL $config
# Try a few permutations that should succeed
log_must zpool create -o special_failsafe=off $TESTPOOL $config
boilerplate_check "enabled" "off"
log_must zpool destroy $TESTPOOL
log_must zpool create -o special_failsafe=on $TESTPOOL $config
boilerplate_check "active" "on"
log_must zpool destroy $TESTPOOL
log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=on $TESTPOOL $config
boilerplate_check "active" "on"
log_must zpool destroy $TESTPOOL
done
# Now let's do a multi-step test where we upgrade an older pool
for cmd in "zpool set feature@special_failsafe=enabled $TESTPOOL" "zpool upgrade $TESTPOOL" ; do
# Make a pool with no special devices
log_must zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL mirror $ZPOOL_DISKS
totalwritten=0
boilerplate_check "disabled" "off"
special_failsafe_make_datasets
write_some_files
# Test enabling the feature in two different ways:
#
# zpool set feature@special_failsafe=enabled ...
# zpool upgrade ...
#
log_must eval "$cmd"
boilerplate_check "enabled" "off"
write_some_files
# Shouldn't be able to add with special_failsafe prop off
log_mustnot zpool add $TESTPOOL special $CLASS_DISK0
log_must zpool set special_failsafe=on $TESTPOOL
boilerplate_check "enabled" "on"
write_some_files
log_must zpool add $TESTPOOL special $CLASS_DISK0
boilerplate_check "active" "on"
write_some_files
zpool add $TESTPOOL dedup $CLASS_DISK1
write_some_files
log_must zpool export $TESTPOOL
log_must zpool import -l -d $IMPORTDIR $TESTPOOL
verify_all_directories
# You should be able to turn special_failsafe off if it was on
log_must zpool set special_failsafe=off $TESTPOOL
boilerplate_check "active" "off"
# If special_failsafe prop was on and the feature active, and then you
# turned the prop off, you cannot turn it back on again.
log_mustnot zpool set special_failsafe=on $TESTPOOL
log_must zpool destroy $TESTPOOL
done
# Verify the special_failsafe prop persists across imports
log_must zpool create -o special_failsafe=on $TESTPOOL $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1
log_must zpool export $TESTPOOL
log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
typeset prop=$(get_pool_prop special_failsafe $TESTPOOL)
log_must [ "$prop" == "on" ]
log_must zpool destroy $TESTPOOL
log_must zpool create $TESTPOOL $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1
log_must zpool export $TESTPOOL
log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL
typeset prop=$(get_pool_prop special_failsafe $TESTPOOL)
log_must [ "$prop" == "off" ]
log_must zpool destroy $TESTPOOL
cleanup
log_pass $claim

View File

@ -0,0 +1,106 @@
#!/bin/ksh -p
# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
# Refer to the OpenZFS git commit log for authoritative copyright attribution.
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License Version 1.0 (CDDL-1.0).
# You can obtain a copy of the license from the top-level file
# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
# You may not use this file except in compliance with the license.
#
# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
#
# DESCRIPTION:
# Destroy alloc class disks and then do a scrub on both a
# special_failsafe and non-special_failsafe pool. The special_failsafe
# pool should only be DEGRADED, while the non-special_failsafe pool should
# be SUSPENDED.
verify_runnable "global"
claim="special_failsafe pools survive a normally fatal scrub with bad disks"
log_assert $claim
log_onexit cleanup
# Try different pool configurations
configs="$ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3
raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3
$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1
$ZPOOL_DISKS special $CLASS_DISK0
$ZPOOL_DISKS dedup $CLASS_DISK0"
function do_test {
typeset config="$1"
typeset action="$2"
typeset onoff="$3"
totalwritten=0
log_must disk_setup
log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=$onoff $TESTPOOL $config
special_failsafe_make_datasets
totalwritten=0
write_some_files
# When we do a scrub later, we will either want it to suspend or not
# suspend the pool, depending on our backup settings. Make sure we are
# able to ride though the suspended pool so we # can continue with our
# tests.
log_must zpool set failmode=continue $TESTPOOL
alloc_class_disks="$(get_list_of_alloc_class_disks)"
backup_alloc_class_disks $alloc_class_disks
zero_alloc_class_disks $alloc_class_disks
# Spawn scrub into the background since the pool may be suspended and
# it will hang. We need to continue passed the hung scrub so we
# can restore the bad disks and do a 'zpool clear' to remove the
# suspended pool.
zpool scrub $TESTPOOL &
wait_scrubbed $TESTPOOL 3
if [ "$onoff" == "on" ] ; then
log_must check_state $TESTPOOL "" "DEGRADED"
verify_pool $TESTPOOL
write_some_files
verify_all_directories
else
log_must check_state $TESTPOOL "" "SUSPENDED"
# Pool should be suspended. Restore the old disks so we can
# clear the suspension. 'zpool clear' here will delete the
# pool.
restore_alloc_class_disks $alloc_class_disks
log_must zpool clear $TESTPOOL
fi
cleanup
}
# Stop zed in case we left it running from an old, aborted, test run.
zed_stop
zed_cleanup
log_must zed_setup
log_must zed_start
log_must zed_events_drain
# Verify scrubs work as expected with different permutations of special_failsafe
echo "$configs" | while read config ; do
for i in "on" "off" ; do
do_test "$config" "zero" "$i"
done
done
log_must zed_stop
log_must zed_cleanup
log_pass $claim

View File

@ -0,0 +1,94 @@
#!/bin/ksh -p
# Copyright (C) 2024 Lawrence Livermore National Security, LLC.
# Refer to the OpenZFS git commit log for authoritative copyright attribution.
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License Version 1.0 (CDDL-1.0).
# You can obtain a copy of the license from the top-level file
# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
# You may not use this file except in compliance with the license.
#
# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib
#
# DESCRIPTION:
# Verify we can split a pool with special_failsafe, and the new pool
# keeps the special_failsafe settings. Also verify the new pool has
# all the data if the pool has special_failsafe.
#
verify_runnable "global"
claim="zpool split works with special_failsafe"
log_assert $claim
log_onexit cleanup
# Create a normal, special_failsafe pool
log_must disk_setup
log_must zpool create -o special_failsafe=on $TESTPOOL mirror \
$ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup \
mirror $CLASS_DISK2 $CLASS_DISK3
totalwritten=0
special_failsafe_make_datasets
write_some_files
verify_all_directories
# Split the pool and verify the old pool has all the data
newpool="${TESTPOOL}-2"
log_must zpool split $TESTPOOL $newpool
check_pool_alloc_class_props
verify_all_directories
# Forcefault alloc class devices on the old pool and verify we have all the
# data.
log_must zpool offline -f $TESTPOOL $CLASS_DISK0
log_must zpool offline -f $TESTPOOL $CLASS_DISK2
log_must check_state $TESTPOOL $CLASS_DISK0 "FAULTED"
log_must check_state $TESTPOOL $CLASS_DISK2 "FAULTED"
log_must check_state $TESTPOOL "" "DEGRADED"
verify_all_directories
log_must zpool clear $TESTPOOL
# All done with the old pool
log_must zpool destroy $TESTPOOL
# Import the new split pool and rename it $TESTPOOL since all our verification
# functions expect the pool to be called $TESTPOOL.
log_must zpool import -l -f -d $IMPORTDIR $newpool $TESTPOOL
check_pool_alloc_class_props
verify_all_directories
# zero alloc class devices on the old pool and verify we have all the
# data.
log_must zpool export $TESTPOOL
zero_file $CLASS_DISK1
zero_file $CLASS_DISK3
log_must zpool import -l -f -d $IMPORTDIR $TESTPOOL
verify_all_directories
log_must zpool destroy $TESTPOOL
# Create a non-special_failsafe pool, split it, and verify the split pool is
# also not special_failsafe.
log_must zpool create -o special_failsafe=off $TESTPOOL mirror \
$ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup \
mirror $CLASS_DISK2 $CLASS_DISK3
log_must zpool split $TESTPOOL $newpool
check_pool_alloc_class_props
log_must zpool destroy $TESTPOOL
log_must zpool import -l -f -d $IMPORTDIR $newpool $TESTPOOL
check_pool_alloc_class_props
log_must zpool destroy $TESTPOOL
log_pass $claim