diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 453fb2131f..d445893033 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -1820,6 +1820,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, (void) printf(gettext("currently in use")); break; + case VDEV_AUX_CHILDREN_OFFLINE: + (void) printf(gettext("all children offline")); + break; + default: (void) printf(gettext("corrupted data")); break; @@ -1919,6 +1923,10 @@ print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, (void) printf(gettext("currently in use")); break; + case VDEV_AUX_CHILDREN_OFFLINE: + (void) printf(gettext("all children offline")); + break; + default: (void) printf(gettext("corrupted data")); break; @@ -2752,6 +2760,7 @@ zpool_do_import(int argc, char **argv) idata.guid = searchguid; idata.cachefile = cachefile; idata.scan = do_scan; + idata.policy = policy; pools = zpool_search_import(g_zfs, &idata); diff --git a/include/libzfs.h b/include/libzfs.h index cbaaa13a21..45eb5c9047 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -413,6 +413,7 @@ typedef struct importargs { int unique : 1; /* does 'poolname' already exist? */ int exists : 1; /* set on return if pool already exists */ int scan : 1; /* prefer scanning to libblkid cache */ + nvlist_t *policy; /* rewind policy (rewind txg, etc.) */ } importargs_t; extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index de3b729ebe..fa4eb27217 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -704,6 +704,7 @@ typedef struct zpool_rewind_policy { #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" +#define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ @@ -811,6 +812,7 @@ typedef enum vdev_aux { VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */ VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ VDEV_AUX_ACTIVE, /* vdev active on a different host */ + VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ } vdev_aux_t; /* diff --git a/include/sys/spa.h b/include/sys/spa.h index 1172468ad9..8a3938e865 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -410,6 +410,7 @@ typedef enum bp_embedded_type { #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ +#define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ /* * A block is a hole when it has either 1) never been written to, or @@ -1015,11 +1016,16 @@ extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); +extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, + const blkptr_t *bp); typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, void *arg); extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg); extern uint64_t spa_get_last_removal_txg(spa_t *spa); +extern boolean_t spa_trust_config(spa_t *spa); +extern uint64_t spa_missing_tvds_allowed(spa_t *spa); +extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern boolean_t spa_multihost(spa_t *spa); extern unsigned long spa_get_hostid(void); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 90d9292688..cd214c29c0 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -184,6 +184,15 @@ typedef enum spa_all_vdev_zap_action { AVZ_ACTION_INITIALIZE } spa_avz_action_t; +typedef enum spa_config_source { + SPA_CONFIG_SRC_NONE = 0, + SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */ + SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */ + SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */ + SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */ + SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */ +} spa_config_source_t; + struct spa { /* * Fields protected by spa_namespace_lock. @@ -202,6 +211,8 @@ struct spa { uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ + boolean_t spa_trust_config; /* do we trust vdev tree? */ + spa_config_source_t spa_config_source; /* where config comes from? */ uint64_t spa_import_flags; /* import specific flags */ spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; @@ -263,6 +274,8 @@ struct spa { int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ + uint64_t spa_missing_tvds; /* unopenable tvds on load */ + uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ spa_removing_phys_t spa_removing_phys; spa_vdev_removal_t *spa_vdev_removal; diff --git a/include/sys/vdev.h b/include/sys/vdev.h index a9b99331b3..161e30ae7f 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -48,9 +48,12 @@ typedef enum vdev_dtl_type { extern int zfs_nocacheflush; extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...); +extern void vdev_dbgmsg_print_tree(vdev_t *, int); extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); -extern int vdev_validate(vdev_t *, boolean_t); +extern int vdev_validate(vdev_t *); +extern int vdev_copy_path_strict(vdev_t *, vdev_t *); +extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); extern void vdev_reopen(vdev_t *); @@ -100,6 +103,7 @@ extern void vdev_scan_stat_init(vdev_t *vd); extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); +extern boolean_t vdev_children_are_offline(vdev_t *vd); extern void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); @@ -145,7 +149,8 @@ typedef enum vdev_config_flag { VDEV_CONFIG_SPARE = 1 << 0, VDEV_CONFIG_L2CACHE = 1 << 1, VDEV_CONFIG_REMOVING = 1 << 2, - VDEV_CONFIG_MOS = 1 << 3 + VDEV_CONFIG_MOS = 1 << 3, + VDEV_CONFIG_MISSING = 1 << 4 } vdev_config_flag_t; extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index b933f9ab8d..e289946135 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -437,7 +437,6 @@ extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ -extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd); extern boolean_t vdev_log_state_valid(vdev_t *vd); extern int vdev_load(vdev_t *vd); extern int vdev_dtl_load(vdev_t *vd); diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 2e311cffd9..37bdc533c4 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -674,6 +674,7 @@ typedef struct callb_cpr { #define zone_dataset_visible(x, y) (1) #define INGLOBALZONE(z) (1) +extern uint32_t zone_get_hostid(void *zonep); extern char *kmem_vasprintf(const char *fmt, va_list adx); extern char *kmem_asprintf(const char *fmt, ...); diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index cc9a52a3ea..68b5988cdf 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -897,7 +897,8 @@ vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) * return to the user. */ static nvlist_t * -get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) +get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, + nvlist_t *policy) { pool_entry_t *pe; vdev_entry_t *ve; @@ -1230,6 +1231,12 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) continue; } + if (policy != NULL) { + if (nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY, + policy) != 0) + goto nomem; + } + if ((nvl = refresh_config(hdl, config)) == NULL) { nvlist_free(config); config = NULL; @@ -2080,7 +2087,7 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) free(cache); pthread_mutex_destroy(&lock); - ret = get_configs(hdl, &pools, iarg->can_be_active); + ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy); for (pe = pools.pools; pe != NULL; pe = penext) { penext = pe->pe_next; @@ -2209,6 +2216,14 @@ zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile, if (active) continue; + if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE, + cachefile) != 0) { + (void) no_memory(hdl); + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + if ((dst = refresh_config(hdl, src)) == NULL) { nvlist_free(raw); nvlist_free(pools); diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 2d94cd3204..d082a5f66b 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -1935,8 +1935,9 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) { (void) printf(dgettext(TEXT_DOMAIN, - "The devices below are missing, use " - "'-m' to import the pool anyway:\n")); + "The devices below are missing or " + "corrupted, use '-m' to import the pool " + "anyway:\n")); print_vdev_tree(hdl, NULL, missing, 2); (void) printf("\n"); } diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 09e69ef6d2..f3e84975c6 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -297,6 +297,16 @@ rw_tryenter(krwlock_t *rwlp, krw_t rw) return (0); } +/* ARGSUSED */ +uint32_t +zone_get_hostid(void *zonep) +{ + /* + * We're emulating the system's hostid in userland. + */ + return (strtoul(hw_serial, NULL, 10)); +} + int rw_tryupgrade(krwlock_t *rwlp) { diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 822146a7aa..886dffce8e 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -351,6 +351,18 @@ they operate close to quota or capacity limits. Default value: \fB24\fR. .RE +.sp +.ne 2 +.na +\fBspa_load_print_vdev_tree\fR (int) +.ad +.RS 12n +Whether to print the vdev tree in the debugging message buffer during pool import. +Use 0 to disable and 1 to enable. +.sp +Default value: \fB0\fR. +.RE + .sp .ne 2 .na @@ -701,6 +713,18 @@ the code that may use them. A value of \fB0\fR will default to 6000 ms. Default value: \fB0\fR. .RE +.sp +.ne 2 +.na +\fBzfs_max_missing_tvds\fR (int) +.ad +.RS 12n +Number of missing top-level vdevs which will be allowed during +pool import (only in read-only mode). +.sp +Default value: \fB0\fR +.RE + .sp .ne 2 .na diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 80f0c6f368..3177f9649c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -157,9 +157,8 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); -static inline int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, - spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, - char **ereport); +static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport, + boolean_t reloading); static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ @@ -181,6 +180,54 @@ boolean_t spa_load_verify_dryrun = B_FALSE; */ #define TRYIMPORT_NAME "$import" +/* + * For debugging purposes: print out vdev tree during pool import. + */ +int spa_load_print_vdev_tree = B_FALSE; + +/* + * A non-zero value for zfs_max_missing_tvds means that we allow importing + * pools with missing top-level vdevs. This is strictly intended for advanced + * pool recovery cases since missing data is almost inevitable. Pools with + * missing devices can only be imported read-only for safety reasons, and their + * fail-mode will be automatically set to "continue". + * + * With 1 missing vdev we should be able to import the pool and mount all + * datasets. User data that was not modified after the missing device has been + * added should be recoverable. This means that snapshots created prior to the + * addition of that device should be completely intact. + * + * With 2 missing vdevs, some datasets may fail to mount since there are + * dataset statistics that are stored as regular metadata. Some data might be + * recoverable if those vdevs were added recently. + * + * With 3 or more missing vdevs, the pool is severely damaged and MOS entries + * may be missing entirely. Chances of data recovery are very low. Note that + * there are also risks of performing an inadvertent rewind as we might be + * missing all the vdevs with the latest uberblocks. + */ +unsigned long zfs_max_missing_tvds = 0; + +/* + * The parameters below are similar to zfs_max_missing_tvds but are only + * intended for a preliminary open of the pool with an untrusted config which + * might be incomplete or out-dated. + * + * We are more tolerant for pools opened from a cachefile since we could have + * an out-dated cachefile where a device removal was not registered. + * We could have set the limit arbitrarily high but in the case where devices + * are really missing we would want to return the proper error codes; we chose + * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available + * and we get a chance to retrieve the trusted config. + */ +uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; +/* + * In the case where config was assembled by scanning device paths (/dev/dsks + * by default) we are less tolerant since all the existing devices should have + * been detected and we want spa_load to return the right error codes. + */ +uint64_t zfs_max_missing_tvds_scan = 0; + /* * ========================================================================== * SPA properties routines @@ -1756,6 +1803,27 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) return (error); } +/* + * Concrete top-level vdevs that are not missing and are not logs. At every + * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. + */ +static uint64_t +spa_healthy_core_tvds(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t tvds = 0; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + if (vd->vdev_islog) + continue; + if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) + tvds++; + } + + return (tvds); +} + /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. @@ -1763,7 +1831,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) static void spa_check_removed(vdev_t *vd) { - for (int c = 0; c < vd->vdev_children; c++) + for (uint64_t c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && @@ -1773,38 +1841,14 @@ spa_check_removed(vdev_t *vd) } } -static void -spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) +static int +spa_check_for_missing_logs(spa_t *spa) { - ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); - - vd->vdev_top_zap = mvd->vdev_top_zap; - vd->vdev_leaf_zap = mvd->vdev_leaf_zap; - - for (uint64_t i = 0; i < vd->vdev_children; i++) { - spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); - } -} - -/* - * Validate the current config against the MOS config - */ -static boolean_t -spa_config_valid(spa_t *spa, nvlist_t *config) -{ - vdev_t *mrvd, *rvd = spa->spa_root_vdev; - nvlist_t *nv; - - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); - - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); - - ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); + vdev_t *rvd = spa->spa_root_vdev; /* * If we're doing a normal import, then build up any additional - * diagnostic information about missing devices in this config. + * diagnostic information about missing log devices. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { @@ -1815,109 +1859,52 @@ spa_config_valid(spa_t *spa, nvlist_t *config) KM_SLEEP); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); - for (int c = 0; c < rvd->vdev_children; c++) { + for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - vdev_t *mtvd = mrvd->vdev_child[c]; - if (tvd->vdev_ops == &vdev_missing_ops && - mtvd->vdev_ops != &vdev_missing_ops && - mtvd->vdev_islog) - child[idx++] = vdev_config_generate(spa, mtvd, - B_FALSE, 0); + /* + * We consider a device as missing only if it failed + * to open (i.e. offline or faulted is not considered + * as missing). + */ + if (tvd->vdev_islog && + tvd->vdev_state == VDEV_STATE_CANT_OPEN) { + child[idx++] = vdev_config_generate(spa, tvd, + B_FALSE, VDEV_CONFIG_MISSING); + } } - if (idx) { - VERIFY(nvlist_add_nvlist_array(nv, - ZPOOL_CONFIG_CHILDREN, child, idx) == 0); - VERIFY(nvlist_add_nvlist(spa->spa_load_info, - ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); + if (idx > 0) { + fnvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx); + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_MISSING_DEVICES, nv); - for (int i = 0; i < idx; i++) + for (uint64_t i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); - } - /* - * Compare the root vdev tree with the information we have - * from the MOS config (mrvd). Check each top-level vdev - * with the corresponding MOS config top-level (mtvd). - */ - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - vdev_t *mtvd = mrvd->vdev_child[c]; - - /* - * Resolve any "missing" vdevs in the current configuration. - * Also trust the MOS config about any "indirect" vdevs. - * If we find that the MOS config has more accurate information - * about the top-level vdev then use that vdev instead. - */ - if ((tvd->vdev_ops == &vdev_missing_ops && - mtvd->vdev_ops != &vdev_missing_ops) || - (mtvd->vdev_ops == &vdev_indirect_ops && - tvd->vdev_ops != &vdev_indirect_ops)) { - - /* - * Device specific actions. - */ - if (mtvd->vdev_islog) { - if (!(spa->spa_import_flags & - ZFS_IMPORT_MISSING_LOG)) { - continue; - } - - spa_set_log_state(spa, SPA_LOG_CLEAR); - } else if (mtvd->vdev_ops != &vdev_indirect_ops) { - continue; - } - - /* - * Swap the missing vdev with the data we were - * able to obtain from the MOS config. - */ - vdev_remove_child(rvd, tvd); - vdev_remove_child(mrvd, mtvd); - - vdev_add_child(rvd, mtvd); - vdev_add_child(mrvd, tvd); - - vdev_reopen(rvd); - } else { - if (mtvd->vdev_islog) { - /* - * Load the slog device's state from the MOS - * config since it's possible that the label - * does not contain the most up-to-date - * information. - */ - vdev_load_log_state(tvd, mtvd); - vdev_reopen(tvd); - } - - /* - * Per-vdev ZAP info is stored exclusively in the MOS. - */ - spa_config_valid_zaps(tvd, mtvd); + if (idx > 0) { + spa_load_failed(spa, "some log devices are missing"); + return (SET_ERROR(ENXIO)); } + } else { + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; - /* - * Never trust this info from userland; always use what's - * in the MOS. This prevents it from getting out of sync - * with the rest of the info in the MOS. - */ - tvd->vdev_removing = mtvd->vdev_removing; - tvd->vdev_indirect_config = mtvd->vdev_indirect_config; + if (tvd->vdev_islog && + tvd->vdev_state == VDEV_STATE_CANT_OPEN) { + spa_set_log_state(spa, SPA_LOG_CLEAR); + spa_load_note(spa, "some log devices are " + "missing, ZIL is dropped."); + break; + } + } } - vdev_free(mrvd); - spa_config_exit(spa, SCL_ALL, FTAG); - - /* - * Ensure we were able to validate the config. - */ - return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); + return (0); } /* @@ -2311,53 +2298,15 @@ spa_try_repair(spa_t *spa, nvlist_t *config) } static int -spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, - boolean_t trust_config) +spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) { - nvlist_t *config = spa->spa_config; char *ereport = FM_EREPORT_ZFS_POOL; - char *comment; int error; - uint64_t pool_guid; - nvlist_t *nvl; - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) - return (SET_ERROR(EINVAL)); + spa->spa_load_state = state; - ASSERT(spa->spa_comment == NULL); - if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) - spa->spa_comment = spa_strdup(comment); - - /* - * Versioning wasn't explicitly added to the label until later, so if - * it's not present treat it as the initial version. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &spa->spa_ubsync.ub_version) != 0) - spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; - - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, - &spa->spa_config_txg); - - if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && - spa_guid_exists(pool_guid, 0)) { - error = SET_ERROR(EEXIST); - } else { - spa->spa_config_guid = pool_guid; - - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, - &nvl) == 0) { - VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, - KM_SLEEP) == 0); - } - - nvlist_free(spa->spa_load_info); - spa->spa_load_info = fnvlist_alloc(); - - gethrestime(&spa->spa_loaded_ts); - error = spa_load_impl(spa, pool_guid, config, state, type, - trust_config, &ereport); - } + gethrestime(&spa->spa_loaded_ts); + error = spa_load_impl(spa, type, &ereport, B_FALSE); /* * Don't count references from objsets that are already closed @@ -2611,13 +2560,80 @@ out: } static int -spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - spa_import_type_t type) +spa_verify_host(spa_t *spa, nvlist_t *mos_config) +{ + uint64_t hostid; + char *hostname; + uint64_t myhostid = 0; + + if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { + hostname = fnvlist_lookup_string(mos_config, + ZPOOL_CONFIG_HOSTNAME); + + myhostid = zone_get_hostid(NULL); + + if (hostid != 0 && myhostid != 0 && hostid != myhostid) { + cmn_err(CE_WARN, "pool '%s' could not be " + "loaded as it was last accessed by " + "another system (host: %s hostid: 0x%llx). " + "See: http://illumos.org/msg/ZFS-8000-EY", + spa_name(spa), hostname, (u_longlong_t)hostid); + spa_load_failed(spa, "hostid verification failed: pool " + "last accessed by host: %s (hostid: 0x%llx)", + hostname, (u_longlong_t)hostid); + return (SET_ERROR(EBADF)); + } + } + + return (0); +} + +static int +spa_ld_parse_config(spa_t *spa, spa_import_type_t type) { int error = 0; - nvlist_t *nvtree = NULL; + nvlist_t *nvtree, *nvl, *config = spa->spa_config; int parse; vdev_t *rvd; + uint64_t pool_guid; + char *comment; + + /* + * Versioning wasn't explicitly added to the label until later, so if + * it's not present treat it as the initial version. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { + spa_load_failed(spa, "invalid config provided: '%s' missing", + ZPOOL_CONFIG_POOL_GUID); + return (SET_ERROR(EINVAL)); + } + + if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == + SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { + spa_load_failed(spa, "a pool with guid %llu is already open", + (u_longlong_t)pool_guid); + return (SET_ERROR(EEXIST)); + } + + spa->spa_config_guid = pool_guid; + + nvlist_free(spa->spa_load_info); + spa->spa_load_info = fnvlist_alloc(); + + ASSERT(spa->spa_comment == NULL); + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + spa->spa_comment = spa_strdup(comment); + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &spa->spa_config_txg); + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) + spa->spa_config_splitting = fnvlist_dup(nvl); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { spa_load_failed(spa, "invalid config provided: '%s' missing", @@ -2625,9 +2641,6 @@ spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config, return (SET_ERROR(EINVAL)); } - parse = (type == SPA_IMPORT_EXISTING ? - VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); - /* * Create "The Godfather" zio to hold all async IOs */ @@ -2645,6 +2658,8 @@ spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + parse = (type == SPA_IMPORT_EXISTING ? + VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); @@ -2665,71 +2680,105 @@ spa_ld_parse_config(spa_t *spa, uint64_t pool_guid, nvlist_t *config, return (0); } +/* + * Recursively open all vdevs in the vdev tree. This function is called twice: + * first with the untrusted config, then with the trusted config. + */ static int spa_ld_open_vdevs(spa_t *spa) { int error = 0; + /* + * spa_missing_tvds_allowed defines how many top-level vdevs can be + * missing/unopenable for the root vdev to be still considered openable. + */ + if (spa->spa_trust_config) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; + } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; + } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; + } else { + spa->spa_missing_tvds_allowed = 0; + } + + spa->spa_missing_tvds_allowed = + MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); + + if (spa->spa_missing_tvds != 0) { + spa_load_note(spa, "vdev tree has %lld missing top-level " + "vdevs.", (u_longlong_t)spa->spa_missing_tvds); + if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) { + /* + * Although theoretically we could allow users to open + * incomplete pools in RW mode, we'd need to add a lot + * of extra logic (e.g. adjust pool space to account + * for missing vdevs). + * This limitation also prevents users from accidentally + * opening the pool in RW mode during data recovery and + * damaging it further. + */ + spa_load_note(spa, "pools with missing top-level " + "vdevs can only be opened in read-only mode."); + error = SET_ERROR(ENXIO); + } else { + spa_load_note(spa, "current settings allow for maximum " + "%lld missing top-level vdevs at this stage.", + (u_longlong_t)spa->spa_missing_tvds_allowed); + } + } if (error != 0) { spa_load_failed(spa, "unable to open vdev tree [error=%d]", error); } + if (spa->spa_missing_tvds != 0 || error != 0) + vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); return (error); } +/* + * We need to validate the vdev labels against the configuration that + * we have in hand. This function is called twice: first with an untrusted + * config, then with a trusted config. The validation is more strict when the + * config is trusted. + */ static int -spa_ld_validate_vdevs(spa_t *spa, spa_import_type_t type, - boolean_t trust_config) +spa_ld_validate_vdevs(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; - /* - * We need to validate the vdev labels against the configuration that - * we have in hand, which is dependent on the setting of trust_config. - * If trust_config is true then we're validating the vdev labels based - * on that config. Otherwise, we're validating against the cached - * config (zpool.cache) that was read when we loaded the zfs module, and - * then later we will recursively call spa_load() and validate against - * the vdev config. - * - * If we're assembling a new pool that's been split off from an - * existing pool, the labels haven't yet been updated so we skip - * validation for now. - */ - if (type != SPA_IMPORT_ASSEMBLE) { - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd, trust_config); - spa_config_exit(spa, SCL_ALL, FTAG); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) { - spa_load_failed(spa, "vdev_validate failed [error=%d]", - error); - return (error); - } + if (error != 0) { + spa_load_failed(spa, "vdev_validate failed [error=%d]", error); + return (error); + } - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - spa_load_failed(spa, "cannot open vdev tree after " - "invalidating some vdevs"); - return (SET_ERROR(ENXIO)); - } + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { + spa_load_failed(spa, "cannot open vdev tree after invalidating " + "some vdevs"); + vdev_dbgmsg_print_tree(rvd, 2); + return (SET_ERROR(ENXIO)); } return (0); } static int -spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type, - boolean_t trust_config) +spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) { vdev_t *rvd = spa->spa_root_vdev; nvlist_t *label; uberblock_t *ub = &spa->spa_uberblock; - uint64_t children; boolean_t activity_check = B_FALSE; /* @@ -2755,7 +2804,8 @@ spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type, * pool is truly inactive and can be safely imported. Prevent * hosts which don't have a hostid set from importing the pool. */ - activity_check = spa_activity_check_required(spa, ub, label, config); + activity_check = spa_activity_check_required(spa, ub, label, + spa->spa_config); if (activity_check) { if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && spa_get_hostid() == 0) { @@ -2765,7 +2815,7 @@ spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type, return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } - int error = spa_activity_check(spa, ub, config); + int error = spa_activity_check(spa, ub, spa->spa_config); if (error) { nvlist_free(label); return (error); @@ -2851,26 +2901,9 @@ spa_ld_select_uberblock(spa_t *spa, nvlist_t *config, spa_import_type_t type, nvlist_free(unsup_feat); } - /* - * If the vdev guid sum doesn't match the uberblock, we have an - * incomplete configuration. We first check to see if the pool - * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). - * If it is, defer the vdev_guid_sum check till later so we - * can handle missing vdevs. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, - &children) != 0 && trust_config && type != SPA_IMPORT_ASSEMBLE && - rvd->vdev_guid_sum != ub->ub_guid_sum) { - spa_load_failed(spa, "guid sum in config doesn't match guid " - "sum in uberblock (%llu != %llu)", - (u_longlong_t)rvd->vdev_guid_sum, - (u_longlong_t)ub->ub_guid_sum); - return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); - } - if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_try_repair(spa, config); + spa_try_repair(spa, spa->spa_config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; @@ -2909,49 +2942,167 @@ spa_ld_open_rootbp(spa_t *spa) } static int -spa_ld_validate_config(spa_t *spa, spa_import_type_t type) +spa_ld_load_trusted_config(spa_t *spa, spa_import_type_t type, + boolean_t reloading) { - vdev_t *rvd = spa->spa_root_vdev; + vdev_t *mrvd, *rvd = spa->spa_root_vdev; + nvlist_t *nv, *mos_config, *policy; + int error = 0, copy_error; + uint64_t healthy_tvds, healthy_tvds_mos; + uint64_t mos_config_txg; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* - * Validate the config, using the MOS config to fill in any - * information which might be missing. If we fail to validate - * the config then declare the pool unfit for use. If we're - * assembling a pool from a split, the log is not transferred - * over. + * If we're assembling a pool from a split, the config provided is + * already trusted so there is nothing to do. */ - if (type != SPA_IMPORT_ASSEMBLE) { - nvlist_t *mos_config; - if (load_nvlist(spa, spa->spa_config_object, &mos_config) - != 0) { - spa_load_failed(spa, "unable to retrieve MOS config"); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } + if (type == SPA_IMPORT_ASSEMBLE) + return (0); - if (!spa_config_valid(spa, mos_config)) { + healthy_tvds = spa_healthy_core_tvds(spa); + + if (load_nvlist(spa, spa->spa_config_object, &mos_config) + != 0) { + spa_load_failed(spa, "unable to retrieve MOS config"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + /* + * If we are doing an open, pool owner wasn't verified yet, thus do + * the verification here. + */ + if (spa->spa_load_state == SPA_LOAD_OPEN) { + error = spa_verify_host(spa, mos_config); + if (error != 0) { nvlist_free(mos_config); - spa_load_failed(spa, "mismatch between config provided " - "and config stored in MOS"); - return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, - ENXIO)); + return (error); } - nvlist_free(mos_config); + } + nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * Build a new vdev tree from the trusted config + */ + VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + + /* + * Vdev paths in the MOS may be obsolete. If the untrusted config was + * obtained by scanning /dev/dsk, then it will have the right vdev + * paths. We update the trusted MOS config with this information. + * We first try to copy the paths with vdev_copy_path_strict, which + * succeeds only when both configs have exactly the same vdev tree. + * If that fails, we fall back to a more flexible method that has a + * best effort policy. + */ + copy_error = vdev_copy_path_strict(rvd, mrvd); + if (copy_error != 0 || spa_load_print_vdev_tree) { + spa_load_note(spa, "provided vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + spa_load_note(spa, "MOS vdev tree:"); + vdev_dbgmsg_print_tree(mrvd, 2); + } + if (copy_error != 0) { + spa_load_note(spa, "vdev_copy_path_strict failed, falling " + "back to vdev_copy_path_relaxed"); + vdev_copy_path_relaxed(rvd, mrvd); + } + + vdev_close(rvd); + vdev_free(rvd); + spa->spa_root_vdev = mrvd; + rvd = mrvd; + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * We will use spa_config if we decide to reload the spa or if spa_load + * fails and we rewind. We must thus regenerate the config using the + * MOS information with the updated paths. Rewind policy is an import + * setting and is not in the MOS. We copy it over to our new, trusted + * config. + */ + mos_config_txg = fnvlist_lookup_uint64(mos_config, + ZPOOL_CONFIG_POOL_TXG); + nvlist_free(mos_config); + mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); + if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_REWIND_POLICY, + &policy) == 0) + fnvlist_add_nvlist(mos_config, ZPOOL_REWIND_POLICY, policy); + spa_config_set(spa, mos_config); + spa->spa_config_source = SPA_CONFIG_SRC_MOS; + + /* + * Now that we got the config from the MOS, we should be more strict + * in checking blkptrs and can make assumptions about the consistency + * of the vdev tree. spa_trust_config must be set to true before opening + * vdevs in order for them to be writeable. + */ + spa->spa_trust_config = B_TRUE; + + /* + * Open and validate the new vdev tree + */ + error = spa_ld_open_vdevs(spa); + if (error != 0) + return (error); + + error = spa_ld_validate_vdevs(spa); + if (error != 0) + return (error); + + if (copy_error != 0 || spa_load_print_vdev_tree) { + spa_load_note(spa, "final vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + } + + if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && + !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { /* - * Now that we've validated the config, check the state of the - * root vdev. If it can't be opened, it indicates one or - * more toplevel vdevs are faulted. + * Sanity check to make sure that we are indeed loading the + * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds + * in the config provided and they happened to be the only ones + * to have the latest uberblock, we could involuntarily perform + * an extreme rewind. */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - spa_load_failed(spa, "some top vdevs are unavailable"); - return (SET_ERROR(ENXIO)); + healthy_tvds_mos = spa_healthy_core_tvds(spa); + if (healthy_tvds_mos - healthy_tvds >= + SPA_SYNC_MIN_VDEVS) { + spa_load_note(spa, "config provided misses too many " + "top-level vdevs compared to MOS (%lld vs %lld). ", + (u_longlong_t)healthy_tvds, + (u_longlong_t)healthy_tvds_mos); + spa_load_note(spa, "vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + if (reloading) { + spa_load_failed(spa, "config was already " + "provided from MOS. Aborting."); + return (spa_vdev_err(rvd, + VDEV_AUX_CORRUPT_DATA, EIO)); + } + spa_load_note(spa, "spa must be reloaded using MOS " + "config"); + return (SET_ERROR(EAGAIN)); } } + error = spa_check_for_missing_logs(spa); + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); + + if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { + spa_load_failed(spa, "uberblock guid sum doesn't match MOS " + "guid sum (%llu != %llu)", + (u_longlong_t)spa->spa_uberblock.ub_guid_sum, + (u_longlong_t)rvd->vdev_guid_sum); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } + return (0); } @@ -3117,47 +3268,6 @@ spa_ld_load_special_directories(spa_t *spa) return (0); } -static int -spa_ld_prepare_for_reload(spa_t *spa, int orig_mode) -{ - vdev_t *rvd = spa->spa_root_vdev; - - uint64_t hostid; - nvlist_t *policy = NULL; - nvlist_t *mos_config; - - if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { - spa_load_failed(spa, "unable to retrieve MOS config"); - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - } - - if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, - ZPOOL_CONFIG_HOSTID, &hostid) == 0) { - char *hostname; - unsigned long myhostid = 0; - - VERIFY(nvlist_lookup_string(mos_config, - ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); - - myhostid = spa_get_hostid(); - if (hostid && myhostid && hostid != myhostid) { - nvlist_free(mos_config); - return (SET_ERROR(EBADF)); - } - } - if (nvlist_lookup_nvlist(spa->spa_config, - ZPOOL_REWIND_POLICY, &policy) == 0) - VERIFY(nvlist_add_nvlist(mos_config, - ZPOOL_REWIND_POLICY, policy) == 0); - - spa_config_set(spa, mos_config); - spa_unload(spa); - spa_deactivate(spa); - spa_activate(spa, orig_mode); - - return (0); -} - static int spa_ld_get_props(spa_t *spa) { @@ -3286,6 +3396,19 @@ spa_ld_get_props(spa_t *spa) spa->spa_autoreplace = (autoreplace != 0); } + /* + * If we are importing a pool with missing top-level vdevs, + * we enforce that the pool doesn't panic or get suspended on + * error since the likelihood of missing data is extremely high. + */ + if (spa->spa_missing_tvds > 0 && + spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && + spa->spa_load_state != SPA_LOAD_TRYIMPORT) { + spa_load_note(spa, "forcing failmode to 'continue' " + "as some top level vdevs are missing"); + spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; + } + return (0); } @@ -3428,9 +3551,15 @@ spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { boolean_t missing = spa_check_logs(spa); if (missing) { - *ereport = FM_EREPORT_ZFS_LOG_REPLAY; - spa_load_failed(spa, "spa_check_logs failed"); - return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); + if (spa->spa_missing_tvds != 0) { + spa_load_note(spa, "spa_check_logs failed " + "so dropping the logs"); + } else { + *ereport = FM_EREPORT_ZFS_LOG_REPLAY; + spa_load_failed(spa, "spa_check_logs failed"); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, + ENXIO)); + } } } @@ -3486,7 +3615,8 @@ spa_ld_claim_log_blocks(spa_t *spa) } static void -spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg) +spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, + boolean_t reloading) { vdev_t *rvd = spa->spa_root_vdev; int need_update = B_FALSE; @@ -3498,7 +3628,7 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg) * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ - if (config_cache_txg != spa->spa_config_txg || + if (reloading || config_cache_txg != spa->spa_config_txg || spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) @@ -3516,6 +3646,24 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } +static void +spa_ld_prepare_for_reload(spa_t *spa) +{ + int mode = spa->spa_mode; + int async_suspended = spa->spa_async_suspended; + + spa_unload(spa); + spa_deactivate(spa); + spa_activate(spa, mode); + + /* + * We save the value of spa_async_suspended as it gets reset to 0 by + * spa_unload(). We want to restore it back to the original value before + * returning as we might be calling spa_async_resume() later. + */ + spa->spa_async_suspended = async_suspended; +} + /* * Load an existing storage pool, using the config provided. This config * describes which vdevs are part of the pool and is later validated against @@ -3523,32 +3671,35 @@ spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg) * config stored in the MOS. */ static int -spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, - spa_load_state_t state, spa_import_type_t type, boolean_t trust_config, - char **ereport) +spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport, + boolean_t reloading) { int error = 0; - uint64_t config_cache_txg = spa->spa_config_txg; - int orig_mode = spa->spa_mode; boolean_t missing_feat_write = B_FALSE; ASSERT(MUTEX_HELD(&spa_namespace_lock)); - - spa->spa_load_state = state; - spa_load_note(spa, "LOADING"); + ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); /* - * If this is an untrusted config, first access the pool in read-only - * mode. We will then retrieve a trusted copy of the config from the MOS - * and use it to reopen the pool in read-write mode. + * Never trust the config that is provided unless we are assembling + * a pool following a split. + * This means don't trust blkptrs and the vdev tree in general. This + * also effectively puts the spa in read-only mode since + * spa_writeable() checks for spa_trust_config to be true. + * We will later load a trusted config from the MOS. */ - if (!trust_config) - spa->spa_mode = FREAD; + if (type != SPA_IMPORT_ASSEMBLE) + spa->spa_trust_config = B_FALSE; + + if (reloading) + spa_load_note(spa, "RELOADING"); + else + spa_load_note(spa, "LOADING"); /* * Parse the config provided to create a vdev tree. */ - error = spa_ld_parse_config(spa, pool_guid, config, type); + error = spa_ld_parse_config(spa, type); if (error != 0) return (error); @@ -3566,10 +3717,15 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, /* * Read the label of each vdev and make sure that the GUIDs stored * there match the GUIDs in the config provided. + * If we're assembling a new pool that's been split off from an + * existing pool, the labels haven't yet been updated so we skip + * validation for now. */ - error = spa_ld_validate_vdevs(spa, type, trust_config); - if (error != 0) - return (error); + if (type != SPA_IMPORT_ASSEMBLE) { + error = spa_ld_validate_vdevs(spa); + if (error != 0) + return (error); + } /* * Read vdev labels to find the best uberblock (i.e. latest, unless @@ -3578,7 +3734,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * label with the best uberblock and verify that our version of zfs * supports them all. */ - error = spa_ld_select_uberblock(spa, config, type, trust_config); + error = spa_ld_select_uberblock(spa, type); if (error != 0) return (error); @@ -3592,13 +3748,21 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, return (error); /* - * Retrieve the config stored in the MOS and use it to validate the - * config provided. Also extract some information from the MOS config - * to update our vdev tree. + * Retrieve the trusted config stored in the MOS and use it to create + * a new, exact version of the vdev tree, then reopen all vdevs. */ - error = spa_ld_validate_config(spa, type); - if (error != 0) + error = spa_ld_load_trusted_config(spa, type, reloading); + if (error == EAGAIN) { + VERIFY(!reloading); + /* + * Redo the loading process with the trusted config if it is + * too different from the untrusted config. + */ + spa_ld_prepare_for_reload(spa); + return (spa_load_impl(spa, type, ereport, B_TRUE)); + } else if (error != 0) { return (error); + } /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed @@ -3628,19 +3792,6 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, if (error != 0) return (error); - /* - * If the config provided is not trusted, discard it and use the config - * from the MOS to reload the pool. - */ - if (!trust_config) { - error = spa_ld_prepare_for_reload(spa, orig_mode); - if (error != 0) - return (error); - - spa_load_note(spa, "RELOADING"); - return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); - } - /* * Retrieve pool properties from the MOS. */ @@ -3677,7 +3828,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, return (error); if (missing_feat_write) { - ASSERT(state == SPA_LOAD_TRYIMPORT); + ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in @@ -3709,9 +3860,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ - if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || + if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { - ASSERT(state != SPA_LOAD_TRYIMPORT); + uint64_t config_cache_txg = spa->spa_config_txg; + + ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* * Traverse the ZIL and claim all blocks. @@ -3739,7 +3892,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ - spa_ld_check_for_config_update(spa, config_cache_txg); + spa_ld_check_for_config_update(spa, config_cache_txg, + reloading); /* * Check all DTLs to see if anything needs resilvering. @@ -3776,7 +3930,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, } static int -spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config) +spa_load_retry(spa_t *spa, spa_load_state_t state) { int mode = spa->spa_mode; @@ -3791,7 +3945,7 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config) spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", (u_longlong_t)spa->spa_load_max_txg); - return (spa_load(spa, state, SPA_IMPORT_EXISTING, trust_config)); + return (spa_load(spa, state, SPA_IMPORT_EXISTING)); } /* @@ -3802,8 +3956,8 @@ spa_load_retry(spa_t *spa, spa_load_state_t state, int trust_config) * spa_load(). */ static int -spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config, - uint64_t max_request, int rewind_flags) +spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, + int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; @@ -3820,8 +3974,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config, spa->spa_extreme_rewind = B_TRUE; } - load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, - trust_config); + load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); @@ -3862,7 +4015,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, int trust_config, spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; - rewind_error = spa_load_retry(spa, state, trust_config); + rewind_error = spa_load_retry(spa, state); } spa->spa_extreme_rewind = B_FALSE; @@ -3944,9 +4097,10 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; zfs_dbgmsg("spa_open_common: opening %s", pool); - error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, + error = spa_load_best(spa, state, policy.zrp_txg, policy.zrp_request); if (error == EBADF) { @@ -4863,18 +5017,16 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) if (policy.zrp_request & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; - /* - * Pass off the heavy lifting to spa_load(). Pass TRUE for trust_config - * because the user-supplied config is actually the one to trust when - * doing an import. - */ - if (state != SPA_LOAD_RECOVER) - spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; - zfs_dbgmsg("spa_import: importing %s%s", pool, - (state == SPA_LOAD_RECOVER) ? " (RECOVERY MODE)" : ""); - error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, - policy.zrp_request); + if (state != SPA_LOAD_RECOVER) { + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + zfs_dbgmsg("spa_import: importing %s", pool); + } else { + zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " + "(RECOVERY MODE)", pool, (longlong_t)policy.zrp_txg); + } + error = spa_load_best(spa, state, policy.zrp_txg, policy.zrp_request); /* * Propagate anything learned while loading the pool and pass it @@ -4988,10 +5140,11 @@ nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; - char *poolname; + char *poolname, *cachefile; spa_t *spa; uint64_t state; int error; + zpool_rewind_policy_t policy; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); @@ -5006,14 +5159,30 @@ spa_tryimport(nvlist_t *tryconfig) spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, FREAD); - zfs_dbgmsg("spa_tryimport: importing %s", poolname); - /* - * Pass off the heavy lifting to spa_load(). - * Pass TRUE for trust_config because the user-supplied config - * is actually the one to trust when doing an import. + * Rewind pool if a max txg was provided. Note that even though we + * retrieve the complete rewind policy, only the rewind txg is relevant + * for tryimport. */ - error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); + zpool_get_rewind_policy(spa->spa_config, &policy); + if (policy.zrp_txg != UINT64_MAX) { + spa->spa_load_max_txg = policy.zrp_txg; + spa->spa_extreme_rewind = B_TRUE; + zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", + poolname, (longlong_t)policy.zrp_txg); + } else { + zfs_dbgmsg("spa_tryimport: importing %s", poolname); + } + + if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) + == 0) { + zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); + spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; + } else { + spa->spa_config_source = SPA_CONFIG_SRC_SCAN; + } + + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* * If 'tryconfig' was at least parsable, return the current config. @@ -6033,8 +6202,10 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); + newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; + /* create the new pool from the disks of the original pool */ - error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); + error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); if (error) goto out; @@ -7337,7 +7508,7 @@ spa_sync(spa_t *spa, uint64_t txg) spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { - vdev_t *svd[SPA_DVAS_PER_BP]; + vdev_t *svd[SPA_SYNC_MIN_VDEVS]; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); @@ -7348,7 +7519,7 @@ spa_sync(spa_t *spa, uint64_t txg) !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; - if (svdcount == SPA_DVAS_PER_BP) + if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, txg); @@ -7692,9 +7863,20 @@ module_param(spa_load_verify_data, int, 0644); MODULE_PARM_DESC(spa_load_verify_data, "Set to traverse data on pool import"); +module_param(spa_load_print_vdev_tree, int, 0644); +MODULE_PARM_DESC(spa_load_print_vdev_tree, + "Print vdev tree to zfs_dbgmsg during pool import"); + /* CSTYLED */ module_param(zio_taskq_batch_pct, uint, 0444); MODULE_PARM_DESC(zio_taskq_batch_pct, "Percentage of CPUs to run an IO worker thread"); +/* BEGIN CSTYLED */ +module_param(zfs_max_missing_tvds, ulong, 0644); +MODULE_PARM_DESC(zfs_max_missing_tvds, + "Allow importing pool with up to this number of missing top-level vdevs" + " (in read-only mode)"); +/* END CSTYLED */ + #endif diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 4e9fd6c575..50bba23454 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -393,7 +393,8 @@ void spa_config_set(spa_t *spa, nvlist_t *config) { mutex_enter(&spa->spa_props_lock); - nvlist_free(spa->spa_config); + if (spa->spa_config != NULL && spa->spa_config != config) + nvlist_free(spa->spa_config); spa->spa_config = config; mutex_exit(&spa->spa_props_lock); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 6f4db76c80..e0edba1554 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -384,7 +384,8 @@ spa_load_failed(spa_t *spa, const char *fmt, ...) (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); - zfs_dbgmsg("spa_load(%s): FAILED: %s", spa->spa_name, buf); + zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, + spa->spa_trust_config ? "trusted" : "untrusted", buf); } /*PRINTFLIKE2*/ @@ -398,7 +399,8 @@ spa_load_note(spa_t *spa, const char *fmt, ...) (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); - zfs_dbgmsg("spa_load(%s): %s", spa->spa_name, buf); + zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, + spa->spa_trust_config ? "trusted" : "untrusted", buf); } /* @@ -637,6 +639,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; + spa->spa_trust_config = B_TRUE; spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); @@ -2052,7 +2055,7 @@ spa_is_root(spa_t *spa) boolean_t spa_writeable(spa_t *spa) { - return (!!(spa->spa_mode & FWRITE)); + return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config); } /* @@ -2233,6 +2236,24 @@ spa_get_hostid(void) return (myhostid); } +boolean_t +spa_trust_config(spa_t *spa) +{ + return (spa->spa_trust_config); +} + +uint64_t +spa_missing_tvds_allowed(spa_t *spa) +{ + return (spa->spa_missing_tvds_allowed); +} + +void +spa_set_missing_tvds(spa_t *spa, uint64_t missing) +{ + spa->spa_missing_tvds = missing; +} + #if defined(_KERNEL) && defined(HAVE_SPL) #include @@ -2338,6 +2359,9 @@ EXPORT_SYMBOL(spa_is_root); EXPORT_SYMBOL(spa_writeable); EXPORT_SYMBOL(spa_mode); EXPORT_SYMBOL(spa_namespace_lock); +EXPORT_SYMBOL(spa_trust_config); +EXPORT_SYMBOL(spa_missing_tvds_allowed); +EXPORT_SYMBOL(spa_set_missing_tvds); /* BEGIN CSTYLED */ module_param(zfs_flags, uint, 0644); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 3654919fcc..ad53c0c896 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -74,6 +74,8 @@ unsigned int zfs_checksums_per_second = 20; */ int zfs_scan_ignore_errors = 0; +int vdev_validate_skip = B_FALSE; + /*PRINTFLIKE2*/ void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) @@ -96,6 +98,57 @@ vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) } } +void +vdev_dbgmsg_print_tree(vdev_t *vd, int indent) +{ + char state[20]; + + if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { + zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id, + vd->vdev_ops->vdev_op_type); + return; + } + + switch (vd->vdev_state) { + case VDEV_STATE_UNKNOWN: + (void) snprintf(state, sizeof (state), "unknown"); + break; + case VDEV_STATE_CLOSED: + (void) snprintf(state, sizeof (state), "closed"); + break; + case VDEV_STATE_OFFLINE: + (void) snprintf(state, sizeof (state), "offline"); + break; + case VDEV_STATE_REMOVED: + (void) snprintf(state, sizeof (state), "removed"); + break; + case VDEV_STATE_CANT_OPEN: + (void) snprintf(state, sizeof (state), "can't open"); + break; + case VDEV_STATE_FAULTED: + (void) snprintf(state, sizeof (state), "faulted"); + break; + case VDEV_STATE_DEGRADED: + (void) snprintf(state, sizeof (state), "degraded"); + break; + case VDEV_STATE_HEALTHY: + (void) snprintf(state, sizeof (state), "healthy"); + break; + default: + (void) snprintf(state, sizeof (state), "", + (uint_t)vd->vdev_state); + } + + zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, + "", vd->vdev_id, vd->vdev_ops->vdev_op_type, + vd->vdev_islog ? " (log)" : "", + (u_longlong_t)vd->vdev_guid, + vd->vdev_path ? vd->vdev_path : "N/A", state); + + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); +} + /* * Virtual device management. */ @@ -1424,8 +1477,13 @@ vdev_open(vdev_t *vd) vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - vd->vdev_stat.vs_aux); + if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, + vd->vdev_stat.vs_aux); + } else { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + vd->vdev_stat.vs_aux); + } return (error); } @@ -1596,29 +1654,29 @@ vdev_open(vdev_t *vd) /* * Called once the vdevs are all opened, this routine validates the label - * contents. This needs to be done before vdev_load() so that we don't + * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * - * If 'strict' is false ignore the spa guid check. This is necessary because - * if the machine crashed during a re-guid the new guid might have been written - * to all of the vdev labels, but not the cached config. The strict check - * will be performed when the pool is opened again using the mos config. - * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int -vdev_validate(vdev_t *vd, boolean_t strict) +vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; nvlist_t *label; - uint64_t guid = 0, top_guid; + uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; + nvlist_t *nvl; + uint64_t txg; - for (int c = 0; c < vd->vdev_children; c++) - if (vdev_validate(vd->vdev_child[c], strict) != 0) + if (vdev_validate_skip) + return (0); + + for (uint64_t c = 0; c < vd->vdev_children; c++) + if (vdev_validate(vd->vdev_child[c]) != 0) return (SET_ERROR(EBADF)); /* @@ -1626,115 +1684,276 @@ vdev_validate(vdev_t *vd, boolean_t strict) * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { - uint64_t aux_guid = 0; - nvlist_t *nvl; - uint64_t txg = spa_last_synced_txg(spa) != 0 ? - spa_last_synced_txg(spa) : -1ULL; + if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) + return (0); - if ((label = vdev_label_read_config(vd, txg)) == NULL) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - vdev_dbgmsg(vd, "vdev_validate: failed reading config"); - return (0); - } + /* + * If we are performing an extreme rewind, we allow for a label that + * was modified at a point after the current txg. + */ + if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0) + txg = UINT64_MAX; + else + txg = spa_last_synced_txg(spa); - /* - * Determine if this vdev has been split off into another - * pool. If so, then refuse to open it. - */ - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, - &aux_guid) == 0 && aux_guid == spa_guid(spa)) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_SPLIT_POOL); - nvlist_free(label); - vdev_dbgmsg(vd, "vdev_validate: vdev split into other " - "pool"); - return (0); - } - - if (strict && (nvlist_lookup_uint64(label, - ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || - guid != spa_guid(spa))) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid " - "doesn't match config (%llu != %llu)", - (u_longlong_t)guid, - (u_longlong_t)spa_guid(spa)); - return (0); - } - - if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) - != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, - &aux_guid) != 0) - aux_guid = 0; - - /* - * If this vdev just became a top-level vdev because its - * sibling was detached, it will have adopted the parent's - * vdev guid -- but the label may or may not be on disk yet. - * Fortunately, either version of the label will have the - * same top guid, so if we're a top-level vdev, we can - * safely compare to that instead. - * - * If we split this vdev off instead, then we also check the - * original pool's guid. We don't want to consider the vdev - * corrupt if it is partway through a split operation. - */ - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, - &guid) != 0 || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, - &top_guid) != 0 || - ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && - (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - vdev_dbgmsg(vd, "vdev_validate: config guid doesn't " - "match label guid (%llu != %llu)", - (u_longlong_t)vd->vdev_guid, (u_longlong_t)guid); - return (0); - } - - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, - &state) != 0) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - nvlist_free(label); - vdev_dbgmsg(vd, "vdev_validate: '%s' missing", - ZPOOL_CONFIG_POOL_STATE); - return (0); - } - - nvlist_free(label); - - /* - * If this is a verbatim import, no need to check the - * state of the pool. - */ - if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && - spa_load_state(spa) == SPA_LOAD_OPEN && - state != POOL_STATE_ACTIVE) { - vdev_dbgmsg(vd, "vdev_validate: invalid pool state " - "(%llu) for spa %s", (u_longlong_t)state, - spa->spa_name); - return (SET_ERROR(EBADF)); - } - - /* - * If we were able to open and validate a vdev that was - * previously marked permanently unavailable, clear that state - * now. - */ - if (vd->vdev_not_present) - vd->vdev_not_present = 0; + if ((label = vdev_label_read_config(vd, txg)) == NULL) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + vdev_dbgmsg(vd, "vdev_validate: failed reading config"); + return (0); } + /* + * Determine if this vdev has been split off into another + * pool. If so, then refuse to open it. + */ + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, + &aux_guid) == 0 && aux_guid == spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_SPLIT_POOL); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_POOL_GUID); + return (0); + } + + /* + * If config is not trusted then ignore the spa guid check. This is + * necessary because if the machine crashed during a re-guid the new + * guid might have been written to all of the vdev labels, but not the + * cached config. The check will be performed again once we have the + * trusted config from the MOS. + */ + if (spa->spa_trust_config && guid != spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " + "match config (%llu != %llu)", (u_longlong_t)guid, + (u_longlong_t)spa_guid(spa)); + return (0); + } + + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) + != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, + &aux_guid) != 0) + aux_guid = 0; + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_GUID); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) + != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_TOP_GUID); + return (0); + } + + /* + * If this vdev just became a top-level vdev because its sibling was + * detached, it will have adopted the parent's vdev guid -- but the + * label may or may not be on disk yet. Fortunately, either version + * of the label will have the same top guid, so if we're a top-level + * vdev, we can safely compare to that instead. + * However, if the config comes from a cachefile that failed to update + * after the detach, a top-level vdev will appear as a non top-level + * vdev in the config. Also relax the constraints if we perform an + * extreme rewind. + * + * If we split this vdev off instead, then we also check the + * original pool's guid. We don't want to consider the vdev + * corrupt if it is partway through a split operation. + */ + if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { + boolean_t mismatch = B_FALSE; + if (spa->spa_trust_config && !spa->spa_extreme_rewind) { + if (vd != vd->vdev_top || vd->vdev_guid != top_guid) + mismatch = B_TRUE; + } else { + if (vd->vdev_guid != top_guid && + vd->vdev_top->vdev_guid != guid) + mismatch = B_TRUE; + } + + if (mismatch) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: config guid " + "doesn't match label guid"); + vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", + (u_longlong_t)vd->vdev_guid, + (u_longlong_t)vd->vdev_top->vdev_guid); + vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " + "aux_guid %llu", (u_longlong_t)guid, + (u_longlong_t)top_guid, (u_longlong_t)aux_guid); + return (0); + } + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_POOL_STATE); + return (0); + } + + nvlist_free(label); + + /* + * If this is a verbatim import, no need to check the + * state of the pool. + */ + if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && + spa_load_state(spa) == SPA_LOAD_OPEN && + state != POOL_STATE_ACTIVE) { + vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " + "for spa %s", (u_longlong_t)state, spa->spa_name); + return (SET_ERROR(EBADF)); + } + + /* + * If we were able to open and validate a vdev that was + * previously marked permanently unavailable, clear that state + * now. + */ + if (vd->vdev_not_present) + vd->vdev_not_present = 0; + return (0); } +static void +vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) +{ + if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { + if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { + zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " + "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, + dvd->vdev_path, svd->vdev_path); + spa_strfree(dvd->vdev_path); + dvd->vdev_path = spa_strdup(svd->vdev_path); + } + } else if (svd->vdev_path != NULL) { + dvd->vdev_path = spa_strdup(svd->vdev_path); + zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", + (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); + } +} + +/* + * Recursively copy vdev paths from one vdev to another. Source and destination + * vdev trees must have same geometry otherwise return error. Intended to copy + * paths from userland config into MOS config. + */ +int +vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) +{ + if ((svd->vdev_ops == &vdev_missing_ops) || + (svd->vdev_ishole && dvd->vdev_ishole) || + (dvd->vdev_ops == &vdev_indirect_ops)) + return (0); + + if (svd->vdev_ops != dvd->vdev_ops) { + vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", + svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); + return (SET_ERROR(EINVAL)); + } + + if (svd->vdev_guid != dvd->vdev_guid) { + vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " + "%llu)", (u_longlong_t)svd->vdev_guid, + (u_longlong_t)dvd->vdev_guid); + return (SET_ERROR(EINVAL)); + } + + if (svd->vdev_children != dvd->vdev_children) { + vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " + "%llu != %llu", (u_longlong_t)svd->vdev_children, + (u_longlong_t)dvd->vdev_children); + return (SET_ERROR(EINVAL)); + } + + for (uint64_t i = 0; i < svd->vdev_children; i++) { + int error = vdev_copy_path_strict(svd->vdev_child[i], + dvd->vdev_child[i]); + if (error != 0) + return (error); + } + + if (svd->vdev_ops->vdev_op_leaf) + vdev_copy_path_impl(svd, dvd); + + return (0); +} + +static void +vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) +{ + ASSERT(stvd->vdev_top == stvd); + ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); + + for (uint64_t i = 0; i < dvd->vdev_children; i++) { + vdev_copy_path_search(stvd, dvd->vdev_child[i]); + } + + if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) + return; + + /* + * The idea here is that while a vdev can shift positions within + * a top vdev (when replacing, attaching mirror, etc.) it cannot + * step outside of it. + */ + vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); + + if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) + return; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + vdev_copy_path_impl(vd, dvd); +} + +/* + * Recursively copy vdev paths from one root vdev to another. Source and + * destination vdev trees may differ in geometry. For each destination leaf + * vdev, search a vdev with the same guid and top vdev id in the source. + * Intended to copy paths from userland config into MOS config. + */ +void +vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) +{ + uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); + ASSERT(srvd->vdev_ops == &vdev_root_ops); + ASSERT(drvd->vdev_ops == &vdev_root_ops); + + for (uint64_t i = 0; i < children; i++) { + vdev_copy_path_search(srvd->vdev_child[i], + drvd->vdev_child[i]); + } +} + /* * Close a virtual device. */ @@ -1828,7 +2047,7 @@ vdev_reopen(vdev_t *vd) !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { - (void) vdev_validate(vd, B_TRUE); + (void) vdev_validate(vd); } /* @@ -3873,6 +4092,19 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vdev_propagate_state(vd->vdev_parent); } +boolean_t +vdev_children_are_offline(vdev_t *vd) +{ + ASSERT(!vd->vdev_ops->vdev_op_leaf); + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) + return (B_FALSE); + } + + return (B_TRUE); +} + /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. @@ -3908,34 +4140,6 @@ vdev_is_concrete(vdev_t *vd) } } -/* - * Load the state from the original vdev tree (ovd) which - * we've retrieved from the MOS config object. If the original - * vdev was offline or faulted then we transfer that state to the - * device in the current vdev tree (nvd). - */ -void -vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) -{ - ASSERT(nvd->vdev_top->vdev_islog); - ASSERT(spa_config_held(nvd->vdev_spa, - SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); - ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); - - for (int c = 0; c < nvd->vdev_children; c++) - vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); - - if (nvd->vdev_ops->vdev_op_leaf) { - /* - * Restore the persistent vdev state - */ - nvd->vdev_offline = ovd->vdev_offline; - nvd->vdev_faulted = ovd->vdev_faulted; - nvd->vdev_degraded = ovd->vdev_degraded; - nvd->vdev_removed = ovd->vdev_removed; - } -} - /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that @@ -4051,5 +4255,9 @@ module_param(zfs_checksums_per_second, uint, 0644); module_param(zfs_scan_ignore_errors, int, 0644); MODULE_PARM_DESC(zfs_scan_ignore_errors, "Ignore errors during resilver/scrub"); + +module_param(vdev_validate_skip, int, 0644); +MODULE_PARM_DESC(vdev_validate_skip, + "Bypass vdev_validate()"); /* END CSTYLED */ #endif diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index ad334fe8c2..85d133a5ac 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -412,7 +412,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); - if (vd->vdev_not_present) + if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); if (vd->vdev_isspare) @@ -1209,6 +1209,11 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); + if (*config == NULL && spa->spa_extreme_rewind) { + vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " + "Trying again without txg restrictions."); + *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX); + } if (*config == NULL) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config"); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 4b01f317b4..1c591cd647 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -251,9 +251,33 @@ vdev_mirror_map_init(zio_t *zio) if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; + dva_t dva_copy[SPA_DVAS_PER_BP]; - mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE, - B_TRUE); + c = BP_GET_NDVAS(zio->io_bp); + + /* + * If we do not trust the pool config, some DVAs might be + * invalid or point to vdevs that do not exist. We skip them. + */ + if (!spa_trust_config(spa)) { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + int j = 0; + for (int i = 0; i < c; i++) { + if (zfs_dva_valid(spa, &dva[i], zio->io_bp)) + dva_copy[j++] = dva[i]; + } + if (j == 0) { + zio->io_vsd = NULL; + zio->io_error = ENXIO; + return (NULL); + } + if (j < c) { + dva = dva_copy; + c = j; + } + } + + mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; @@ -305,7 +329,10 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, } if (numerrors == vd->vdev_children) { - vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + if (vdev_children_are_offline(vd)) + vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE; + else + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } @@ -485,6 +512,13 @@ vdev_mirror_io_start(zio_t *zio) mm = vdev_mirror_map_init(zio); + if (mm == NULL) { + ASSERT(!spa_trust_config(zio->io_spa)); + ASSERT(zio->io_type == ZIO_TYPE_READ); + zio_execute(zio); + return; + } + if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_bp != NULL && (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { @@ -558,6 +592,9 @@ vdev_mirror_io_done(zio_t *zio) int good_copies = 0; int unexpected_errors = 0; + if (mm == NULL) + return; + for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; @@ -677,13 +714,19 @@ vdev_mirror_io_done(zio_t *zio) static void vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) { - if (faulted == vd->vdev_children) - vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_NO_REPLICAS); - else if (degraded + faulted != 0) + if (faulted == vd->vdev_children) { + if (vdev_children_are_offline(vd)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE, + VDEV_AUX_CHILDREN_OFFLINE); + } else { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + } + } else if (degraded + faulted != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); - else + } else { vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + } } vdev_ops_t vdev_mirror_ops = { diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 8ac9ce1878..9f86cbfa41 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -37,6 +37,23 @@ * Virtual device vector for the pool's root vdev. */ +static uint64_t +vdev_root_core_tvds(vdev_t *vd) +{ + uint64_t tvds = 0; + + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!cvd->vdev_ishole && !cvd->vdev_islog && + cvd->vdev_ops != &vdev_indirect_ops) { + tvds++; + } + } + + return (tvds); +} + /* * We should be able to tolerate one failure with absolutely no damage * to our metadata. Two failures will take out space maps, a bunch of @@ -46,17 +63,28 @@ * probably fine. Adding bean counters during alloc/free can make this * future guesswork more accurate. */ -static int -too_many_errors(vdev_t *vd, int numerrors) +static boolean_t +too_many_errors(vdev_t *vd, uint64_t numerrors) { - ASSERT3U(numerrors, <=, vd->vdev_children); - return (numerrors > 0); + uint64_t tvds; + + if (numerrors == 0) + return (B_FALSE); + + tvds = vdev_root_core_tvds(vd); + ASSERT3U(numerrors, <=, tvds); + + if (numerrors == tvds) + return (B_TRUE); + + return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa)); } static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *ashift) { + spa_t *spa = vd->vdev_spa; int lasterror = 0; int numerrors = 0; @@ -76,6 +104,9 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, } } + if (spa_load_state(spa) != SPA_LOAD_NONE) + spa_set_missing_tvds(spa, numerrors); + if (too_many_errors(vd, numerrors)) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); @@ -101,7 +132,7 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded) if (too_many_errors(vd, faulted)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); - } else if (degraded) { + } else if (degraded || faulted) { vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); } else { vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 6822505f18..81ae65c319 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -878,6 +878,13 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) } } + /* + * Do not verify individual DVAs if the config is not trusted. This + * will be done once the zio is executed in vdev_mirror_map_alloc. + */ + if (!spa->spa_trust_config) + return; + /* * Pool-specific checks. * @@ -928,6 +935,36 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) } } +boolean_t +zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) +{ + uint64_t vdevid = DVA_GET_VDEV(dva); + + if (vdevid >= spa->spa_root_vdev->vdev_children) + return (B_FALSE); + + vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; + if (vd == NULL) + return (B_FALSE); + + if (vd->vdev_ops == &vdev_hole_ops) + return (B_FALSE); + + if (vd->vdev_ops == &vdev_missing_ops) { + return (B_FALSE); + } + + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t asize = DVA_GET_ASIZE(dva); + + if (BP_IS_GANG(bp)) + asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + if (offset + asize > vd->vdev_asize) + return (B_FALSE); + + return (B_TRUE); +} + zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, @@ -3473,14 +3510,18 @@ zio_vdev_io_start(zio_t *zio) } ASSERT3P(zio->io_logical, !=, zio); - if (zio->io_type == ZIO_TYPE_WRITE && zio->io_vd->vdev_removing) { + if (zio->io_type == ZIO_TYPE_WRITE) { + ASSERT(spa->spa_trust_config); + /* * Note: the code can handle other kinds of writes, * but we don't expect them. */ - ASSERT(zio->io_flags & - (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | - ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); + if (zio->io_vd->vdev_removing) { + ASSERT(zio->io_flags & + (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | + ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); + } } align = 1ULL << vd->vdev_top->vdev_ashift; diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index eecac8f6e0..0260eb8848 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -364,11 +364,22 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos', 'zpool_import_015_pos', 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', - 'zpool_import_features_003_pos','zpool_import_missing_001_pos', + 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos', 'zpool_import_missing_002_pos', 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted', 'zpool_import_encrypted_load', - 'zpool_import_errata3'] + 'zpool_import_errata3', + 'import_cache_device_added', + 'import_cache_device_removed', + 'import_cache_device_replaced', + 'import_cache_mirror_attached', + 'import_cache_mirror_detached', + 'import_cache_shared_device', + 'import_devices_missing', + 'import_paths_changed', + 'import_rewind_config_changed', + 'import_rewind_device_replaced'] + tags = ['functional', 'cli_root', 'zpool_import'] [tests/functional/cli_root/zpool_labelclear] diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am index 8aa34f33cc..97a15a20de 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am @@ -2,6 +2,17 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_impo dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ + zpool_import.kshlib \ + import_cache_device_added.ksh \ + import_cache_device_removed.ksh \ + import_cache_device_replaced.ksh \ + import_cache_mirror_attached.ksh \ + import_cache_mirror_detached.ksh \ + import_cache_shared_device.ksh \ + import_devices_missing.ksh \ + import_paths_changed.ksh \ + import_rewind_config_changed.ksh \ + import_rewind_device_replaced.ksh \ zpool_import_001_pos.ksh \ zpool_import_002_pos.ksh \ zpool_import_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_added.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_added.ksh new file mode 100755 index 0000000000..bda6b891b9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_added.ksh @@ -0,0 +1,76 @@ +#!/usr/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool should be importable using an outdated cachefile that is unaware +# that one or two top-level vdevs were added. +# +# STRATEGY: +# 1. Create a pool with some devices and an alternate cachefile. +# 2. Backup the cachefile. +# 3. Add a device/mirror/raid to the pool. +# 4. Export the pool. +# 5. Verify that we can import the pool using the backed-up cachefile. +# + +verify_runnable "global" + +log_onexit cleanup + +function test_add_vdevs +{ + typeset poolcreate="$1" + typeset addvdevs="$2" + typeset poolcheck="$3" + + log_note "$0: pool '$poolcreate', add $addvdevs." + + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate + + log_must cp $CPATH $CPATHBKP + + log_must zpool add -f $TESTPOOL1 $addvdevs + + log_must zpool export $TESTPOOL1 + + log_must zpool import -c $CPATHBKP $TESTPOOL1 + log_must check_pool_config $TESTPOOL1 "$poolcheck" + + # Cleanup + log_must zpool destroy $TESTPOOL1 + log_must rm -f $CPATH $CPATHBKP + + log_note "" +} + +test_add_vdevs "$VDEV0" "$VDEV1" "$VDEV0 $VDEV1" +test_add_vdevs "$VDEV0 $VDEV1" "$VDEV2" "$VDEV0 $VDEV1 $VDEV2" +test_add_vdevs "$VDEV0" "$VDEV1 $VDEV2" "$VDEV0 $VDEV1 $VDEV2" +test_add_vdevs "$VDEV0" "mirror $VDEV1 $VDEV2" \ + "$VDEV0 mirror $VDEV1 $VDEV2" +test_add_vdevs "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3" \ + "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3" +test_add_vdevs "$VDEV0" "raidz $VDEV1 $VDEV2 $VDEV3" \ + "$VDEV0 raidz $VDEV1 $VDEV2 $VDEV3" +test_add_vdevs "$VDEV0" "log $VDEV1" "$VDEV0 log $VDEV1" +test_add_vdevs "$VDEV0 log $VDEV1" "$VDEV2" "$VDEV0 $VDEV2 log $VDEV1" +test_add_vdevs "$VDEV0" "$VDEV1 log $VDEV2" "$VDEV0 $VDEV1 log $VDEV2" + +log_pass "zpool import -c cachefile_unaware_of_add passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_removed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_removed.ksh new file mode 100755 index 0000000000..1d878b7a25 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_removed.ksh @@ -0,0 +1,145 @@ +#!/usr/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool should be importable using an outdated cachefile that is unaware +# that one or more vdevs were removed. +# +# STRATEGY: +# 1. Create a pool with some devices and an alternate cachefile. +# 2. Backup the cachefile. +# 3. Remove device(s) from the pool and remove them. +# 4. (Optionally) Add device(s) to pool. +# 5. Export the pool. +# 6. Verify that we can import the pool using the backed-up cachefile. +# + +verify_runnable "global" + +function custom_cleanup +{ + cleanup +} + +log_onexit custom_cleanup + +function test_remove_vdev +{ + typeset poolcreate="$1" + typeset removevdev="$2" + typeset poolcheck="$3" + + log_note "$0: pool '$poolcreate', remove $2." + + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate + + log_must cp $CPATH $CPATHBKP + + log_must zpool remove $TESTPOOL1 $removevdev + log_must wait_for_pool_config $TESTPOOL1 "$poolcheck" + log_must rm $removevdev + + log_must zpool export $TESTPOOL1 + + log_must zpool import -c $CPATHBKP $TESTPOOL1 + log_must check_pool_config $TESTPOOL1 "$poolcheck" + + # Cleanup + log_must zpool destroy $TESTPOOL1 + log_must rm -f $CPATH $CPATHBKP + log_must mkfile $FILE_SIZE $removevdev + + log_note "" +} + +# +# We have to remove top-level non-log vdevs one by one, else there is a high +# chance pool will report busy and command will fail for the second vdev. +# +function test_remove_two_vdevs +{ + log_note "$0." + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 \ + $VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4 + + log_must cp $CPATH $CPATHBKP + + log_must zpool remove $TESTPOOL1 $VDEV4 + log_must wait_for_pool_config $TESTPOOL1 \ + "$VDEV0 $VDEV1 $VDEV2 $VDEV3" + log_must zpool remove $TESTPOOL1 $VDEV3 + log_must wait_for_pool_config $TESTPOOL1 "$VDEV0 $VDEV1 $VDEV2" + log_must rm $VDEV3 $VDEV4 + + log_must zpool export $TESTPOOL1 + + log_must zpool import -c $CPATHBKP $TESTPOOL1 + log_must check_pool_config $TESTPOOL1 "$VDEV0 $VDEV1 $VDEV2" + + # Cleanup + log_must zpool destroy $TESTPOOL1 + log_must rm -f $CPATH $CPATHBKP + log_must mkfile $FILE_SIZE $VDEV3 $VDEV4 + + log_note "" +} + +# +# We want to test the case where a whole created by a log device is filled +# by a regular device +# +function test_remove_log_then_add_vdev +{ + log_note "$0." + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 \ + $VDEV0 $VDEV1 $VDEV2 log $VDEV3 + + log_must cp $CPATH $CPATHBKP + + log_must zpool remove $TESTPOOL1 $VDEV1 + log_must wait_for_pool_config $TESTPOOL1 "$VDEV0 $VDEV2 log $VDEV3" + log_must zpool remove $TESTPOOL1 $VDEV3 + log_must check_pool_config $TESTPOOL1 "$VDEV0 $VDEV2" + log_must rm $VDEV1 $VDEV3 + log_must zpool add $TESTPOOL1 $VDEV4 + + log_must zpool export $TESTPOOL1 + + log_must zpool import -c $CPATHBKP $TESTPOOL1 + log_must check_pool_config $TESTPOOL1 "$VDEV0 $VDEV2 $VDEV4" + + # Cleanup + log_must zpool destroy $TESTPOOL1 + log_must rm -f $CPATH $CPATHBKP + log_must mkfile $FILE_SIZE $VDEV1 $VDEV3 + + log_note "" +} + +test_remove_vdev "$VDEV0 $VDEV1 $VDEV2" "$VDEV2" "$VDEV0 $VDEV1" +test_remove_vdev "$VDEV0 $VDEV1 $VDEV2" "$VDEV1" "$VDEV0 $VDEV2" +test_remove_vdev "$VDEV0 log $VDEV1" "$VDEV1" "$VDEV0" +test_remove_vdev "$VDEV0 log $VDEV1 $VDEV2" "$VDEV1 $VDEV2" "$VDEV0" +test_remove_vdev "$VDEV0 $VDEV1 $VDEV2 log $VDEV3" "$VDEV2" \ + "$VDEV0 $VDEV1 log $VDEV3" +test_remove_two_vdevs +test_remove_log_then_add_vdev + +log_pass "zpool import -c cachefile_unaware_of_remove passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_replaced.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_replaced.ksh new file mode 100755 index 0000000000..f2888a5bb1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_device_replaced.ksh @@ -0,0 +1,166 @@ +#!/usr/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool should be importable using an outdated cachefile that is unaware +# of a zpool replace operation at different stages in time. +# +# STRATEGY: +# 1. Create a pool with some devices and an alternate cachefile. +# 2. Backup the cachefile. +# 3. Initiate device replacement, backup cachefile again and export pool. +# Special care must be taken so that resilvering doesn't complete +# before we exported the pool. +# 4. Verify that we can import the pool using the first cachefile backup. +# (Test 1. cachefile: pre-replace, pool: resilvering) +# 5. Wait for the resilvering to finish and export the pool. +# 6. Verify that we can import the pool using the first cachefile backup. +# (Test 2. cachefile: pre-replace, pool: post-replace) +# 7. Export the pool. +# 8. Verify that we can import the pool using the second cachefile backup. +# (Test 3. cachefile: resilvering, pool: post-replace) +# +# STRATEGY TO SLOW DOWN RESILVERING: +# 1. Reduce zfs_txg_timeout, which controls how long can we resilver for +# each sync. +# 2. Add data to pool +# 3. Re-import the pool so that data isn't cached +# 4. Use zinject to slow down device I/O +# 5. Trigger the resilvering +# 6. Use spa freeze to stop writing to the pool. +# 7. Clear zinject events (needed to export the pool) +# 8. Export the pool +# + +verify_runnable "global" + +ZFS_TXG_TIMEOUT="" + +function custom_cleanup +{ + # Revert zfs_txg_timeout to defaults + [[ -n ZFS_TXG_TIMEOUT ]] && + log_must set_zfs_txg_timeout $ZFS_TXG_TIMEOUT + + zinject -c all + cleanup +} + +log_onexit custom_cleanup + +function test_replacing_vdevs +{ + typeset poolcreate="$1" + typeset replacevdev="$2" + typeset replaceby="$3" + typeset poolfinalstate="$4" + typeset zinjectdevices="$5" + typeset earlyremove="$6" + typeset writedata="$7" + + log_note "$0: pool '$poolcreate', replace $replacevdev by $replaceby." + + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate + + # Cachefile: pool in pre-replace state + log_must cp $CPATH $CPATHBKP + + # Steps to insure resilvering happens very slowly. + log_must write_some_data $TESTPOOL1 $writedata + log_must zpool export $TESTPOOL1 + log_must cp $CPATHBKP $CPATH + log_must zpool import -c $CPATH -o cachefile=$CPATH $TESTPOOL1 + typeset device + for device in $zinjectdevices ; do + log_must zinject -d $device -D 200:1 $TESTPOOL1 > /dev/null + done + log_must zpool replace $TESTPOOL1 $replacevdev $replaceby + + # Cachefile: pool in resilvering state + log_must cp $CPATH $CPATHBKP2 + + # We must disable zinject in order to export the pool, so we freeze + # it first to prevent writing out subsequent resilvering progress. + log_must zpool freeze $TESTPOOL1 + # Confirm pool is still replacing + log_must pool_is_replacing $TESTPOOL1 + log_must zinject -c all > /dev/null + log_must zpool export $TESTPOOL1 + + ( $earlyremove ) && log_must rm $replacevdev + + ############################################################ + # Test 1. Cachefile: pre-replace, pool: resilvering + ############################################################ + log_must cp $CPATHBKP $CPATH + log_must zpool import -c $CPATH $TESTPOOL1 + + # Wait for resilvering to finish + log_must wait_for_pool_config $TESTPOOL1 "$poolfinalstate" + log_must zpool export $TESTPOOL1 + + ( ! $earlyremove ) && log_must rm $replacevdev + + ############################################################ + # Test 2. Cachefile: pre-replace, pool: post-replace + ############################################################ + log_must zpool import -c $CPATHBKP $TESTPOOL1 + log_must check_pool_config $TESTPOOL1 "$poolfinalstate" + log_must zpool export $TESTPOOL1 + + ############################################################ + # Test 3. Cachefile: resilvering, pool: post-replace + ############################################################ + log_must zpool import -c $CPATHBKP2 $TESTPOOL1 + log_must check_pool_config $TESTPOOL1 "$poolfinalstate" + + # Cleanup + log_must zpool destroy $TESTPOOL1 + log_must rm -f $CPATH $CPATHBKP $CPATHBKP2 + log_must mkfile $FILE_SIZE $replacevdev + + log_note "" +} + +# We set zfs_txg_timeout to 1 to reduce resilvering time at each sync. +ZFS_TXG_TIMEOUT=$(get_zfs_txg_timeout) +set_zfs_txg_timeout 1 + +test_replacing_vdevs "$VDEV0 $VDEV1" \ + "$VDEV1" "$VDEV2" \ + "$VDEV0 $VDEV2" \ + "$VDEV0 $VDEV1" \ + false 20 + +test_replacing_vdevs "mirror $VDEV0 $VDEV1" \ + "$VDEV1" "$VDEV2" \ + "mirror $VDEV0 $VDEV2" \ + "$VDEV0 $VDEV1" \ + true 10 + +test_replacing_vdevs "raidz $VDEV0 $VDEV1 $VDEV2" \ + "$VDEV1" "$VDEV3" \ + "raidz $VDEV0 $VDEV3 $VDEV2" \ + "$VDEV0 $VDEV1 $VDEV2" \ + true 20 + +set_zfs_txg_timeout $ZFS_TXG_TIMEOUT + +log_pass "zpool import -c cachefile_unaware_of_replace passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_attached.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_attached.ksh new file mode 100755 index 0000000000..987b745b91 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_attached.ksh @@ -0,0 +1,72 @@ +#!/usr/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool should be importable using an outdated cachefile that misses a +# mirror that was attached. +# +# STRATEGY: +# 1. Create a pool with some devices and an alternate cachefile. +# 2. Backup the cachefile. +# 3. Attach a mirror to one of the devices in the pool. +# 4. Export the pool. +# 5. Verify that we can import the pool using the backed-up cachefile. +# + +verify_runnable "global" + +log_onexit cleanup + +function test_attach_vdev +{ + typeset poolcreate="$1" + typeset attachto="$2" + typeset attachvdev="$3" + typeset poolcheck="$4" + + log_note "$0: pool '$poolcreate', attach $attachvdev to $attachto." + + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate + + log_must cp $CPATH $CPATHBKP + + log_must zpool attach $TESTPOOL1 $attachto $attachvdev + + log_must zpool export $TESTPOOL1 + + log_must zpool import -c $CPATHBKP $TESTPOOL1 + log_must check_pool_config $TESTPOOL1 "$poolcheck" + + # Cleanup + log_must zpool destroy $TESTPOOL1 + log_must rm -f $CPATH $CPATHBKP + + log_note "" +} + +test_attach_vdev "$VDEV0" "$VDEV0" "$VDEV4" "mirror $VDEV0 $VDEV4" +test_attach_vdev "$VDEV0 $VDEV1" "$VDEV1" "$VDEV4" \ + "$VDEV0 mirror $VDEV1 $VDEV4" +test_attach_vdev "mirror $VDEV0 $VDEV1" "$VDEV0" "$VDEV4" \ + "mirror $VDEV0 $VDEV1 $VDEV4" +test_attach_vdev "$VDEV0 log $VDEV1" "$VDEV1" "$VDEV4" \ + "$VDEV0 log mirror $VDEV1 $VDEV4" + +log_pass "zpool import -c cachefile_unaware_of_attach passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_detached.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_detached.ksh new file mode 100755 index 0000000000..85ec51673b --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_mirror_detached.ksh @@ -0,0 +1,70 @@ +#!/usr/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool should be importable using an outdated cachefile that is unaware +# that a mirror was detached. +# +# STRATEGY: +# 1. Create a pool with some devices mirrored and an alternate cachefile. +# 2. Backup the cachefile. +# 3. Detach a mirror from the pool. +# 4. Export the pool. +# 5. Verify that we can import the pool using the backed-up cachefile. +# + +verify_runnable "global" + +log_onexit cleanup + +function test_detach_vdev +{ + typeset poolcreate="$1" + typeset poolcheck="$2" + + log_note "$0: pool '$poolcreate', detach $VDEV4." + + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $poolcreate + + log_must cp $CPATH $CPATHBKP + + log_must zpool detach $TESTPOOL1 $VDEV4 + log_must rm -f $VDEV4 + + log_must zpool export $TESTPOOL1 + + log_must zpool import -c $CPATHBKP $TESTPOOL1 + log_must check_pool_config $TESTPOOL1 "$poolcheck" + + # Cleanup + log_must zpool destroy $TESTPOOL1 + log_must rm -f $CPATH $CPATHBKP + log_must mkfile $FILE_SIZE $VDEV4 + + log_note "" +} + +test_detach_vdev "mirror $VDEV0 $VDEV4" "$VDEV0" +test_detach_vdev "mirror $VDEV0 $VDEV4 mirror $VDEV1 $VDEV2" \ + "$VDEV0 mirror $VDEV1 $VDEV2" +test_detach_vdev "mirror $VDEV0 $VDEV1 $VDEV4" "mirror $VDEV0 $VDEV1" +test_detach_vdev "$VDEV0 log mirror $VDEV1 $VDEV4" "$VDEV0 log $VDEV1" + +log_pass "zpool import -c cachefile_unaware_of_detach passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_shared_device.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_shared_device.ksh new file mode 100755 index 0000000000..66225c11b9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cache_shared_device.ksh @@ -0,0 +1,113 @@ +#!/usr/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool should not try to write to a device that doesn't belong to it +# anymore, even if the device is in its cachefile. +# +# STRATEGY: +# 1. Create pool1 with some devices and an alternate cachefile. +# 2. Backup the cachefile. +# 3. Export pool1. +# 4. Create pool2 using a device that belongs to pool1. +# 5. Export pool2. +# 6. Compute checksum of the shared device. +# 7. Import pool1 and write some data to it. +# 8. Verify that the checksum of the shared device hasn't changed. +# + +verify_runnable "global" + +function custom_cleanup +{ + destroy_pool $TESTPOOL2 + cleanup +} + +log_onexit custom_cleanup + +function dev_checksum +{ + typeset dev="$1" + typeset checksum + + log_note "Compute checksum of '$dev'" + + checksum=$(md5sum $dev) + if [[ $? -ne 0 ]]; then + log_fail "Failed to compute checksum of '$dev'" + return 1 + fi + + echo "$checksum" + return 0 +} + +function test_shared_device +{ + typeset pool1="$1" + typeset pool2="$2" + typeset sharedvdev="$3" + typeset importflags="${4:-}" + + log_note "$0: pool1 '$pool1', pool2 '$pool2' takes $sharedvdev." + + log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $pool1 + + log_must cp $CPATH $CPATHBKP + + log_must zpool export $TESTPOOL1 + + log_must zpool create -f $TESTPOOL2 $pool2 + + log_must zpool export $TESTPOOL2 + + typeset checksum1=$(dev_checksum $sharedvdev) + + log_must zpool import -c $CPATHBKP $importflags $TESTPOOL1 + + log_must write_some_data $TESTPOOL1 2 + + log_must zpool destroy $TESTPOOL1 + + typeset checksum2=$(dev_checksum $sharedvdev) + + if [[ $checksum1 == $checksum2 ]]; then + log_pos "Device hasn't been modified by original pool" + else + log_fail "Device has been modified by original pool." \ + "Checksum mismatch: $checksum1 != $checksum2." + fi + + # Cleanup + log_must zpool import -d $DEVICE_DIR $TESTPOOL2 + log_must zpool destroy $TESTPOOL2 + log_must rm -f $CPATH $CPATHBKP + + log_note "" +} + +test_shared_device "mirror $VDEV0 $VDEV1" "mirror $VDEV1 $VDEV2" "$VDEV1" +test_shared_device "mirror $VDEV0 $VDEV1 $VDEV2" "mirror $VDEV2 $VDEV3" \ + "$VDEV2" +test_shared_device "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV2" "$VDEV2" +test_shared_device "$VDEV0 log $VDEV1" "$VDEV2 log $VDEV1" "$VDEV1" "-m" + +log_pass "Pool doesn't write to a device it doesn't own anymore." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_devices_missing.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_devices_missing.ksh new file mode 100755 index 0000000000..74b736aef9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_devices_missing.ksh @@ -0,0 +1,122 @@ +#!/usr/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool should be importable when up to 2 top-level devices are missing. +# +# STRATEGY: +# 1. Create a pool. +# 2. Write some data to the pool and checksum it. +# 3. Add one or more devices. +# 4. Write more data to the pool and checksum it. +# 5. Export the pool. +# 6. Move added devices out of the devices directory. +# 7. Import the pool with missing devices. +# 8. Verify that the first batch of data is intact. +# 9. Verify that accessing the second batch of data doesn't suspend pool. +# 10. Export the pool, move back missing devices, Re-import the pool. +# 11. Verify that all the data is intact. +# + +verify_runnable "global" + +function custom_cleanup +{ + log_must set_spa_load_verify_metadata 1 + log_must set_spa_load_verify_data 1 + log_must set_zfs_max_missing_tvds 0 + log_must rm -rf $BACKUP_DEVICE_DIR + # Highly damaged pools may fail to be destroyed, so we export them. + poolexists $TESTPOOL1 && log_must zpool export $TESTPOOL1 + cleanup +} + +log_onexit custom_cleanup + +function test_devices_missing +{ + typeset poolcreate="$1" + typeset addvdevs="$2" + typeset missingvdevs="$3" + typeset -i missingtvds="$4" + + log_note "$0: pool '$poolcreate', adding $addvdevs, then" \ + "moving away $missingvdevs." + + log_must zpool create $TESTPOOL1 $poolcreate + + log_must generate_data $TESTPOOL1 $MD5FILE "first" + + log_must zpool add $TESTPOOL1 $addvdevs + + log_must generate_data $TESTPOOL1 $MD5FILE2 "second" + + log_must zpool export $TESTPOOL1 + + log_must mv $missingvdevs $BACKUP_DEVICE_DIR + + # Tell zfs that it is ok to import a pool with missing top-level vdevs + log_must set_zfs_max_missing_tvds $missingtvds + # Missing devices means that data or metadata may be corrupted. + (( missingtvds > 1 )) && log_must set_spa_load_verify_metadata 0 + log_must set_spa_load_verify_data 0 + log_must zpool import -o readonly=on -d $DEVICE_DIR $TESTPOOL1 + + log_must verify_data_md5sums $MD5FILE + + log_note "Try reading second batch of data, make sure pool doesn't" \ + "get suspended." + verify_data_md5sums $MD5FILE >/dev/null 2>&1 + + log_must zpool export $TESTPOOL1 + + typeset newpaths=$(echo "$missingvdevs" | \ + sed "s:$DEVICE_DIR:$BACKUP_DEVICE_DIR:g") + log_must mv $newpaths $DEVICE_DIR + log_must set_spa_load_verify_metadata 1 + log_must set_spa_load_verify_data 1 + log_must set_zfs_max_missing_tvds 0 + log_must zpool import -d $DEVICE_DIR $TESTPOOL1 + + log_must verify_data_md5sums $MD5FILE + log_must verify_data_md5sums $MD5FILE2 + + # Cleanup + log_must zpool destroy $TESTPOOL1 + + log_note "" +} + +log_must mkdir -p $BACKUP_DEVICE_DIR + +test_devices_missing "$VDEV0" "$VDEV1" "$VDEV1" 1 +test_devices_missing "$VDEV0" "$VDEV1 $VDEV2" "$VDEV1" 1 +test_devices_missing "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3" \ + "$VDEV2 $VDEV3" 1 +test_devices_missing "$VDEV0 log $VDEV1" "$VDEV2" "$VDEV2" 1 + +# +# Note that we are testing for 2 non-consecutive missing devices. +# Missing consecutive devices results in missing metadata. Because of +# Missing metadata can cause the root dataset to fail to mount. +# +test_devices_missing "$VDEV0" "$VDEV1 $VDEV2 $VDEV3" "$VDEV1 $VDEV3" 2 + +log_pass "zpool import succeeded with missing devices." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh new file mode 100755 index 0000000000..457eb6a14a --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh @@ -0,0 +1,98 @@ +#!/usr/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# A pool should be importable even if device paths have changed. +# +# STRATEGY: +# 1. Create a pool. +# 2. Export the pool. +# 3. Change the paths of some of the devices. +# 4. Verify that we can import the pool in a healthy state. +# + +verify_runnable "global" + +log_onexit cleanup + +function test_new_paths +{ + typeset poolcreate="$1" + typeset pathstochange="$2" + + log_note "$0: pool '$poolcreate', changing paths of $pathstochange." + + log_must zpool create $TESTPOOL1 $poolcreate + + log_must zpool export $TESTPOOL1 + + for dev in $pathstochange; do + log_must mv $dev "${dev}_new" + done + + log_must zpool import -d $DEVICE_DIR $TESTPOOL1 + log_must check_pool_healthy $TESTPOOL1 + + # Cleanup + log_must zpool destroy $TESTPOOL1 + for dev in $pathstochange; do + log_must mv "${dev}_new" $dev + done + + log_note "" +} + +function test_swap_paths +{ + typeset poolcreate="$1" + typeset pathtoswap1="$2" + typeset pathtoswap2="$3" + + log_note "$0: pool '$poolcreate', swapping paths of $pathtoswap1" \ + "and $pathtoswap2." + + log_must zpool create $TESTPOOL1 $poolcreate + + log_must zpool export $TESTPOOL1 + + log_must mv $pathtoswap2 "$pathtoswap2.tmp" + log_must mv $pathtoswap1 "$pathtoswap2" + log_must mv "$pathtoswap2.tmp" $pathtoswap1 + + log_must zpool import -d $DEVICE_DIR $TESTPOOL1 + log_must check_pool_healthy $TESTPOOL1 + + # Cleanup + log_must zpool destroy $TESTPOOL1 + + log_note "" +} + +test_new_paths "$VDEV0 $VDEV1" "$VDEV0 $VDEV1" +test_new_paths "mirror $VDEV0 $VDEV1" "$VDEV0 $VDEV1" +test_new_paths "$VDEV0 log $VDEV1" "$VDEV1" +test_new_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV1" + +test_swap_paths "$VDEV0 $VDEV1" "$VDEV0" "$VDEV1" +test_swap_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV0" "$VDEV1" +test_swap_paths "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3" \ + "$VDEV0" "$VDEV2" + +log_pass "zpool import succeeded after changing device paths." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh new file mode 100755 index 0000000000..92d8140156 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh @@ -0,0 +1,239 @@ +#!/usr/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# It should be possible to rewind a pool beyond a configuration change. +# +# STRATEGY: +# 1. Create a pool. +# 2. Generate files and remember their md5sum. +# 3. Note last synced txg. +# 4. Take a snapshot to make sure old blocks are not overwritten. +# 5. Perform zpool add/attach/detach/remove operation. +# 6. Change device paths if requested and re-import pool. +# 7. Overwrite the files. +# 8. Export the pool. +# 9. Verify that we can rewind the pool to the noted txg. +# 10. Verify that the files are readable and retain their old data. +# +# DISCLAIMER: +# This test can fail since nothing guarantees that old MOS blocks aren't +# overwritten. Snapshots protect datasets and data files but not the MOS. +# sync_some_data_a_few_times interleaves file data and MOS data for a few +# txgs, thus increasing the odds that some txgs will have their MOS data +# left untouched. +# + +verify_runnable "global" + +function custom_cleanup +{ + set_vdev_validate_skip 0 + cleanup +} + +log_onexit custom_cleanup + +function test_common +{ + typeset poolcreate="$1" + typeset addvdevs="$2" + typeset attachargs="${3:-}" + typeset detachvdev="${4:-}" + typeset removevdev="${5:-}" + typeset finalpool="${6:-}" + + typeset poolcheck="$poolcreate" + + log_must zpool create $TESTPOOL1 $poolcreate + + log_must generate_data $TESTPOOL1 $MD5FILE + + # syncing a few times while writing new data increases the odds that MOS + # metadata for some of the txgs will survive + log_must sync_some_data_a_few_times $TESTPOOL1 + typeset txg + txg=$(get_last_txg_synced $TESTPOOL1) + log_must zfs snapshot -r $TESTPOOL1@snap1 + + # + # Perform config change operations + # + if [[ -n $addvdev ]]; then + log_must zpool add -f $TESTPOOL1 $addvdev + fi + if [[ -n $attachargs ]]; then + log_must zpool attach $TESTPOOL1 $attachargs + fi + if [[ -n $detachvdev ]]; then + log_must zpool detach $TESTPOOL1 $detachvdev + fi + if [[ -n $removevdev ]]; then + [[ -z $finalpool ]] && + log_fail "Must provide final pool status!" + log_must zpool remove $TESTPOOL1 $removevdev + log_must wait_for_pool_config $TESTPOOL1 "$finalpool" + fi + if [[ -n $pathstochange ]]; then + # + # Change device paths and re-import pool to update labels + # + zpool export $TESTPOOL1 + for dev in $pathstochange; do + log_must mv $dev "${dev}_new" + poolcheck=$(echo "$poolcheck" | \ + sed "s:$dev:${dev}_new:g") + done + zpool import -d $DEVICE_DIR $TESTPOOL1 + fi + + log_must overwrite_data $TESTPOOL1 "" + + log_must zpool export $TESTPOOL1 + + log_must zpool import -d $DEVICE_DIR -T $txg $TESTPOOL1 + log_must check_pool_config $TESTPOOL1 "$poolcheck" + + log_must verify_data_md5sums $MD5FILE + + # Cleanup + log_must zpool destroy $TESTPOOL1 + if [[ -n $pathstochange ]]; then + for dev in $pathstochange; do + log_must mv "${dev}_new" $dev + done + fi + # Fast way to clear vdev labels + log_must zpool create -f $TESTPOOL2 $VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4 + log_must zpool destroy $TESTPOOL2 + + log_note "" +} + +function test_add_vdevs +{ + typeset poolcreate="$1" + typeset addvdevs="$2" + + log_note "$0: pool '$poolcreate', add $addvdevs." + + test_common "$poolcreate" "$addvdevs" +} + +function test_attach_vdev +{ + typeset poolcreate="$1" + typeset attachto="$2" + typeset attachvdev="$3" + + log_note "$0: pool '$poolcreate', attach $attachvdev to $attachto." + + test_common "$poolcreate" "" "$attachto $attachvdev" +} + +function test_detach_vdev +{ + typeset poolcreate="$1" + typeset detachvdev="$2" + + log_note "$0: pool '$poolcreate', detach $detachvdev." + + test_common "$poolcreate" "" "" "$detachvdev" +} + +function test_attach_detach_vdev +{ + typeset poolcreate="$1" + typeset attachto="$2" + typeset attachvdev="$3" + typeset detachvdev="$4" + + log_note "$0: pool '$poolcreate', attach $attachvdev to $attachto," \ + "then detach $detachvdev." + + test_common "$poolcreate" "" "$attachto $attachvdev" "$detachvdev" +} + +function test_remove_vdev +{ + typeset poolcreate="$1" + typeset removevdev="$2" + typeset finalpool="$3" + + log_note "$0: pool '$poolcreate', remove $removevdev." + + test_common "$poolcreate" "" "" "" "$removevdev" "$finalpool" +} + +# Record txg history +is_linux && log_must set_tunable32 zfs_txg_history 100 + +# Make the devices bigger to reduce chances of overwriting MOS metadata. +increase_device_sizes $(( FILE_SIZE * 4 )) + +# Part of the rewind test is to see how it reacts to path changes +typeset pathstochange="$VDEV0 $VDEV1 $VDEV2 $VDEV3" + +log_note " == test rewind after device addition == " + +test_add_vdevs "$VDEV0" "$VDEV1" +test_add_vdevs "$VDEV0 $VDEV1" "$VDEV2" +test_add_vdevs "$VDEV0" "$VDEV1 $VDEV2" +test_add_vdevs "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3" +test_add_vdevs "$VDEV0" "raidz $VDEV1 $VDEV2 $VDEV3" +test_add_vdevs "$VDEV0" "log $VDEV1" +test_add_vdevs "$VDEV0 log $VDEV1" "$VDEV2" + +log_note " == test rewind after device attach == " + +test_attach_vdev "$VDEV0" "$VDEV0" "$VDEV1" +test_attach_vdev "mirror $VDEV0 $VDEV1" "$VDEV0" "$VDEV2" +test_attach_vdev "$VDEV0 $VDEV1" "$VDEV0" "$VDEV2" + +log_note " == test rewind after device removal == " + +# Once we remove a device it will be overlooked in the device scan, so we must +# preserve its original path +pathstochange="$VDEV0 $VDEV2" +test_remove_vdev "$VDEV0 $VDEV1 $VDEV2" "$VDEV1" "$VDEV0 $VDEV2" + +# +# Path change and detach are incompatible. Detach changes the guid of the vdev +# so we have no direct way to link the new path to an existing vdev. +# +pathstochange="" + +log_note " == test rewind after device detach == " + +test_detach_vdev "mirror $VDEV0 $VDEV1" "$VDEV1" +test_detach_vdev "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3" "$VDEV1" +test_detach_vdev "$VDEV0 log mirror $VDEV1 $VDEV2" "$VDEV2" + +log_note " == test rewind after device attach followed by device detach == " + +# +# We need to disable vdev validation since once we detach VDEV1, VDEV0 will +# inherit the mirror tvd's guid and lose its original guid. +# +set_vdev_validate_skip 1 +test_attach_detach_vdev "$VDEV0" "$VDEV0" "$VDEV1" "$VDEV1" +set_vdev_validate_skip 0 + +log_pass "zpool import rewind after configuration change passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh new file mode 100755 index 0000000000..5ff1c47f32 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh @@ -0,0 +1,186 @@ +#!/usr/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# It should be possible to rewind a pool beyond a device replacement. +# +# STRATEGY: +# 1. Create a pool. +# 2. Generate files and remember their md5sum. +# 3. Sync a few times and note last synced txg. +# 4. Take a snapshot to make sure old blocks are not overwritten. +# 5. Initiate device replacement and export the pool. Special care must +# be taken so that resilvering doesn't complete before the export. +# 6. Test 1: Rewind pool to noted txg and then verify data checksums. +# Import it read-only so that we do not overwrite blocks in later txgs. +# 7. Re-import pool at latest txg and let the replacement finish. +# 8. Export the pool an remove the new device - we shouldn't need it. +# 9. Test 2: Rewind pool to noted txg and then verify data checksums. +# +# STRATEGY TO SLOW DOWN RESILVERING: +# 1. Reduce zfs_txg_timeout, which controls how long can we resilver for +# each sync. +# 2. Add data to pool +# 3. Re-import the pool so that data isn't cached +# 4. Use zinject to slow down device I/O +# 5. Trigger the resilvering +# 6. Use spa freeze to stop writing to the pool. +# 7. Clear zinject events (needed to export the pool) +# 8. Export the pool +# +# DISCLAIMER: +# This test can fail since nothing guarantees that old MOS blocks aren't +# overwritten. Snapshots protect datasets and data files but not the MOS. +# sync_some_data_a_few_times interleaves file data and MOS data for a few +# txgs, thus increasing the odds that some txgs will have their MOS data +# left untouched. +# + +verify_runnable "global" + +ZFS_TXG_TIMEOUT="" + +function custom_cleanup +{ + # Revert zfs_txg_timeout to defaults + [[ -n ZFS_TXG_TIMEOUT ]] && + log_must set_zfs_txg_timeout $ZFS_TXG_TIMEOUT + log_must rm -rf $BACKUP_DEVICE_DIR + zinject -c all + cleanup +} + +log_onexit custom_cleanup + +function test_replace_vdev +{ + typeset poolcreate="$1" + typeset replacevdev="$2" + typeset replaceby="$3" + typeset poolfinalstate="$4" + typeset zinjectdevices="$5" + typeset writedata="$6" + + log_note "$0: pool '$poolcreate', replace $replacevdev by $replaceby." + + log_must zpool create $TESTPOOL1 $poolcreate + + # generate data and checksum it + log_must generate_data $TESTPOOL1 $MD5FILE + + # add more data so that resilver takes longer + log_must write_some_data $TESTPOOL1 $writedata + + # Syncing a few times while writing new data increases the odds that + # MOS metadata for some of the txgs will survive. + log_must sync_some_data_a_few_times $TESTPOOL1 + typeset txg + txg=$(get_last_txg_synced $TESTPOOL1) + log_must zfs snapshot -r $TESTPOOL1@snap1 + + # This should not free original data. + log_must overwrite_data $TESTPOOL1 "" + + # Steps to insure resilvering happens very slowly. + log_must zpool export $TESTPOOL1 + log_must zpool import -d $DEVICE_DIR $TESTPOOL1 + typeset device + for device in $zinjectdevices ; do + log_must zinject -d $device -D 200:1 $TESTPOOL1 > /dev/null + done + log_must zpool replace $TESTPOOL1 $replacevdev $replaceby + + # We must disable zinject in order to export the pool, so we freeze + # it first to prevent writing out subsequent resilvering progress. + log_must zpool freeze $TESTPOOL1 + # Confirm pool is still replacing + log_must pool_is_replacing $TESTPOOL1 + log_must zinject -c all > /dev/null + log_must zpool export $TESTPOOL1 + + ############################################################ + # Test 1: rewind while device is resilvering. + # Import read only to avoid overwriting more recent blocks. + ############################################################ + log_must zpool import -d $DEVICE_DIR -o readonly=on -T $txg $TESTPOOL1 + log_must check_pool_config $TESTPOOL1 "$poolcreate" + + log_must verify_data_md5sums $MD5FILE + + log_must zpool export $TESTPOOL1 + + # Import pool at latest txg to finish the resilvering + log_must zpool import -d $DEVICE_DIR $TESTPOOL1 + log_must overwrite_data $TESTPOOL1 "" + log_must wait_for_pool_config $TESTPOOL1 "$poolfinalstate" + log_must zpool export $TESTPOOL1 + + # Move out the new device + log_must mv $replaceby $BACKUP_DEVICE_DIR/ + + ############################################################ + # Test 2: rewind after device has been replaced. + # Import read-write since we won't need the pool anymore. + ############################################################ + log_must zpool import -d $DEVICE_DIR -T $txg $TESTPOOL1 + log_must check_pool_config $TESTPOOL1 "$poolcreate" + + log_must verify_data_md5sums $MD5FILE + + # Cleanup + log_must zpool destroy $TESTPOOL1 + # Restore the device we moved out + log_must mv "$BACKUP_DEVICE_DIR/$(basename $replaceby)" $DEVICE_DIR/ + # Fast way to clear vdev labels + log_must zpool create -f $TESTPOOL2 $VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4 + log_must zpool destroy $TESTPOOL2 + + log_note "" +} + +# Record txg history +is_linux && log_must set_tunable32 zfs_txg_history 100 + +log_must mkdir -p $BACKUP_DEVICE_DIR +# Make the devices bigger to reduce chances of overwriting MOS metadata. +increase_device_sizes $(( FILE_SIZE * 4 )) + +# We set zfs_txg_timeout to 1 to reduce resilvering time at each sync. +ZFS_TXG_TIMEOUT=$(get_zfs_txg_timeout) +set_zfs_txg_timeout 1 + +test_replace_vdev "$VDEV0 $VDEV1" \ + "$VDEV1" "$VDEV2" \ + "$VDEV0 $VDEV2" \ + "$VDEV0 $VDEV1" 15 + +test_replace_vdev "mirror $VDEV0 $VDEV1" \ + "$VDEV1" "$VDEV2" \ + "mirror $VDEV0 $VDEV2" \ + "$VDEV0 $VDEV1" 10 + +test_replace_vdev "raidz $VDEV0 $VDEV1 $VDEV2" \ + "$VDEV1" "$VDEV3" \ + "raidz $VDEV0 $VDEV3 $VDEV2" \ + "$VDEV0 $VDEV1 $VDEV2" 10 + +set_zfs_txg_timeout $ZFS_TXG_TIMEOUT + +log_pass "zpool import rewind after device replacement passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh index 142771de6c..d81e66636d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh @@ -69,26 +69,14 @@ log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS DISK2="$(echo $DISKS | nawk '{print $2}')" -if is_mpath_device $DISK2; then - echo "y" | newfs -v $DEV_DSKDIR/$DISK2 >/dev/null 2>&1 - (( $? != 0 )) && - log_untested "Unable to setup a $NEWFS_DEFAULT_FS file system" +echo "y" | newfs -v $DEV_DSKDIR/$DISK2 >/dev/null 2>&1 +(( $? != 0 )) && + log_untested "Unable to setup a $NEWFS_DEFAULT_FS file system" - [[ ! -d $DEVICE_DIR ]] && \ - log_must mkdir -p $DEVICE_DIR +[[ ! -d $DEVICE_DIR ]] && \ + log_must mkdir -p $DEVICE_DIR - log_must mount $DEV_DSKDIR/$DISK2 $DEVICE_DIR -else - log_must set_partition 0 "" $FS_SIZE $ZFS_DISK2 - echo "y" | newfs -v $DEV_DSKDIR/$ZFSSIDE_DISK2 >/dev/null 2>&1 - (( $? != 0 )) && - log_untested "Unable to setup a $NEWFS_DEFAULT_FS file system" - - [[ ! -d $DEVICE_DIR ]] && \ - log_must mkdir -p $DEVICE_DIR - - log_must mount $DEV_DSKDIR/$ZFSSIDE_DISK2 $DEVICE_DIR -fi +log_must mount $DEV_DSKDIR/$DISK2 $DEVICE_DIR i=0 while (( i < $MAX_NUM )); do diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg index 648f82c2b6..20f43cefa0 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg @@ -25,7 +25,7 @@ # # -# Copyright (c) 2012, 2015 by Delphix. All rights reserved. +# Copyright (c) 2012, 2016 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -57,10 +57,8 @@ case "${#disk_array[*]}" in if ( is_mpath_device $ZFS_DISK1 ) && [[ -z $(echo $ZFS_DISK1 | awk 'substr($1,18,1)\ ~ /^[[:digit:]]+$/') ]] || ( is_real_device $ZFS_DISK1 ); then ZFSSIDE_DISK1=${ZFS_DISK1}1 - ZFSSIDE_DISK2=${ZFS_DISK2}2 elif ( is_mpath_device $ZFS_DISK1 || is_loop_device $ZFS_DISK1 ); then ZFSSIDE_DISK1=${ZFS_DISK1}p1 - ZFSSIDE_DISK2=${ZFS_DISK2}p2 else log_fail "$ZFS_DISK1 not supported for partitioning." fi @@ -71,7 +69,6 @@ case "${#disk_array[*]}" in ZFS_DISK1=${disk_array[0]} ZFSSIDE_DISK1=${ZFS_DISK1}s0 ZFS_DISK2=${disk_array[0]} - ZFSSIDE_DISK2=${ZFS_DISK2}s1 fi ;; *) @@ -96,14 +93,6 @@ case "${#disk_array[*]}" in log_fail "$ZFS_DISK1 not supported for partitioning." fi ZFS_DISK2=${disk_array[1]} - if ( is_mpath_device $ZFS_DISK2 ) && [[ -z $(echo $ZFS_DISK2 | awk 'substr($1,18,1)\ - ~ /^[[:digit:]]+$/') ]] || ( is_real_device $ZFS_DISK2 ); then - ZFSSIDE_DISK2=${ZFS_DISK2}1 - elif ( is_mpath_device $ZFS_DISK2 || is_loop_device $ZFS_DISK2 ); then - ZFSSIDE_DISK2=${ZFS_DISK2}p1 - else - log_fail "$ZFS_DISK2 not supported for partitioning." - fi else export DEV_DSKDIR="/dev" PRIMARY_SLICE=2 @@ -111,15 +100,14 @@ case "${#disk_array[*]}" in ZFS_DISK1=${disk_array[0]} ZFSSIDE_DISK1=${ZFS_DISK1}s0 ZFS_DISK2=${disk_array[1]} - ZFSSIDE_DISK2=${ZFS_DISK2}s0 fi ;; esac -export DISK_COUNT ZFS_DISK1 ZFSSIDE_DISK1 ZFS_DISK2 ZFSSIDE_DISK2 +export DISK_COUNT ZFS_DISK1 ZFSSIDE_DISK1 ZFS_DISK2 -export FS_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 16))m" -export FILE_SIZE="$(($MINVDEVSIZE / 2))" +export FS_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 32))m" +export FILE_SIZE="$((MINVDEVSIZE))" export SLICE_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 2))m" export MAX_NUM=5 export GROUP_NUM=3 @@ -129,6 +117,12 @@ export DEVICE_FILE=disk export DEVICE_ARCHIVE=archive_import-test export MYTESTFILE=$STF_SUITE/include/libtest.shlib +export CPATH=/var/tmp/cachefile.$$ +export CPATHBKP=/var/tmp/cachefile.$$.bkp +export CPATHBKP2=/var/tmp/cachefile.$$.bkp2 +export MD5FILE=/var/tmp/md5sums.$$ +export MD5FILE2=/var/tmp/md5sums.$$.2 + typeset -i num=0 while (( num < $GROUP_NUM )); do DEVICE_FILES="$DEVICE_FILES ${DEVICE_DIR}/${DEVICE_FILE}$num" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib new file mode 100644 index 0000000000..bc89d8159c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib @@ -0,0 +1,376 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg + +# +# Prototype cleanup function for zpool_import tests. +# +function cleanup +{ + destroy_pool $TESTPOOL1 + + log_must rm -f $CPATH $CPATHBKP $CPATHBKP2 $MD5FILE $MD5FILE2 + + log_must rm -rf $DEVICE_DIR/* + typeset i=0 + while (( i < $MAX_NUM )); do + log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + ((i += 1)) + done + is_linux && set_tunable32 "zfs_txg_history" 0 +} + +# +# Write a bit of data and sync several times. +# This function is intended to be used by zpool rewind tests. +# +function sync_some_data_a_few_times +{ + typeset pool=$1 + typeset -i a_few_times=${2:-10} + + typeset file="/$pool/tmpfile" + for i in {0..$a_few_times}; do + dd if=/dev/urandom of=${file}_$i bs=128k count=10 + sync_pool "$pool" + done + + return 0 +} + +# +# Just write a moderate amount of data to the pool. +# +function write_some_data +{ + typeset pool=$1 + typeset files10mb=${2:-10} + + typeset ds="$pool/fillerds" + zfs create $ds + [[ $? -ne 0 ]] && return 1 + + # Create 100 MB of data + typeset file="/$ds/fillerfile" + for i in {1..$files10mb}; do + dd if=/dev/urandom of=$file.$i bs=128k count=80 + [[ $? -ne 0 ]] && return 1 + done + + return 0 +} + +# +# Create/overwrite a few datasets with files. +# Apply md5sum on all the files and store checksums in a file. +# +# newdata: overwrite existing files if false. +# md5file: file where to store md5sums +# datasetname: base name for datasets +# +function _generate_data_common +{ + typeset pool=$1 + typeset newdata=$2 + typeset md5file=$3 + typeset datasetname=$4 + + typeset -i datasets=3 + typeset -i files=5 + typeset -i blocks=10 + + [[ -n $md5file ]] && rm -f $md5file + for i in {1..$datasets}; do + ( $newdata ) && log_must zfs create "$pool/$datasetname$i" + for j in {1..$files}; do + typeset file="/$pool/$datasetname$i/file$j" + dd if=/dev/urandom of=$file bs=128k count=$blocks > /dev/null + [[ -n $md5file ]] && md5sum $file >> $md5file + done + ( $newdata ) && sync_pool "$pool" + done + + return 0 +} + +function generate_data +{ + typeset pool=$1 + typeset md5file="$2" + typeset datasetname=${3:-ds} + + _generate_data_common $pool true "$md5file" $datasetname +} + +function overwrite_data +{ + typeset pool=$1 + typeset md5file="$2" + typeset datasetname=${3:-ds} + + _generate_data_common $1 false "$md5file" $datasetname +} + +# +# Verify md5sums of every file in md5sum file $1. +# +function verify_data_md5sums +{ + typeset md5file=$1 + + if [[ ! -f $md5file ]]; then + log_note "md5 sums file '$md5file' doesn't exist" + return 1 + fi + + md5sum -c --quiet $md5file + return $? +} + +# +# Set devices size in DEVICE_DIR to $1. +# +function increase_device_sizes +{ + typeset newfilesize=$1 + + typeset -i i=0 + while (( i < $MAX_NUM )); do + log_must mkfile $newfilesize ${DEVICE_DIR}/${DEVICE_FILE}$i + ((i += 1)) + done +} + +# +# Translate vdev names returned by zpool status into more generic names. +# +# eg: mirror-2 --> mirror +# +function _translate_vdev +{ + typeset vdev=$1 + + typeset keywords="mirror replacing raidz1 raidz2 raidz3 indirect" + for word in $keywords; do + echo $vdev | egrep "^${word}-[0-9]+\$" > /dev/null + if [[ $? -eq 0 ]]; then + vdev=$word + break + fi + done + + [[ $vdev == "logs" ]] && echo "log" && return 0 + [[ $vdev == "raidz1" ]] && echo "raidz" && return 0 + + echo $vdev + return 0 +} + +# +# Check that pool configuration returned by zpool status matches expected +# configuration. Format for the check string is same as the vdev arguments for +# creating a pool +# Add -q for quiet mode. +# +# eg: check_pool_config pool1 "mirror c0t0d0s0 c0t1d0s0 log c1t1d0s0" +# +function check_pool_config +{ + typeset logfailure=true + if [[ $1 == '-q' ]]; then + logfailure=false + shift + fi + + typeset poolname=$1 + typeset expected=$2 + + typeset status + status=$(zpool status $poolname 2>&1) + if [[ $? -ne 0 ]]; then + if ( $logfailure ); then + log_note "zpool status $poolname failed: $status" + fi + return 1 + fi + + typeset actual="" + typeset began=false + printf "$status\n" | while read line; do + typeset vdev=$(echo "$line" | awk '{printf $1}') + if ( ! $began ) && [[ $vdev == NAME ]]; then + began=true + continue + fi + ( $began ) && [[ -z $vdev ]] && break; + + if ( $began ); then + [[ -z $actual ]] && actual="$vdev" && continue + vdev=$(_translate_vdev $vdev) + actual="$actual $vdev" + fi + done + + expected="$poolname $expected" + + if [[ "$actual" != "$expected" ]]; then + if ( $logfailure ); then + log_note "expected pool vdevs:" + log_note "> '$expected'" + log_note "actual pool vdevs:" + log_note "> '$actual'" + fi + return 1 + fi + + return 0 +} + +# +# Check that pool configuration returned by zpool status matches expected +# configuration within a given timeout in seconds. See check_pool_config(). +# +# eg: wait_for_pool_config pool1 "mirror c0t0d0s0 c0t1d0s0" 60 +# +function wait_for_pool_config +{ + typeset poolname=$1 + typeset expectedconfig="$2" + typeset -i timeout=${3:-60} + + timeout=$(( $timeout + $(date +%s) )) + + while (( $(date +%s) < $timeout )); do + check_pool_config -q $poolname "$expectedconfig" + [[ $? -eq 0 ]] && return 0 + sleep 3 + done + + check_pool_config $poolname "$expectedconfig" + return $? +} + +# +# Check that pool status is ONLINE +# +function check_pool_healthy +{ + typeset pool=$1 + + typeset status + status=$(zpool status $pool 2>&1) + if [[ $? -ne 0 ]]; then + log_note "zpool status $pool failed: $status" + return 1 + fi + + status=$(echo "$status" | grep "$pool" | grep -v "pool:" | \ + awk '{print $2}') + + if [[ $status != "ONLINE" ]]; then + log_note "Invalid zpool status for '$pool': '$status'" \ + "!= 'ONLINE'" + return 1 + fi + + return 0 +} + +# +# Return 0 if a device is currently being replaced in the pool. +# +function pool_is_replacing +{ + typeset pool=$1 + + zpool status $pool | grep "replacing" | grep "ONLINE" > /dev/null + + return $? +} + +function set_vdev_validate_skip +{ + set_tunable32 "vdev_validate_skip" "$1" +} + +function get_zfs_txg_timeout +{ + get_tunable "zfs_txg_timeout" +} + +function set_zfs_txg_timeout +{ + set_tunable32 "zfs_txg_timeout" "$1" +} + +function set_spa_load_verify_metadata +{ + set_tunable32 "spa_load_verify_metadata" "$1" +} + +function set_spa_load_verify_data +{ + set_tunable32 "spa_load_verify_data" "$1" +} + +function set_zfs_max_missing_tvds +{ + set_tunable32 "zfs_max_missing_tvds" "$1" +} + +# +# Use mdb to find the last txg that was synced in an active pool. +# +function get_last_txg_synced +{ + typeset pool=$1 + + if is_linux; then + txg=$(tail "/proc/spl/kstat/zfs/$pool/txgs" | + awk '$3=="C" {print $1}' | tail -1) + [[ "$txg" ]] || txg=0 + echo $txg + return 0 + fi + + typeset spas + spas=$(mdb -k -e "::spa") + [[ $? -ne 0 ]] && return 1 + + typeset spa="" + print "$spas\n" | while read line; do + typeset poolname=$(echo "$line" | awk '{print $3}') + typeset addr=$(echo "$line" | awk '{print $1}') + if [[ $poolname == $pool ]]; then + spa=$addr + break + fi + done + if [[ -z $spa ]]; then + log_fail "Couldn't find pool '$pool'" + return 1 + fi + typeset mdbcmd="$spa::print spa_t spa_ubsync.ub_txg | ::eval '.=E'" + typeset -i txg + txg=$(mdb -k -e "$mdbcmd") + [[ $? -ne 0 ]] && return 1 + + echo $txg + return 0 +}