From d87676a9fa0b434fe3073933c4125074b8a48325 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Wed, 5 Aug 2020 10:22:09 -0700 Subject: [PATCH] Fix i/o error handling of livelists and zap iteration Pool-wide metadata is stored in the MOS (Meta Object Set). This metadata is stored in triplicate, in addition to any pool-level reduncancy (e.g. RAIDZ). However, if all 3+ copies of this metadata are not available, we can still get EIO/ECKSUM when reading from the MOS. If we encounter such an error in syncing context, we have typically already committed to making a change that we now can't do because of the corrupt/missing metadata. We typically "handle" this with a `VERIFY()` or `zfs_panic_recover()`. This prevents the system from continuing on in an undefined state, while minimizing the amount of error-handling code. However, there are some code paths that ignore these i/o errors, or `ASSERT()` that they don't happen. Since assertions are disabled on non-debug builds, they effectively ignore them as well. This can lead to ZFS continuing on in an incorrect state, potentially leading to on-disk inconsistencies. This commit adds handling for these i/o errors on MOS metadata, typically with a `VERIFY()`: * Handle error return from `zap_cursor_retrieve()` in 4 places in `dsl_deadlist.c`. * Handle error return from `zap_contains()` in `dsl_dir_hold_obj()`. Turns out this call isn't necessary because we can always call `zap_lookup()`. * Handle error return from `zap_lookup()` in `dsl_fs_ss_limit_check()`. * Handle error return from `zap_remove()` in `dsl_dir_rename_sync()`. * Handle error return from `zap_lookup()` in `dsl_dir_remove_livelist()`. * Handle error return from `dsl_process_sub_livelist()` in `spa_livelist_delete_cb()`. Additionally: * Augment the internal history log message for `zfs destroy` to note which method is used (e.g. bptree, livelist, or, synchronous) and the mintxg. * Correct a comment in `dbuf_init()`. * Correct indentation in `dsl_dir_remove_livelist()`. Reviewed by: Sara Hartse Reviewed-by: George Wilson Signed-off-by: Matthew Ahrens Closes #10643 --- module/zfs/dsl_deadlist.c | 16 +++-- module/zfs/dsl_destroy.c | 21 +++++-- module/zfs/dsl_dir.c | 121 +++++++++++++++++++------------------- module/zfs/spa.c | 2 +- 4 files changed, 91 insertions(+), 69 deletions(-) diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 3d16f61f62..bad2d56eef 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -134,6 +134,7 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) { zap_cursor_t zc; zap_attribute_t za; + int error; ASSERT(MUTEX_HELD(&dl->dl_lock)); @@ -162,7 +163,7 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) sizeof (dsl_deadlist_entry_t), offsetof(dsl_deadlist_entry_t, dle_node)); for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); - zap_cursor_retrieve(&zc, &za) == 0; + (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); @@ -177,6 +178,7 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) avl_add(&dl->dl_tree, dle); } + VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree); @@ -206,6 +208,7 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl) { zap_cursor_t zc; zap_attribute_t za; + int error; ASSERT(MUTEX_HELD(&dl->dl_lock)); @@ -219,7 +222,7 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl) sizeof (dsl_deadlist_cache_entry_t), offsetof(dsl_deadlist_cache_entry_t, dlce_node)); for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); - zap_cursor_retrieve(&zc, &za) == 0; + (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { if (za.za_first_integer == empty_bpobj) continue; @@ -236,6 +239,7 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl) 0, 0, 0, ZIO_PRIORITY_SYNC_READ); avl_add(&dl->dl_cache, dlce); } + VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache); @@ -378,6 +382,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) dmu_object_info_t doi; zap_cursor_t zc; zap_attribute_t za; + int error; VERIFY0(dmu_object_info(os, dlobj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { @@ -386,7 +391,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) } for (zap_cursor_init(&zc, os, dlobj); - zap_cursor_retrieve(&zc, &za) == 0; + (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t obj = za.za_first_integer; if (obj == dmu_objset_pool(os)->dp_empty_bpobj) @@ -394,6 +399,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) else bpobj_free(os, obj, tx); } + VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); VERIFY0(dmu_object_free(os, dlobj, tx)); } @@ -824,6 +830,7 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) dmu_buf_t *bonus; dsl_deadlist_phys_t *dlp; dmu_object_info_t doi; + int error; VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { @@ -836,12 +843,13 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) mutex_enter(&dl->dl_lock); for (zap_cursor_init(&zc, dl->dl_os, obj); - zap_cursor_retrieve(&zc, &za) == 0; + (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t mintxg = zfs_strtonum(za.za_name, NULL); dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); } + VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 190dbdef5f..26fdf96341 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -738,6 +738,10 @@ old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) { struct killarg ka; + spa_history_log_internal_ds(ds, "destroy", tx, + "(synchronous, mintxg=%llu)", + (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg); + /* * Free everything that we point to (that's born after * the previous snapshot, if we are a clone) @@ -902,6 +906,14 @@ dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) spa_t *spa = dmu_tx_pool(tx)->dp_spa; VERIFY0(dmu_objset_from_ds(ds, &os)); + uint64_t mintxg = 0; + dsl_deadlist_entry_t *dle = dsl_deadlist_first(&dd->dd_livelist); + if (dle != NULL) + mintxg = dle->dle_mintxg; + + spa_history_log_internal_ds(ds, "destroy", tx, + "(livelist, mintxg=%llu)", (long long)mintxg); + /* Check that the clone is in a correct state to be deleted */ dsl_clone_destroy_assert(dd); @@ -922,7 +934,7 @@ dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) spa->spa_livelists_to_delete = zap_obj; } else if (error != 0) { zfs_panic_recover("zfs: error %d was returned while looking " - "up DMU_POOL_DELETED_CLONES in the zap"); + "up DMU_POOL_DELETED_CLONES in the zap", error); return; } VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx)); @@ -952,6 +964,10 @@ dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; + spa_history_log_internal_ds(ds, "destroy", tx, + "(bptree, mintxg=%llu)", + (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg); + zil_destroy_sync(dmu_objset_zil(os), tx); if (!spa_feature_is_active(dp->dp_spa, @@ -1003,9 +1019,6 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); - /* We need to log before removing it from the namespace. */ - spa_history_log_internal_ds(ds, "destroy", tx, " "); - dsl_dir_cancel_waiters(ds->ds_dir); rmorigin = (dsl_dir_is_clone(ds->ds_dir) && diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index af369d1c7d..73b7943b8f 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -197,25 +197,27 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_dbuf = dbuf; dd->dd_pool = dp; - if (dsl_dir_is_zapified(dd) && - zap_contains(dp->dp_meta_objset, ddobj, - DD_FIELD_CRYPTO_KEY_OBJ) == 0) { - VERIFY0(zap_lookup(dp->dp_meta_objset, - ddobj, DD_FIELD_CRYPTO_KEY_OBJ, - sizeof (uint64_t), 1, &dd->dd_crypto_obj)); - - /* check for on-disk format errata */ - if (dsl_dir_incompatible_encryption_version(dd)) { - dp->dp_spa->spa_errata = - ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; - } - } - mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL); dsl_prop_init(dd); + if (dsl_dir_is_zapified(dd)) { + err = zap_lookup(dp->dp_meta_objset, + ddobj, DD_FIELD_CRYPTO_KEY_OBJ, + sizeof (uint64_t), 1, &dd->dd_crypto_obj); + if (err == 0) { + /* check for on-disk format errata */ + if (dsl_dir_incompatible_encryption_version( + dd)) { + dp->dp_spa->spa_errata = + ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; + } + } else if (err != ENOENT) { + goto errout; + } + } + dsl_dir_snap_cmtime_update(dd); if (dsl_dir_phys(dd)->dd_parent_obj) { @@ -863,9 +865,14 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, * stop since we know there is no limit here (or above). The counts are * not valid on this node and we know we won't touch this node's counts. */ - if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, - count_prop, sizeof (count), 1, &count) == ENOENT) + if (!dsl_dir_is_zapified(dd)) return (0); + err = zap_lookup(os, dd->dd_object, + count_prop, sizeof (count), 1, &count); + if (err == ENOENT) + return (0); + if (err != 0) + return (err); err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, B_FALSE); @@ -2047,7 +2054,6 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; const char *mynewname; - int error; objset_t *mos = dp->dp_meta_objset; VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); @@ -2114,10 +2120,9 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ - error = zap_remove(mos, + VERIFY0(zap_remove(mos, dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, - dd->dd_myname, tx); - ASSERT0(error); + dd->dd_myname, tx)); (void) strlcpy(dd->dd_myname, mynewname, sizeof (dd->dd_myname)); @@ -2259,49 +2264,45 @@ dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL && (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) { - /* - * We use zthr_wait_cycle_done instead of zthr_cancel - * because we don't want to destroy the zthr, just have - * it skip its current task. - */ - spa->spa_to_condense.cancelled = B_TRUE; - zthr_wait_cycle_done(ll_condense_thread); - /* - * If we've returned from zthr_wait_cycle_done without - * clearing the to_condense data structure it's either - * because the no-wait synctask has started (which is - * indicated by 'syncing' field of to_condense) and we - * can expect it to clear to_condense on its own. - * Otherwise, we returned before the zthr ran. The - * checkfunc will now fail as cancelled == B_TRUE so we - * can safely NULL out ds, allowing a different dir's - * livelist to be condensed. - * - * We can be sure that the to_condense struct will not - * be repopulated at this stage because both this - * function and dsl_livelist_try_condense execute in - * syncing context. - */ - if ((spa->spa_to_condense.ds != NULL) && - !spa->spa_to_condense.syncing) { - dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, - spa); - spa->spa_to_condense.ds = NULL; - } + /* + * We use zthr_wait_cycle_done instead of zthr_cancel + * because we don't want to destroy the zthr, just have + * it skip its current task. + */ + spa->spa_to_condense.cancelled = B_TRUE; + zthr_wait_cycle_done(ll_condense_thread); + /* + * If we've returned from zthr_wait_cycle_done without + * clearing the to_condense data structure it's either + * because the no-wait synctask has started (which is + * indicated by 'syncing' field of to_condense) and we + * can expect it to clear to_condense on its own. + * Otherwise, we returned before the zthr ran. The + * checkfunc will now fail as cancelled == B_TRUE so we + * can safely NULL out ds, allowing a different dir's + * livelist to be condensed. + * + * We can be sure that the to_condense struct will not + * be repopulated at this stage because both this + * function and dsl_livelist_try_condense execute in + * syncing context. + */ + if ((spa->spa_to_condense.ds != NULL) && + !spa->spa_to_condense.syncing) { + dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, + spa); + spa->spa_to_condense.ds = NULL; + } } dsl_dir_livelist_close(dd); - int err = zap_lookup(dp->dp_meta_objset, dd->dd_object, - DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj); - if (err == 0) { - VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, - DD_FIELD_LIVELIST, tx)); - if (total) { - dsl_deadlist_free(dp->dp_meta_objset, obj, tx); - spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); - } - } else { - ASSERT3U(err, !=, ENOENT); + VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj)); + VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_LIVELIST, tx)); + if (total) { + dsl_deadlist_free(dp->dp_meta_objset, obj, tx); + spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); } } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 86e5d0125f..79b88e9111 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -2527,7 +2527,7 @@ spa_livelist_delete_cb(void *arg, zthr_t *z) sublist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } else { - ASSERT(err == EINTR); + VERIFY3U(err, ==, EINTR); } bplist_clear(&to_free); bplist_destroy(&to_free);