From 018f26041d67447de28c293c35e1f664ce3c1abb Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 22 Dec 2022 11:48:49 -0800 Subject: [PATCH] deadlock between spa_errlog_lock and dp_config_rwlock There is a lock order inversion deadlock between `spa_errlog_lock` and `dp_config_rwlock`: A thread in `spa_delete_dataset_errlog()` is running from a sync task. It is holding the `dp_config_rwlock` for writer (see `dsl_sync_task_sync()`), and waiting for the `spa_errlog_lock`. A thread in `dsl_pool_config_enter()` is holding the `spa_errlog_lock` (see `spa_get_errlog_size()`) and waiting for the `dp_config_rwlock` (as reader). Note that this was introduced by #12812. This commit address this by defining the lock ordering to be dp_config_rwlock first, then spa_errlog_lock / spa_errlist_lock. spa_get_errlog() and spa_get_errlog_size() can acquire the locks in this order, and then process_error_block() and get_head_and_birth_txg() can verify that the dp_config_rwlock is already held. Additionally, a buffer overrun in `spa_get_errlog()` is corrected. Many code paths didn't check if `*count` got to zero, instead continuing to overwrite past the beginning of the userspace buffer at `uaddr`. Tested by having some errors in the pool (via `zinject -t data /path/to/file`), one thread running `zpool iostat 0.001`, and another thread runs `zfs destroy` (in a loop, although it hits the first time). This reproduces the problem easily without the fix, and works with the fix. Reviewed-by: Mark Maybee Reviewed-by: George Amanakis Reviewed-by: George Wilson Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens Closes #14239 Closes #14289 --- cmd/zpool/zpool_main.c | 32 +---- cmd/ztest.c | 2 +- include/sys/spa.h | 2 +- lib/libzfs/libzfs_pool.c | 46 +++---- lib/libzfs/libzfs_status.c | 2 +- module/zfs/dsl_scan.c | 8 +- module/zfs/spa.c | 2 +- module/zfs/spa_errlog.c | 267 ++++++++++++++----------------------- module/zfs/zfs_ioctl.c | 7 +- 9 files changed, 138 insertions(+), 230 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index af76d20f2d..04e80efbb3 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -8599,37 +8599,17 @@ status_callback(zpool_handle_t *zhp, void *data) if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { - nvlist_t *nverrlist = NULL; - - /* - * If the approximate error count is small, get a - * precise count by fetching the entire log and - * uniquifying the results. - */ - if (nerr > 0 && nerr < 100 && !cbp->cb_verbose && - zpool_get_errlog(zhp, &nverrlist) == 0) { - nvpair_t *elem; - - elem = NULL; - nerr = 0; - while ((elem = nvlist_next_nvpair(nverrlist, - elem)) != NULL) { - nerr++; - } - } - nvlist_free(nverrlist); - (void) printf("\n"); - - if (nerr == 0) - (void) printf(gettext("errors: No known data " - "errors\n")); - else if (!cbp->cb_verbose) + if (nerr == 0) { + (void) printf(gettext( + "errors: No known data errors\n")); + } else if (!cbp->cb_verbose) { (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); - else + } else { print_error_log(zhp); + } } if (cbp->cb_dedup_stats) diff --git a/cmd/ztest.c b/cmd/ztest.c index 605ce5cea3..4a031dac21 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -6313,7 +6313,7 @@ ztest_scrub_impl(spa_t *spa) while (dsl_scan_scrubbing(spa_get_dsl(spa))) txg_wait_synced(spa_get_dsl(spa), 0); - if (spa_get_errlog_size(spa) > 0) + if (spa_approx_errlog_size(spa) > 0) return (ECKSUM); ztest_pool_scrubbed = B_TRUE; diff --git a/include/sys/spa.h b/include/sys/spa.h index b260dd3282..500eb3491a 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1146,7 +1146,7 @@ extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); -extern uint64_t spa_get_errlog_size(spa_t *spa); +extern uint64_t spa_approx_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 7f7e19a090..2977986e1e 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -4133,33 +4133,28 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; - uint64_t count; - zbookmark_phys_t *zb = NULL; - int i; + zbookmark_phys_t *buf; + uint64_t buflen = 10000; /* approx. 1MB of RAM */ + + if (fnvlist_lookup_uint64(zhp->zpool_config, + ZPOOL_CONFIG_ERRCOUNT) == 0) + return (0); /* - * Retrieve the raw error list from the kernel. If the number of errors - * has increased, allocate more space and continue until we get the - * entire list. + * Retrieve the raw error list from the kernel. If it doesn't fit, + * allocate a larger buffer and retry. */ - count = fnvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT); - if (count == 0) - return (0); - zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl, - count * sizeof (zbookmark_phys_t)); - zc.zc_nvlist_dst_size = count; (void) strcpy(zc.zc_name, zhp->zpool_name); for (;;) { + buf = zfs_alloc(zhp->zpool_hdl, + buflen * sizeof (zbookmark_phys_t)); + zc.zc_nvlist_dst = (uintptr_t)buf; + zc.zc_nvlist_dst_size = buflen; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG, &zc) != 0) { - free((void *)(uintptr_t)zc.zc_nvlist_dst); + free(buf); if (errno == ENOMEM) { - void *dst; - - count = zc.zc_nvlist_dst_size; - dst = zfs_alloc(zhp->zpool_hdl, count * - sizeof (zbookmark_phys_t)); - zc.zc_nvlist_dst = (uintptr_t)dst; + buflen *= 2; } else { return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "errors: List of " @@ -4177,18 +4172,17 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) * _not_ copied as part of the process. So we point the start of our * array appropriate and decrement the total number of elements. */ - zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) + - zc.zc_nvlist_dst_size; - count -= zc.zc_nvlist_dst_size; + zbookmark_phys_t *zb = buf + zc.zc_nvlist_dst_size; + uint64_t zblen = buflen - zc.zc_nvlist_dst_size; - qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare); + qsort(zb, zblen, sizeof (zbookmark_phys_t), zbookmark_mem_compare); verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); /* * Fill in the nverrlistp with nvlist's of dataset and object numbers. */ - for (i = 0; i < count; i++) { + for (uint64_t i = 0; i < zblen; i++) { nvlist_t *nv; /* ignoring zb_blkid and zb_level for now */ @@ -4215,11 +4209,11 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) nvlist_free(nv); } - free((void *)(uintptr_t)zc.zc_nvlist_dst); + free(buf); return (0); nomem: - free((void *)(uintptr_t)zc.zc_nvlist_dst); + free(buf); return (no_memory(zhp->zpool_hdl)); } diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c index 6999d9afc5..27bb4476d7 100644 --- a/lib/libzfs/libzfs_status.c +++ b/lib/libzfs/libzfs_status.c @@ -222,7 +222,6 @@ check_status(nvlist_t *config, boolean_t isimport, { pool_scan_stat_t *ps = NULL; uint_t vsc, psc; - uint64_t nerr; uint64_t suspended; uint64_t hostid = 0; uint64_t errata = 0; @@ -392,6 +391,7 @@ check_status(nvlist_t *config, boolean_t isimport, * Persistent data errors. */ if (!isimport) { + uint64_t nerr; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0 && nerr != 0) return (ZPOOL_STATUS_CORRUPT_DATA); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 03c2aa313a..c4b90991a4 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -944,13 +944,13 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) if (dsl_scan_restarting(scn, tx)) spa_history_log_internal(spa, "scan aborted, restarting", tx, - "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); else if (!complete) spa_history_log_internal(spa, "scan cancelled", tx, - "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); else spa_history_log_internal(spa, "scan done", tx, - "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { spa->spa_scrub_active = B_FALSE; @@ -1013,7 +1013,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) { spa_history_log_internal(spa, "starting deferred resilver", tx, "errors=%llu", - (u_longlong_t)spa_get_errlog_size(spa)); + (u_longlong_t)spa_approx_errlog_size(spa)); spa_async_request(spa, SPA_ASYNC_RESILVER); } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index fe7051db27..67b3a03a95 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -5543,7 +5543,7 @@ spa_get_stats(const char *name, nvlist_t **config, fnvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, - spa_get_errlog_size(spa)); + spa_approx_errlog_size(spa)); if (spa_suspended(spa)) { fnvlist_add_uint64(*config, diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 30e1249dd3..c6d97eed28 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -160,10 +160,8 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, dsl_dataset_t *ds; objset_t *os; - dsl_pool_config_enter(dp, FTAG); int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds); if (error != 0) { - dsl_pool_config_exit(dp, FTAG); return (error); } ASSERT(head_dataset_id); @@ -172,7 +170,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, error = dmu_objset_from_ds(ds, &os); if (error != 0) { dsl_dataset_rele(ds, FTAG); - dsl_pool_config_exit(dp, FTAG); return (error); } @@ -189,7 +186,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, ZFS_KEYSTATUS_UNAVAILABLE) { zep->zb_birth = 0; dsl_dataset_rele(ds, FTAG); - dsl_pool_config_exit(dp, FTAG); return (0); } @@ -199,7 +195,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, error = dnode_hold(os, zep->zb_object, FTAG, &dn); if (error != 0) { dsl_dataset_rele(ds, FTAG); - dsl_pool_config_exit(dp, FTAG); return (error); } @@ -225,7 +220,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); dsl_dataset_rele(ds, FTAG); - dsl_pool_config_exit(dp, FTAG); return (error); } @@ -303,17 +297,31 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, } /* - * This function serves a double role. If only_count is true, it returns - * (in *count) how many times an error block belonging to this filesystem is - * referenced by snapshots or clones. If only_count is false, each time the - * error block is referenced by a snapshot or clone, it fills the userspace - * array at uaddr with the bookmarks of the error blocks. The array is filled - * from the back and *count is modified to be the number of unused entries at - * the beginning of the array. + * Copy the bookmark to the end of the user-space buffer which starts at + * uaddr and has *count unused entries, and decrement *count by 1. + */ +static int +copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count) +{ + if (*count == 0) + return (SET_ERROR(ENOMEM)); + + *count -= 1; + if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) + return (SET_ERROR(EFAULT)); + return (0); +} + +/* + * Each time the error block is referenced by a snapshot or clone, add a + * zbookmark_phys_t entry to the userspace array at uaddr. The array is + * filled from the back and the in-out parameter *count is modified to be the + * number of unused entries at the beginning of the array. */ static int check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, - uint64_t *count, void *uaddr, boolean_t only_count) + void *uaddr, uint64_t *count) { dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; @@ -343,18 +351,12 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, } if (zep->zb_birth == latest_txg) { /* Block neither free nor rewritten. */ - if (!only_count) { - zbookmark_phys_t zb; - zep_to_zb(head_ds, zep, &zb); - if (copyout(&zb, (char *)uaddr + (*count - 1) - * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) { - dsl_dataset_rele(ds, FTAG); - return (SET_ERROR(EFAULT)); - } - (*count)--; - } else { - (*count)++; + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + error = copyout_entry(&zb, uaddr, count); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); } check_snapshot = B_FALSE; } else { @@ -407,19 +409,12 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, snap_obj_array[aff_snap_count] = snap_obj; aff_snap_count++; - if (!only_count) { - zbookmark_phys_t zb; - zep_to_zb(snap_obj, zep, &zb); - if (copyout(&zb, (char *)uaddr + (*count - 1) * - sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) { - dsl_dataset_rele(ds, FTAG); - error = SET_ERROR(EFAULT); - goto out; - } - (*count)--; - } else { - (*count)++; + zbookmark_phys_t zb; + zep_to_zb(snap_obj, zep, &zb); + error = copyout_entry(&zb, uaddr, count); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + goto out; } /* @@ -433,8 +428,7 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { error = check_filesystem(spa, - za.za_first_integer, zep, - count, uaddr, only_count); + za.za_first_integer, zep, uaddr, count); if (error != 0) { zap_cursor_fini(&zc); @@ -477,11 +471,8 @@ find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, static int process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, - uint64_t *count, void *uaddr, boolean_t only_count) + void *uaddr, uint64_t *count) { - dsl_pool_t *dp = spa->spa_dsl_pool; - uint64_t top_affected_fs; - /* * If the zb_birth is 0 it means we failed to retrieve the birth txg * of the block pointer. This happens when an encrypted filesystem is @@ -489,95 +480,24 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, * check_filesystem(), instead do the accounting here. */ if (zep->zb_birth == 0) { - if (!only_count) { - zbookmark_phys_t zb; - zep_to_zb(head_ds, zep, &zb); - if (copyout(&zb, (char *)uaddr + (*count - 1) - * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) { - return (SET_ERROR(EFAULT)); - } - (*count)--; - } else { - (*count)++; + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + int error = copyout_entry(&zb, uaddr, count); + if (error != 0) { + return (error); } return (0); } - dsl_pool_config_enter(dp, FTAG); + uint64_t top_affected_fs; int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); - if (error == 0) - error = check_filesystem(spa, top_affected_fs, zep, count, - uaddr, only_count); + if (error == 0) { + error = check_filesystem(spa, top_affected_fs, zep, + uaddr, count); + } - dsl_pool_config_exit(dp, FTAG); return (error); } - -static uint64_t -get_errlog_size(spa_t *spa, uint64_t spa_err_obj) -{ - if (spa_err_obj == 0) - return (0); - uint64_t total = 0; - - zap_cursor_t zc; - zap_attribute_t za; - for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); - zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { - - zap_cursor_t head_ds_cursor; - zap_attribute_t head_ds_attr; - zbookmark_err_phys_t head_ds_block; - - uint64_t head_ds; - name_to_object(za.za_name, &head_ds); - - for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, - za.za_first_integer); zap_cursor_retrieve(&head_ds_cursor, - &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { - - name_to_errphys(head_ds_attr.za_name, &head_ds_block); - (void) process_error_block(spa, head_ds, &head_ds_block, - &total, NULL, B_TRUE); - } - zap_cursor_fini(&head_ds_cursor); - } - zap_cursor_fini(&zc); - return (total); -} - -static uint64_t -get_errlist_size(spa_t *spa, avl_tree_t *tree) -{ - if (avl_numnodes(tree) == 0) - return (0); - uint64_t total = 0; - - spa_error_entry_t *se; - for (se = avl_first(tree); se != NULL; se = AVL_NEXT(tree, se)) { - zbookmark_err_phys_t zep; - zep.zb_object = se->se_bookmark.zb_object; - zep.zb_level = se->se_bookmark.zb_level; - zep.zb_blkid = se->se_bookmark.zb_blkid; - zep.zb_birth = 0; - - /* - * If we cannot find out the head dataset and birth txg of - * the present error block, we opt not to error out. In the - * next pool sync this information will be retrieved by - * sync_error_list() and written to the on-disk error log. - */ - uint64_t head_ds_obj; - int error = get_head_and_birth_txg(spa, &zep, - se->se_bookmark.zb_objset, &head_ds_obj); - - if (!error) - (void) process_error_block(spa, head_ds_obj, &zep, - &total, NULL, B_TRUE); - } - return (total); -} #endif /* @@ -677,13 +597,33 @@ spa_remove_error(spa_t *spa, zbookmark_phys_t *zb) spa_add_healed_error(spa, spa->spa_errlog_scrub, zb); } +static uint64_t +approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj) +{ + if (spa_err_obj == 0) + return (0); + uint64_t total = 0; + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + uint64_t count; + if (zap_count(spa->spa_meta_objset, za.za_first_integer, + &count) == 0) + total += count; + } + zap_cursor_fini(&zc); + return (total); +} + /* - * Return the number of errors currently in the error log. This is actually the - * sum of both the last log and the current log, since we don't know the union - * of these logs until we reach userland. + * Return the approximate number of errors currently in the error log. This + * will be nonzero if there are some errors, but otherwise it may be more + * or less than the number of entries returned by spa_get_errlog(). */ uint64_t -spa_get_errlog_size(spa_t *spa) +spa_approx_errlog_size(spa_t *spa) { uint64_t total = 0; @@ -701,23 +641,16 @@ spa_get_errlog_size(spa_t *spa) total += count; mutex_exit(&spa->spa_errlog_lock); - mutex_enter(&spa->spa_errlist_lock); - total += avl_numnodes(&spa->spa_errlist_last); - total += avl_numnodes(&spa->spa_errlist_scrub); - mutex_exit(&spa->spa_errlist_lock); } else { -#ifdef _KERNEL mutex_enter(&spa->spa_errlog_lock); - total += get_errlog_size(spa, spa->spa_errlog_last); - total += get_errlog_size(spa, spa->spa_errlog_scrub); + total += approx_errlog_size_impl(spa, spa->spa_errlog_last); + total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub); mutex_exit(&spa->spa_errlog_lock); - - mutex_enter(&spa->spa_errlist_lock); - total += get_errlist_size(spa, &spa->spa_errlist_last); - total += get_errlist_size(spa, &spa->spa_errlist_scrub); - mutex_exit(&spa->spa_errlist_lock); -#endif } + mutex_enter(&spa->spa_errlist_lock); + total += avl_numnodes(&spa->spa_errlist_last); + total += avl_numnodes(&spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); return (total); } @@ -860,8 +793,7 @@ spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) #ifdef _KERNEL /* - * If an error block is shared by two datasets it will be counted twice. For - * detailed message see spa_get_errlog_size() above. + * If an error block is shared by two datasets it will be counted twice. */ static int process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) @@ -884,14 +816,11 @@ process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) zbookmark_phys_t zb; name_to_bookmark(za.za_name, &zb); - if (copyout(&zb, (char *)uaddr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) { + int error = copyout_entry(&zb, uaddr, count); + if (error != 0) { zap_cursor_fini(&zc); - return (SET_ERROR(EFAULT)); + return (error); } - *count -= 1; - } zap_cursor_fini(&zc); return (0); @@ -914,7 +843,7 @@ process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) zbookmark_err_phys_t head_ds_block; name_to_errphys(head_ds_attr.za_name, &head_ds_block); int error = process_error_block(spa, head_ds, - &head_ds_block, count, uaddr, B_FALSE); + &head_ds_block, uaddr, count); if (error != 0) { zap_cursor_fini(&head_ds_cursor); @@ -936,16 +865,11 @@ process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count) if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { - - if (*count == 0) - return (SET_ERROR(ENOMEM)); - - if (copyout(&se->se_bookmark, (char *)uaddr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) - return (SET_ERROR(EFAULT)); - - *count -= 1; + int error = + copyout_entry(&se->se_bookmark, uaddr, count); + if (error != 0) { + return (error); + } } return (0); } @@ -963,7 +887,7 @@ process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count) if (!error) error = process_error_block(spa, head_ds_obj, &zep, - count, uaddr, B_FALSE); + uaddr, count); if (error) return (error); } @@ -988,6 +912,12 @@ spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) int ret = 0; #ifdef _KERNEL + /* + * The pool config lock is needed to hold a dataset_t via (among other + * places) process_error_list() -> get_head_and_birth_txg(), and lock + * ordering requires that we get it before the spa_errlog_lock. + */ + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); mutex_enter(&spa->spa_errlog_lock); ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); @@ -1006,6 +936,7 @@ spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) mutex_exit(&spa->spa_errlist_lock); mutex_exit(&spa->spa_errlog_lock); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); #else (void) spa, (void) uaddr, (void) count; #endif @@ -1174,6 +1105,13 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) spa->spa_scrub_finished = B_FALSE; mutex_exit(&spa->spa_errlist_lock); + + /* + * The pool config lock is needed to hold a dataset_t via + * sync_error_list() -> get_head_and_birth_txg(), and lock ordering + * requires that we get it before the spa_errlog_lock. + */ + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); mutex_enter(&spa->spa_errlog_lock); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); @@ -1218,6 +1156,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) dmu_tx_commit(tx); mutex_exit(&spa->spa_errlog_lock); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); } static void @@ -1354,7 +1293,7 @@ spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, #if defined(_KERNEL) /* error handling */ EXPORT_SYMBOL(spa_log_error); -EXPORT_SYMBOL(spa_get_errlog_size); +EXPORT_SYMBOL(spa_approx_errlog_size); EXPORT_SYMBOL(spa_get_errlog); EXPORT_SYMBOL(spa_errlog_rotate); EXPORT_SYMBOL(spa_errlog_drain); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index a55d9f88ab..f913c3509b 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -5695,17 +5695,12 @@ zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; - uint64_t count = zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, - &count); - if (error == 0) - zc->zc_nvlist_dst_size = count; - else - zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); + &zc->zc_nvlist_dst_size); spa_close(spa, FTAG);