deadlock between spa_errlog_lock and dp_config_rwlock
There is a lock order inversion deadlock between `spa_errlog_lock` and `dp_config_rwlock`: A thread in `spa_delete_dataset_errlog()` is running from a sync task. It is holding the `dp_config_rwlock` for writer (see `dsl_sync_task_sync()`), and waiting for the `spa_errlog_lock`. A thread in `dsl_pool_config_enter()` is holding the `spa_errlog_lock` (see `spa_get_errlog_size()`) and waiting for the `dp_config_rwlock` (as reader). Note that this was introduced by #12812. This commit address this by defining the lock ordering to be dp_config_rwlock first, then spa_errlog_lock / spa_errlist_lock. spa_get_errlog() and spa_get_errlog_size() can acquire the locks in this order, and then process_error_block() and get_head_and_birth_txg() can verify that the dp_config_rwlock is already held. Additionally, a buffer overrun in `spa_get_errlog()` is corrected. Many code paths didn't check if `*count` got to zero, instead continuing to overwrite past the beginning of the userspace buffer at `uaddr`. Tested by having some errors in the pool (via `zinject -t data /path/to/file`), one thread running `zpool iostat 0.001`, and another thread runs `zfs destroy` (in a loop, although it hits the first time). This reproduces the problem easily without the fix, and works with the fix. Reviewed-by: Mark Maybee <mark.maybee@delphix.com> Reviewed-by: George Amanakis <gamanakis@gmail.com> Reviewed-by: George Wilson <gwilson@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Closes #14239 Closes #14289
This commit is contained in:
parent
29e1b089c1
commit
018f26041d
|
@ -8599,38 +8599,18 @@ status_callback(zpool_handle_t *zhp, void *data)
|
||||||
|
|
||||||
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
|
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
|
||||||
&nerr) == 0) {
|
&nerr) == 0) {
|
||||||
nvlist_t *nverrlist = NULL;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If the approximate error count is small, get a
|
|
||||||
* precise count by fetching the entire log and
|
|
||||||
* uniquifying the results.
|
|
||||||
*/
|
|
||||||
if (nerr > 0 && nerr < 100 && !cbp->cb_verbose &&
|
|
||||||
zpool_get_errlog(zhp, &nverrlist) == 0) {
|
|
||||||
nvpair_t *elem;
|
|
||||||
|
|
||||||
elem = NULL;
|
|
||||||
nerr = 0;
|
|
||||||
while ((elem = nvlist_next_nvpair(nverrlist,
|
|
||||||
elem)) != NULL) {
|
|
||||||
nerr++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
nvlist_free(nverrlist);
|
|
||||||
|
|
||||||
(void) printf("\n");
|
(void) printf("\n");
|
||||||
|
if (nerr == 0) {
|
||||||
if (nerr == 0)
|
(void) printf(gettext(
|
||||||
(void) printf(gettext("errors: No known data "
|
"errors: No known data errors\n"));
|
||||||
"errors\n"));
|
} else if (!cbp->cb_verbose) {
|
||||||
else if (!cbp->cb_verbose)
|
|
||||||
(void) printf(gettext("errors: %llu data "
|
(void) printf(gettext("errors: %llu data "
|
||||||
"errors, use '-v' for a list\n"),
|
"errors, use '-v' for a list\n"),
|
||||||
(u_longlong_t)nerr);
|
(u_longlong_t)nerr);
|
||||||
else
|
} else {
|
||||||
print_error_log(zhp);
|
print_error_log(zhp);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (cbp->cb_dedup_stats)
|
if (cbp->cb_dedup_stats)
|
||||||
print_dedup_stats(config);
|
print_dedup_stats(config);
|
||||||
|
|
|
@ -6313,7 +6313,7 @@ ztest_scrub_impl(spa_t *spa)
|
||||||
while (dsl_scan_scrubbing(spa_get_dsl(spa)))
|
while (dsl_scan_scrubbing(spa_get_dsl(spa)))
|
||||||
txg_wait_synced(spa_get_dsl(spa), 0);
|
txg_wait_synced(spa_get_dsl(spa), 0);
|
||||||
|
|
||||||
if (spa_get_errlog_size(spa) > 0)
|
if (spa_approx_errlog_size(spa) > 0)
|
||||||
return (ECKSUM);
|
return (ECKSUM);
|
||||||
|
|
||||||
ztest_pool_scrubbed = B_TRUE;
|
ztest_pool_scrubbed = B_TRUE;
|
||||||
|
|
|
@ -1146,7 +1146,7 @@ extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
|
||||||
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
|
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
|
||||||
extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
|
extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
|
||||||
extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
|
extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
|
||||||
extern uint64_t spa_get_errlog_size(spa_t *spa);
|
extern uint64_t spa_approx_errlog_size(spa_t *spa);
|
||||||
extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count);
|
extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count);
|
||||||
extern void spa_errlog_rotate(spa_t *spa);
|
extern void spa_errlog_rotate(spa_t *spa);
|
||||||
extern void spa_errlog_drain(spa_t *spa);
|
extern void spa_errlog_drain(spa_t *spa);
|
||||||
|
|
|
@ -4133,33 +4133,28 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
|
||||||
{
|
{
|
||||||
zfs_cmd_t zc = {"\0"};
|
zfs_cmd_t zc = {"\0"};
|
||||||
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
libzfs_handle_t *hdl = zhp->zpool_hdl;
|
||||||
uint64_t count;
|
zbookmark_phys_t *buf;
|
||||||
zbookmark_phys_t *zb = NULL;
|
uint64_t buflen = 10000; /* approx. 1MB of RAM */
|
||||||
int i;
|
|
||||||
|
if (fnvlist_lookup_uint64(zhp->zpool_config,
|
||||||
|
ZPOOL_CONFIG_ERRCOUNT) == 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Retrieve the raw error list from the kernel. If the number of errors
|
* Retrieve the raw error list from the kernel. If it doesn't fit,
|
||||||
* has increased, allocate more space and continue until we get the
|
* allocate a larger buffer and retry.
|
||||||
* entire list.
|
|
||||||
*/
|
*/
|
||||||
count = fnvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT);
|
|
||||||
if (count == 0)
|
|
||||||
return (0);
|
|
||||||
zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
|
|
||||||
count * sizeof (zbookmark_phys_t));
|
|
||||||
zc.zc_nvlist_dst_size = count;
|
|
||||||
(void) strcpy(zc.zc_name, zhp->zpool_name);
|
(void) strcpy(zc.zc_name, zhp->zpool_name);
|
||||||
for (;;) {
|
for (;;) {
|
||||||
|
buf = zfs_alloc(zhp->zpool_hdl,
|
||||||
|
buflen * sizeof (zbookmark_phys_t));
|
||||||
|
zc.zc_nvlist_dst = (uintptr_t)buf;
|
||||||
|
zc.zc_nvlist_dst_size = buflen;
|
||||||
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG,
|
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG,
|
||||||
&zc) != 0) {
|
&zc) != 0) {
|
||||||
free((void *)(uintptr_t)zc.zc_nvlist_dst);
|
free(buf);
|
||||||
if (errno == ENOMEM) {
|
if (errno == ENOMEM) {
|
||||||
void *dst;
|
buflen *= 2;
|
||||||
|
|
||||||
count = zc.zc_nvlist_dst_size;
|
|
||||||
dst = zfs_alloc(zhp->zpool_hdl, count *
|
|
||||||
sizeof (zbookmark_phys_t));
|
|
||||||
zc.zc_nvlist_dst = (uintptr_t)dst;
|
|
||||||
} else {
|
} else {
|
||||||
return (zpool_standard_error_fmt(hdl, errno,
|
return (zpool_standard_error_fmt(hdl, errno,
|
||||||
dgettext(TEXT_DOMAIN, "errors: List of "
|
dgettext(TEXT_DOMAIN, "errors: List of "
|
||||||
|
@ -4177,18 +4172,17 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
|
||||||
* _not_ copied as part of the process. So we point the start of our
|
* _not_ copied as part of the process. So we point the start of our
|
||||||
* array appropriate and decrement the total number of elements.
|
* array appropriate and decrement the total number of elements.
|
||||||
*/
|
*/
|
||||||
zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
|
zbookmark_phys_t *zb = buf + zc.zc_nvlist_dst_size;
|
||||||
zc.zc_nvlist_dst_size;
|
uint64_t zblen = buflen - zc.zc_nvlist_dst_size;
|
||||||
count -= zc.zc_nvlist_dst_size;
|
|
||||||
|
|
||||||
qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
|
qsort(zb, zblen, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
|
||||||
|
|
||||||
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
|
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Fill in the nverrlistp with nvlist's of dataset and object numbers.
|
* Fill in the nverrlistp with nvlist's of dataset and object numbers.
|
||||||
*/
|
*/
|
||||||
for (i = 0; i < count; i++) {
|
for (uint64_t i = 0; i < zblen; i++) {
|
||||||
nvlist_t *nv;
|
nvlist_t *nv;
|
||||||
|
|
||||||
/* ignoring zb_blkid and zb_level for now */
|
/* ignoring zb_blkid and zb_level for now */
|
||||||
|
@ -4215,11 +4209,11 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
|
||||||
nvlist_free(nv);
|
nvlist_free(nv);
|
||||||
}
|
}
|
||||||
|
|
||||||
free((void *)(uintptr_t)zc.zc_nvlist_dst);
|
free(buf);
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
nomem:
|
nomem:
|
||||||
free((void *)(uintptr_t)zc.zc_nvlist_dst);
|
free(buf);
|
||||||
return (no_memory(zhp->zpool_hdl));
|
return (no_memory(zhp->zpool_hdl));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -222,7 +222,6 @@ check_status(nvlist_t *config, boolean_t isimport,
|
||||||
{
|
{
|
||||||
pool_scan_stat_t *ps = NULL;
|
pool_scan_stat_t *ps = NULL;
|
||||||
uint_t vsc, psc;
|
uint_t vsc, psc;
|
||||||
uint64_t nerr;
|
|
||||||
uint64_t suspended;
|
uint64_t suspended;
|
||||||
uint64_t hostid = 0;
|
uint64_t hostid = 0;
|
||||||
uint64_t errata = 0;
|
uint64_t errata = 0;
|
||||||
|
@ -392,6 +391,7 @@ check_status(nvlist_t *config, boolean_t isimport,
|
||||||
* Persistent data errors.
|
* Persistent data errors.
|
||||||
*/
|
*/
|
||||||
if (!isimport) {
|
if (!isimport) {
|
||||||
|
uint64_t nerr;
|
||||||
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
|
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
|
||||||
&nerr) == 0 && nerr != 0)
|
&nerr) == 0 && nerr != 0)
|
||||||
return (ZPOOL_STATUS_CORRUPT_DATA);
|
return (ZPOOL_STATUS_CORRUPT_DATA);
|
||||||
|
|
|
@ -944,13 +944,13 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
|
||||||
|
|
||||||
if (dsl_scan_restarting(scn, tx))
|
if (dsl_scan_restarting(scn, tx))
|
||||||
spa_history_log_internal(spa, "scan aborted, restarting", tx,
|
spa_history_log_internal(spa, "scan aborted, restarting", tx,
|
||||||
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
|
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
|
||||||
else if (!complete)
|
else if (!complete)
|
||||||
spa_history_log_internal(spa, "scan cancelled", tx,
|
spa_history_log_internal(spa, "scan cancelled", tx,
|
||||||
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
|
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
|
||||||
else
|
else
|
||||||
spa_history_log_internal(spa, "scan done", tx,
|
spa_history_log_internal(spa, "scan done", tx,
|
||||||
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
|
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
|
||||||
|
|
||||||
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
|
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
|
||||||
spa->spa_scrub_active = B_FALSE;
|
spa->spa_scrub_active = B_FALSE;
|
||||||
|
@ -1013,7 +1013,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
|
||||||
vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
|
vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
|
||||||
spa_history_log_internal(spa,
|
spa_history_log_internal(spa,
|
||||||
"starting deferred resilver", tx, "errors=%llu",
|
"starting deferred resilver", tx, "errors=%llu",
|
||||||
(u_longlong_t)spa_get_errlog_size(spa));
|
(u_longlong_t)spa_approx_errlog_size(spa));
|
||||||
spa_async_request(spa, SPA_ASYNC_RESILVER);
|
spa_async_request(spa, SPA_ASYNC_RESILVER);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5543,7 +5543,7 @@ spa_get_stats(const char *name, nvlist_t **config,
|
||||||
|
|
||||||
fnvlist_add_uint64(*config,
|
fnvlist_add_uint64(*config,
|
||||||
ZPOOL_CONFIG_ERRCOUNT,
|
ZPOOL_CONFIG_ERRCOUNT,
|
||||||
spa_get_errlog_size(spa));
|
spa_approx_errlog_size(spa));
|
||||||
|
|
||||||
if (spa_suspended(spa)) {
|
if (spa_suspended(spa)) {
|
||||||
fnvlist_add_uint64(*config,
|
fnvlist_add_uint64(*config,
|
||||||
|
|
|
@ -160,10 +160,8 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
|
||||||
dsl_dataset_t *ds;
|
dsl_dataset_t *ds;
|
||||||
objset_t *os;
|
objset_t *os;
|
||||||
|
|
||||||
dsl_pool_config_enter(dp, FTAG);
|
|
||||||
int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds);
|
int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds);
|
||||||
if (error != 0) {
|
if (error != 0) {
|
||||||
dsl_pool_config_exit(dp, FTAG);
|
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
ASSERT(head_dataset_id);
|
ASSERT(head_dataset_id);
|
||||||
|
@ -172,7 +170,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
|
||||||
error = dmu_objset_from_ds(ds, &os);
|
error = dmu_objset_from_ds(ds, &os);
|
||||||
if (error != 0) {
|
if (error != 0) {
|
||||||
dsl_dataset_rele(ds, FTAG);
|
dsl_dataset_rele(ds, FTAG);
|
||||||
dsl_pool_config_exit(dp, FTAG);
|
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -189,7 +186,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
|
||||||
ZFS_KEYSTATUS_UNAVAILABLE) {
|
ZFS_KEYSTATUS_UNAVAILABLE) {
|
||||||
zep->zb_birth = 0;
|
zep->zb_birth = 0;
|
||||||
dsl_dataset_rele(ds, FTAG);
|
dsl_dataset_rele(ds, FTAG);
|
||||||
dsl_pool_config_exit(dp, FTAG);
|
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -199,7 +195,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
|
||||||
error = dnode_hold(os, zep->zb_object, FTAG, &dn);
|
error = dnode_hold(os, zep->zb_object, FTAG, &dn);
|
||||||
if (error != 0) {
|
if (error != 0) {
|
||||||
dsl_dataset_rele(ds, FTAG);
|
dsl_dataset_rele(ds, FTAG);
|
||||||
dsl_pool_config_exit(dp, FTAG);
|
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -225,7 +220,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
dnode_rele(dn, FTAG);
|
dnode_rele(dn, FTAG);
|
||||||
dsl_dataset_rele(ds, FTAG);
|
dsl_dataset_rele(ds, FTAG);
|
||||||
dsl_pool_config_exit(dp, FTAG);
|
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -303,17 +297,31 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This function serves a double role. If only_count is true, it returns
|
* Copy the bookmark to the end of the user-space buffer which starts at
|
||||||
* (in *count) how many times an error block belonging to this filesystem is
|
* uaddr and has *count unused entries, and decrement *count by 1.
|
||||||
* referenced by snapshots or clones. If only_count is false, each time the
|
*/
|
||||||
* error block is referenced by a snapshot or clone, it fills the userspace
|
static int
|
||||||
* array at uaddr with the bookmarks of the error blocks. The array is filled
|
copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count)
|
||||||
* from the back and *count is modified to be the number of unused entries at
|
{
|
||||||
* the beginning of the array.
|
if (*count == 0)
|
||||||
|
return (SET_ERROR(ENOMEM));
|
||||||
|
|
||||||
|
*count -= 1;
|
||||||
|
if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t),
|
||||||
|
sizeof (zbookmark_phys_t)) != 0)
|
||||||
|
return (SET_ERROR(EFAULT));
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Each time the error block is referenced by a snapshot or clone, add a
|
||||||
|
* zbookmark_phys_t entry to the userspace array at uaddr. The array is
|
||||||
|
* filled from the back and the in-out parameter *count is modified to be the
|
||||||
|
* number of unused entries at the beginning of the array.
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
|
check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
|
||||||
uint64_t *count, void *uaddr, boolean_t only_count)
|
void *uaddr, uint64_t *count)
|
||||||
{
|
{
|
||||||
dsl_dataset_t *ds;
|
dsl_dataset_t *ds;
|
||||||
dsl_pool_t *dp = spa->spa_dsl_pool;
|
dsl_pool_t *dp = spa->spa_dsl_pool;
|
||||||
|
@ -343,18 +351,12 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
|
||||||
}
|
}
|
||||||
if (zep->zb_birth == latest_txg) {
|
if (zep->zb_birth == latest_txg) {
|
||||||
/* Block neither free nor rewritten. */
|
/* Block neither free nor rewritten. */
|
||||||
if (!only_count) {
|
|
||||||
zbookmark_phys_t zb;
|
zbookmark_phys_t zb;
|
||||||
zep_to_zb(head_ds, zep, &zb);
|
zep_to_zb(head_ds, zep, &zb);
|
||||||
if (copyout(&zb, (char *)uaddr + (*count - 1)
|
error = copyout_entry(&zb, uaddr, count);
|
||||||
* sizeof (zbookmark_phys_t),
|
if (error != 0) {
|
||||||
sizeof (zbookmark_phys_t)) != 0) {
|
|
||||||
dsl_dataset_rele(ds, FTAG);
|
dsl_dataset_rele(ds, FTAG);
|
||||||
return (SET_ERROR(EFAULT));
|
return (error);
|
||||||
}
|
|
||||||
(*count)--;
|
|
||||||
} else {
|
|
||||||
(*count)++;
|
|
||||||
}
|
}
|
||||||
check_snapshot = B_FALSE;
|
check_snapshot = B_FALSE;
|
||||||
} else {
|
} else {
|
||||||
|
@ -407,20 +409,13 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
|
||||||
snap_obj_array[aff_snap_count] = snap_obj;
|
snap_obj_array[aff_snap_count] = snap_obj;
|
||||||
aff_snap_count++;
|
aff_snap_count++;
|
||||||
|
|
||||||
if (!only_count) {
|
|
||||||
zbookmark_phys_t zb;
|
zbookmark_phys_t zb;
|
||||||
zep_to_zb(snap_obj, zep, &zb);
|
zep_to_zb(snap_obj, zep, &zb);
|
||||||
if (copyout(&zb, (char *)uaddr + (*count - 1) *
|
error = copyout_entry(&zb, uaddr, count);
|
||||||
sizeof (zbookmark_phys_t),
|
if (error != 0) {
|
||||||
sizeof (zbookmark_phys_t)) != 0) {
|
|
||||||
dsl_dataset_rele(ds, FTAG);
|
dsl_dataset_rele(ds, FTAG);
|
||||||
error = SET_ERROR(EFAULT);
|
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
(*count)--;
|
|
||||||
} else {
|
|
||||||
(*count)++;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Only clones whose origins were affected could also
|
* Only clones whose origins were affected could also
|
||||||
|
@ -433,8 +428,7 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
|
||||||
zap_cursor_retrieve(&zc, &za) == 0;
|
zap_cursor_retrieve(&zc, &za) == 0;
|
||||||
zap_cursor_advance(&zc)) {
|
zap_cursor_advance(&zc)) {
|
||||||
error = check_filesystem(spa,
|
error = check_filesystem(spa,
|
||||||
za.za_first_integer, zep,
|
za.za_first_integer, zep, uaddr, count);
|
||||||
count, uaddr, only_count);
|
|
||||||
|
|
||||||
if (error != 0) {
|
if (error != 0) {
|
||||||
zap_cursor_fini(&zc);
|
zap_cursor_fini(&zc);
|
||||||
|
@ -477,11 +471,8 @@ find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
|
||||||
|
|
||||||
static int
|
static int
|
||||||
process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
|
process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
|
||||||
uint64_t *count, void *uaddr, boolean_t only_count)
|
void *uaddr, uint64_t *count)
|
||||||
{
|
{
|
||||||
dsl_pool_t *dp = spa->spa_dsl_pool;
|
|
||||||
uint64_t top_affected_fs;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the zb_birth is 0 it means we failed to retrieve the birth txg
|
* If the zb_birth is 0 it means we failed to retrieve the birth txg
|
||||||
* of the block pointer. This happens when an encrypted filesystem is
|
* of the block pointer. This happens when an encrypted filesystem is
|
||||||
|
@ -489,94 +480,23 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
|
||||||
* check_filesystem(), instead do the accounting here.
|
* check_filesystem(), instead do the accounting here.
|
||||||
*/
|
*/
|
||||||
if (zep->zb_birth == 0) {
|
if (zep->zb_birth == 0) {
|
||||||
if (!only_count) {
|
|
||||||
zbookmark_phys_t zb;
|
zbookmark_phys_t zb;
|
||||||
zep_to_zb(head_ds, zep, &zb);
|
zep_to_zb(head_ds, zep, &zb);
|
||||||
if (copyout(&zb, (char *)uaddr + (*count - 1)
|
int error = copyout_entry(&zb, uaddr, count);
|
||||||
* sizeof (zbookmark_phys_t),
|
if (error != 0) {
|
||||||
sizeof (zbookmark_phys_t)) != 0) {
|
|
||||||
return (SET_ERROR(EFAULT));
|
|
||||||
}
|
|
||||||
(*count)--;
|
|
||||||
} else {
|
|
||||||
(*count)++;
|
|
||||||
}
|
|
||||||
return (0);
|
|
||||||
}
|
|
||||||
|
|
||||||
dsl_pool_config_enter(dp, FTAG);
|
|
||||||
int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
|
|
||||||
if (error == 0)
|
|
||||||
error = check_filesystem(spa, top_affected_fs, zep, count,
|
|
||||||
uaddr, only_count);
|
|
||||||
|
|
||||||
dsl_pool_config_exit(dp, FTAG);
|
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint64_t
|
|
||||||
get_errlog_size(spa_t *spa, uint64_t spa_err_obj)
|
|
||||||
{
|
|
||||||
if (spa_err_obj == 0)
|
|
||||||
return (0);
|
return (0);
|
||||||
uint64_t total = 0;
|
|
||||||
|
|
||||||
zap_cursor_t zc;
|
|
||||||
zap_attribute_t za;
|
|
||||||
for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
|
|
||||||
zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
|
|
||||||
|
|
||||||
zap_cursor_t head_ds_cursor;
|
|
||||||
zap_attribute_t head_ds_attr;
|
|
||||||
zbookmark_err_phys_t head_ds_block;
|
|
||||||
|
|
||||||
uint64_t head_ds;
|
|
||||||
name_to_object(za.za_name, &head_ds);
|
|
||||||
|
|
||||||
for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset,
|
|
||||||
za.za_first_integer); zap_cursor_retrieve(&head_ds_cursor,
|
|
||||||
&head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) {
|
|
||||||
|
|
||||||
name_to_errphys(head_ds_attr.za_name, &head_ds_block);
|
|
||||||
(void) process_error_block(spa, head_ds, &head_ds_block,
|
|
||||||
&total, NULL, B_TRUE);
|
|
||||||
}
|
|
||||||
zap_cursor_fini(&head_ds_cursor);
|
|
||||||
}
|
|
||||||
zap_cursor_fini(&zc);
|
|
||||||
return (total);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint64_t
|
uint64_t top_affected_fs;
|
||||||
get_errlist_size(spa_t *spa, avl_tree_t *tree)
|
int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
|
||||||
{
|
if (error == 0) {
|
||||||
if (avl_numnodes(tree) == 0)
|
error = check_filesystem(spa, top_affected_fs, zep,
|
||||||
return (0);
|
uaddr, count);
|
||||||
uint64_t total = 0;
|
|
||||||
|
|
||||||
spa_error_entry_t *se;
|
|
||||||
for (se = avl_first(tree); se != NULL; se = AVL_NEXT(tree, se)) {
|
|
||||||
zbookmark_err_phys_t zep;
|
|
||||||
zep.zb_object = se->se_bookmark.zb_object;
|
|
||||||
zep.zb_level = se->se_bookmark.zb_level;
|
|
||||||
zep.zb_blkid = se->se_bookmark.zb_blkid;
|
|
||||||
zep.zb_birth = 0;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If we cannot find out the head dataset and birth txg of
|
|
||||||
* the present error block, we opt not to error out. In the
|
|
||||||
* next pool sync this information will be retrieved by
|
|
||||||
* sync_error_list() and written to the on-disk error log.
|
|
||||||
*/
|
|
||||||
uint64_t head_ds_obj;
|
|
||||||
int error = get_head_and_birth_txg(spa, &zep,
|
|
||||||
se->se_bookmark.zb_objset, &head_ds_obj);
|
|
||||||
|
|
||||||
if (!error)
|
|
||||||
(void) process_error_block(spa, head_ds_obj, &zep,
|
|
||||||
&total, NULL, B_TRUE);
|
|
||||||
}
|
}
|
||||||
return (total);
|
|
||||||
|
return (error);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -677,13 +597,33 @@ spa_remove_error(spa_t *spa, zbookmark_phys_t *zb)
|
||||||
spa_add_healed_error(spa, spa->spa_errlog_scrub, zb);
|
spa_add_healed_error(spa, spa->spa_errlog_scrub, zb);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static uint64_t
|
||||||
|
approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj)
|
||||||
|
{
|
||||||
|
if (spa_err_obj == 0)
|
||||||
|
return (0);
|
||||||
|
uint64_t total = 0;
|
||||||
|
|
||||||
|
zap_cursor_t zc;
|
||||||
|
zap_attribute_t za;
|
||||||
|
for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
|
||||||
|
zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
|
||||||
|
uint64_t count;
|
||||||
|
if (zap_count(spa->spa_meta_objset, za.za_first_integer,
|
||||||
|
&count) == 0)
|
||||||
|
total += count;
|
||||||
|
}
|
||||||
|
zap_cursor_fini(&zc);
|
||||||
|
return (total);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the number of errors currently in the error log. This is actually the
|
* Return the approximate number of errors currently in the error log. This
|
||||||
* sum of both the last log and the current log, since we don't know the union
|
* will be nonzero if there are some errors, but otherwise it may be more
|
||||||
* of these logs until we reach userland.
|
* or less than the number of entries returned by spa_get_errlog().
|
||||||
*/
|
*/
|
||||||
uint64_t
|
uint64_t
|
||||||
spa_get_errlog_size(spa_t *spa)
|
spa_approx_errlog_size(spa_t *spa)
|
||||||
{
|
{
|
||||||
uint64_t total = 0;
|
uint64_t total = 0;
|
||||||
|
|
||||||
|
@ -701,23 +641,16 @@ spa_get_errlog_size(spa_t *spa)
|
||||||
total += count;
|
total += count;
|
||||||
mutex_exit(&spa->spa_errlog_lock);
|
mutex_exit(&spa->spa_errlog_lock);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
mutex_enter(&spa->spa_errlog_lock);
|
||||||
|
total += approx_errlog_size_impl(spa, spa->spa_errlog_last);
|
||||||
|
total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub);
|
||||||
|
mutex_exit(&spa->spa_errlog_lock);
|
||||||
|
}
|
||||||
mutex_enter(&spa->spa_errlist_lock);
|
mutex_enter(&spa->spa_errlist_lock);
|
||||||
total += avl_numnodes(&spa->spa_errlist_last);
|
total += avl_numnodes(&spa->spa_errlist_last);
|
||||||
total += avl_numnodes(&spa->spa_errlist_scrub);
|
total += avl_numnodes(&spa->spa_errlist_scrub);
|
||||||
mutex_exit(&spa->spa_errlist_lock);
|
mutex_exit(&spa->spa_errlist_lock);
|
||||||
} else {
|
|
||||||
#ifdef _KERNEL
|
|
||||||
mutex_enter(&spa->spa_errlog_lock);
|
|
||||||
total += get_errlog_size(spa, spa->spa_errlog_last);
|
|
||||||
total += get_errlog_size(spa, spa->spa_errlog_scrub);
|
|
||||||
mutex_exit(&spa->spa_errlog_lock);
|
|
||||||
|
|
||||||
mutex_enter(&spa->spa_errlist_lock);
|
|
||||||
total += get_errlist_size(spa, &spa->spa_errlist_last);
|
|
||||||
total += get_errlist_size(spa, &spa->spa_errlist_scrub);
|
|
||||||
mutex_exit(&spa->spa_errlist_lock);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
return (total);
|
return (total);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -860,8 +793,7 @@ spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx)
|
||||||
|
|
||||||
#ifdef _KERNEL
|
#ifdef _KERNEL
|
||||||
/*
|
/*
|
||||||
* If an error block is shared by two datasets it will be counted twice. For
|
* If an error block is shared by two datasets it will be counted twice.
|
||||||
* detailed message see spa_get_errlog_size() above.
|
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
|
process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
|
||||||
|
@ -884,14 +816,11 @@ process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
|
||||||
zbookmark_phys_t zb;
|
zbookmark_phys_t zb;
|
||||||
name_to_bookmark(za.za_name, &zb);
|
name_to_bookmark(za.za_name, &zb);
|
||||||
|
|
||||||
if (copyout(&zb, (char *)uaddr +
|
int error = copyout_entry(&zb, uaddr, count);
|
||||||
(*count - 1) * sizeof (zbookmark_phys_t),
|
if (error != 0) {
|
||||||
sizeof (zbookmark_phys_t)) != 0) {
|
|
||||||
zap_cursor_fini(&zc);
|
zap_cursor_fini(&zc);
|
||||||
return (SET_ERROR(EFAULT));
|
return (error);
|
||||||
}
|
}
|
||||||
*count -= 1;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
zap_cursor_fini(&zc);
|
zap_cursor_fini(&zc);
|
||||||
return (0);
|
return (0);
|
||||||
|
@ -914,7 +843,7 @@ process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
|
||||||
zbookmark_err_phys_t head_ds_block;
|
zbookmark_err_phys_t head_ds_block;
|
||||||
name_to_errphys(head_ds_attr.za_name, &head_ds_block);
|
name_to_errphys(head_ds_attr.za_name, &head_ds_block);
|
||||||
int error = process_error_block(spa, head_ds,
|
int error = process_error_block(spa, head_ds,
|
||||||
&head_ds_block, count, uaddr, B_FALSE);
|
&head_ds_block, uaddr, count);
|
||||||
|
|
||||||
if (error != 0) {
|
if (error != 0) {
|
||||||
zap_cursor_fini(&head_ds_cursor);
|
zap_cursor_fini(&head_ds_cursor);
|
||||||
|
@ -936,16 +865,11 @@ process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
|
||||||
if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
|
if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
|
||||||
for (se = avl_first(list); se != NULL;
|
for (se = avl_first(list); se != NULL;
|
||||||
se = AVL_NEXT(list, se)) {
|
se = AVL_NEXT(list, se)) {
|
||||||
|
int error =
|
||||||
if (*count == 0)
|
copyout_entry(&se->se_bookmark, uaddr, count);
|
||||||
return (SET_ERROR(ENOMEM));
|
if (error != 0) {
|
||||||
|
return (error);
|
||||||
if (copyout(&se->se_bookmark, (char *)uaddr +
|
}
|
||||||
(*count - 1) * sizeof (zbookmark_phys_t),
|
|
||||||
sizeof (zbookmark_phys_t)) != 0)
|
|
||||||
return (SET_ERROR(EFAULT));
|
|
||||||
|
|
||||||
*count -= 1;
|
|
||||||
}
|
}
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
@ -963,7 +887,7 @@ process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
|
||||||
|
|
||||||
if (!error)
|
if (!error)
|
||||||
error = process_error_block(spa, head_ds_obj, &zep,
|
error = process_error_block(spa, head_ds_obj, &zep,
|
||||||
count, uaddr, B_FALSE);
|
uaddr, count);
|
||||||
if (error)
|
if (error)
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
@ -988,6 +912,12 @@ spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
#ifdef _KERNEL
|
#ifdef _KERNEL
|
||||||
|
/*
|
||||||
|
* The pool config lock is needed to hold a dataset_t via (among other
|
||||||
|
* places) process_error_list() -> get_head_and_birth_txg(), and lock
|
||||||
|
* ordering requires that we get it before the spa_errlog_lock.
|
||||||
|
*/
|
||||||
|
dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
|
||||||
mutex_enter(&spa->spa_errlog_lock);
|
mutex_enter(&spa->spa_errlog_lock);
|
||||||
|
|
||||||
ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
|
ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
|
||||||
|
@ -1006,6 +936,7 @@ spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
|
||||||
mutex_exit(&spa->spa_errlist_lock);
|
mutex_exit(&spa->spa_errlist_lock);
|
||||||
|
|
||||||
mutex_exit(&spa->spa_errlog_lock);
|
mutex_exit(&spa->spa_errlog_lock);
|
||||||
|
dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
|
||||||
#else
|
#else
|
||||||
(void) spa, (void) uaddr, (void) count;
|
(void) spa, (void) uaddr, (void) count;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1174,6 +1105,13 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
|
||||||
spa->spa_scrub_finished = B_FALSE;
|
spa->spa_scrub_finished = B_FALSE;
|
||||||
|
|
||||||
mutex_exit(&spa->spa_errlist_lock);
|
mutex_exit(&spa->spa_errlist_lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The pool config lock is needed to hold a dataset_t via
|
||||||
|
* sync_error_list() -> get_head_and_birth_txg(), and lock ordering
|
||||||
|
* requires that we get it before the spa_errlog_lock.
|
||||||
|
*/
|
||||||
|
dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
|
||||||
mutex_enter(&spa->spa_errlog_lock);
|
mutex_enter(&spa->spa_errlog_lock);
|
||||||
|
|
||||||
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
|
||||||
|
@ -1218,6 +1156,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
|
||||||
dmu_tx_commit(tx);
|
dmu_tx_commit(tx);
|
||||||
|
|
||||||
mutex_exit(&spa->spa_errlog_lock);
|
mutex_exit(&spa->spa_errlog_lock);
|
||||||
|
dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -1354,7 +1293,7 @@ spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
|
||||||
#if defined(_KERNEL)
|
#if defined(_KERNEL)
|
||||||
/* error handling */
|
/* error handling */
|
||||||
EXPORT_SYMBOL(spa_log_error);
|
EXPORT_SYMBOL(spa_log_error);
|
||||||
EXPORT_SYMBOL(spa_get_errlog_size);
|
EXPORT_SYMBOL(spa_approx_errlog_size);
|
||||||
EXPORT_SYMBOL(spa_get_errlog);
|
EXPORT_SYMBOL(spa_get_errlog);
|
||||||
EXPORT_SYMBOL(spa_errlog_rotate);
|
EXPORT_SYMBOL(spa_errlog_rotate);
|
||||||
EXPORT_SYMBOL(spa_errlog_drain);
|
EXPORT_SYMBOL(spa_errlog_drain);
|
||||||
|
|
|
@ -5695,17 +5695,12 @@ zfs_ioc_error_log(zfs_cmd_t *zc)
|
||||||
{
|
{
|
||||||
spa_t *spa;
|
spa_t *spa;
|
||||||
int error;
|
int error;
|
||||||
uint64_t count = zc->zc_nvlist_dst_size;
|
|
||||||
|
|
||||||
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
|
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
|
||||||
return (error);
|
return (error);
|
||||||
|
|
||||||
error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
|
error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
|
||||||
&count);
|
&zc->zc_nvlist_dst_size);
|
||||||
if (error == 0)
|
|
||||||
zc->zc_nvlist_dst_size = count;
|
|
||||||
else
|
|
||||||
zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
|
|
||||||
|
|
||||||
spa_close(spa, FTAG);
|
spa_close(spa, FTAG);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue