Fix 'zpool remap' freeing race
The dmu_objset_remap_indirects_impl() logic depends on dnode_hold() returning ENOENT for dnodes which will be freed and should be skipped. This behavior can only be relied upon when taking a new hold and while the caller has an open transaction. This ensures that the open txg cannot advance and that a concurrent free will end up in the same txg (which is critical). Relying on an existing hold will not prevent dnode_free() from succeeding. The solution is to take an additional dnode_hold() after assigning the transaction. This ensures the remap will never dirty the dnode if it was freed while we were waiting in dmu_tx_assign(, TXG_WAIT). Randomly set zfs_object_remap_one_indirect_delay_ms in ztest. This increases the likelihood of an operation racing with the remap. Converted from ticks to milliseconds. Reviewed by: Matt Ahrens <mahrens@delphix.com> Reviewed by: Tom Caputi <tcaputi@datto.com> Reviewed by: Igor Kozhukhov <igor@dilos.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #8215
This commit is contained in:
parent
06f3fc2a4b
commit
65ca2c1eb9
|
@ -215,6 +215,8 @@ extern int dmu_object_alloc_chunk_shift;
|
||||||
extern boolean_t zfs_force_some_double_word_sm_entries;
|
extern boolean_t zfs_force_some_double_word_sm_entries;
|
||||||
extern unsigned long zio_decompress_fail_fraction;
|
extern unsigned long zio_decompress_fail_fraction;
|
||||||
extern unsigned long zfs_reconstruct_indirect_damage_fraction;
|
extern unsigned long zfs_reconstruct_indirect_damage_fraction;
|
||||||
|
extern int zfs_object_remap_one_indirect_delay_ms;
|
||||||
|
|
||||||
|
|
||||||
static ztest_shared_opts_t *ztest_shared_opts;
|
static ztest_shared_opts_t *ztest_shared_opts;
|
||||||
static ztest_shared_opts_t ztest_opts;
|
static ztest_shared_opts_t ztest_opts;
|
||||||
|
@ -6526,6 +6528,12 @@ ztest_resume_thread(void *arg)
|
||||||
*/
|
*/
|
||||||
if (ztest_random(10) == 0)
|
if (ztest_random(10) == 0)
|
||||||
zfs_abd_scatter_enabled = ztest_random(2);
|
zfs_abd_scatter_enabled = ztest_random(2);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Periodically inject remapping delays (10% of the time).
|
||||||
|
*/
|
||||||
|
zfs_object_remap_one_indirect_delay_ms =
|
||||||
|
ztest_random(10) == 0 ? ztest_random(1000) + 1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
thread_exit();
|
thread_exit();
|
||||||
|
|
|
@ -76,9 +76,9 @@ int zfs_dmu_offset_next_sync = 0;
|
||||||
/*
|
/*
|
||||||
* This can be used for testing, to ensure that certain actions happen
|
* This can be used for testing, to ensure that certain actions happen
|
||||||
* while in the middle of a remap (which might otherwise complete too
|
* while in the middle of a remap (which might otherwise complete too
|
||||||
* quickly).
|
* quickly). Used by ztest(8).
|
||||||
*/
|
*/
|
||||||
int zfs_object_remap_one_indirect_delay_ticks = 0;
|
int zfs_object_remap_one_indirect_delay_ms = 0;
|
||||||
|
|
||||||
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
|
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
|
||||||
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
|
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
|
||||||
|
@ -1075,6 +1075,7 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
|
||||||
uint64_t last_removal_txg, uint64_t offset)
|
uint64_t last_removal_txg, uint64_t offset)
|
||||||
{
|
{
|
||||||
uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
|
uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
|
||||||
|
dnode_t *dn_tx;
|
||||||
int err = 0;
|
int err = 0;
|
||||||
|
|
||||||
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
rw_enter(&dn->dn_struct_rwlock, RW_READER);
|
||||||
|
@ -1093,7 +1094,9 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If this L1 was already written after the last removal, then we've
|
* If this L1 was already written after the last removal, then we've
|
||||||
* already tried to remap it.
|
* already tried to remap it. An additional hold is taken after the
|
||||||
|
* dmu_tx_assign() to handle the case where the dnode is freed while
|
||||||
|
* waiting for the next open txg.
|
||||||
*/
|
*/
|
||||||
if (birth <= last_removal_txg &&
|
if (birth <= last_removal_txg &&
|
||||||
dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
|
dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
|
||||||
|
@ -1102,7 +1105,11 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
|
||||||
dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
|
dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
|
||||||
err = dmu_tx_assign(tx, TXG_WAIT);
|
err = dmu_tx_assign(tx, TXG_WAIT);
|
||||||
if (err == 0) {
|
if (err == 0) {
|
||||||
(void) dbuf_dirty(dbuf, tx);
|
err = dnode_hold(os, dn->dn_object, FTAG, &dn_tx);
|
||||||
|
if (err == 0) {
|
||||||
|
(void) dbuf_dirty(dbuf, tx);
|
||||||
|
dnode_rele(dn_tx, FTAG);
|
||||||
|
}
|
||||||
dmu_tx_commit(tx);
|
dmu_tx_commit(tx);
|
||||||
} else {
|
} else {
|
||||||
dmu_tx_abort(tx);
|
dmu_tx_abort(tx);
|
||||||
|
@ -1111,7 +1118,7 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
|
||||||
|
|
||||||
dbuf_rele(dbuf, FTAG);
|
dbuf_rele(dbuf, FTAG);
|
||||||
|
|
||||||
delay(zfs_object_remap_one_indirect_delay_ticks);
|
delay(MSEC_TO_TICK(zfs_object_remap_one_indirect_delay_ms));
|
||||||
|
|
||||||
return (err);
|
return (err);
|
||||||
}
|
}
|
||||||
|
@ -1133,7 +1140,7 @@ dmu_object_remap_indirects(objset_t *os, uint64_t object,
|
||||||
{
|
{
|
||||||
uint64_t offset, l1span;
|
uint64_t offset, l1span;
|
||||||
int err;
|
int err;
|
||||||
dnode_t *dn;
|
dnode_t *dn, *dn_tx;
|
||||||
|
|
||||||
err = dnode_hold(os, object, FTAG, &dn);
|
err = dnode_hold(os, object, FTAG, &dn);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
|
@ -1148,13 +1155,20 @@ dmu_object_remap_indirects(objset_t *os, uint64_t object,
|
||||||
/*
|
/*
|
||||||
* If the dnode has no indirect blocks, we cannot dirty them.
|
* If the dnode has no indirect blocks, we cannot dirty them.
|
||||||
* We still want to remap the blkptr(s) in the dnode if
|
* We still want to remap the blkptr(s) in the dnode if
|
||||||
* appropriate, so mark it as dirty.
|
* appropriate, so mark it as dirty. An additional hold is
|
||||||
|
* taken after the dmu_tx_assign() to handle the case where
|
||||||
|
* the dnode is freed while waiting for the next open txg.
|
||||||
*/
|
*/
|
||||||
if (err == 0 && dnode_needs_remap(dn)) {
|
if (err == 0 && dnode_needs_remap(dn)) {
|
||||||
dmu_tx_t *tx = dmu_tx_create(os);
|
dmu_tx_t *tx = dmu_tx_create(os);
|
||||||
dmu_tx_hold_bonus(tx, dn->dn_object);
|
dmu_tx_hold_bonus(tx, object);
|
||||||
if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
|
err = dmu_tx_assign(tx, TXG_WAIT);
|
||||||
dnode_setdirty(dn, tx);
|
if (err == 0) {
|
||||||
|
err = dnode_hold(os, object, FTAG, &dn_tx);
|
||||||
|
if (err == 0) {
|
||||||
|
dnode_setdirty(dn_tx, tx);
|
||||||
|
dnode_rele(dn_tx, FTAG);
|
||||||
|
}
|
||||||
dmu_tx_commit(tx);
|
dmu_tx_commit(tx);
|
||||||
} else {
|
} else {
|
||||||
dmu_tx_abort(tx);
|
dmu_tx_abort(tx);
|
||||||
|
|
Loading…
Reference in New Issue