Illumos 3693 - restore_object uses at least two transactions to restore an object
Restore_object should not use two transactions to restore an object: * one transaction is used for dmu_object_claim * another transaction is used to set compression, checksum and most importantly bonus data * furthermore dmu_object_reclaim internally uses multiple transactions * dmu_free_long_range frees chunks in separate transactions * dnode_reallocate is executed in a distinct transaction The fact the dnode_allocate/dnode_reallocate are executed in one transaction and bonus (re-)population is executed in a different transaction may lead to violation of ZFS consistency assertions if the transactions are assigned to different transaction groups. Also, if the first transaction group is successfully written to a permanent storage, but the second transaction is lost, then an invalid dnode may be created on the stable storage. 3693 restore_object uses at least two transactions to restore an object Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Andriy Gapon <andriy.gapon@hybridcluster.com> Approved by: Robert Mustacchi <rm@joyent.com> Original authors: Matthew Ahrens and Andriy Gapon References: https://www.illumos.org/issues/3693 https://github.com/illumos/illumos-gate/commit/e77d42e Ported by: Turbo Fredriksson <turbo@bayour.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2689
This commit is contained in:
parent
356d9ed4c8
commit
6c59307a3c
|
@ -23,6 +23,7 @@
|
||||||
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
|
||||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
||||||
|
* Copyright 2014 HybridCluster. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* Portions Copyright 2010 Robert Milkowski */
|
/* Portions Copyright 2010 Robert Milkowski */
|
||||||
|
@ -336,7 +337,7 @@ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
|
||||||
int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
|
int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
|
||||||
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
|
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
|
||||||
int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
|
int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
|
||||||
int blocksize, dmu_object_type_t bonustype, int bonuslen);
|
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Free an object from this objset.
|
* Free an object from this objset.
|
||||||
|
@ -652,7 +653,8 @@ typedef struct dmu_object_info {
|
||||||
uint8_t doi_indirection; /* 2 = dnode->indirect->data */
|
uint8_t doi_indirection; /* 2 = dnode->indirect->data */
|
||||||
uint8_t doi_checksum;
|
uint8_t doi_checksum;
|
||||||
uint8_t doi_compress;
|
uint8_t doi_compress;
|
||||||
uint8_t doi_pad[5];
|
uint8_t doi_nblkptr;
|
||||||
|
uint8_t doi_pad[4];
|
||||||
uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
|
uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
|
||||||
uint64_t doi_max_offset;
|
uint64_t doi_max_offset;
|
||||||
uint64_t doi_fill_count; /* number of non-empty blocks */
|
uint64_t doi_fill_count; /* number of non-empty blocks */
|
||||||
|
|
|
@ -1894,6 +1894,7 @@ __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
|
||||||
doi->doi_indirection = dn->dn_nlevels;
|
doi->doi_indirection = dn->dn_nlevels;
|
||||||
doi->doi_checksum = dn->dn_checksum;
|
doi->doi_checksum = dn->dn_checksum;
|
||||||
doi->doi_compress = dn->dn_compress;
|
doi->doi_compress = dn->dn_compress;
|
||||||
|
doi->doi_nblkptr = dn->dn_nblkptr;
|
||||||
doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
|
doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
|
||||||
doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
|
doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
|
||||||
doi->doi_fill_count = 0;
|
doi->doi_fill_count = 0;
|
||||||
|
|
|
@ -20,7 +20,8 @@
|
||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2013 by Delphix. All rights reserved.
|
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
|
||||||
|
* Copyright 2014 HybridCluster. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sys/dmu.h>
|
#include <sys/dmu.h>
|
||||||
|
@ -107,11 +108,9 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
|
||||||
|
|
||||||
int
|
int
|
||||||
dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
|
dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
|
||||||
int blocksize, dmu_object_type_t bonustype, int bonuslen)
|
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
|
||||||
{
|
{
|
||||||
dnode_t *dn;
|
dnode_t *dn;
|
||||||
dmu_tx_t *tx;
|
|
||||||
int nblkptr;
|
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
if (object == DMU_META_DNODE_OBJECT)
|
if (object == DMU_META_DNODE_OBJECT)
|
||||||
|
@ -122,44 +121,9 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
|
||||||
if (err)
|
if (err)
|
||||||
return (err);
|
return (err);
|
||||||
|
|
||||||
if (dn->dn_type == ot && dn->dn_datablksz == blocksize &&
|
|
||||||
dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) {
|
|
||||||
/* nothing is changing, this is a noop */
|
|
||||||
dnode_rele(dn, FTAG);
|
|
||||||
return (0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (bonustype == DMU_OT_SA) {
|
|
||||||
nblkptr = 1;
|
|
||||||
} else {
|
|
||||||
nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If we are losing blkptrs or changing the block size this must
|
|
||||||
* be a new file instance. We must clear out the previous file
|
|
||||||
* contents before we can change this type of metadata in the dnode.
|
|
||||||
*/
|
|
||||||
if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) {
|
|
||||||
err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
|
|
||||||
if (err)
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
tx = dmu_tx_create(os);
|
|
||||||
dmu_tx_hold_bonus(tx, object);
|
|
||||||
err = dmu_tx_assign(tx, TXG_WAIT);
|
|
||||||
if (err) {
|
|
||||||
dmu_tx_abort(tx);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
|
dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
|
||||||
|
|
||||||
dmu_tx_commit(tx);
|
|
||||||
out:
|
|
||||||
dnode_rele(dn, FTAG);
|
dnode_rele(dn, FTAG);
|
||||||
|
|
||||||
return (err);
|
return (err);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1328,12 +1328,25 @@ backup_byteswap(dmu_replay_record_t *drr)
|
||||||
#undef DO32
|
#undef DO32
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline uint8_t
|
||||||
|
deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
|
||||||
|
{
|
||||||
|
if (bonus_type == DMU_OT_SA) {
|
||||||
|
return (1);
|
||||||
|
} else {
|
||||||
|
return (1 +
|
||||||
|
((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
noinline static int
|
noinline static int
|
||||||
restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
|
restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
|
||||||
{
|
{
|
||||||
int err;
|
dmu_object_info_t doi;
|
||||||
dmu_tx_t *tx;
|
dmu_tx_t *tx;
|
||||||
void *data = NULL;
|
void *data = NULL;
|
||||||
|
uint64_t object;
|
||||||
|
int err;
|
||||||
|
|
||||||
if (drro->drr_type == DMU_OT_NONE ||
|
if (drro->drr_type == DMU_OT_NONE ||
|
||||||
!DMU_OT_IS_VALID(drro->drr_type) ||
|
!DMU_OT_IS_VALID(drro->drr_type) ||
|
||||||
|
@ -1347,10 +1360,11 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
|
||||||
return (SET_ERROR(EINVAL));
|
return (SET_ERROR(EINVAL));
|
||||||
}
|
}
|
||||||
|
|
||||||
err = dmu_object_info(os, drro->drr_object, NULL);
|
err = dmu_object_info(os, drro->drr_object, &doi);
|
||||||
|
|
||||||
if (err != 0 && err != ENOENT)
|
if (err != 0 && err != ENOENT)
|
||||||
return (SET_ERROR(EINVAL));
|
return (SET_ERROR(EINVAL));
|
||||||
|
object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
|
||||||
|
|
||||||
if (drro->drr_bonuslen) {
|
if (drro->drr_bonuslen) {
|
||||||
data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
|
data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
|
||||||
|
@ -1358,37 +1372,53 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
|
||||||
return (ra->err);
|
return (ra->err);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (err == ENOENT) {
|
/*
|
||||||
/* currently free, want to be allocated */
|
* If we are losing blkptrs or changing the block size this must
|
||||||
tx = dmu_tx_create(os);
|
* be a new file instance. We must clear out the previous file
|
||||||
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
|
* contents before we can change this type of metadata in the dnode.
|
||||||
err = dmu_tx_assign(tx, TXG_WAIT);
|
*/
|
||||||
if (err != 0) {
|
if (err == 0) {
|
||||||
dmu_tx_abort(tx);
|
int nblkptr;
|
||||||
return (err);
|
|
||||||
}
|
nblkptr = deduce_nblkptr(drro->drr_bonustype,
|
||||||
err = dmu_object_claim(os, drro->drr_object,
|
drro->drr_bonuslen);
|
||||||
drro->drr_type, drro->drr_blksz,
|
|
||||||
drro->drr_bonustype, drro->drr_bonuslen, tx);
|
if (drro->drr_blksz != doi.doi_data_block_size ||
|
||||||
dmu_tx_commit(tx);
|
nblkptr < doi.doi_nblkptr) {
|
||||||
} else {
|
err = dmu_free_long_range(os, drro->drr_object,
|
||||||
/* currently allocated, want to be allocated */
|
0, DMU_OBJECT_END);
|
||||||
err = dmu_object_reclaim(os, drro->drr_object,
|
if (err != 0)
|
||||||
drro->drr_type, drro->drr_blksz,
|
|
||||||
drro->drr_bonustype, drro->drr_bonuslen);
|
|
||||||
}
|
|
||||||
if (err != 0) {
|
|
||||||
return (SET_ERROR(EINVAL));
|
return (SET_ERROR(EINVAL));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
tx = dmu_tx_create(os);
|
tx = dmu_tx_create(os);
|
||||||
dmu_tx_hold_bonus(tx, drro->drr_object);
|
dmu_tx_hold_bonus(tx, object);
|
||||||
err = dmu_tx_assign(tx, TXG_WAIT);
|
err = dmu_tx_assign(tx, TXG_WAIT);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
dmu_tx_abort(tx);
|
dmu_tx_abort(tx);
|
||||||
return (err);
|
return (err);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (object == DMU_NEW_OBJECT) {
|
||||||
|
/* currently free, want to be allocated */
|
||||||
|
err = dmu_object_claim(os, drro->drr_object,
|
||||||
|
drro->drr_type, drro->drr_blksz,
|
||||||
|
drro->drr_bonustype, drro->drr_bonuslen, tx);
|
||||||
|
} else if (drro->drr_type != doi.doi_type ||
|
||||||
|
drro->drr_blksz != doi.doi_data_block_size ||
|
||||||
|
drro->drr_bonustype != doi.doi_bonus_type ||
|
||||||
|
drro->drr_bonuslen != doi.doi_bonus_size) {
|
||||||
|
/* currently allocated, but with different properties */
|
||||||
|
err = dmu_object_reclaim(os, drro->drr_object,
|
||||||
|
drro->drr_type, drro->drr_blksz,
|
||||||
|
drro->drr_bonustype, drro->drr_bonuslen, tx);
|
||||||
|
}
|
||||||
|
if (err != 0) {
|
||||||
|
dmu_tx_commit(tx);
|
||||||
|
return (SET_ERROR(EINVAL));
|
||||||
|
}
|
||||||
|
|
||||||
dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
|
dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
|
||||||
tx);
|
tx);
|
||||||
dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
|
dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
|
||||||
|
|
Loading…
Reference in New Issue