Fix handling of maxblkid for raw sends
Currently, the receive code can create an unreadable dataset from a correct raw send stream. This is because it is currently impossible to set maxblkid to a lower value without freeing the associated object. This means truncating files on the send side to a non-0 size could result in corruption. This patch solves this issue by adding a new 'force' flag to dnode_new_blkid() which will allow the raw receive code to force the DMU to accept the provided maxblkid even if it is a lower value than the existing one. For testing purposes the send_encrypted_files.ksh test has been extended to include a variety of truncated files and multiple snapshots. It also now leverages the xattrtest command to help ensure raw receives correctly handle xattrs. Reviewed-by: Paul Dagnelie <pcd@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matt Ahrens <mahrens@delphix.com> Signed-off-by: Tom Caputi <tcaputi@datto.com> Closes #8168 Closes #8487
This commit is contained in:
parent
146bdc414c
commit
369aa501d1
|
@ -477,7 +477,8 @@ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
|
|||
|
||||
/*
|
||||
* Manually set the maxblkid on a dnode. This will adjust nlevels accordingly
|
||||
* to accommodate the change.
|
||||
* to accommodate the change. When calling this function, the caller must
|
||||
* ensure that the object's nlevels can sufficiently support the new maxblkid.
|
||||
*/
|
||||
int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
|
||||
dmu_tx_t *tx);
|
||||
|
|
|
@ -371,6 +371,12 @@ struct dnode {
|
|||
struct zfetch dn_zfetch;
|
||||
};
|
||||
|
||||
/*
|
||||
* We use this (otherwise unused) bit to indicate if the value of
|
||||
* dn_next_maxblkid[txgoff] is valid to use in dnode_sync().
|
||||
*/
|
||||
#define DMU_NEXT_MAXBLKID_SET (1ULL << 63)
|
||||
|
||||
/*
|
||||
* Adds a level of indirection between the dbuf and the dnode to avoid
|
||||
* iterating descendent dbufs in dnode_move(). Handles are not allocated
|
||||
|
@ -423,7 +429,8 @@ int dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx);
|
|||
int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
|
||||
void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
|
||||
void dnode_diduse_space(dnode_t *dn, int64_t space);
|
||||
void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
|
||||
void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx,
|
||||
boolean_t have_read, boolean_t force);
|
||||
uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
|
||||
void dnode_init(void);
|
||||
void dnode_fini(void);
|
||||
|
|
|
@ -2116,7 +2116,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|||
if (db->db_level == 0) {
|
||||
ASSERT(!db->db_objset->os_raw_receive ||
|
||||
dn->dn_maxblkid >= db->db_blkid);
|
||||
dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
|
||||
dnode_new_blkid(dn, db->db_blkid, tx,
|
||||
drop_struct_lock, B_FALSE);
|
||||
ASSERT(dn->dn_maxblkid >= db->db_blkid);
|
||||
}
|
||||
|
||||
|
|
|
@ -2155,7 +2155,7 @@ dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
|
|||
if (err)
|
||||
return (err);
|
||||
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
|
||||
dnode_new_blkid(dn, maxblkid, tx, B_FALSE);
|
||||
dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE);
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
dnode_rele(dn, FTAG);
|
||||
return (0);
|
||||
|
|
|
@ -1179,10 +1179,22 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
|
|||
|
||||
object = drro->drr_object;
|
||||
|
||||
/* nblkptr will be bounded by the bonus size and type */
|
||||
/* nblkptr should be bounded by the bonus size and type */
|
||||
if (rwa->raw && nblkptr != drro->drr_nblkptr)
|
||||
return (SET_ERROR(EINVAL));
|
||||
|
||||
/*
|
||||
* Check for indicators that the object was freed and
|
||||
* reallocated. For all sends, these indicators are:
|
||||
* - A changed block size
|
||||
* - A smaller nblkptr
|
||||
* - A changed dnode size
|
||||
* For raw sends we also check a few other fields to
|
||||
* ensure we are preserving the objset structure exactly
|
||||
* as it was on the receive side:
|
||||
* - A changed indirect block size
|
||||
* - A smaller nlevels
|
||||
*/
|
||||
if (drro->drr_blksz != doi.doi_data_block_size ||
|
||||
nblkptr < doi.doi_nblkptr ||
|
||||
dn_slots != doi.doi_dnodesize >> DNODE_SHIFT ||
|
||||
|
@ -1197,13 +1209,14 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
|
|||
|
||||
/*
|
||||
* The dmu does not currently support decreasing nlevels
|
||||
* on an object. For non-raw sends, this does not matter
|
||||
* and the new object can just use the previous one's nlevels.
|
||||
* For raw sends, however, the structure of the received dnode
|
||||
* (including nlevels) must match that of the send side.
|
||||
* Therefore, instead of using dmu_object_reclaim(), we must
|
||||
* free the object completely and call dmu_object_claim_dnsize()
|
||||
* instead.
|
||||
* or changing the number of dnode slots on an object. For
|
||||
* non-raw sends, this does not matter and the new object
|
||||
* can just use the previous one's nlevels. For raw sends,
|
||||
* however, the structure of the received dnode (including
|
||||
* nlevels and dnode slots) must match that of the send
|
||||
* side. Therefore, instead of using dmu_object_reclaim(),
|
||||
* we must free the object completely and call
|
||||
* dmu_object_claim_dnsize() instead.
|
||||
*/
|
||||
if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) ||
|
||||
dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
|
||||
|
@ -1214,6 +1227,23 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
|
|||
txg_wait_synced(dmu_objset_pool(rwa->os), 0);
|
||||
object = DMU_NEW_OBJECT;
|
||||
}
|
||||
|
||||
/*
|
||||
* For raw receives, free everything beyond the new incoming
|
||||
* maxblkid. Normally this would be done with a DRR_FREE
|
||||
* record that would come after this DRR_OBJECT record is
|
||||
* processed. However, for raw receives we manually set the
|
||||
* maxblkid from the drr_maxblkid and so we must first free
|
||||
* everything above that blkid to ensure the DMU is always
|
||||
* consistent with itself.
|
||||
*/
|
||||
if (rwa->raw) {
|
||||
err = dmu_free_long_range(rwa->os, drro->drr_object,
|
||||
(drro->drr_maxblkid + 1) * drro->drr_blksz,
|
||||
DMU_OBJECT_END);
|
||||
if (err != 0)
|
||||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
} else if (err == EEXIST) {
|
||||
/*
|
||||
* The object requested is currently an interior slot of a
|
||||
|
@ -1333,14 +1363,24 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
|
|||
/* handle more restrictive dnode structuring for raw recvs */
|
||||
if (rwa->raw) {
|
||||
/*
|
||||
* Set the indirect block shift and nlevels. This will not fail
|
||||
* because we ensured all of the blocks were free earlier if
|
||||
* this is a new object.
|
||||
* Set the indirect block size, block shift, nlevels.
|
||||
* This will not fail because we ensured all of the
|
||||
* blocks were freed earlier if this is a new object.
|
||||
* For non-new objects block size and indirect block
|
||||
* shift cannot change and nlevels can only increase.
|
||||
*/
|
||||
VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object,
|
||||
drro->drr_blksz, drro->drr_indblkshift, tx));
|
||||
VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object,
|
||||
drro->drr_nlevels, tx));
|
||||
|
||||
/*
|
||||
* Set the maxblkid. We will never free the first block of
|
||||
* an object here because a maxblkid of 0 could indicate
|
||||
* an object with a single block or one with no blocks.
|
||||
* This will always succeed because we freed all blocks
|
||||
* beyond the new maxblkid above.
|
||||
*/
|
||||
VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object,
|
||||
drro->drr_maxblkid, tx));
|
||||
}
|
||||
|
|
|
@ -1828,7 +1828,8 @@ out:
|
|||
|
||||
/* read-holding callers must not rely on the lock being continuously held */
|
||||
void
|
||||
dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
|
||||
dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
|
||||
boolean_t force)
|
||||
{
|
||||
int epbs, new_nlevels;
|
||||
uint64_t sz;
|
||||
|
@ -1853,14 +1854,25 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
|
|||
}
|
||||
}
|
||||
|
||||
if (blkid <= dn->dn_maxblkid)
|
||||
/*
|
||||
* Raw sends (indicated by the force flag) require that we take the
|
||||
* given blkid even if the value is lower than the current value.
|
||||
*/
|
||||
if (!force && blkid <= dn->dn_maxblkid)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
|
||||
* to indicate that this field is set. This allows us to set the
|
||||
* maxblkid to 0 on an existing object in dnode_sync().
|
||||
*/
|
||||
dn->dn_maxblkid = blkid;
|
||||
dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] = blkid;
|
||||
dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
|
||||
blkid | DMU_NEXT_MAXBLKID_SET;
|
||||
|
||||
/*
|
||||
* Compute the number of levels necessary to support the new maxblkid.
|
||||
* Raw sends will ensure nlevels is set correctly for us.
|
||||
*/
|
||||
new_nlevels = 1;
|
||||
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
|
||||
|
@ -1870,8 +1882,12 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
|
|||
|
||||
ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
|
||||
|
||||
if (new_nlevels > dn->dn_nlevels)
|
||||
dnode_set_nlevels_impl(dn, new_nlevels, tx);
|
||||
if (!force) {
|
||||
if (new_nlevels > dn->dn_nlevels)
|
||||
dnode_set_nlevels_impl(dn, new_nlevels, tx);
|
||||
} else {
|
||||
ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
|
||||
}
|
||||
|
||||
out:
|
||||
if (have_read)
|
||||
|
|
|
@ -384,12 +384,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not truncate the maxblkid if we are performing a raw
|
||||
* receive. The raw receive sets the mablkid manually and
|
||||
* must not be overriden.
|
||||
*/
|
||||
if (trunc && !dn->dn_objset->os_raw_receive) {
|
||||
if (trunc) {
|
||||
ASSERTV(uint64_t off);
|
||||
dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
|
||||
|
||||
|
@ -765,11 +760,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
|
|||
|
||||
/*
|
||||
* This must be done after dnode_sync_free_range()
|
||||
* and dnode_increase_indirection().
|
||||
* and dnode_increase_indirection(). See dnode_new_blkid()
|
||||
* for an explanation of the high bit being set.
|
||||
*/
|
||||
if (dn->dn_next_maxblkid[txgoff]) {
|
||||
mutex_enter(&dn->dn_mtx);
|
||||
dnp->dn_maxblkid = dn->dn_next_maxblkid[txgoff];
|
||||
dnp->dn_maxblkid =
|
||||
dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET;
|
||||
dn->dn_next_maxblkid[txgoff] = 0;
|
||||
mutex_exit(&dn->dn_mtx);
|
||||
}
|
||||
|
|
|
@ -2099,7 +2099,7 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype,
|
|||
mdn->dn_checksum = checksum;
|
||||
|
||||
rw_enter(&mdn->dn_struct_rwlock, RW_WRITER);
|
||||
dnode_new_blkid(mdn, maxblkid, tx, B_FALSE);
|
||||
dnode_new_blkid(mdn, maxblkid, tx, B_FALSE, B_TRUE);
|
||||
rw_exit(&mdn->dn_struct_rwlock);
|
||||
|
||||
/*
|
||||
|
|
|
@ -144,11 +144,6 @@ parse_args(int argc, char **argv)
|
|||
break;
|
||||
case 'y':
|
||||
verify = 1;
|
||||
if (phase != PHASE_ALL) {
|
||||
fprintf(stderr,
|
||||
"Error: -y and -o are incompatible.\n");
|
||||
rc = 1;
|
||||
}
|
||||
break;
|
||||
case 'n':
|
||||
nth = strtol(optarg, NULL, 0);
|
||||
|
@ -201,11 +196,6 @@ parse_args(int argc, char **argv)
|
|||
PHASE_ALL, PHASE_INVAL);
|
||||
rc = 1;
|
||||
}
|
||||
if (verify == 1) {
|
||||
fprintf(stderr,
|
||||
"Error: -y and -o are incompatible.\n");
|
||||
rc = 1;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
rc = 1;
|
||||
|
|
|
@ -30,12 +30,18 @@
|
|||
# 3. Add a small 512 byte file to the filesystem
|
||||
# 4. Add a larger 32M file to the filesystem
|
||||
# 5. Add a large sparse file to the filesystem
|
||||
# 6. Add a file truncated to 4M to the filesystem
|
||||
# 7. Add a sparse file with metadata compression disabled to the filesystem
|
||||
# 8. Add and remove 1000 empty files to the filesystem
|
||||
# 9. Add a file with a large xattr value
|
||||
# 10. Snapshot the filesystem
|
||||
# 11. Send and receive the filesystem, ensuring that it can be mounted
|
||||
# 6. Add a 3 files that are to be truncated later
|
||||
# 7. Add 1000 empty files to the filesystem
|
||||
# 8. Add a file with a large xattr value
|
||||
# 9. Use xattrtest to create files with random xattrs (with and without xattrs=on)
|
||||
# 10. Take a snapshot of the filesystem
|
||||
# 11. Truncate one of the files from 32M to 128k
|
||||
# 12. Truncate one of the files from 512k to 384k
|
||||
# 13. Truncate one of the files from 512k to 0 to 384k
|
||||
# 14. Remove the 1000 empty files to the filesystem
|
||||
# 15. Take another snapshot of the filesystem
|
||||
# 16. Send and receive both snapshots
|
||||
# 17. Mount the filesystem and check the contents
|
||||
#
|
||||
|
||||
verify_runnable "both"
|
||||
|
@ -51,46 +57,89 @@ function cleanup
|
|||
}
|
||||
log_onexit cleanup
|
||||
|
||||
function recursive_cksum
|
||||
{
|
||||
find $1 -type f -exec sha256sum {} \; | \
|
||||
sort -k 2 | awk '{ print $1 }' | sha256sum
|
||||
}
|
||||
|
||||
log_assert "Verify 'zfs send -w' works with many different file layouts"
|
||||
|
||||
typeset keyfile=/$TESTPOOL/pkey
|
||||
typeset sendfile=/$TESTPOOL/sendfile
|
||||
typeset sendfile2=/$TESTPOOL/sendfile2
|
||||
|
||||
# Create an encrypted dataset
|
||||
log_must eval "echo 'password' > $keyfile"
|
||||
log_must zfs create -o encryption=on -o keyformat=passphrase \
|
||||
-o keylocation=file://$keyfile $TESTPOOL/$TESTFS2
|
||||
|
||||
# Create files with vaired layouts on disk
|
||||
log_must touch /$TESTPOOL/$TESTFS2/empty
|
||||
log_must mkfile 512 /$TESTPOOL/$TESTFS2/small
|
||||
log_must mkfile 32M /$TESTPOOL/$TESTFS2/full
|
||||
log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS2/sparse \
|
||||
bs=512 count=1 seek=10G >/dev/null 2>&1
|
||||
bs=512 count=1 seek=1048576 >/dev/null 2>&1
|
||||
log_must mkfile 32M /$TESTPOOL/$TESTFS2/truncated
|
||||
log_must truncate -s 4M /$TESTPOOL/$TESTFS2/truncated
|
||||
sync
|
||||
log_must mkfile 524288 /$TESTPOOL/$TESTFS2/truncated2
|
||||
log_must mkfile 524288 /$TESTPOOL/$TESTFS2/truncated3
|
||||
|
||||
log_must mkdir -p /$TESTPOOL/$TESTFS2/dir
|
||||
for i in {1..1000}; do
|
||||
log_must mkfile 512 /$TESTPOOL/$TESTFS2/dir/file-$i
|
||||
done
|
||||
sync
|
||||
|
||||
log_must mkdir -p /$TESTPOOL/$TESTFS2/xattrondir
|
||||
log_must zfs set xattr=on $TESTPOOL/$TESTFS2
|
||||
log_must xattrtest -f 10 -x 3 -s 32768 -r -k -p /$TESTPOOL/$TESTFS2/xattrondir
|
||||
log_must mkdir -p /$TESTPOOL/$TESTFS2/xattrsadir
|
||||
log_must zfs set xattr=sa $TESTPOOL/$TESTFS2
|
||||
log_must xattrtest -f 10 -x 3 -s 32768 -r -k -p /$TESTPOOL/$TESTFS2/xattrsadir
|
||||
|
||||
# ZoL issue #7432
|
||||
log_must zfs set compression=on xattr=sa $TESTPOOL/$TESTFS2
|
||||
log_must touch /$TESTPOOL/$TESTFS2/attrs
|
||||
log_must eval "python -c 'print \"a\" * 4096' | \
|
||||
attr -s bigval /$TESTPOOL/$TESTFS2/attrs"
|
||||
|
||||
log_must zfs snapshot $TESTPOOL/$TESTFS2@snap1
|
||||
|
||||
#
|
||||
# Truncate files created in the first snapshot. The first tests
|
||||
# truncating a large file to a single block. The second tests
|
||||
# truncating one block off the end of a file without changing
|
||||
# the required nlevels to hold it. The last tests handling
|
||||
# of a maxblkid that is dropped and then raised again.
|
||||
#
|
||||
log_must truncate -s 131072 /$TESTPOOL/$TESTFS2/truncated
|
||||
log_must truncate -s 393216 /$TESTPOOL/$TESTFS2/truncated2
|
||||
log_must truncate -s 0 /$TESTPOOL/$TESTFS2/truncated3
|
||||
log_must zpool sync $TESTPOOL
|
||||
log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS2/truncated3 \
|
||||
bs=128k count=3 iflag=fullblock
|
||||
|
||||
# Remove the empty files created in the first snapshot
|
||||
for i in {1..1000}; do
|
||||
log_must rm /$TESTPOOL/$TESTFS2/dir/file-$i
|
||||
done
|
||||
sync
|
||||
|
||||
# ZoL issue #7432
|
||||
log_must zfs set compression=on xattr=sa $TESTPOOL/$TESTFS2
|
||||
log_must touch /$TESTPOOL/$TESTFS2/attrs
|
||||
log_must eval "python -c 'print \"a\" * 4096' | attr -s bigval /$TESTPOOL/$TESTFS2/attrs"
|
||||
log_must zfs snapshot $TESTPOOL/$TESTFS2@snap2
|
||||
expected_cksum=$(recursive_cksum /$TESTPOOL/$TESTFS2)
|
||||
|
||||
log_must zfs snapshot $TESTPOOL/$TESTFS2@now
|
||||
log_must eval "zfs send -wR $TESTPOOL/$TESTFS2@now > $sendfile"
|
||||
log_must eval "zfs send -wp $TESTPOOL/$TESTFS2@snap1 > $sendfile"
|
||||
log_must eval "zfs send -wp -i @snap1 $TESTPOOL/$TESTFS2@snap2 > $sendfile2"
|
||||
|
||||
log_must eval "zfs recv -F $TESTPOOL/recv < $sendfile"
|
||||
log_must eval "zfs recv -F $TESTPOOL/recv < $sendfile2"
|
||||
log_must zfs load-key $TESTPOOL/recv
|
||||
|
||||
log_must zfs mount -a
|
||||
actual_cksum=$(recursive_cksum /$TESTPOOL/recv)
|
||||
[[ "$expected_cksum" != "$actual_cksum" ]] && \
|
||||
log_fail "Recursive checksums differ ($expected_cksum != $actual_cksum)"
|
||||
|
||||
log_must xattrtest -f 10 -o3 -y -p /$TESTPOOL/recv/xattrondir
|
||||
log_must xattrtest -f 10 -o3 -y -p /$TESTPOOL/recv/xattrsadir
|
||||
|
||||
log_pass "Verified 'zfs send -w' works with many different file layouts"
|
||||
|
|
Loading…
Reference in New Issue