From 369aa501d11f4d21d4732b58d749259ad811a10a Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Wed, 13 Mar 2019 13:52:01 -0400 Subject: [PATCH] Fix handling of maxblkid for raw sends Currently, the receive code can create an unreadable dataset from a correct raw send stream. This is because it is currently impossible to set maxblkid to a lower value without freeing the associated object. This means truncating files on the send side to a non-0 size could result in corruption. This patch solves this issue by adding a new 'force' flag to dnode_new_blkid() which will allow the raw receive code to force the DMU to accept the provided maxblkid even if it is a lower value than the existing one. For testing purposes the send_encrypted_files.ksh test has been extended to include a variety of truncated files and multiple snapshots. It also now leverages the xattrtest command to help ensure raw receives correctly handle xattrs. Reviewed-by: Paul Dagnelie Reviewed-by: Brian Behlendorf Reviewed-by: Matt Ahrens Signed-off-by: Tom Caputi Closes #8168 Closes #8487 --- include/sys/dmu.h | 3 +- include/sys/dnode.h | 9 ++- module/zfs/dbuf.c | 3 +- module/zfs/dmu.c | 2 +- module/zfs/dmu_recv.c | 62 +++++++++++--- module/zfs/dnode.c | 26 ++++-- module/zfs/dnode_sync.c | 13 ++- module/zfs/dsl_crypt.c | 2 +- tests/zfs-tests/cmd/xattrtest/xattrtest.c | 10 --- .../functional/rsend/send_encrypted_files.ksh | 81 +++++++++++++++---- 10 files changed, 156 insertions(+), 55 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index e4c2ebc2f2..93d05aac42 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -477,7 +477,8 @@ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, /* * Manually set the maxblkid on a dnode. This will adjust nlevels accordingly - * to accommodate the change. + * to accommodate the change. When calling this function, the caller must + * ensure that the object's nlevels can sufficiently support the new maxblkid. */ int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, dmu_tx_t *tx); diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 48ef927d4a..6355aaa502 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -371,6 +371,12 @@ struct dnode { struct zfetch dn_zfetch; }; +/* + * We use this (otherwise unused) bit to indicate if the value of + * dn_next_maxblkid[txgoff] is valid to use in dnode_sync(). + */ +#define DMU_NEXT_MAXBLKID_SET (1ULL << 63) + /* * Adds a level of indirection between the dbuf and the dnode to avoid * iterating descendent dbufs in dnode_move(). Handles are not allocated @@ -423,7 +429,8 @@ int dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx); int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); void dnode_diduse_space(dnode_t *dn, int64_t space); -void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); +void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, + boolean_t have_read, boolean_t force); uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); void dnode_init(void); void dnode_fini(void); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 28ff5fc7e4..e6ce2bca52 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2116,7 +2116,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_level == 0) { ASSERT(!db->db_objset->os_raw_receive || dn->dn_maxblkid >= db->db_blkid); - dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); + dnode_new_blkid(dn, db->db_blkid, tx, + drop_struct_lock, B_FALSE); ASSERT(dn->dn_maxblkid >= db->db_blkid); } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 36fc8815cd..1621772840 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2155,7 +2155,7 @@ dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, if (err) return (err); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - dnode_new_blkid(dn, maxblkid, tx, B_FALSE); + dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (0); diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index e05b5ad821..8794603185 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1179,10 +1179,22 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, object = drro->drr_object; - /* nblkptr will be bounded by the bonus size and type */ + /* nblkptr should be bounded by the bonus size and type */ if (rwa->raw && nblkptr != drro->drr_nblkptr) return (SET_ERROR(EINVAL)); + /* + * Check for indicators that the object was freed and + * reallocated. For all sends, these indicators are: + * - A changed block size + * - A smaller nblkptr + * - A changed dnode size + * For raw sends we also check a few other fields to + * ensure we are preserving the objset structure exactly + * as it was on the receive side: + * - A changed indirect block size + * - A smaller nlevels + */ if (drro->drr_blksz != doi.doi_data_block_size || nblkptr < doi.doi_nblkptr || dn_slots != doi.doi_dnodesize >> DNODE_SHIFT || @@ -1197,13 +1209,14 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, /* * The dmu does not currently support decreasing nlevels - * on an object. For non-raw sends, this does not matter - * and the new object can just use the previous one's nlevels. - * For raw sends, however, the structure of the received dnode - * (including nlevels) must match that of the send side. - * Therefore, instead of using dmu_object_reclaim(), we must - * free the object completely and call dmu_object_claim_dnsize() - * instead. + * or changing the number of dnode slots on an object. For + * non-raw sends, this does not matter and the new object + * can just use the previous one's nlevels. For raw sends, + * however, the structure of the received dnode (including + * nlevels and dnode slots) must match that of the send + * side. Therefore, instead of using dmu_object_reclaim(), + * we must free the object completely and call + * dmu_object_claim_dnsize() instead. */ if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) || dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { @@ -1214,6 +1227,23 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, txg_wait_synced(dmu_objset_pool(rwa->os), 0); object = DMU_NEW_OBJECT; } + + /* + * For raw receives, free everything beyond the new incoming + * maxblkid. Normally this would be done with a DRR_FREE + * record that would come after this DRR_OBJECT record is + * processed. However, for raw receives we manually set the + * maxblkid from the drr_maxblkid and so we must first free + * everything above that blkid to ensure the DMU is always + * consistent with itself. + */ + if (rwa->raw) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + (drro->drr_maxblkid + 1) * drro->drr_blksz, + DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } } else if (err == EEXIST) { /* * The object requested is currently an interior slot of a @@ -1333,14 +1363,24 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, /* handle more restrictive dnode structuring for raw recvs */ if (rwa->raw) { /* - * Set the indirect block shift and nlevels. This will not fail - * because we ensured all of the blocks were free earlier if - * this is a new object. + * Set the indirect block size, block shift, nlevels. + * This will not fail because we ensured all of the + * blocks were freed earlier if this is a new object. + * For non-new objects block size and indirect block + * shift cannot change and nlevels can only increase. */ VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, drro->drr_blksz, drro->drr_indblkshift, tx)); VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, drro->drr_nlevels, tx)); + + /* + * Set the maxblkid. We will never free the first block of + * an object here because a maxblkid of 0 could indicate + * an object with a single block or one with no blocks. + * This will always succeed because we freed all blocks + * beyond the new maxblkid above. + */ VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object, drro->drr_maxblkid, tx)); } diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 9a19ddabba..35aefa7cb9 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1828,7 +1828,8 @@ out: /* read-holding callers must not rely on the lock being continuously held */ void -dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) +dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read, + boolean_t force) { int epbs, new_nlevels; uint64_t sz; @@ -1853,14 +1854,25 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) } } - if (blkid <= dn->dn_maxblkid) + /* + * Raw sends (indicated by the force flag) require that we take the + * given blkid even if the value is lower than the current value. + */ + if (!force && blkid <= dn->dn_maxblkid) goto out; + /* + * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff] + * to indicate that this field is set. This allows us to set the + * maxblkid to 0 on an existing object in dnode_sync(). + */ dn->dn_maxblkid = blkid; - dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] = blkid; + dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] = + blkid | DMU_NEXT_MAXBLKID_SET; /* * Compute the number of levels necessary to support the new maxblkid. + * Raw sends will ensure nlevels is set correctly for us. */ new_nlevels = 1; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; @@ -1870,8 +1882,12 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS); - if (new_nlevels > dn->dn_nlevels) - dnode_set_nlevels_impl(dn, new_nlevels, tx); + if (!force) { + if (new_nlevels > dn->dn_nlevels) + dnode_set_nlevels_impl(dn, new_nlevels, tx); + } else { + ASSERT3U(dn->dn_nlevels, >=, new_nlevels); + } out: if (have_read) diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index c4062beb31..581f812a14 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -384,12 +384,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, } } - /* - * Do not truncate the maxblkid if we are performing a raw - * receive. The raw receive sets the mablkid manually and - * must not be overriden. - */ - if (trunc && !dn->dn_objset->os_raw_receive) { + if (trunc) { ASSERTV(uint64_t off); dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; @@ -765,11 +760,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) /* * This must be done after dnode_sync_free_range() - * and dnode_increase_indirection(). + * and dnode_increase_indirection(). See dnode_new_blkid() + * for an explanation of the high bit being set. */ if (dn->dn_next_maxblkid[txgoff]) { mutex_enter(&dn->dn_mtx); - dnp->dn_maxblkid = dn->dn_next_maxblkid[txgoff]; + dnp->dn_maxblkid = + dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET; dn->dn_next_maxblkid[txgoff] = 0; mutex_exit(&dn->dn_mtx); } diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c index da2a126f2e..9271128b96 100644 --- a/module/zfs/dsl_crypt.c +++ b/module/zfs/dsl_crypt.c @@ -2099,7 +2099,7 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype, mdn->dn_checksum = checksum; rw_enter(&mdn->dn_struct_rwlock, RW_WRITER); - dnode_new_blkid(mdn, maxblkid, tx, B_FALSE); + dnode_new_blkid(mdn, maxblkid, tx, B_FALSE, B_TRUE); rw_exit(&mdn->dn_struct_rwlock); /* diff --git a/tests/zfs-tests/cmd/xattrtest/xattrtest.c b/tests/zfs-tests/cmd/xattrtest/xattrtest.c index 32a6b1d95b..42c510ed08 100644 --- a/tests/zfs-tests/cmd/xattrtest/xattrtest.c +++ b/tests/zfs-tests/cmd/xattrtest/xattrtest.c @@ -144,11 +144,6 @@ parse_args(int argc, char **argv) break; case 'y': verify = 1; - if (phase != PHASE_ALL) { - fprintf(stderr, - "Error: -y and -o are incompatible.\n"); - rc = 1; - } break; case 'n': nth = strtol(optarg, NULL, 0); @@ -201,11 +196,6 @@ parse_args(int argc, char **argv) PHASE_ALL, PHASE_INVAL); rc = 1; } - if (verify == 1) { - fprintf(stderr, - "Error: -y and -o are incompatible.\n"); - rc = 1; - } break; default: rc = 1; diff --git a/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh b/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh index 5bf25e2773..d981aa3fd2 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh @@ -30,12 +30,18 @@ # 3. Add a small 512 byte file to the filesystem # 4. Add a larger 32M file to the filesystem # 5. Add a large sparse file to the filesystem -# 6. Add a file truncated to 4M to the filesystem -# 7. Add a sparse file with metadata compression disabled to the filesystem -# 8. Add and remove 1000 empty files to the filesystem -# 9. Add a file with a large xattr value -# 10. Snapshot the filesystem -# 11. Send and receive the filesystem, ensuring that it can be mounted +# 6. Add a 3 files that are to be truncated later +# 7. Add 1000 empty files to the filesystem +# 8. Add a file with a large xattr value +# 9. Use xattrtest to create files with random xattrs (with and without xattrs=on) +# 10. Take a snapshot of the filesystem +# 11. Truncate one of the files from 32M to 128k +# 12. Truncate one of the files from 512k to 384k +# 13. Truncate one of the files from 512k to 0 to 384k +# 14. Remove the 1000 empty files to the filesystem +# 15. Take another snapshot of the filesystem +# 16. Send and receive both snapshots +# 17. Mount the filesystem and check the contents # verify_runnable "both" @@ -51,46 +57,89 @@ function cleanup } log_onexit cleanup +function recursive_cksum +{ + find $1 -type f -exec sha256sum {} \; | \ + sort -k 2 | awk '{ print $1 }' | sha256sum +} + log_assert "Verify 'zfs send -w' works with many different file layouts" typeset keyfile=/$TESTPOOL/pkey typeset sendfile=/$TESTPOOL/sendfile +typeset sendfile2=/$TESTPOOL/sendfile2 +# Create an encrypted dataset log_must eval "echo 'password' > $keyfile" log_must zfs create -o encryption=on -o keyformat=passphrase \ -o keylocation=file://$keyfile $TESTPOOL/$TESTFS2 +# Create files with vaired layouts on disk log_must touch /$TESTPOOL/$TESTFS2/empty log_must mkfile 512 /$TESTPOOL/$TESTFS2/small log_must mkfile 32M /$TESTPOOL/$TESTFS2/full log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS2/sparse \ - bs=512 count=1 seek=10G >/dev/null 2>&1 + bs=512 count=1 seek=1048576 >/dev/null 2>&1 log_must mkfile 32M /$TESTPOOL/$TESTFS2/truncated -log_must truncate -s 4M /$TESTPOOL/$TESTFS2/truncated -sync +log_must mkfile 524288 /$TESTPOOL/$TESTFS2/truncated2 +log_must mkfile 524288 /$TESTPOOL/$TESTFS2/truncated3 log_must mkdir -p /$TESTPOOL/$TESTFS2/dir for i in {1..1000}; do log_must mkfile 512 /$TESTPOOL/$TESTFS2/dir/file-$i done -sync +log_must mkdir -p /$TESTPOOL/$TESTFS2/xattrondir +log_must zfs set xattr=on $TESTPOOL/$TESTFS2 +log_must xattrtest -f 10 -x 3 -s 32768 -r -k -p /$TESTPOOL/$TESTFS2/xattrondir +log_must mkdir -p /$TESTPOOL/$TESTFS2/xattrsadir +log_must zfs set xattr=sa $TESTPOOL/$TESTFS2 +log_must xattrtest -f 10 -x 3 -s 32768 -r -k -p /$TESTPOOL/$TESTFS2/xattrsadir + +# ZoL issue #7432 +log_must zfs set compression=on xattr=sa $TESTPOOL/$TESTFS2 +log_must touch /$TESTPOOL/$TESTFS2/attrs +log_must eval "python -c 'print \"a\" * 4096' | \ + attr -s bigval /$TESTPOOL/$TESTFS2/attrs" + +log_must zfs snapshot $TESTPOOL/$TESTFS2@snap1 + +# +# Truncate files created in the first snapshot. The first tests +# truncating a large file to a single block. The second tests +# truncating one block off the end of a file without changing +# the required nlevels to hold it. The last tests handling +# of a maxblkid that is dropped and then raised again. +# +log_must truncate -s 131072 /$TESTPOOL/$TESTFS2/truncated +log_must truncate -s 393216 /$TESTPOOL/$TESTFS2/truncated2 +log_must truncate -s 0 /$TESTPOOL/$TESTFS2/truncated3 +log_must zpool sync $TESTPOOL +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS2/truncated3 \ + bs=128k count=3 iflag=fullblock + +# Remove the empty files created in the first snapshot for i in {1..1000}; do log_must rm /$TESTPOOL/$TESTFS2/dir/file-$i done sync -# ZoL issue #7432 -log_must zfs set compression=on xattr=sa $TESTPOOL/$TESTFS2 -log_must touch /$TESTPOOL/$TESTFS2/attrs -log_must eval "python -c 'print \"a\" * 4096' | attr -s bigval /$TESTPOOL/$TESTFS2/attrs" +log_must zfs snapshot $TESTPOOL/$TESTFS2@snap2 +expected_cksum=$(recursive_cksum /$TESTPOOL/$TESTFS2) -log_must zfs snapshot $TESTPOOL/$TESTFS2@now -log_must eval "zfs send -wR $TESTPOOL/$TESTFS2@now > $sendfile" +log_must eval "zfs send -wp $TESTPOOL/$TESTFS2@snap1 > $sendfile" +log_must eval "zfs send -wp -i @snap1 $TESTPOOL/$TESTFS2@snap2 > $sendfile2" log_must eval "zfs recv -F $TESTPOOL/recv < $sendfile" +log_must eval "zfs recv -F $TESTPOOL/recv < $sendfile2" log_must zfs load-key $TESTPOOL/recv log_must zfs mount -a +actual_cksum=$(recursive_cksum /$TESTPOOL/recv) +[[ "$expected_cksum" != "$actual_cksum" ]] && \ + log_fail "Recursive checksums differ ($expected_cksum != $actual_cksum)" + +log_must xattrtest -f 10 -o3 -y -p /$TESTPOOL/recv/xattrondir +log_must xattrtest -f 10 -o3 -y -p /$TESTPOOL/recv/xattrsadir log_pass "Verified 'zfs send -w' works with many different file layouts"