diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c index bb974133d1..4d958fe436 100644 --- a/cmd/zhack/zhack.c +++ b/cmd/zhack/zhack.c @@ -103,8 +103,8 @@ fatal(spa_t *spa, void *tag, const char *fmt, ...) /* ARGSUSED */ static int -space_delta_cb(dmu_object_type_t bonustype, void *data, - uint64_t *userp, uint64_t *groupp, uint64_t *projectp) +space_delta_cb(dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zoi) { /* * Is it a valid type of object to track? diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 139f3cbdfe..5174bdc452 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1013,10 +1013,17 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); -typedef int objset_used_cb_t(dmu_object_type_t bonustype, - void *bonus, uint64_t *userp, uint64_t *groupp, uint64_t *projectp); +typedef struct zfs_file_info { + uint64_t zfi_user; + uint64_t zfi_group; + uint64_t zfi_project; + uint64_t zfi_generation; +} zfs_file_info_t; + +typedef int file_info_cb_t(dmu_object_type_t bonustype, const void *data, + struct zfs_file_info *zoi); extern void dmu_objset_register_type(dmu_objset_type_t ost, - objset_used_cb_t *cb); + file_info_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 9b6614e98b..a77131ef1c 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -254,6 +254,8 @@ boolean_t dmu_objset_projectquota_enabled(objset_t *os); boolean_t dmu_objset_projectquota_present(objset_t *os); boolean_t dmu_objset_projectquota_upgradable(objset_t *os); void dmu_objset_id_quota_upgrade(objset_t *os); +int dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, + const void *data, zfs_file_info_t *zfi); int dmu_fsname(const char *snapname, char *buf); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index ecdfd42d01..575a4af514 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1336,6 +1336,7 @@ typedef enum { ZFS_ERR_EXPORT_IN_PROGRESS, ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR, ZFS_ERR_STREAM_TRUNCATED, + ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH, } zfs_errno_t; /* diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index d4ffe70bbe..78d33dedaf 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -107,6 +107,22 @@ typedef enum drr_headertype { #define DMU_BACKUP_FEATURE_RAW (1 << 24) /* flag #25 is reserved for the ZSTD compression feature */ #define DMU_BACKUP_FEATURE_HOLDS (1 << 26) +/* + * The SWITCH_TO_LARGE_BLOCKS feature indicates that we can receive + * incremental LARGE_BLOCKS streams (those with WRITE records of >128KB) even + * if the previous send did not use LARGE_BLOCKS, and thus its large blocks + * were split into multiple 128KB WRITE records. (See + * flush_write_batch_impl() and receive_object()). Older software that does + * not support this flag may encounter a bug when switching to large blocks, + * which causes files to incorrectly be zeroed. + * + * This flag is currently not set on any send streams. In the future, we + * intend for incremental send streams of snapshots that have large blocks to + * use LARGE_BLOCKS by default, and these streams will also have the + * SWITCH_TO_LARGE_BLOCKS feature set. This ensures that streams from the + * default use of "zfs send" won't encounter the bug mentioned above. + */ +#define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27) /* * Mask of all supported backup features @@ -116,7 +132,7 @@ typedef enum drr_headertype { DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_LARGE_BLOCKS | \ DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \ - DMU_BACKUP_FEATURE_REDACTED) + DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) diff --git a/include/sys/zfs_quota.h b/include/sys/zfs_quota.h index ec4dc8f163..b215b8dd00 100644 --- a/include/sys/zfs_quota.h +++ b/include/sys/zfs_quota.h @@ -24,23 +24,22 @@ #include #include -#include -extern int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - uint64_t *userp, uint64_t *groupp, uint64_t *projectp); +struct zfsvfs; +struct zfs_file_info_t; -extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t *valuep); -extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); -extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t quota); +extern int zpl_get_file_info(dmu_object_type_t, + const void *, struct zfs_file_info *); -extern boolean_t zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, - uint64_t id); -extern boolean_t zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, - uint64_t id); -extern boolean_t zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, - uint64_t id); +extern int zfs_userspace_one(struct zfsvfs *, zfs_userquota_prop_t, + const char *, uint64_t, uint64_t *); +extern int zfs_userspace_many(struct zfsvfs *, zfs_userquota_prop_t, + uint64_t *, void *, uint64_t *); +extern int zfs_set_userquota(struct zfsvfs *, zfs_userquota_prop_t, + const char *, uint64_t, uint64_t); + +extern boolean_t zfs_id_overobjquota(struct zfsvfs *, uint64_t, uint64_t); +extern boolean_t zfs_id_overblockquota(struct zfsvfs *, uint64_t, uint64_t); +extern boolean_t zfs_id_overquota(struct zfsvfs *, uint64_t, uint64_t); #endif diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 39b5c6df10..3fffc426c0 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -4802,6 +4802,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, ioctl_err == ECKSUM); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; + case ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental send stream requires -L " + "(--large-block), to match previous receive.")); + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to receive this stream.")); diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 3c37d3faa9..317773e444 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -2202,7 +2202,7 @@ zfs_init(void) */ zfs_vnodes_adjust(); - dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); + dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); } diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 28ea34a00c..ea5971b0c5 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -2131,7 +2131,7 @@ zfs_init(void) { zfsctl_init(); zfs_znode_init(); - dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); + dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); register_filesystem(&zpl_fs_type); } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 16b93a4fe4..d305cee409 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1728,19 +1728,29 @@ dmu_objset_is_dirty(objset_t *os, uint64_t txg) return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK])); } -static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; +static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES]; void -dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) +dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb) { - used_cbs[ost] = cb; + file_cbs[ost] = cb; +} + +int +dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zfi) +{ + file_info_cb_t *cb = file_cbs[os->os_phys->os_type]; + if (cb == NULL) + return (EINVAL); + return (cb(bonustype, data, zfi)); } boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && - used_cbs[os->os_phys->os_type] != NULL && + file_cbs[os->os_phys->os_type] != NULL && DMU_USERUSED_DNODE(os) != NULL); } @@ -1754,7 +1764,7 @@ dmu_objset_userobjused_enabled(objset_t *os) boolean_t dmu_objset_projectquota_enabled(objset_t *os) { - return (used_cbs[os->os_phys->os_type] != NULL && + return (file_cbs[os->os_phys->os_type] != NULL && DMU_PROJECTUSED_DNODE(os) != NULL && spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA)); } @@ -2089,9 +2099,6 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) objset_t *os = dn->dn_objset; void *data = NULL; dmu_buf_impl_t *db = NULL; - uint64_t *user = NULL; - uint64_t *group = NULL; - uint64_t *project = NULL; int flags = dn->dn_id_flags; int error; boolean_t have_spill = B_FALSE; @@ -2145,23 +2152,23 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) return; } - if (before) { - ASSERT(data); - user = &dn->dn_olduid; - group = &dn->dn_oldgid; - project = &dn->dn_oldprojid; - } else if (data) { - user = &dn->dn_newuid; - group = &dn->dn_newgid; - project = &dn->dn_newprojid; - } - /* * Must always call the callback in case the object * type has changed and that type isn't an object type to track */ - error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, - user, group, project); + zfs_file_info_t zfi; + error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi); + + if (before) { + ASSERT(data); + dn->dn_olduid = zfi.zfi_user; + dn->dn_oldgid = zfi.zfi_group; + dn->dn_oldprojid = zfi.zfi_project; + } else if (data) { + dn->dn_newuid = zfi.zfi_user; + dn->dn_newgid = zfi.zfi_group; + dn->dn_newprojid = zfi.zfi_project; + } /* * Preserve existing uid/gid when the callback can't determine diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 29fbe854d7..17ebeb682d 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -104,6 +104,7 @@ struct receive_writer_arg { boolean_t resumable; boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ + boolean_t full; /* this is a full send stream */ uint64_t last_object; uint64_t last_offset; uint64_t max_object; /* highest object ID referenced in stream */ @@ -333,6 +334,21 @@ redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin) return (ret); } +/* + * If we previously received a stream with --large-block, we don't support + * receiving an incremental on top of it without --large-block. This avoids + * forcing a read-modify-write or trying to re-aggregate a string of WRITE + * records. + */ +static int +recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags) +{ + if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) && + !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH)); + return (0); +} + static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) @@ -445,6 +461,12 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, return (SET_ERROR(EINVAL)); } + error = recv_check_large_blocks(snap, featureflags); + if (error != 0) { + dsl_dataset_rele(snap, FTAG); + return (error); + } + dsl_dataset_rele(snap, FTAG); } else { /* if full, then must be forced */ @@ -479,7 +501,6 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, } return (0); - } /* @@ -725,6 +746,13 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) } } + error = recv_check_large_blocks(ds, featureflags); + if (error != 0) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + dsl_dataset_rele_flags(origin, dsflags, FTAG); } @@ -1050,6 +1078,13 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) } } } + + error = recv_check_large_blocks(ds, drc->drc_featureflags); + if (error != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (0); } @@ -1289,14 +1324,251 @@ save_resume_state(struct receive_writer_arg *rwa, rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; } +static int +receive_object_is_same_generation(objset_t *os, uint64_t object, + dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type, + const void *new_bonus, boolean_t *samegenp) +{ + zfs_file_info_t zoi; + int err; + + dmu_buf_t *old_bonus_dbuf; + err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf); + if (err != 0) + return (err); + err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data, + &zoi); + dmu_buf_rele(old_bonus_dbuf, FTAG); + if (err != 0) + return (err); + uint64_t old_gen = zoi.zfi_generation; + + err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi); + if (err != 0) + return (err); + uint64_t new_gen = zoi.zfi_generation; + + *samegenp = (old_gen == new_gen); + return (0); +} + +static int +receive_handle_existing_object(const struct receive_writer_arg *rwa, + const struct drr_object *drro, const dmu_object_info_t *doi, + const void *bonus_data, + uint64_t *object_to_hold, uint32_t *new_blksz) +{ + uint32_t indblksz = drro->drr_indblkshift ? + 1ULL << drro->drr_indblkshift : 0; + int nblkptr = deduce_nblkptr(drro->drr_bonustype, + drro->drr_bonuslen); + uint8_t dn_slots = drro->drr_dn_slots != 0 ? + drro->drr_dn_slots : DNODE_MIN_SLOTS; + boolean_t do_free_range = B_FALSE; + int err; + + *object_to_hold = drro->drr_object; + + /* nblkptr should be bounded by the bonus size and type */ + if (rwa->raw && nblkptr != drro->drr_nblkptr) + return (SET_ERROR(EINVAL)); + + /* + * After the previous send stream, the sending system may + * have freed this object, and then happened to re-allocate + * this object number in a later txg. In this case, we are + * receiving a different logical file, and the block size may + * appear to be different. i.e. we may have a different + * block size for this object than what the send stream says. + * In this case we need to remove the object's contents, + * so that its structure can be changed and then its contents + * entirely replaced by subsequent WRITE records. + * + * If this is a -L (--large-block) incremental stream, and + * the previous stream was not -L, the block size may appear + * to increase. i.e. we may have a smaller block size for + * this object than what the send stream says. In this case + * we need to keep the object's contents and block size + * intact, so that we don't lose parts of the object's + * contents that are not changed by this incremental send + * stream. + * + * We can distinguish between the two above cases by using + * the ZPL's generation number (see + * receive_object_is_same_generation()). However, we only + * want to rely on the generation number when absolutely + * necessary, because with raw receives, the generation is + * encrypted. We also want to minimize dependence on the + * ZPL, so that other types of datasets can also be received + * (e.g. ZVOLs, although note that ZVOLS currently do not + * reallocate their objects or change their structure). + * Therefore, we check a number of different cases where we + * know it is safe to discard the object's contents, before + * using the ZPL's generation number to make the above + * distinction. + */ + if (drro->drr_blksz != doi->doi_data_block_size) { + if (rwa->raw) { + /* + * RAW streams always have large blocks, so + * we are sure that the data is not needed + * due to changing --large-block to be on. + * Which is fortunate since the bonus buffer + * (which contains the ZPL generation) is + * encrypted, and the key might not be + * loaded. + */ + do_free_range = B_TRUE; + } else if (rwa->full) { + /* + * This is a full send stream, so it always + * replaces what we have. Even if the + * generation numbers happen to match, this + * can not actually be the same logical file. + * This is relevant when receiving a full + * send as a clone. + */ + do_free_range = B_TRUE; + } else if (drro->drr_type != + DMU_OT_PLAIN_FILE_CONTENTS || + doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) { + /* + * PLAIN_FILE_CONTENTS are the only type of + * objects that have ever been stored with + * large blocks, so we don't need the special + * logic below. ZAP blocks can shrink (when + * there's only one block), so we don't want + * to hit the error below about block size + * only increasing. + */ + do_free_range = B_TRUE; + } else if (doi->doi_max_offset <= + doi->doi_data_block_size) { + /* + * There is only one block. We can free it, + * because its contents will be replaced by a + * WRITE record. This can not be the no-L -> + * -L case, because the no-L case would have + * resulted in multiple blocks. If we + * supported -L -> no-L, it would not be safe + * to free the file's contents. Fortunately, + * that is not allowed (see + * recv_check_large_blocks()). + */ + do_free_range = B_TRUE; + } else { + boolean_t is_same_gen; + err = receive_object_is_same_generation(rwa->os, + drro->drr_object, doi->doi_bonus_type, + drro->drr_bonustype, bonus_data, &is_same_gen); + if (err != 0) + return (SET_ERROR(EINVAL)); + + if (is_same_gen) { + /* + * This is the same logical file, and + * the block size must be increasing. + * It could only decrease if + * --large-block was changed to be + * off, which is checked in + * recv_check_large_blocks(). + */ + if (drro->drr_blksz <= + doi->doi_data_block_size) + return (SET_ERROR(EINVAL)); + /* + * We keep the existing blocksize and + * contents. + */ + *new_blksz = + doi->doi_data_block_size; + } else { + do_free_range = B_TRUE; + } + } + } + + /* nblkptr can only decrease if the object was reallocated */ + if (nblkptr < doi->doi_nblkptr) + do_free_range = B_TRUE; + + /* number of slots can only change on reallocation */ + if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) + do_free_range = B_TRUE; + + /* + * For raw sends we also check a few other fields to + * ensure we are preserving the objset structure exactly + * as it was on the receive side: + * - A changed indirect block size + * - A smaller nlevels + */ + if (rwa->raw) { + if (indblksz != doi->doi_metadata_block_size) + do_free_range = B_TRUE; + if (drro->drr_nlevels < doi->doi_indirection) + do_free_range = B_TRUE; + } + + if (do_free_range) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + 0, DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + + /* + * The dmu does not currently support decreasing nlevels + * or changing the number of dnode slots on an object. For + * non-raw sends, this does not matter and the new object + * can just use the previous one's nlevels. For raw sends, + * however, the structure of the received dnode (including + * nlevels and dnode slots) must match that of the send + * side. Therefore, instead of using dmu_object_reclaim(), + * we must free the object completely and call + * dmu_object_claim_dnsize() instead. + */ + if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) || + dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { + err = dmu_free_long_object(rwa->os, drro->drr_object); + if (err != 0) + return (SET_ERROR(EINVAL)); + + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + *object_to_hold = DMU_NEW_OBJECT; + } + + /* + * For raw receives, free everything beyond the new incoming + * maxblkid. Normally this would be done with a DRR_FREE + * record that would come after this DRR_OBJECT record is + * processed. However, for raw receives we manually set the + * maxblkid from the drr_maxblkid and so we must first free + * everything above that blkid to ensure the DMU is always + * consistent with itself. We will never free the first block + * of the object here because a maxblkid of 0 could indicate + * an object with a single block or one with no blocks. This + * free may be skipped when dmu_free_long_range() was called + * above since it covers the entire object's contents. + */ + if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + (drro->drr_maxblkid + 1) * doi->doi_data_block_size, + DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + return (0); +} + noinline static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) { dmu_object_info_t doi; dmu_tx_t *tx; - uint64_t object; int err; + uint32_t new_blksz = drro->drr_blksz; uint8_t dn_slots = drro->drr_dn_slots != 0 ? drro->drr_dn_slots : DNODE_MIN_SLOTS; @@ -1360,86 +1632,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, * Raw receives will also check that the indirect structure of the * dnode hasn't changed. */ + uint64_t object_to_hold; if (err == 0) { - uint32_t indblksz = drro->drr_indblkshift ? - 1ULL << drro->drr_indblkshift : 0; - int nblkptr = deduce_nblkptr(drro->drr_bonustype, - drro->drr_bonuslen); - boolean_t did_free = B_FALSE; - - object = drro->drr_object; - - /* nblkptr should be bounded by the bonus size and type */ - if (rwa->raw && nblkptr != drro->drr_nblkptr) - return (SET_ERROR(EINVAL)); - - /* - * Check for indicators that the object was freed and - * reallocated. For all sends, these indicators are: - * - A changed block size - * - A smaller nblkptr - * - A changed dnode size - * For raw sends we also check a few other fields to - * ensure we are preserving the objset structure exactly - * as it was on the receive side: - * - A changed indirect block size - * - A smaller nlevels - */ - if (drro->drr_blksz != doi.doi_data_block_size || - nblkptr < doi.doi_nblkptr || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT || - (rwa->raw && - (indblksz != doi.doi_metadata_block_size || - drro->drr_nlevels < doi.doi_indirection))) { - err = dmu_free_long_range(rwa->os, drro->drr_object, - 0, DMU_OBJECT_END); - if (err != 0) - return (SET_ERROR(EINVAL)); - else - did_free = B_TRUE; - } - - /* - * The dmu does not currently support decreasing nlevels - * or changing the number of dnode slots on an object. For - * non-raw sends, this does not matter and the new object - * can just use the previous one's nlevels. For raw sends, - * however, the structure of the received dnode (including - * nlevels and dnode slots) must match that of the send - * side. Therefore, instead of using dmu_object_reclaim(), - * we must free the object completely and call - * dmu_object_claim_dnsize() instead. - */ - if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { - err = dmu_free_long_object(rwa->os, drro->drr_object); - if (err != 0) - return (SET_ERROR(EINVAL)); - - txg_wait_synced(dmu_objset_pool(rwa->os), 0); - object = DMU_NEW_OBJECT; - } - - /* - * For raw receives, free everything beyond the new incoming - * maxblkid. Normally this would be done with a DRR_FREE - * record that would come after this DRR_OBJECT record is - * processed. However, for raw receives we manually set the - * maxblkid from the drr_maxblkid and so we must first free - * everything above that blkid to ensure the DMU is always - * consistent with itself. We will never free the first block - * of the object here because a maxblkid of 0 could indicate - * an object with a single block or one with no blocks. This - * free may be skipped when dmu_free_long_range() was called - * above since it covers the entire object's contents. - */ - if (rwa->raw && object != DMU_NEW_OBJECT && !did_free) { - err = dmu_free_long_range(rwa->os, drro->drr_object, - (drro->drr_maxblkid + 1) * doi.doi_data_block_size, - DMU_OBJECT_END); - if (err != 0) - return (SET_ERROR(EINVAL)); - } + err = receive_handle_existing_object(rwa, drro, &doi, data, + &object_to_hold, &new_blksz); } else if (err == EEXIST) { /* * The object requested is currently an interior slot of a @@ -1454,10 +1650,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, return (SET_ERROR(EINVAL)); /* object was freed and we are about to allocate a new one */ - object = DMU_NEW_OBJECT; + object_to_hold = DMU_NEW_OBJECT; } else { /* object is free and we are about to allocate a new one */ - object = DMU_NEW_OBJECT; + object_to_hold = DMU_NEW_OBJECT; } /* @@ -1492,27 +1688,27 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, } tx = dmu_tx_create(rwa->os); - dmu_tx_hold_bonus(tx, object); - dmu_tx_hold_write(tx, object, 0, 0); + dmu_tx_hold_bonus(tx, object_to_hold); + dmu_tx_hold_write(tx, object_to_hold, 0, 0); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } - if (object == DMU_NEW_OBJECT) { + if (object_to_hold == DMU_NEW_OBJECT) { /* Currently free, wants to be allocated */ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, + drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || - drro->drr_blksz != doi.doi_data_block_size || + new_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* Currently allocated, but with different properties */ err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, + drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, rwa->spill ? DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx); @@ -1578,6 +1774,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, * For non-new objects block size and indirect block * shift cannot change and nlevels can only increase. */ + ASSERT3U(new_blksz, ==, drro->drr_blksz); VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, drro->drr_blksz, drro->drr_indblkshift, tx)); VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, @@ -1707,6 +1904,40 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) DRR_WRITE_PAYLOAD_SIZE(drrw)); } + /* + * If we are receiving an incremental large-block stream into + * a dataset that previously did a non-large-block receive, + * the WRITE record may be larger than the object's block + * size. dmu_assign_arcbuf_by_dnode() handles this as long + * as the arcbuf is not compressed, so decompress it here if + * necessary. + */ + if (drrw->drr_logical_size != dn->dn_datablksz && + arc_get_compression(abuf) != ZIO_COMPRESS_OFF) { + ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz); + zbookmark_phys_t zb = { + .zb_objset = dmu_objset_id(rwa->os), + .zb_object = rwa->last_object, + .zb_level = 0, + .zb_blkid = + drrw->drr_offset >> dn->dn_datablkshift, + }; + + /* + * The size of loaned arc bufs is counted in + * arc_loaned_bytes. When we untransform + * (decompress) the buf, its size increases. To + * ensure that arc_loaned_bytes remains accurate, we + * need to return (un-loan) the buf (with its + * compressed size) and then re-loan it (with its + * new, uncompressed size). + */ + arc_return_buf(abuf, FTAG); + VERIFY0(arc_untransform(abuf, dmu_objset_spa(rwa->os), + &zb, B_FALSE)); + arc_loan_inuse_buf(abuf, FTAG); + } + err = dmu_assign_arcbuf_by_dnode(dn, drrw->drr_offset, abuf, tx); if (err != 0) { @@ -2710,6 +2941,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) rwa->resumable = drc->drc_resumable; rwa->raw = drc->drc_raw; rwa->spill = drc->drc_spill; + rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0); rwa->os->os_raw_receive = drc->drc_raw; list_create(&rwa->write_batch, sizeof (struct receive_record_arg), offsetof(struct receive_record_arg, node.bqn_node)); diff --git a/module/zfs/zfs_quota.c b/module/zfs/zfs_quota.c index 6c83f79c97..e61db5c7ab 100644 --- a/module/zfs/zfs_quota.c +++ b/module/zfs/zfs_quota.c @@ -39,21 +39,17 @@ #include int -zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - uint64_t *userp, uint64_t *groupp, uint64_t *projectp) +zpl_get_file_info(dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zoi) { - sa_hdr_phys_t sa; - sa_hdr_phys_t *sap = data; - uint64_t flags; - int hdrsize; - boolean_t swap = B_FALSE; - /* * Is it a valid type of object to track? */ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) return (SET_ERROR(ENOENT)); + zoi->zfi_project = ZFS_DEFAULT_PROJID; + /* * If we have a NULL data pointer * then assume the id's aren't changing and @@ -64,52 +60,55 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, return (SET_ERROR(EEXIST)); if (bonustype == DMU_OT_ZNODE) { - znode_phys_t *znp = data; - *userp = znp->zp_uid; - *groupp = znp->zp_gid; - *projectp = ZFS_DEFAULT_PROJID; + const znode_phys_t *znp = data; + zoi->zfi_user = znp->zp_uid; + zoi->zfi_group = znp->zp_gid; + zoi->zfi_generation = znp->zp_gen; return (0); } + const sa_hdr_phys_t *sap = data; if (sap->sa_magic == 0) { /* * This should only happen for newly created files * that haven't had the znode data filled in yet. */ - *userp = 0; - *groupp = 0; - *projectp = ZFS_DEFAULT_PROJID; + zoi->zfi_user = 0; + zoi->zfi_group = 0; + zoi->zfi_generation = 0; return (0); } - sa = *sap; + sa_hdr_phys_t sa = *sap; + boolean_t swap = B_FALSE; if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { sa.sa_magic = SA_MAGIC; sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); swap = B_TRUE; - } else { - VERIFY3U(sa.sa_magic, ==, SA_MAGIC); } + VERIFY3U(sa.sa_magic, ==, SA_MAGIC); - hdrsize = sa_hdrsize(&sa); + int hdrsize = sa_hdrsize(&sa); VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); - *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); - *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); - flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET)); + uintptr_t data_after_hdr = (uintptr_t)data + hdrsize; + zoi->zfi_user = *((uint64_t *)(data_after_hdr + SA_UID_OFFSET)); + zoi->zfi_group = *((uint64_t *)(data_after_hdr + SA_GID_OFFSET)); + zoi->zfi_generation = *((uint64_t *)(data_after_hdr + SA_GEN_OFFSET)); + uint64_t flags = *((uint64_t *)(data_after_hdr + SA_FLAGS_OFFSET)); if (swap) flags = BSWAP_64(flags); - if (flags & ZFS_PROJID) - *projectp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_PROJID_OFFSET)); - else - *projectp = ZFS_DEFAULT_PROJID; + if (flags & ZFS_PROJID) { + zoi->zfi_project = + *((uint64_t *)(data_after_hdr + SA_PROJID_OFFSET)); + } if (swap) { - *userp = BSWAP_64(*userp); - *groupp = BSWAP_64(*groupp); - *projectp = BSWAP_64(*projectp); + zoi->zfi_user = BSWAP_64(zoi->zfi_user); + zoi->zfi_group = BSWAP_64(zoi->zfi_group); + zoi->zfi_project = BSWAP_64(zoi->zfi_project); + zoi->zfi_generation = BSWAP_64(zoi->zfi_generation); } return (0); } @@ -468,7 +467,7 @@ zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) zfs_id_overobjquota(zfsvfs, usedobj, id)); } -EXPORT_SYMBOL(zfs_space_delta_cb); +EXPORT_SYMBOL(zpl_get_file_info); EXPORT_SYMBOL(zfs_userspace_one); EXPORT_SYMBOL(zfs_userspace_many); EXPORT_SYMBOL(zfs_set_userquota); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index cbad90ad14..d8c109eb7d 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -781,7 +781,7 @@ tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled', 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', - 'send-c_recv_dedup', 'send_encrypted_hierarchy', + 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy', 'send_encrypted_props', 'send_encrypted_truncated_files', 'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files', 'send_spill_block', 'send_holds', diff --git a/tests/zfs-tests/tests/functional/rsend/Makefile.am b/tests/zfs-tests/tests/functional/rsend/Makefile.am index 7728a6481d..ab3a1c6c3c 100644 --- a/tests/zfs-tests/tests/functional/rsend/Makefile.am +++ b/tests/zfs-tests/tests/functional/rsend/Makefile.am @@ -41,6 +41,7 @@ dist_pkgdata_SCRIPTS = \ send-c_volume.ksh \ send-c_zstreamdump.ksh \ send-cpL_varied_recsize.ksh \ + send-L_toggle.ksh \ send_freeobjects.ksh \ send_partial_dataset.ksh \ send_realloc_dnode_size.ksh \ diff --git a/tests/zfs-tests/tests/functional/rsend/send-L_toggle.ksh b/tests/zfs-tests/tests/functional/rsend/send-L_toggle.ksh new file mode 100755 index 0000000000..483efcc605 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/send-L_toggle.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# Description: +# Verify that send -L can be changed to on in an incremental. +# Verify that send -L can not be turned off in an incremental. +# + +function cleanup +{ + log_must_busy zfs destroy -r $TESTPOOL/fs + log_must_busy zfs destroy -r $TESTPOOL/recv +} + +verify_runnable "both" + +log_assert "Verify toggling send -L works as expected" +log_onexit cleanup + +log_must zfs create -o compression=on -o recordsize=1m $TESTPOOL/fs + +log_must dd if=/dev/urandom of=/$TESTPOOL/fs/file bs=1024 count=1500 + +log_must zfs snapshot $TESTPOOL/fs@snap + +log_must dd if=/dev/urandom of=/$TESTPOOL/fs/file bs=1024 count=1500 conv=notrunc seek=2048 + +log_must zfs snapshot $TESTPOOL/fs@snap2 + +log_must zfs create $TESTPOOL/recv + +log_must zfs send -c $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/noL-noL +log_must zfs send -c -i @snap $TESTPOOL/fs@snap2| zfs recv $TESTPOOL/recv/noL-noL +log_must diff /$TESTPOOL/fs/file /$TESTPOOL/recv/noL-noL/file + +log_must zfs send -c -L $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/L-L +log_must zfs send -c -L -i @snap $TESTPOOL/fs@snap2 | zfs recv $TESTPOOL/recv/L-L +log_must diff /$TESTPOOL/fs/file /$TESTPOOL/recv/L-L/file + +log_must zfs send -c $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/noL-L +log_must zfs send -c -L -i @snap $TESTPOOL/fs@snap2 | zfs recv $TESTPOOL/recv/noL-L +log_must diff /$TESTPOOL/fs/file /$TESTPOOL/recv/noL-L/file + +log_must zfs send -c -L $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/L-noL +log_mustnot zfs send -c -i @snap $TESTPOOL/fs@snap2 | zfs recv $TESTPOOL/recv/L-noL +log_must diff /$TESTPOOL/fs/.zfs/snapshot/snap/file /$TESTPOOL/recv/L-noL/file + +log_pass "Verify toggling send -L works as expected"