diff --git a/META b/META index 5f834d5cc7..f17be8f36e 100644 --- a/META +++ b/META @@ -2,7 +2,7 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.2.0 -Release: rc1 +Release: rc2 Release-Tags: relext License: CDDL Author: OpenZFS diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index f83ae09259..a0e377a4a0 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -416,6 +416,11 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) return; + if (vdev_guid == 0) { + fmd_hdl_debug(hdl, "Got a zero GUID"); + return; + } + if (spare) { int nspares = find_and_remove_spares(zhdl, vdev_guid); fmd_hdl_debug(hdl, "%d spares removed", nspares); diff --git a/config/Shellcheck.am b/config/Shellcheck.am index 1cff81e04b..1ab1351606 100644 --- a/config/Shellcheck.am +++ b/config/Shellcheck.am @@ -4,6 +4,7 @@ # Not following: a was not specified as input (see shellcheck -x). [SC1091] # Prefer putting braces around variable references even when not strictly required. [SC2250] # Consider invoking this command separately to avoid masking its return value (or use '|| true' to ignore). [SC2312] +# Command appears to be unreachable. Check usage (or ignore if invoked indirectly). [SC2317] # In POSIX sh, 'local' is undefined. [SC2039] # older ShellCheck versions # In POSIX sh, 'local' is undefined. [SC3043] # newer ShellCheck versions @@ -18,7 +19,7 @@ PHONY += shellcheck _STGT = $(subst ^,/,$(subst shellcheck-here-,,$@)) shellcheck-here-%: if HAVE_SHELLCHECK - shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)" + shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC2317,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)" else @echo "skipping shellcheck of" $(_STGT) "because shellcheck is not installed" endif diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index 28e5364581..887acee670 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -103,6 +103,33 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [ ]) ]) +dnl # +dnl # 6.5.x API change +dnl # disk_check_media_change() was added +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [ + ZFS_LINUX_TEST_SRC([disk_check_media_change], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + bool error; + + error = disk_check_media_change(bdev->bd_disk); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [ + AC_MSG_CHECKING([whether disk_check_media_change() exists]) + ZFS_LINUX_TEST_RESULT([disk_check_media_change], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_DISK_CHECK_MEDIA_CHANGE, 1, + [disk_check_media_change() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + dnl # dnl # bdev_kobj() is introduced from 5.12 dnl # @@ -443,6 +470,29 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [ ]) ]) +dnl # +dnl # 6.5.x API change +dnl # BLK_STS_NEXUS replaced with BLK_STS_RESV_CONFLICT +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT], [ + ZFS_LINUX_TEST_SRC([blk_sts_resv_conflict], [ + #include + ],[ + blk_status_t s __attribute__ ((unused)) = BLK_STS_RESV_CONFLICT; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [ + AC_MSG_CHECKING([whether BLK_STS_RESV_CONFLICT is defined]) + ZFS_LINUX_TEST_RESULT([blk_sts_resv_conflict], [ + AC_DEFINE(HAVE_BLK_STS_RESV_CONFLICT, 1, [BLK_STS_RESV_CONFLICT is defined]) + AC_MSG_RESULT(yes) + ], [ + AC_MSG_RESULT(no) + ]) + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_PUT @@ -458,6 +508,8 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV + ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE + ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ @@ -476,4 +528,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV + ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE + ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT ]) diff --git a/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in index 68fdcb1f32..12d8ac703e 100644 --- a/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in +++ b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in @@ -2,7 +2,7 @@ Description=Rollback bootfs just before it is mounted Requisite=zfs-import.target After=zfs-import.target dracut-pre-mount.service zfs-snapshot-bootfs.service -Before=dracut-mount.service +Before=dracut-mount.service sysroot.mount DefaultDependencies=no ConditionKernelCommandLine=bootfs.rollback ConditionEnvironment=BOOTFS diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h index ab1727dca0..0779e58e49 100644 --- a/include/os/freebsd/spl/sys/vnode.h +++ b/include/os/freebsd/spl/sys/vnode.h @@ -36,7 +36,11 @@ struct xucred; typedef struct flock flock64_t; typedef struct vnode vnode_t; typedef struct vattr vattr_t; +#if __FreeBSD_version < 1400093 typedef enum vtype vtype_t; +#else +#define vtype_t __enum_uint8(vtype) +#endif #include #include diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index c5c6385be6..e0f20ba320 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -181,7 +181,11 @@ bi_status_to_errno(blk_status_t status) return (ENOLINK); case BLK_STS_TARGET: return (EREMOTEIO); +#ifdef HAVE_BLK_STS_RESV_CONFLICT + case BLK_STS_RESV_CONFLICT: +#else case BLK_STS_NEXUS: +#endif return (EBADE); case BLK_STS_MEDIUM: return (ENODATA); @@ -215,7 +219,11 @@ errno_to_bi_status(int error) case EREMOTEIO: return (BLK_STS_TARGET); case EBADE: +#ifdef HAVE_BLK_STS_RESV_CONFLICT + return (BLK_STS_RESV_CONFLICT); +#else return (BLK_STS_NEXUS); +#endif case ENODATA: return (BLK_STS_MEDIUM); case EILSEQ: @@ -337,6 +345,8 @@ zfs_check_media_change(struct block_device *bdev) return (0); } #define vdev_bdev_reread_part(bdev) zfs_check_media_change(bdev) +#elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE) +#define vdev_bdev_reread_part(bdev) disk_check_media_change(bdev->bd_disk) #else /* * This is encountered if check_disk_change() and bdev_check_media_change() diff --git a/include/os/linux/spl/sys/types.h b/include/os/linux/spl/sys/types.h index a7666187ec..d89a91c36f 100644 --- a/include/os/linux/spl/sys/types.h +++ b/include/os/linux/spl/sys/types.h @@ -38,7 +38,7 @@ typedef unsigned long ulong_t; typedef unsigned long long u_longlong_t; typedef long long longlong_t; -typedef unsigned long intptr_t; +typedef long intptr_t; typedef unsigned long long rlim64_t; typedef struct task_struct kthread_t; diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h index f3384f5264..81bc0fe210 100644 --- a/include/sys/bpobj.h +++ b/include/sys/bpobj.h @@ -60,7 +60,7 @@ typedef struct bpobj { kmutex_t bpo_lock; objset_t *bpo_os; uint64_t bpo_object; - int bpo_epb; + uint32_t bpo_epb; uint8_t bpo_havecomp; uint8_t bpo_havesubobj; uint8_t bpo_havefreed; diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h index 0fbc3bacff..f00e13cf03 100644 --- a/include/sys/dmu_zfetch.h +++ b/include/sys/dmu_zfetch.h @@ -36,8 +36,6 @@ extern "C" { #endif -extern uint64_t zfetch_array_rd_sz; - struct dnode; /* so we can reference dnode */ typedef struct zfetch { diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index b9bac7e252..fb9e864922 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -102,8 +102,6 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA "zio_delta" #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" -#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" -#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap" #define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges" @@ -112,8 +110,6 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears" #define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits" #define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram" -#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram" #define FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME "snapshot_name" #define FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME "device_name" #define FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME "raw_name" diff --git a/include/sys/spa.h b/include/sys/spa.h index 1fa2044008..b908556874 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -723,16 +723,10 @@ typedef enum spa_mode { * Send TRIM commands in-line during normal pool operation while deleting. * OFF: no * ON: yes - * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources. */ typedef enum { SPA_AUTOTRIM_OFF = 0, /* default */ SPA_AUTOTRIM_ON, -#ifdef IN_FREEBSD_BASE - SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON, -#else - SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF, -#endif } spa_autotrim_t; /* diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 44afa76328..588c72f6e4 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -250,6 +250,7 @@ struct spa { uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ + uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 2b22b973ba..5f4e82ad86 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -420,6 +420,7 @@ struct vdev { boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */ boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ + boolean_t vdev_attaching; /* vdev attach ashift handling */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index 9fb79ab4a5..37fd65b7cb 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -94,8 +94,6 @@ typedef const struct zio_checksum_info { } zio_checksum_info_t; typedef struct zio_bad_cksum { - zio_cksum_t zbc_expected; - zio_cksum_t zbc_actual; const char *zbc_checksum_name; uint8_t zbc_byteswapped; uint8_t zbc_injected; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 271b02b6ee..3843419731 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -15,7 +15,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd January 10, 2023 +.Dd July 21, 2023 .Dt ZFS 4 .Os . @@ -239,6 +239,11 @@ relative to the pool. Make some blocks above a certain size be gang blocks. This option is used by the test suite to facilitate testing. . +.It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint +For blocks that could be forced to be a gang block (due to +.Sy metaslab_force_ganging ) , +force this many of them to be gang blocks. +. .It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP data block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created @@ -519,9 +524,6 @@ However, this is limited by Maximum micro ZAP size. A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size. . -.It Sy zfetch_array_rd_sz Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64 -If prefetching is enabled, disable prefetching for reads larger than this size. -. .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint Min bytes to prefetch per stream. Prefetch distance starts from the demand access size and quickly grows to diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index 341f902fe6..e1436f6ded 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 27, 2021 +.Dd July 11, 2023 .Dt ZPOOL-EVENTS 8 .Os . @@ -305,10 +305,6 @@ The time when a given I/O request was submitted. The time required to service a given I/O request. .It Sy prev_state The previous state of the vdev. -.It Sy cksum_expected -The expected checksum value for the block. -.It Sy cksum_actual -The actual checksum value for an errant block. .It Sy cksum_algorithm Checksum algorithm used. See @@ -362,23 +358,6 @@ Like but contains .Pq Ar good data No & ~( Ns Ar bad data ) ; that is, the bits set in the good data which are cleared in the bad data. -.It Sy bad_set_histogram -If this field exists, it is an array of counters. -Each entry counts bits set in a particular bit of a big-endian uint64 type. -The first entry counts bits -set in the high-order bit of the first byte, the 9th byte, etc, and the last -entry counts bits set of the low-order bit of the 8th byte, the 16th byte, etc. -This information is useful for observing a stuck bit in a parallel data path, -such as IDE or parallel SCSI. -.It Sy bad_cleared_histogram -If this field exists, it is an array of counters. -Each entry counts bit clears in a particular bit of a big-endian uint64 type. -The first entry counts bits -clears of the high-order bit of the first byte, the 9th byte, etc, and the -last entry counts clears of the low-order bit of the 8th byte, the 16th byte, -etc. -This information is useful for observing a stuck bit in a parallel data -path, such as IDE or parallel SCSI. .El . .Sh I/O STAGES diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index d29f00a0cb..7692200ab2 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6263,7 +6263,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) goto bad_write_fallback; } } else { -#if __FreeBSD_version >= 1400086 +#if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \ + __FreeBSD_version >= 1400086 vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false, LK_EXCLUSIVE); #else diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 94184732f9..f0075a52f4 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1680,6 +1680,7 @@ zfs_umount(struct super_block *sb) } zfsvfs_free(zfsvfs); + sb->s_fs_info = NULL; return (0); } diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index 8bdc20eaa0..a375e6b98c 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -280,8 +280,6 @@ zpl_test_super(struct super_block *s, void *data) { zfsvfs_t *zfsvfs = s->s_fs_info; objset_t *os = data; - int match; - /* * If the os doesn't match the z_os in the super_block, assume it is * not a match. Matching would imply a multimount of a dataset. It is @@ -289,19 +287,7 @@ zpl_test_super(struct super_block *s, void *data) * that changes the z_os, e.g., rollback, where the match will be * missed, but in that case the user will get an EBUSY. */ - if (zfsvfs == NULL || os != zfsvfs->z_os) - return (0); - - /* - * If they do match, recheck with the lock held to prevent mounting the - * wrong dataset since z_os can be stale when the teardown lock is held. - */ - if (zpl_enter(zfsvfs, FTAG) != 0) - return (0); - match = (os == zfsvfs->z_os); - zpl_exit(zfsvfs, FTAG); - - return (match); + return (zfsvfs != NULL && os == zfsvfs->z_os); } static struct super_block * @@ -327,12 +313,35 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); + /* + * Recheck with the lock held to prevent mounting the wrong dataset + * since z_os can be stale when the teardown lock is held. + * + * We can't do this in zpl_test_super in since it's under spinlock and + * also s_umount lock is not held there so it would race with + * zfs_umount and zfsvfs can be freed. + */ + if (!IS_ERR(s) && s->s_fs_info != NULL) { + zfsvfs_t *zfsvfs = s->s_fs_info; + if (zpl_enter(zfsvfs, FTAG) == 0) { + if (os != zfsvfs->z_os) + err = -SET_ERROR(EBUSY); + zpl_exit(zfsvfs, FTAG); + } else { + err = -SET_ERROR(EBUSY); + } + } dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); dsl_dataset_rele(dmu_objset_ds(os), FTAG); if (IS_ERR(s)) return (ERR_CAST(s)); + if (err) { + deactivate_locked_super(s); + return (ERR_PTR(err)); + } + if (s->s_root == NULL) { err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); if (err) { diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 459ff62fc9..c4aca04a96 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -160,7 +160,7 @@ zpool_prop_init(void) "wait | continue | panic", "FAILMODE", failuremode_table, sfeatures); zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim", - SPA_AUTOTRIM_DEFAULT, PROP_DEFAULT, ZFS_TYPE_POOL, + SPA_AUTOTRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "AUTOTRIM", boolean_table, sfeatures); /* hidden properties */ diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 211bab5651..e772caead2 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -284,7 +284,17 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, dmu_buf_t *dbuf = NULL; bpobj_t *bpo = bpi->bpi_bpo; - for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { + int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; + uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) * + sizeof (blkptr_t); + uint64_t ps = start * sizeof (blkptr_t); + uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0, + ps); + if (pe > pb) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb, + ZIO_PRIORITY_ASYNC_READ); + } + for (; i >= start; i--) { uint64_t offset = i * sizeof (blkptr_t); uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); @@ -292,9 +302,16 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, if (dbuf) dmu_buf_rele(dbuf, FTAG); err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, - offset, FTAG, &dbuf, 0); + offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH); if (err) break; + pe = pb; + pb = MAX((dbuf->db_offset > dmu_prefetch_max) ? + dbuf->db_offset - dmu_prefetch_max : 0, ps); + if (pe > pb) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, + pb, pe - pb, ZIO_PRIORITY_ASYNC_READ); + } } ASSERT3U(offset, >=, dbuf->db_offset); @@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, int64_t i = bpi->bpi_unprocessed_subobjs - 1; uint64_t offset = i * sizeof (uint64_t); - uint64_t obj_from_sublist; + uint64_t subobj; err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - offset, sizeof (uint64_t), &obj_from_sublist, - DMU_READ_PREFETCH); + offset, sizeof (uint64_t), &subobj, + DMU_READ_NO_PREFETCH); if (err) break; - bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t), + + bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t), KM_SLEEP); - - err = bpobj_open(sublist, bpo->bpo_os, - obj_from_sublist); - if (err) + err = bpobj_open(subbpo, bpo->bpo_os, subobj); + if (err) { + kmem_free(subbpo, sizeof (bpobj_t)); break; + } - list_insert_head(&stack, bpi_alloc(sublist, bpi, i)); - mutex_enter(&sublist->bpo_lock); + if (subbpo->bpo_havesubobj && + subbpo->bpo_phys->bpo_subobjs != 0) { + dmu_prefetch(subbpo->bpo_os, + subbpo->bpo_phys->bpo_subobjs, 0, 0, 0, + ZIO_PRIORITY_ASYNC_READ); + } + + list_insert_head(&stack, bpi_alloc(subbpo, bpi, i)); + mutex_enter(&subbpo->bpo_lock); bpi->bpi_unprocessed_subobjs--; } } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index dda869287c..3a4560cec2 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -89,7 +89,11 @@ static int zfs_dmu_offset_next_sync = 1; * helps to limit the amount of memory that can be used by prefetching. * Larger objects should be prefetched a bit at a time. */ +#ifdef _ILP32 +uint_t dmu_prefetch_max = 8 * 1024 * 1024; +#else uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; +#endif const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, @@ -552,8 +556,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); - if ((flags & DMU_READ_NO_PREFETCH) == 0 && - length <= zfetch_array_rd_sz) { + if ((flags & DMU_READ_NO_PREFETCH) == 0) { /* * Prepare the zfetch before initiating the demand reads, so * that if multiple threads block on same indirect block, we diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 2fdd7c1ece..05ca91717c 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1795,17 +1795,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa, } /* - * The dmu does not currently support decreasing nlevels - * or changing the number of dnode slots on an object. For - * non-raw sends, this does not matter and the new object - * can just use the previous one's nlevels. For raw sends, - * however, the structure of the received dnode (including - * nlevels and dnode slots) must match that of the send - * side. Therefore, instead of using dmu_object_reclaim(), - * we must free the object completely and call - * dmu_object_claim_dnsize() instead. + * The dmu does not currently support decreasing nlevels or changing + * indirect block size if there is already one, same as changing the + * number of of dnode slots on an object. For non-raw sends this + * does not matter and the new object can just use the previous one's + * parameters. For raw sends, however, the structure of the received + * dnode (including indirects and dnode slots) must match that of the + * send side. Therefore, instead of using dmu_object_reclaim(), we + * must free the object completely and call dmu_object_claim_dnsize() + * instead. */ - if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) || + if ((rwa->raw && ((doi->doi_indirection > 1 && + indblksz != doi->doi_metadata_block_size) || + drro->drr_nlevels < doi->doi_indirection)) || dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { err = dmu_free_long_object(rwa->os, drro->drr_object); if (err != 0) diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index b70459380c..d0acaf5020 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -52,14 +52,19 @@ static unsigned int zfetch_max_streams = 8; static unsigned int zfetch_min_sec_reap = 1; /* max time before stream delete */ static unsigned int zfetch_max_sec_reap = 2; +#ifdef _ILP32 +/* min bytes to prefetch per stream (default 2MB) */ +static unsigned int zfetch_min_distance = 2 * 1024 * 1024; +/* max bytes to prefetch per stream (default 8MB) */ +unsigned int zfetch_max_distance = 8 * 1024 * 1024; +#else /* min bytes to prefetch per stream (default 4MB) */ static unsigned int zfetch_min_distance = 4 * 1024 * 1024; /* max bytes to prefetch per stream (default 64MB) */ unsigned int zfetch_max_distance = 64 * 1024 * 1024; +#endif /* max bytes to prefetch indirects for per stream (default 64MB) */ unsigned int zfetch_max_idistance = 64 * 1024 * 1024; -/* max number of bytes in an array_read in which we allow prefetching (1MB) */ -uint64_t zfetch_array_rd_sz = 1024 * 1024; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; @@ -580,6 +585,3 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, "Max bytes to prefetch indirects for per stream"); - -ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, U64, ZMOD_RW, - "Number of bytes in a array_read"); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index d15268cd7b..7cf03264dc 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1882,7 +1882,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) if (ibs == dn->dn_indblkshift) ibs = 0; - if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) + if (size == dn->dn_datablksz && ibs == 0) return (0); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); @@ -1905,24 +1905,25 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) if (ibs && dn->dn_nlevels != 1) goto fail; - /* resize the old block */ - err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); - if (err == 0) { - dbuf_new_size(db, size, tx); - } else if (err != ENOENT) { - goto fail; - } - - dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); - dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; + if (size != dn->dn_datablksz) { + /* resize the old block */ + err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); + if (err == 0) { + dbuf_new_size(db, size, tx); + } else if (err != ENOENT) { + goto fail; + } + + dnode_setdblksz(dn, size); + dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size; + if (db) + dbuf_rele(db, FTAG); + } if (ibs) { dn->dn_indblkshift = ibs; - dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; + dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; } - /* release after we have fixed the blocksize in the dnode */ - if (db) - dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); return (0); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 181efd8ed6..47c234f76c 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -892,9 +892,9 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) for (zap_cursor_init(&zc, dl->dl_os, obj); (error = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { - uint64_t mintxg = zfs_strtonum(za->za_name, NULL); - dsl_deadlist_insert_bpobj(dl, za->za_first_integer, mintxg, tx); - VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); + dsl_deadlist_insert_bpobj(dl, za->za_first_integer, + zfs_strtonum(za->za_name, NULL), tx); + VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx)); if (perror == 0) { dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, zfs_strtonum(pza->za_name, NULL)); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 50428bff3e..34012db82d 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2015,6 +2015,11 @@ dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, zb->zb_objset, DMU_META_DNODE_OBJECT); if (OBJSET_BUF_HAS_USERUSED(buf)) { + if (OBJSET_BUF_HAS_PROJECTUSED(buf)) { + dsl_scan_prefetch_dnode(scn, + &osp->os_projectused_dnode, zb->zb_objset, + DMU_PROJECTUSED_OBJECT); + } dsl_scan_prefetch_dnode(scn, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); @@ -2075,10 +2080,16 @@ dsl_scan_prefetch_thread(void *arg) zio_flags |= ZIO_FLAG_RAW; } + /* We don't need data L1 buffer since we do not prefetch L0. */ + blkptr_t *bp = &spic->spic_bp; + if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE && + BP_GET_TYPE(bp) != DMU_OT_OBJSET) + flags |= ARC_FLAG_NO_BUF; + /* issue the prefetch asynchronously */ - (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, - &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, - ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); + (void) arc_read(scn->scn_zio_root, spa, bp, + dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB, + zio_flags, &flags, &spic->spic_zb); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 176247d63b..9991e1a22c 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -58,6 +58,11 @@ static uint64_t metaslab_aliquot = 1024 * 1024; */ uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; +/* + * Of blocks of size >= metaslab_force_ganging, actually gang them this often. + */ +uint_t metaslab_force_ganging_pct = 3; + /* * In pools where the log space map feature is not enabled we touch * multiple metaslabs (and their respective space maps) with each @@ -5109,7 +5114,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * damage can result in extremely long reconstruction times. This * will also test spilling from special to normal. */ - if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) { + if (psize >= metaslab_force_ganging && + metaslab_force_ganging_pct > 0 && + (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) { metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, allocator); return (SET_ERROR(ENOSPC)); @@ -6266,7 +6273,10 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, "Segment-based metaslab selection maximum buckets before switching"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW, - "Blocks larger than this size are forced to be gang blocks"); + "Blocks larger than this size are sometimes forced to be gang blocks"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW, + "Percentage of large blocks that will be forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, "Max distance (bytes) to search forward before using size tree"); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 06f6407690..3b355e0deb 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -772,6 +772,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 30551feb63..f3812b843e 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -889,9 +889,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_not_present); /* - * Get the alignment requirement. + * Get the alignment requirement. Ignore pool ashift for vdev + * attach case. */ - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); + if (alloctype != VDEV_ALLOC_ATTACH) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, + &vd->vdev_ashift); + } else { + vd->vdev_attaching = B_TRUE; + } /* * Retrieve the vdev creation time. @@ -1393,6 +1399,36 @@ vdev_remove_parent(vdev_t *cvd) vdev_free(mvd); } +/* + * Choose GCD for spa_gcd_alloc. + */ +static uint64_t +vdev_gcd(uint64_t a, uint64_t b) +{ + while (b != 0) { + uint64_t t = b; + b = a % b; + a = t; + } + return (a); +} + +/* + * Set spa_min_alloc and spa_gcd_alloc. + */ +static void +vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) +{ + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; + if (spa->spa_gcd_alloc == INT_MAX) { + spa->spa_gcd_alloc = min_alloc; + } else { + spa->spa_gcd_alloc = vdev_gcd(min_alloc, + spa->spa_gcd_alloc); + } +} + void vdev_metaslab_group_create(vdev_t *vd) { @@ -1445,8 +1481,7 @@ vdev_metaslab_group_create(vdev_t *vd) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); - if (min_alloc < spa->spa_min_alloc) - spa->spa_min_alloc = min_alloc; + vdev_spa_set_alloc(spa, min_alloc); } } } @@ -2144,9 +2179,9 @@ vdev_open(vdev_t *vd) return (SET_ERROR(EDOM)); } - if (vd->vdev_top == vd) { + if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE) vdev_ashift_optimize(vd); - } + vd->vdev_attaching = B_FALSE; } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { @@ -2207,8 +2242,7 @@ vdev_open(vdev_t *vd) if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); - if (min_alloc < spa->spa_min_alloc) - spa->spa_min_alloc = min_alloc; + vdev_spa_set_alloc(spa, min_alloc); } /* @@ -5688,6 +5722,7 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx) objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; uint64_t vdev_guid; + uint64_t objid; nvlist_t *nvprops; vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); @@ -5698,31 +5733,28 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx) if (vd == NULL) return; + /* + * Set vdev property values in the vdev props mos object. + */ + if (vd->vdev_root_zap != 0) { + objid = vd->vdev_root_zap; + } else if (vd->vdev_top_zap != 0) { + objid = vd->vdev_top_zap; + } else if (vd->vdev_leaf_zap != 0) { + objid = vd->vdev_leaf_zap; + } else { + panic("unexpected vdev type"); + } + mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { - uint64_t intval, objid = 0; + uint64_t intval; const char *strval; vdev_prop_t prop; const char *propname = nvpair_name(elem); zprop_type_t proptype; - /* - * Set vdev property values in the vdev props mos object. - */ - if (vd->vdev_root_zap != 0) { - objid = vd->vdev_root_zap; - } else if (vd->vdev_top_zap != 0) { - objid = vd->vdev_top_zap; - } else if (vd->vdev_leaf_zap != 0) { - objid = vd->vdev_leaf_zap; - } else { - /* - * XXX: implement vdev_props_set_check() - */ - panic("vdev not root/top/leaf"); - } - switch (prop = vdev_name_to_prop(propname)) { case VDEV_PROP_USERPROP: if (vdev_prop_user(propname)) { @@ -5791,6 +5823,12 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) ASSERT(vd != NULL); + /* Check that vdev has a zap we can use */ + if (vd->vdev_root_zap == 0 && + vd->vdev_top_zap == 0 && + vd->vdev_leaf_zap == 0) + return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 8966758534..acb7256966 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1398,7 +1398,7 @@ vdev_indirect_checksum_error(zio_t *zio, vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); - zio_bad_cksum_t zbc = {{{ 0 }}}; + zio_bad_cksum_t zbc = { 0 }; abd_t *bad_abd = ic->ic_data; abd_t *good_abd = is->is_good_child->ic_data; (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 14b98a76b8..3445fa9d35 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1785,7 +1785,7 @@ vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) static int raidz_checksum_verify(zio_t *zio) { - zio_bad_cksum_t zbc = {{{0}}}; + zio_bad_cksum_t zbc = {0}; raidz_map_t *rm = zio->io_vsd; int ret = zio_checksum_error(zio, &zbc); diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index c42ef048dd..c4eb74e873 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -754,10 +754,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, #define MAX_RANGES 16 typedef struct zfs_ecksum_info { - /* histograms of set and cleared bits by bit number in a 64-bit word */ - uint8_t zei_histogram_set[sizeof (uint64_t) * NBBY]; - uint8_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; - /* inline arrays of bits set and cleared. */ uint64_t zei_bits_set[ZFM_MAX_INLINE]; uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; @@ -781,7 +777,7 @@ typedef struct zfs_ecksum_info { } zfs_ecksum_info_t; static void -update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count) +update_bad_bits(uint64_t value_arg, uint32_t *count) { size_t i; size_t bits = 0; @@ -789,10 +785,8 @@ update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count) /* We store the bits in big-endian (largest-first) order */ for (i = 0; i < 64; i++) { - if (value & (1ull << i)) { - hist[63 - i]++; + if (value & (1ull << i)) ++bits; - } } /* update the count of bits changed */ *count += bits; @@ -920,14 +914,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, if (info != NULL && info->zbc_has_cksum) { fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, - DATA_TYPE_UINT64_ARRAY, - sizeof (info->zbc_expected) / sizeof (uint64_t), - (uint64_t *)&info->zbc_expected, - FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, - DATA_TYPE_UINT64_ARRAY, - sizeof (info->zbc_actual) / sizeof (uint64_t), - (uint64_t *)&info->zbc_actual, FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, DATA_TYPE_STRING, info->zbc_checksum_name, @@ -1010,10 +996,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, offset++; } - update_histogram(set, eip->zei_histogram_set, - &eip->zei_range_sets[range]); - update_histogram(cleared, eip->zei_histogram_cleared, - &eip->zei_range_clears[range]); + update_bad_bits(set, &eip->zei_range_sets[range]); + update_bad_bits(cleared, &eip->zei_range_clears[range]); } /* convert to byte offsets */ @@ -1049,15 +1033,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, DATA_TYPE_UINT8_ARRAY, inline_size, (uint8_t *)eip->zei_bits_cleared, NULL); - } else { - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, - DATA_TYPE_UINT8_ARRAY, - NBBY * sizeof (uint64_t), eip->zei_histogram_set, - FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, - DATA_TYPE_UINT8_ARRAY, - NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, - NULL); } return (eip); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 10279fde89..3f5e6a08d8 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1596,6 +1596,19 @@ zio_shrink(zio_t *zio, uint64_t size) } } +/* + * Round provided allocation size up to a value that can be allocated + * by at least some vdev(s) in the pool with minimum or no additional + * padding and without extra space usage on others + */ +static uint64_t +zio_roundup_alloc_size(spa_t *spa, uint64_t size) +{ + if (size > spa->spa_min_alloc) + return (roundup(size, spa->spa_gcd_alloc)); + return (spa->spa_min_alloc); +} + /* * ========================================================================== * Prepare to read and write logical blocks @@ -1802,9 +1815,8 @@ zio_write_compress(zio_t *zio) * in that we charge for the padding used to fill out * the last sector. */ - ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT); - size_t rounded = (size_t)roundup(psize, - spa->spa_min_alloc); + size_t rounded = (size_t)zio_roundup_alloc_size(spa, + psize); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); @@ -1847,8 +1859,8 @@ zio_write_compress(zio_t *zio) * take this codepath because it will change the on-disk block * and decryption will fail. */ - size_t rounded = MIN((size_t)roundup(psize, - spa->spa_min_alloc), lsize); + size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize), + lsize); if (rounded != psize) { abd_t *cdata = abd_alloc_linear(rounded, B_TRUE); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 6090959c5b..9de515e876 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -515,8 +515,6 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, } if (info != NULL) { - info->zbc_expected = expected_cksum; - info->zbc_actual = actual_cksum; info->zbc_checksum_name = ci->ci_name; info->zbc_byteswapped = byteswap; info->zbc_injected = 0; diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh index 6ccec6abd6..574cb7654d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh @@ -35,7 +35,7 @@ # # STRATEGY: # 1. Create various pools with different ashift values. -# 2. Verify 'attach -o ashift=' works only with allowed values. +# 2. Verify 'attach' works. # verify_runnable "global" @@ -66,26 +66,14 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do - for cmdval in ${ashifts[@]} - do - log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1 - log_must verify_ashift $disk1 $ashift - - # ashift_of(attached_disk) <= ashift_of(existing_vdev) - if [[ $cmdval -le $ashift ]] - then - log_must zpool attach -o ashift=$cmdval $TESTPOOL1 \ - $disk1 $disk2 - log_must verify_ashift $disk2 $ashift - else - log_mustnot zpool attach -o ashift=$cmdval $TESTPOOL1 \ - $disk1 $disk2 - fi - # clean things for the next run - log_must zpool destroy $TESTPOOL1 - log_must zpool labelclear $disk1 - log_must zpool labelclear $disk2 - done + log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1 + log_must verify_ashift $disk1 $ashift + log_must zpool attach $TESTPOOL1 $disk1 $disk2 + log_must verify_ashift $disk2 $ashift + # clean things for the next run + log_must zpool destroy $TESTPOOL1 + log_must zpool labelclear $disk1 + log_must zpool labelclear $disk2 done typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-") diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh index 37ed0062e6..9595e51241 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh @@ -35,7 +35,7 @@ # # STRATEGY: # 1. Create various pools with different ashift values. -# 2. Verify 'replace -o ashift=' works only with allowed values. +# 2. Verify 'replace' works. # verify_runnable "global" @@ -66,26 +66,16 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do - for cmdval in ${ashifts[@]} - do - log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1 - log_must verify_ashift $disk1 $ashift - # ashift_of(replacing_disk) <= ashift_of(existing_vdev) - if [[ $cmdval -le $ashift ]] - then - log_must zpool replace -o ashift=$cmdval $TESTPOOL1 \ - $disk1 $disk2 - log_must verify_ashift $disk2 $ashift - wait_replacing $TESTPOOL1 - else - log_mustnot zpool replace -o ashift=$cmdval $TESTPOOL1 \ - $disk1 $disk2 - fi - # clean things for the next run - log_must zpool destroy $TESTPOOL1 - log_must zpool labelclear $disk1 - log_must zpool labelclear $disk2 - done + log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1 + log_must verify_ashift $disk1 $ashift + # ashift_of(replacing_disk) <= ashift_of(existing_vdev) + log_must zpool replace $TESTPOOL1 $disk1 $disk2 + log_must verify_ashift $disk2 $ashift + wait_replacing $TESTPOOL1 + # clean things for the next run + log_must zpool destroy $TESTPOOL1 + log_must zpool labelclear $disk1 + log_must zpool labelclear $disk2 done typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-") diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh index ffdaf91a28..b4ac18e5ea 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh @@ -34,10 +34,8 @@ # # STRATEGY: # 1. Create a pool with default values. -# 2. Verify 'zpool replace' uses the ashift pool property value when -# replacing an existing device. -# 3. Verify the default ashift value can still be overridden by manually -# specifying '-o ashift=' from the command line. +# 2. Override the pool ashift property. +# 3. Verify 'zpool replace' works. # verify_runnable "global" @@ -72,21 +70,9 @@ do do log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1 log_must zpool set ashift=$pprop $TESTPOOL1 - # ashift_of(replacing_disk) <= ashift_of(existing_vdev) - if [[ $pprop -le $ashift ]] - then - log_must zpool replace $TESTPOOL1 $disk1 $disk2 - wait_replacing $TESTPOOL1 - log_must verify_ashift $disk2 $ashift - else - # cannot replace if pool prop ashift > vdev ashift - log_mustnot zpool replace $TESTPOOL1 $disk1 $disk2 - # verify we can override the pool prop value manually - log_must zpool replace -o ashift=$ashift $TESTPOOL1 \ - $disk1 $disk2 - wait_replacing $TESTPOOL1 - log_must verify_ashift $disk2 $ashift - fi + log_must zpool replace $TESTPOOL1 $disk1 $disk2 + wait_replacing $TESTPOOL1 + log_must verify_ashift $disk2 $ashift # clean things for the next run log_must zpool destroy $TESTPOOL1 log_must zpool labelclear $disk1