Tag 2.2.0-rc2

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
shellcheck: disable "unreachable command" check [SC2317]
2023-07-21 16:36:34 -07:00 · 2023-07-21 16:35:12 -07:00 · 2023-07-21 16:35:12 -07:00 · 2023-07-21 16:35:12 -07:00 · 2023-07-21 16:35:12 -07:00 · 2023-07-21 16:35:12 -07:00
39 changed files with 334 additions and 238 deletions
--- a/2
+++ b/2
@ -2,7 +2,7 @@ Meta:          1
 Name:          zfs
 Branch:        1.0
 Version:       2.2.0
-Release:       rc1
+Release:       rc2
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
--- a/cmd/zed/agents/zfs_retire.c
+++ b/cmd/zed/agents/zfs_retire.c
@ -416,6 +416,11 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
 			return;

+		if (vdev_guid == 0) {
+			fmd_hdl_debug(hdl, "Got a zero GUID");
+			return;
+		}
+
 		if (spare) {
 			int nspares = find_and_remove_spares(zhdl, vdev_guid);
 			fmd_hdl_debug(hdl, "%d spares removed", nspares);
--- a/config/Shellcheck.am
+++ b/config/Shellcheck.am
@ -4,6 +4,7 @@
 # Not following: a was not specified as input (see shellcheck -x). [SC1091]
 # Prefer putting braces around variable references even when not strictly required. [SC2250]
 # Consider invoking this command separately to avoid masking its return value (or use '|| true' to ignore). [SC2312]
+# Command appears to be unreachable. Check usage (or ignore if invoked indirectly). [SC2317]
 # In POSIX sh, 'local' is undefined. [SC2039] # older ShellCheck versions
 # In POSIX sh, 'local' is undefined. [SC3043] # newer ShellCheck versions

@ -18,7 +19,7 @@ PHONY += shellcheck
 _STGT = $(subst ^,/,$(subst shellcheck-here-,,$@))
 shellcheck-here-%:
 if HAVE_SHELLCHECK
-	shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)"
+	shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC2317,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)"
 else
 	@echo "skipping shellcheck of" $(_STGT) "because shellcheck is not installed"
 endif
--- a/config/kernel-blkdev.m4
+++ b/config/kernel-blkdev.m4
@ -103,6 +103,33 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [
 	])
 ])

+dnl #
+dnl # 6.5.x API change
+dnl # disk_check_media_change() was added
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [
+	ZFS_LINUX_TEST_SRC([disk_check_media_change], [
+		#include <linux/fs.h>
+		#include <linux/blkdev.h>
+	], [
+		struct block_device *bdev = NULL;
+		bool error;
+
+		error = disk_check_media_change(bdev->bd_disk);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [
+	AC_MSG_CHECKING([whether disk_check_media_change() exists])
+	ZFS_LINUX_TEST_RESULT([disk_check_media_change], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_DISK_CHECK_MEDIA_CHANGE, 1,
+		    [disk_check_media_change() exists])
+	], [
+		AC_MSG_RESULT(no)
+	])
+])
+
 dnl #
 dnl # bdev_kobj() is introduced from 5.12
 dnl #
@ -443,6 +470,29 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [
 	])
 ])

+dnl #
+dnl # 6.5.x API change
+dnl # BLK_STS_NEXUS replaced with BLK_STS_RESV_CONFLICT
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT], [
+	ZFS_LINUX_TEST_SRC([blk_sts_resv_conflict], [
+		#include <linux/blkdev.h>
+	],[
+		blk_status_t s __attribute__ ((unused)) = BLK_STS_RESV_CONFLICT;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [
+	AC_MSG_CHECKING([whether BLK_STS_RESV_CONFLICT is defined])
+		ZFS_LINUX_TEST_RESULT([blk_sts_resv_conflict], [
+			AC_DEFINE(HAVE_BLK_STS_RESV_CONFLICT, 1, [BLK_STS_RESV_CONFLICT is defined])
+			AC_MSG_RESULT(yes)
+		], [
+			AC_MSG_RESULT(no)
+		])
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT
@ -458,6 +508,8 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
+	ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE
+	ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT
 ])

 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
@ -476,4 +528,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
+	ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE
+	ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT
 ])
--- a/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in
+++ b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in
@ -2,7 +2,7 @@
 Description=Rollback bootfs just before it is mounted
 Requisite=zfs-import.target
 After=zfs-import.target dracut-pre-mount.service zfs-snapshot-bootfs.service
-Before=dracut-mount.service
+Before=dracut-mount.service sysroot.mount
 DefaultDependencies=no
 ConditionKernelCommandLine=bootfs.rollback
 ConditionEnvironment=BOOTFS
--- a/include/os/freebsd/spl/sys/vnode.h
+++ b/include/os/freebsd/spl/sys/vnode.h
@ -36,7 +36,11 @@ struct xucred;
 typedef struct flock	flock64_t;
 typedef	struct vnode	vnode_t;
 typedef	struct vattr	vattr_t;
+#if __FreeBSD_version < 1400093
 typedef enum vtype vtype_t;
+#else
+#define	vtype_t __enum_uint8(vtype)
+#endif

 #include <sys/types.h>
 #include <sys/queue.h>
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@ -181,7 +181,11 @@ bi_status_to_errno(blk_status_t status)
 		return (ENOLINK);
 	case BLK_STS_TARGET:
 		return (EREMOTEIO);
+#ifdef HAVE_BLK_STS_RESV_CONFLICT
+	case BLK_STS_RESV_CONFLICT:
+#else
 	case BLK_STS_NEXUS:
+#endif
 		return (EBADE);
 	case BLK_STS_MEDIUM:
 		return (ENODATA);
@ -215,7 +219,11 @@ errno_to_bi_status(int error)
 	case EREMOTEIO:
 		return (BLK_STS_TARGET);
 	case EBADE:
+#ifdef HAVE_BLK_STS_RESV_CONFLICT
+		return (BLK_STS_RESV_CONFLICT);
+#else
 		return (BLK_STS_NEXUS);
+#endif
 	case ENODATA:
 		return (BLK_STS_MEDIUM);
 	case EILSEQ:
@ -337,6 +345,8 @@ zfs_check_media_change(struct block_device *bdev)
 	return (0);
 }
 #define	vdev_bdev_reread_part(bdev)	zfs_check_media_change(bdev)
+#elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE)
+#define	vdev_bdev_reread_part(bdev)	disk_check_media_change(bdev->bd_disk)
 #else
 /*
 * This is encountered if check_disk_change() and bdev_check_media_change()
--- a/include/os/linux/spl/sys/types.h
+++ b/include/os/linux/spl/sys/types.h
@ -38,7 +38,7 @@ typedef unsigned long		ulong_t;
 typedef unsigned long long	u_longlong_t;
 typedef long long		longlong_t;

-typedef unsigned long		intptr_t;
+typedef long			intptr_t;
 typedef unsigned long long	rlim64_t;

 typedef struct task_struct	kthread_t;
--- a/include/sys/bpobj.h
+++ b/include/sys/bpobj.h
@ -60,7 +60,7 @@ typedef struct bpobj {
 	kmutex_t	bpo_lock;
 	objset_t	*bpo_os;
 	uint64_t	bpo_object;
-	int		bpo_epb;
+	uint32_t	bpo_epb;
 	uint8_t		bpo_havecomp;
 	uint8_t		bpo_havesubobj;
 	uint8_t		bpo_havefreed;
--- a/include/sys/dmu_zfetch.h
+++ b/include/sys/dmu_zfetch.h
@ -36,8 +36,6 @@
 extern "C" {
 #endif

-extern uint64_t	zfetch_array_rd_sz;
-
 struct dnode;				/* so we can reference dnode */

 typedef struct zfetch {
--- a/include/sys/fm/fs/zfs.h
+++ b/include/sys/fm/fs/zfs.h
@ -102,8 +102,6 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP	"zio_timestamp"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA	"zio_delta"
 #define	FM_EREPORT_PAYLOAD_ZFS_PREV_STATE	"prev_state"
-#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED	"cksum_expected"
-#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL	"cksum_actual"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO	"cksum_algorithm"
 #define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP	"cksum_byteswap"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
@ -112,8 +110,6 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS	"bad_range_clears"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS	"bad_set_bits"
 #define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS	"bad_cleared_bits"
-#define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
-#define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"
 #define	FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME	"snapshot_name"
 #define	FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME	"device_name"
 #define	FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME	"raw_name"
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@ -723,16 +723,10 @@ typedef enum spa_mode {
 * Send TRIM commands in-line during normal pool operation while deleting.
 *	OFF: no
 *	ON: yes
- * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources.
 */
 typedef enum {
 	SPA_AUTOTRIM_OFF = 0,	/* default */
 	SPA_AUTOTRIM_ON,
-#ifdef IN_FREEBSD_BASE
-	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON,
-#else
-	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF,
-#endif
 } spa_autotrim_t;

 /*
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@ -250,6 +250,7 @@ struct spa {
 	uint64_t	spa_min_ashift;		/* of vdevs in normal class */
 	uint64_t	spa_max_ashift;		/* of vdevs in normal class */
 	uint64_t	spa_min_alloc;		/* of vdevs in normal class */
+	uint64_t	spa_gcd_alloc;		/* of vdevs in normal class */
 	uint64_t	spa_config_guid;	/* config pool guid */
 	uint64_t	spa_load_guid;		/* spa_load initialized guid */
 	uint64_t	spa_last_synced_guid;	/* last synced guid */
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@ -420,6 +420,7 @@ struct vdev {
 	boolean_t	vdev_copy_uberblocks;  /* post expand copy uberblocks */
 	boolean_t	vdev_resilver_deferred;  /* resilver deferred */
 	boolean_t	vdev_kobj_flag; /* kobj event record */
+	boolean_t	vdev_attaching; /* vdev attach ashift handling */
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
 	spa_aux_vdev_t	*vdev_aux;	/* for l2cache and spares vdevs	*/
 	zio_t		*vdev_probe_zio; /* root of current probe	*/
--- a/include/sys/zio_checksum.h
+++ b/include/sys/zio_checksum.h
@ -94,8 +94,6 @@ typedef const struct zio_checksum_info {
 } zio_checksum_info_t;

 typedef struct zio_bad_cksum {
-	zio_cksum_t		zbc_expected;
-	zio_cksum_t		zbc_actual;
 	const char		*zbc_checksum_name;
 	uint8_t			zbc_byteswapped;
 	uint8_t			zbc_injected;
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@ -15,7 +15,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd January 10, 2023
+.Dd July 21, 2023
 .Dt ZFS 4
 .Os
 .
@ -239,6 +239,11 @@ relative to the pool.
 Make some blocks above a certain size be gang blocks.
 This option is used by the test suite to facilitate testing.
 .
+.It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint
+For blocks that could be forced to be a gang block (due to
+.Sy metaslab_force_ganging ) ,
+force this many of them to be gang blocks.
+.
 .It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP data block size as a power of 2. Note that changing this after
 creating a DDT on the pool will not affect existing DDTs, only newly created
@ -519,9 +524,6 @@ However, this is limited by
 Maximum micro ZAP size.
 A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
 .
-.It Sy zfetch_array_rd_sz Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
-If prefetching is enabled, disable prefetching for reads larger than this size.
-.
 .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Min bytes to prefetch per stream.
 Prefetch distance starts from the demand access size and quickly grows to
--- a/man/man8/zpool-events.8
+++ b/man/man8/zpool-events.8
@ -26,7 +26,7 @@
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd May 27, 2021
+.Dd July 11, 2023
 .Dt ZPOOL-EVENTS 8
 .Os
 .
@ -305,10 +305,6 @@ The time when a given I/O request was submitted.
 The time required to service a given I/O request.
 .It Sy prev_state
 The previous state of the vdev.
-.It Sy cksum_expected
-The expected checksum value for the block.
-.It Sy cksum_actual
-The actual checksum value for an errant block.
 .It Sy cksum_algorithm
 Checksum algorithm used.
 See
@ -362,23 +358,6 @@ Like
 but contains
 .Pq Ar good data No & ~( Ns Ar bad data ) ;
 that is, the bits set in the good data which are cleared in the bad data.
-.It Sy bad_set_histogram
-If this field exists, it is an array of counters.
-Each entry counts bits set in a particular bit of a big-endian uint64 type.
-The first entry counts bits
-set in the high-order bit of the first byte, the 9th byte, etc, and the last
-entry counts bits set of the low-order bit of the 8th byte, the 16th byte, etc.
-This information is useful for observing a stuck bit in a parallel data path,
-such as IDE or parallel SCSI.
-.It Sy bad_cleared_histogram
-If this field exists, it is an array of counters.
-Each entry counts bit clears in a particular bit of a big-endian uint64 type.
-The first entry counts bits
-clears of the high-order bit of the first byte, the 9th byte, etc, and the
-last entry counts clears of the low-order bit of the 8th byte, the 16th byte,
-etc.
-This information is useful for observing a stuck bit in a parallel data
-path, such as IDE or parallel SCSI.
 .El
 .
 .Sh I/O STAGES
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@ -6263,7 +6263,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 			goto bad_write_fallback;
 		}
 	} else {
-#if __FreeBSD_version >= 1400086
+#if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
+	__FreeBSD_version >= 1400086
 		vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false,
 		    LK_EXCLUSIVE);
 #else
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@ -1662,6 +1662,7 @@ zfs_umount(struct super_block *sb)
 	}

 	zfsvfs_free(zfsvfs);
+	sb->s_fs_info = NULL;
 	return (0);
 }

--- a/module/os/linux/zfs/zpl_super.c
+++ b/module/os/linux/zfs/zpl_super.c
@ -277,8 +277,6 @@ zpl_test_super(struct super_block *s, void *data)
 {
 	zfsvfs_t *zfsvfs = s->s_fs_info;
 	objset_t *os = data;
-	int match;
-
 	/*
 	 * If the os doesn't match the z_os in the super_block, assume it is
 	 * not a match. Matching would imply a multimount of a dataset. It is
@ -286,19 +284,7 @@ zpl_test_super(struct super_block *s, void *data)
 	 * that changes the z_os, e.g., rollback, where the match will be
 	 * missed, but in that case the user will get an EBUSY.
 	 */
-	if (zfsvfs == NULL || os != zfsvfs->z_os)
-		return (0);
-
-	/*
-	 * If they do match, recheck with the lock held to prevent mounting the
-	 * wrong dataset since z_os can be stale when the teardown lock is held.
-	 */
-	if (zpl_enter(zfsvfs, FTAG) != 0)
-		return (0);
-	match = (os == zfsvfs->z_os);
-	zpl_exit(zfsvfs, FTAG);
-
-	return (match);
+	return (zfsvfs != NULL && os == zfsvfs->z_os);
 }

 static struct super_block *
@ -324,12 +310,35 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)

 	s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);

+	/*
+	 * Recheck with the lock held to prevent mounting the wrong dataset
+	 * since z_os can be stale when the teardown lock is held.
+	 *
+	 * We can't do this in zpl_test_super in since it's under spinlock and
+	 * also s_umount lock is not held there so it would race with
+	 * zfs_umount and zfsvfs can be freed.
+	 */
+	if (!IS_ERR(s) && s->s_fs_info != NULL) {
+		zfsvfs_t *zfsvfs = s->s_fs_info;
+		if (zpl_enter(zfsvfs, FTAG) == 0) {
+			if (os != zfsvfs->z_os)
+				err = -SET_ERROR(EBUSY);
+			zpl_exit(zfsvfs, FTAG);
+		} else {
+			err = -SET_ERROR(EBUSY);
+		}
+	}
 	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
 	dsl_dataset_rele(dmu_objset_ds(os), FTAG);

 	if (IS_ERR(s))
 		return (ERR_CAST(s));

+	if (err) {
+		deactivate_locked_super(s);
+		return (ERR_PTR(err));
+	}
+
 	if (s->s_root == NULL) {
 		err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
 		if (err) {
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@ -160,7 +160,7 @@ zpool_prop_init(void)
 	    "wait | continue | panic", "FAILMODE", failuremode_table,
 	    sfeatures);
 	zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim",
-	    SPA_AUTOTRIM_DEFAULT, PROP_DEFAULT, ZFS_TYPE_POOL,
+	    SPA_AUTOTRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL,
 	    "on | off", "AUTOTRIM", boolean_table, sfeatures);

 	/* hidden properties */
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@ -284,7 +284,17 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
 	dmu_buf_t *dbuf = NULL;
 	bpobj_t *bpo = bpi->bpi_bpo;

-	for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
+	int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
+	uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
+	    sizeof (blkptr_t);
+	uint64_t ps = start * sizeof (blkptr_t);
+	uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
+	    ps);
+	if (pe > pb) {
+		dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
+	for (; i >= start; i--) {
 		uint64_t offset = i * sizeof (blkptr_t);
 		uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);

@ -292,9 +302,16 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
 			if (dbuf)
 				dmu_buf_rele(dbuf, FTAG);
 			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
-			    offset, FTAG, &dbuf, 0);
+			    offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
 			if (err)
 				break;
+			pe = pb;
+			pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
+			    dbuf->db_offset - dmu_prefetch_max : 0, ps);
+			if (pe > pb) {
+				dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
+				    pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
+			}
 		}

 		ASSERT3U(offset, >=, dbuf->db_offset);
@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
 			int64_t i = bpi->bpi_unprocessed_subobjs - 1;
 			uint64_t offset = i * sizeof (uint64_t);

-			uint64_t obj_from_sublist;
+			uint64_t subobj;
 			err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
-			    offset, sizeof (uint64_t), &obj_from_sublist,
-			    DMU_READ_PREFETCH);
+			    offset, sizeof (uint64_t), &subobj,
+			    DMU_READ_NO_PREFETCH);
 			if (err)
 				break;
-			bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
+
+			bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
 			    KM_SLEEP);
-
-			err = bpobj_open(sublist, bpo->bpo_os,
-			    obj_from_sublist);
-			if (err)
+			err = bpobj_open(subbpo, bpo->bpo_os, subobj);
+			if (err) {
+				kmem_free(subbpo, sizeof (bpobj_t));
 				break;
+			}

-			list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
-			mutex_enter(&sublist->bpo_lock);
+			if (subbpo->bpo_havesubobj &&
+			    subbpo->bpo_phys->bpo_subobjs != 0) {
+				dmu_prefetch(subbpo->bpo_os,
+				    subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
+				    ZIO_PRIORITY_ASYNC_READ);
+			}
+
+			list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
+			mutex_enter(&subbpo->bpo_lock);
 			bpi->bpi_unprocessed_subobjs--;
 		}
 	}
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@ -89,7 +89,11 @@ static int zfs_dmu_offset_next_sync = 1;
 * helps to limit the amount of memory that can be used by prefetching.
 * Larger objects should be prefetched a bit at a time.
 */
+#ifdef _ILP32
+uint_t dmu_prefetch_max = 8 * 1024 * 1024;
+#else
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+#endif

 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
@ -552,8 +556,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 		zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
-	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
-	    length <= zfetch_array_rd_sz) {
+	if ((flags & DMU_READ_NO_PREFETCH) == 0) {
 		/*
 		 * Prepare the zfetch before initiating the demand reads, so
 		 * that if multiple threads block on same indirect block, we
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@ -1795,17 +1795,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa,
 	}

 	/*
-	 * The dmu does not currently support decreasing nlevels
-	 * or changing the number of dnode slots on an object. For
-	 * non-raw sends, this does not matter and the new object
-	 * can just use the previous one's nlevels. For raw sends,
-	 * however, the structure of the received dnode (including
-	 * nlevels and dnode slots) must match that of the send
-	 * side. Therefore, instead of using dmu_object_reclaim(),
-	 * we must free the object completely and call
-	 * dmu_object_claim_dnsize() instead.
+	 * The dmu does not currently support decreasing nlevels or changing
+	 * indirect block size if there is already one, same as changing the
+	 * number of of dnode slots on an object.  For non-raw sends this
+	 * does not matter and the new object can just use the previous one's
+	 * parameters.  For raw sends, however, the structure of the received
+	 * dnode (including indirects and dnode slots) must match that of the
+	 * send side.  Therefore, instead of using dmu_object_reclaim(), we
+	 * must free the object completely and call dmu_object_claim_dnsize()
+	 * instead.
 	 */
-	if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) ||
+	if ((rwa->raw && ((doi->doi_indirection > 1 &&
+	    indblksz != doi->doi_metadata_block_size) ||
+	    drro->drr_nlevels < doi->doi_indirection)) ||
 	    dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
 		err = dmu_free_long_object(rwa->os, drro->drr_object);
 		if (err != 0)
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@ -52,14 +52,19 @@ static unsigned int	zfetch_max_streams = 8;
 static unsigned int	zfetch_min_sec_reap = 1;
 /* max time before stream delete */
 static unsigned int	zfetch_max_sec_reap = 2;
+#ifdef _ILP32
+/* min bytes to prefetch per stream (default 2MB) */
+static unsigned int	zfetch_min_distance = 2 * 1024 * 1024;
+/* max bytes to prefetch per stream (default 8MB) */
+unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
+#else
 /* min bytes to prefetch per stream (default 4MB) */
 static unsigned int	zfetch_min_distance = 4 * 1024 * 1024;
 /* max bytes to prefetch per stream (default 64MB) */
 unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
+#endif
 /* max bytes to prefetch indirects for per stream (default 64MB) */
 unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
-/* max number of bytes in an array_read in which we allow prefetching (1MB) */
-uint64_t	zfetch_array_rd_sz = 1024 * 1024;

 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
@ -580,6 +585,3 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,

 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
 	"Max bytes to prefetch indirects for per stream");
-
-ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, U64, ZMOD_RW,
-	"Number of bytes in a array_read");
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@ -1882,7 +1882,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	if (ibs == dn->dn_indblkshift)
 		ibs = 0;

-	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+	if (size == dn->dn_datablksz && ibs == 0)
 		return (0);

 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
@ -1905,6 +1905,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 	if (ibs && dn->dn_nlevels != 1)
 		goto fail;

+	dnode_setdirty(dn, tx);
+	if (size != dn->dn_datablksz) {
 		/* resize the old block */
 		err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
 		if (err == 0) {
@ -1914,15 +1916,14 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 		}

 		dnode_setdblksz(dn, size);
-	dnode_setdirty(dn, tx);
 		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
+		if (db)
+			dbuf_rele(db, FTAG);
+	}
 	if (ibs) {
 		dn->dn_indblkshift = ibs;
 		dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	}
-	/* release after we have fixed the blocksize in the dnode */
-	if (db)
-		dbuf_rele(db, FTAG);

 	rw_exit(&dn->dn_struct_rwlock);
 	return (0);
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@ -892,9 +892,9 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
 	for (zap_cursor_init(&zc, dl->dl_os, obj);
 	    (error = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
-		uint64_t mintxg = zfs_strtonum(za->za_name, NULL);
-		dsl_deadlist_insert_bpobj(dl, za->za_first_integer, mintxg, tx);
-		VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
+		dsl_deadlist_insert_bpobj(dl, za->za_first_integer,
+		    zfs_strtonum(za->za_name, NULL), tx);
+		VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx));
 		if (perror == 0) {
 			dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
 			    zfs_strtonum(pza->za_name, NULL));
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@ -2015,6 +2015,11 @@ dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
 		    zb->zb_objset, DMU_META_DNODE_OBJECT);

 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
+			if (OBJSET_BUF_HAS_PROJECTUSED(buf)) {
+				dsl_scan_prefetch_dnode(scn,
+				    &osp->os_projectused_dnode, zb->zb_objset,
+				    DMU_PROJECTUSED_OBJECT);
+			}
 			dsl_scan_prefetch_dnode(scn,
 			    &osp->os_groupused_dnode, zb->zb_objset,
 			    DMU_GROUPUSED_OBJECT);
@ -2075,10 +2080,16 @@ dsl_scan_prefetch_thread(void *arg)
 			zio_flags |= ZIO_FLAG_RAW;
 		}

+		/* We don't need data L1 buffer since we do not prefetch L0. */
+		blkptr_t *bp = &spic->spic_bp;
+		if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
+		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+			flags |= ARC_FLAG_NO_BUF;
+
 		/* issue the prefetch asynchronously */
-		(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
-		    &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
-		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
+		(void) arc_read(scn->scn_zio_root, spa, bp,
+		    dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB,
+		    zio_flags, &flags, &spic->spic_zb);

 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@ -58,6 +58,11 @@ static uint64_t metaslab_aliquot = 1024 * 1024;
 */
 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;

+/*
+ * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
+ */
+uint_t metaslab_force_ganging_pct = 3;
+
 /*
 * In pools where the log space map feature is not enabled we touch
 * multiple metaslabs (and their respective space maps) with each
@ -5109,7 +5114,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	 * damage can result in extremely long reconstruction times.  This
 	 * will also test spilling from special to normal.
 	 */
-	if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) {
+	if (psize >= metaslab_force_ganging &&
+	    metaslab_force_ganging_pct > 0 &&
+	    (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
 		    allocator);
 		return (SET_ERROR(ENOSPC));
@ -6266,7 +6273,10 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
 	"Segment-based metaslab selection maximum buckets before switching");

 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
-	"Blocks larger than this size are forced to be gang blocks");
+	"Blocks larger than this size are sometimes forced to be gang blocks");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
+	"Percentage of large blocks that will be forced to be gang blocks");

 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
 	"Max distance (bytes) to search forward before using size tree");
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@ -772,6 +772,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
 	spa->spa_min_alloc = INT_MAX;
+	spa->spa_gcd_alloc = INT_MAX;

 	/* Reset cached value */
 	spa->spa_dedup_dspace = ~0ULL;
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@ -889,9 +889,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	    &vd->vdev_not_present);

 	/*
-	 * Get the alignment requirement.
+	 * Get the alignment requirement. Ignore pool ashift for vdev
+	 * attach case.
 	 */
-	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+	if (alloctype != VDEV_ALLOC_ATTACH) {
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
+		    &vd->vdev_ashift);
+	} else {
+		vd->vdev_attaching = B_TRUE;
+	}

 	/*
 	 * Retrieve the vdev creation time.
@ -1393,6 +1399,36 @@ vdev_remove_parent(vdev_t *cvd)
 	vdev_free(mvd);
 }

+/*
+ * Choose GCD for spa_gcd_alloc.
+ */
+static uint64_t
+vdev_gcd(uint64_t a, uint64_t b)
+{
+	while (b != 0) {
+		uint64_t t = b;
+		b = a % b;
+		a = t;
+	}
+	return (a);
+}
+
+/*
+ * Set spa_min_alloc and spa_gcd_alloc.
+ */
+static void
+vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
+{
+	if (min_alloc < spa->spa_min_alloc)
+		spa->spa_min_alloc = min_alloc;
+	if (spa->spa_gcd_alloc == INT_MAX) {
+		spa->spa_gcd_alloc = min_alloc;
+	} else {
+		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
+		    spa->spa_gcd_alloc);
+	}
+}
+
 void
 vdev_metaslab_group_create(vdev_t *vd)
 {
@ -1445,8 +1481,7 @@ vdev_metaslab_group_create(vdev_t *vd)
 				spa->spa_min_ashift = vd->vdev_ashift;

 			uint64_t min_alloc = vdev_get_min_alloc(vd);
-			if (min_alloc < spa->spa_min_alloc)
-				spa->spa_min_alloc = min_alloc;
+			vdev_spa_set_alloc(spa, min_alloc);
 		}
 	}
 }
@ -2144,9 +2179,9 @@ vdev_open(vdev_t *vd)
 				return (SET_ERROR(EDOM));
 			}

-			if (vd->vdev_top == vd) {
+			if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
 				vdev_ashift_optimize(vd);
-			}
+			vd->vdev_attaching = B_FALSE;
 		}
 		if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
 		    vd->vdev_ashift > ASHIFT_MAX)) {
@ -2207,8 +2242,7 @@ vdev_open(vdev_t *vd)
 	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
 		uint64_t min_alloc = vdev_get_min_alloc(vd);
-		if (min_alloc < spa->spa_min_alloc)
-			spa->spa_min_alloc = min_alloc;
+		vdev_spa_set_alloc(spa, min_alloc);
 	}

 	/*
@ -5688,6 +5722,7 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 	uint64_t vdev_guid;
+	uint64_t objid;
 	nvlist_t *nvprops;

 	vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
@ -5698,15 +5733,6 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 	if (vd == NULL)
 		return;

-	mutex_enter(&spa->spa_props_lock);
-
-	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
-		uint64_t intval, objid = 0;
-		const char *strval;
-		vdev_prop_t prop;
-		const char *propname = nvpair_name(elem);
-		zprop_type_t proptype;
-
 	/*
 	 * Set vdev property values in the vdev props mos object.
 	 */
@ -5717,12 +5743,18 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
-			/*
-			 * XXX: implement vdev_props_set_check()
-			 */
-			panic("vdev not root/top/leaf");
+		panic("unexpected vdev type");
 	}

+	mutex_enter(&spa->spa_props_lock);
+
+	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
+		uint64_t intval;
+		const char *strval;
+		vdev_prop_t prop;
+		const char *propname = nvpair_name(elem);
+		zprop_type_t proptype;
+
 		switch (prop = vdev_name_to_prop(propname)) {
 		case VDEV_PROP_USERPROP:
 			if (vdev_prop_user(propname)) {
@ -5791,6 +5823,12 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)

 	ASSERT(vd != NULL);

+	/* Check that vdev has a zap we can use */
+	if (vd->vdev_root_zap == 0 &&
+	    vd->vdev_top_zap == 0 &&
+	    vd->vdev_leaf_zap == 0)
+		return (SET_ERROR(EINVAL));
+
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@ -1398,7 +1398,7 @@ vdev_indirect_checksum_error(zio_t *zio,
 	vd->vdev_stat.vs_checksum_errors++;
 	mutex_exit(&vd->vdev_stat_lock);

-	zio_bad_cksum_t zbc = {{{ 0 }}};
+	zio_bad_cksum_t zbc = { 0 };
 	abd_t *bad_abd = ic->ic_data;
 	abd_t *good_abd = is->is_good_child->ic_data;
 	(void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@ -1785,7 +1785,7 @@ vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
 static int
 raidz_checksum_verify(zio_t *zio)
 {
-	zio_bad_cksum_t zbc = {{{0}}};
+	zio_bad_cksum_t zbc = {0};
 	raidz_map_t *rm = zio->io_vsd;

 	int ret = zio_checksum_error(zio, &zbc);
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@ -754,10 +754,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 #define	MAX_RANGES		16

 typedef struct zfs_ecksum_info {
-	/* histograms of set and cleared bits by bit number in a 64-bit word */
-	uint8_t zei_histogram_set[sizeof (uint64_t) * NBBY];
-	uint8_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
-
 	/* inline arrays of bits set and cleared. */
 	uint64_t zei_bits_set[ZFM_MAX_INLINE];
 	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
@ -781,7 +777,7 @@ typedef struct zfs_ecksum_info {
 } zfs_ecksum_info_t;

 static void
-update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count)
+update_bad_bits(uint64_t value_arg, uint32_t *count)
 {
 	size_t i;
 	size_t bits = 0;
@ -789,11 +785,9 @@ update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count)

 	/* We store the bits in big-endian (largest-first) order */
 	for (i = 0; i < 64; i++) {
-		if (value & (1ull << i)) {
-			hist[63 - i]++;
+		if (value & (1ull << i))
 			++bits;
 	}
-	}
 	/* update the count of bits changed */
 	*count += bits;
 }
@ -920,14 +914,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,

 	if (info != NULL && info->zbc_has_cksum) {
 		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
-		    DATA_TYPE_UINT64_ARRAY,
-		    sizeof (info->zbc_expected) / sizeof (uint64_t),
-		    (uint64_t *)&info->zbc_expected,
-		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
-		    DATA_TYPE_UINT64_ARRAY,
-		    sizeof (info->zbc_actual) / sizeof (uint64_t),
-		    (uint64_t *)&info->zbc_actual,
 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
 		    DATA_TYPE_STRING,
 		    info->zbc_checksum_name,
@ -1010,10 +996,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 				offset++;
 			}

-			update_histogram(set, eip->zei_histogram_set,
-			    &eip->zei_range_sets[range]);
-			update_histogram(cleared, eip->zei_histogram_cleared,
-			    &eip->zei_range_clears[range]);
+			update_bad_bits(set, &eip->zei_range_sets[range]);
+			update_bad_bits(cleared, &eip->zei_range_clears[range]);
 		}

 		/* convert to byte offsets */
@ -1049,15 +1033,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
 		    DATA_TYPE_UINT8_ARRAY,
 		    inline_size, (uint8_t *)eip->zei_bits_cleared,
 		    NULL);
-	} else {
-		fm_payload_set(ereport,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
-		    DATA_TYPE_UINT8_ARRAY,
-		    NBBY * sizeof (uint64_t), eip->zei_histogram_set,
-		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
-		    DATA_TYPE_UINT8_ARRAY,
-		    NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
-		    NULL);
 	}
 	return (eip);
 }
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@ -1596,6 +1596,19 @@ zio_shrink(zio_t *zio, uint64_t size)
 	}
 }

+/*
+ * Round provided allocation size up to a value that can be allocated
+ * by at least some vdev(s) in the pool with minimum or no additional
+ * padding and without extra space usage on others
+ */
+static uint64_t
+zio_roundup_alloc_size(spa_t *spa, uint64_t size)
+{
+	if (size > spa->spa_min_alloc)
+		return (roundup(size, spa->spa_gcd_alloc));
+	return (spa->spa_min_alloc);
+}
+
 /*
 * ==========================================================================
 * Prepare to read and write logical blocks
@ -1802,9 +1815,8 @@ zio_write_compress(zio_t *zio)
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
-			ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
-			size_t rounded = (size_t)roundup(psize,
-			    spa->spa_min_alloc);
+			size_t rounded = (size_t)zio_roundup_alloc_size(spa,
+			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				zio_buf_free(cbuf, lsize);
@ -1847,8 +1859,8 @@ zio_write_compress(zio_t *zio)
 		 * take this codepath because it will change the on-disk block
 		 * and decryption will fail.
 		 */
-		size_t rounded = MIN((size_t)roundup(psize,
-		    spa->spa_min_alloc), lsize);
+		size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize),
+		    lsize);

 		if (rounded != psize) {
 			abd_t *cdata = abd_alloc_linear(rounded, B_TRUE);
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@ -515,8 +515,6 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
 	}

 	if (info != NULL) {
-		info->zbc_expected = expected_cksum;
-		info->zbc_actual = actual_cksum;
 		info->zbc_checksum_name = ci->ci_name;
 		info->zbc_byteswapped = byteswap;
 		info->zbc_injected = 0;
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh
@ -35,7 +35,7 @@
 #
 # STRATEGY:
 #	1. Create various pools with different ashift values.
-#	2. Verify 'attach -o ashift=<n>' works only with allowed values.
+#	2. Verify 'attach' works.
 #

 verify_runnable "global"
@ -65,28 +65,16 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16

 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
-do
-	for cmdval in ${ashifts[@]}
 do
 	log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
 	log_must verify_ashift $disk1 $ashift
-
-		# ashift_of(attached_disk) <= ashift_of(existing_vdev)
-		if [[ $cmdval -le $ashift ]]
-		then
-			log_must zpool attach -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
+	log_must zpool attach $TESTPOOL1 $disk1 $disk2
 	log_must verify_ashift $disk2 $ashift
-		else
-			log_mustnot zpool attach -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
-		fi
 	# clean things for the next run
 	log_must zpool destroy $TESTPOOL1
 	log_must zpool labelclear $disk1
 	log_must zpool labelclear $disk2
 done
-done

 typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-")
 for badval in ${badvals[@]}
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh
@ -35,7 +35,7 @@
 #
 # STRATEGY:
 #	1. Create various pools with different ashift values.
-#	2. Verify 'replace -o ashift=<n>' works only with allowed values.
+#	2. Verify 'replace' works.
 #

 verify_runnable "global"
@ -65,28 +65,18 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16

 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
-do
-	for cmdval in ${ashifts[@]}
 do
 	log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
 	log_must verify_ashift $disk1 $ashift
 	# ashift_of(replacing_disk) <= ashift_of(existing_vdev)
-		if [[ $cmdval -le $ashift ]]
-		then
-			log_must zpool replace -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
+	log_must zpool replace $TESTPOOL1 $disk1 $disk2
 	log_must verify_ashift $disk2 $ashift
 	wait_replacing $TESTPOOL1
-		else
-			log_mustnot zpool replace -o ashift=$cmdval $TESTPOOL1 \
-			    $disk1 $disk2
-		fi
 	# clean things for the next run
 	log_must zpool destroy $TESTPOOL1
 	log_must zpool labelclear $disk1
 	log_must zpool labelclear $disk2
 done
-done

 typeset badvals=("off" "on" "1" "8" "17" "1b" "ff" "-")
 for badval in ${badvals[@]}
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh
@ -34,10 +34,8 @@
 #
 # STRATEGY:
 #	1. Create a pool with default values.
-#	2. Verify 'zpool replace' uses the ashift pool property value when
-#	   replacing an existing device.
-#	3. Verify the default ashift value can still be overridden by manually
-#	   specifying '-o ashift=<n>' from the command line.
+#	2. Override the pool ashift property.
+#	3. Verify 'zpool replace' works.
 #

 verify_runnable "global"
@ -72,21 +70,9 @@ do
 	do
 		log_must zpool create -o ashift=$ashift $TESTPOOL1 $disk1
 		log_must zpool set ashift=$pprop $TESTPOOL1
-		# ashift_of(replacing_disk) <= ashift_of(existing_vdev)
-		if [[ $pprop -le $ashift ]]
-		then
 		log_must zpool replace $TESTPOOL1 $disk1 $disk2
 		wait_replacing $TESTPOOL1
 		log_must verify_ashift $disk2 $ashift
-		else
-			# cannot replace if pool prop ashift > vdev ashift
-			log_mustnot zpool replace $TESTPOOL1 $disk1 $disk2
-			# verify we can override the pool prop value manually
-			log_must zpool replace -o ashift=$ashift $TESTPOOL1 \
-			    $disk1 $disk2
-			wait_replacing $TESTPOOL1
-			log_must verify_ashift $disk2 $ashift
-		fi
 		# clean things for the next run
 		log_must zpool destroy $TESTPOOL1
 		log_must zpool labelclear $disk1
Author	SHA1	Message	Date
Brian Behlendorf	70232483b4	Tag 2.2.0-rc2 Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>	2023-07-21 16:36:34 -07:00
Rob N	c5273e0c31	shellcheck: disable "unreachable command" check [SC2317] This new check in 0.9.0 appears to have some issues with various forms of "early return", like trap, exit and return. This is tripping up (at least): cmd/zed/zed.d/history_event-zfs-list-cacher.sh /etc/zfs/zfs-functions Its not obvious what its complaining about or what the remedy is, so it seems sensible to disable this check for now. See also: https://www.shellcheck.net/wiki/SC2317 https://github.com/koalaman/shellcheck/issues/2542 https://github.com/koalaman/shellcheck/issues/2613 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Rob Norris <robn@despairlabs.com> Closes #15089	2023-07-21 16:35:12 -07:00
Rob N	685ae4429f	metaslab: tuneable to better control force ganging metaslab_force_ganging isn't enough to actually force ganging, because it still only forces 3% of the time. This adds metaslab_force_ganging_pct so we can configure how often to force ganging. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Rob Norris <rob.norris@klarasystems.com> Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #15088	2023-07-21 16:35:12 -07:00
Alexander Motin	81be809a25	Adjust prefetch parameters. - Reduce maximum prefetch distance for 32bit platforms to 8MB as it was previously. Those systems didn't grow much probably, so better stay conservative there. - Retire array_rd_sz tunable, blocking prefetch for large requests. We should not penalize applications trying to be more efficient. The speculative prefetcher by itself has reasonable distance limits, and 1MB is not much at all these days. Reviewed-by: Allan Jude <allan@klarasystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15072	2023-07-21 16:35:12 -07:00
Alexander Motin	8a6fde8213	Add explicit prefetches to bpobj_iterate(). To simplify error handling bpobj_iterate_blkptrs() iterates through the list of block pointers backwards. Unfortunately speculative prefetcher is currently unable to detect such patterns, that makes each block read there synchronous and very slow on HDD pools. According to my tests, added explicit prefetch reduces time needed to asynchronously delete 8 snapshots of 4 million blocks each from 20 seconds to less than one, that should free sync thread for other useful work, such as async writes, scrub, etc. While there, plug one memory leak in case of bpobj_open() error and harmonize some variable names. Reviewed-by: Allan Jude <allan@klarasystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15071	2023-07-21 16:35:12 -07:00
Alan Somers	b6f618f8ff	Don't emit cksum_{actual_expected} in ereport.fs.zfs.checksum events With anything but fletcher-4, even a tiny change in the input will cause the checksum value to change completely. So knowing the actual and expected checksums doesn't provide much more information than "they don't match". The harm in sending them is simply that they bloat the event. In particular, on FreeBSD the event must fit into a 1016 byte buffer. Fixes #14717 for mirrored pools. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Rich Ercolani <rincebrain@gmail.com> Signed-off-by: Alan Somers <asomers@gmail.com> Sponsored-by: Axcient Closes #14717 Closes #15052	2023-07-21 16:35:12 -07:00
Alan Somers	51a2b59767	Don't emit checksum histograms in ereport.fs.zfs.checksum events The checksum histograms were intended to be used with ATA and parallel SCSI, which are obsolete. With modern storage hardware, they will almost always look like white noise; all bits will be wrong. They only serve to bloat the event. That's a particular problem on FreeBSD, where events must fit into a 1016 byte buffer. This fixes issue #14717 for RAIDZ pools, but not for mirror pools. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Rich Ercolani <rincebrain@gmail.com> Signed-off-by: Alan Somers <asomers@gmail.com> Sponsored-by: Axcient Closes #15052	2023-07-21 16:35:12 -07:00
Tony Hutter	8c81c0b05d	zed: Fix zed ASSERT on slot power cycle We would see zed assert on one of our systems if we powered off a slot. Further examination showed zfs_retire_recv() was reporting a GUID of 0, which in turn would return a NULL nvlist. Add in a check for a zero GUID. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Closes #15084	2023-07-21 16:35:12 -07:00
Chunwei Chen	b221f43943	Fix zpl_test_super race with zfs_umount We cannot call zpl_enter in zpl_test_super, because zpl_test_super is under spinlock so we can't sleep, and also because zpl_test_super is called without sb->s_umount taken, so it's possible we would race with zfs_umount and call zpl_enter on freed zfsvfs. Here's an stack trace when this happens: [ 2379.114837] VERIFY(cvp->cv_magic == CV_MAGIC) failed [ 2379.114845] PANIC at spl-condvar.c:497:__cv_broadcast() [ 2379.114854] Kernel panic - not syncing: VERIFY(cvp->cv_magic == CV_MAGIC) failed [ 2379.115012] Call Trace: [ 2379.115019] dump_stack+0x74/0x96 [ 2379.115024] panic+0x114/0x2f6 [ 2379.115035] spl_panic+0xcf/0xfc [spl] [ 2379.115477] __cv_broadcast+0x68/0xa0 [spl] [ 2379.115585] rrw_exit+0xb8/0x310 [zfs] [ 2379.115696] rrm_exit+0x4a/0x80 [zfs] [ 2379.115808] zpl_test_super+0xa9/0xd0 [zfs] [ 2379.115920] sget+0xd1/0x230 [ 2379.116033] zpl_mount+0xdc/0x230 [zfs] [ 2379.116037] legacy_get_tree+0x28/0x50 [ 2379.116039] vfs_get_tree+0x27/0xc0 [ 2379.116045] path_mount+0x2fe/0xa70 [ 2379.116048] do_mount+0x80/0xa0 [ 2379.116050] __x64_sys_mount+0x8b/0xe0 [ 2379.116052] do_syscall_64+0x35/0x50 [ 2379.116054] entry_SYSCALL_64_after_hwframe+0x61/0xc6 [ 2379.116057] RIP: 0033:0x7f9912e8b26a Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Chunwei Chen <david.chen@nutanix.com> Closes #15077	2023-07-21 16:35:12 -07:00
Ameer Hamza	e037327bfe	spa_min_alloc should be GCD, not min Since spa_min_alloc may not be a power of 2, unlike ashifts, in the case of DRAID, we should not select the minimal value among several vdevs. Rounding to a multiple of it is unlikely to work for other vdevs. Instead, using the greatest common divisor produces smaller yet more reasonable results. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Ameer Hamza <ahamza@ixsystems.com> Closes #15067	2023-07-21 16:35:12 -07:00
Yuri Pankov	1a2e486d25	Don't panic if setting vdev properties is unsupported for this vdev type Check that vdev has valid zap and bail out early. While here, move objid selection out of the loop, it's not going to change. Reviewed-by: Allan Jude <allan@klarasystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Yuri Pankov <yuripv@FreeBSD.org> Closes #15063	2023-07-21 16:35:12 -07:00
Ameer Hamza	d8011707cc	Ignore pool ashift property during vdev attachment Ashift can be set for a vdev only during its creation, and the top-level vdev does not change when a vdev is attached or replaced. The ashift property should not be used during attachment, as it does not allow attaching/replacing a vdev if the pool's ashift property is increased after the existing vdev was created. Instead, we should be able to attach the vdev if the attached vdev can satisfy the ashift requirement with its parent. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Ameer Hamza <ahamza@ixsystems.com> Closes #15061	2023-07-21 16:35:12 -07:00
Wojciech Małota-Wójcik	f5f5a2db95	Rollback before zfs root is mounted On my machines I observe random failures caused by rollback happening after zfs root is mounted. I've observed two types of failures: - zfs-rollback-bootfs.service fails saying that rollback must be done just before mounting the dataset - boot process fails and rescue console is entered. After making this modification and testing it for couple of days none of those problems have been observed anymore. I don't know if `dracut-mount.service` is still needed in the `After` directive. Maybe someone else is able to address this? Reviewed-by: Gregory Bartholomew <gregory.lee.bartholomew@gmail.com> Signed-off-by: Wojciech Małota-Wójcik <59281144+outofforest@users.noreply.github.com> Closes #15025	2023-07-21 16:35:12 -07:00
Alexander Motin	83b0967c1f	Do not request data L1 buffers on scan prefetch. Set ARC_FLAG_NO_BUF when prefetching data L1 buffers for scan. We do not prefetch data L0 buffers, so we do not need the L1 buffers, only want them to be ready in ARC. This saves some CPU time on the buffers decompression. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15029	2023-07-21 16:35:12 -07:00
Coleman Kane	73ba5df31a	Linux 6.5 compat: disk_check_media_change() was added The disk_check_media_change() function was added which replaces bdev_check_media_change. This change was introduced in 6.5rc1 444aa2c58cb3b6cfe3b7cc7db6c294d73393a894 and the new function takes a gendisk* as its argument, no longer a block_device*. Thus, bdev->bd_disk is now used to pass the expected data. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Coleman Kane <ckane@colemankane.org> Closes #15060	2023-07-21 16:35:12 -07:00
Coleman Kane	1bc244ae93	Linux 6.5 compat: BLK_STS_NEXUS renamed to BLK_STS_RESV_CONFLICT This change was introduced in Linux commit 7ba150834b840f6f5cdd07ca69a4ccf39df59a66 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Coleman Kane <ckane@colemankane.org> Closes #15059	2023-07-21 16:35:12 -07:00
Coleman Kane	931dc70550	Linux 6.5 compat: intptr_t definition is canonically signed Make the version here match that elsewhere in the kernel and system headers. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Coleman Kane <ckane@colemankane.org> Closes #15058 Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>	2023-07-21 16:35:12 -07:00
Yuri Pankov	5299f4f289	set autotrim default to 'off' everywhere As it turns out having autotrim default to 'on' on FreeBSD never really worked due to mess with defines where userland and kernel module were getting different default values (userland was defaulting to 'off', module was thinking it's 'on'). Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Yuri Pankov <yuripv@FreeBSD.org> Closes #15079	2023-07-21 16:35:12 -07:00
Alan Somers	f917cf1c03	Fix the ZFS checksum error histograms with larger record sizes My analysis in PR #14716 was incorrect. Each histogram bucket contains the number of incorrect bits, by position in a 64-bit word, over the entire record. 8-bit buckets can overflow for record sizes above 2k. To forestall that, saturate each bucket at 255. That should still get the point across: either all bits are equally wrong, or just a couple are. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alan Somers <asomers@gmail.com> Sponsored-by: Axcient Closes #15049	2023-07-21 16:35:12 -07:00
Alexander Motin	56ed389a57	Fix raw receive with different indirect block size. Unlike regular receive, raw receive require destination to have the same block structure as the source. In case of dnode reclaim this triggers two special cases, requiring special handling: - If dn_nlevels == 1, we can change the ibs, but dnode_set_blksz() should not dirty the data buffer if block size does not change, or durign receive dbuf_dirty_lightweight() will trigger assertion. - If dn_nlevels > 1, we just can't change the ibs, dnode_set_blksz() would fail and receive_object would trigger assertion, so we should destroy and recreate the dnode from scratch. Reviewed-by: Paul Dagnelie <pcd@delphix.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15039 (cherry picked from commit `c4e8742149`)	2023-07-20 08:58:29 -07:00
Alexander Motin	e613e4bbe3	Avoid extra snprintf() in dsl_deadlist_merge(). Since we are already iterating the ZAP, we have exact string key to remove, we do not need to call zap_remove_int() with the int key we just converted, we can call zap_remove() for the original string. This should make no functional change, only a micro-optimization. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15056 (cherry picked from commit `fdba8cbb79`)	2023-07-20 08:58:29 -07:00
Alexander Motin	b4e630b00c	Add missed DMU_PROJECTUSED_OBJECT prefetch. It seems `9c5167d19f` "Project Quota on ZFS" missed to add prefetch for DMU_PROJECTUSED_OBJECT during scan (scrub/resilver). It should not cause visible problems, but may affect scub/resilver performance. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15024	2023-07-20 08:58:29 -07:00
Mateusz Guzik	bf6cd30796	FreeBSD: catch up to __FreeBSD_version 1400093 Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> Closes #15036	2023-07-20 08:58:29 -07:00
Alexander Motin	1266cebf87	FreeBSD: Fix build on stable/13 after 1302506. Starting approximately from version 1302506 vn_lock_pair() grown two additional arguments following head. There is a one week hole, but that is closet reference point we have. Reviewed-by: Mateusz Guzik <mjguzik@gmail.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15047	2023-07-20 08:58:29 -07:00