Hierarchical bandwidth and operations rate limits.

Introduce six new properties: limit_{bw,op}_{read,write,total}. The limit_bw_* properties limit the read, write, or combined bandwidth, respectively, that a dataset and its descendants can consume. Limits are applied to both file systems and ZFS volumes. The configured limits are hierarchical, just like quotas; i.e., even if a higher limit is configured on the child dataset, the parent's lower limit will be enforced. The limits are applied at the VFS level, not at the disk level. The dataset is charged for each operation even if no disk access is required (e.g., due to caching, compression, deduplication, or NOP writes) or if the operation will cause more traffic (due to the copies property, mirroring, or RAIDZ). Read bandwidth consumption is based on: - read-like syscalls, eg., aio_read(2), pread(2), preadv(2), read(2), readv(2), sendfile(2) - syscalls like getdents(2) and getdirentries(2) - reading via mmaped files - zfs send Write bandwidth consumption is based on: - write-like syscalls, eg., aio_write(2), pwrite(2), pwritev(2), write(2), writev(2) - writing via mmaped files - zfs receive The limit_op_* properties limit the read, write, or both metadata operations, respectively, that dataset and its descendants can generate. Read operations consumption is based on: - read-like syscalls where the number of operations is equal to the number of blocks being read (never less than 1) - reading via mmaped files, where the number of operations is equal to the number of pages being read (never less than 1) - syscalls accessing metadata: readlink(2), stat(2) Write operations consumption is based on: - write-like syscalls where the number of operations is equal to the number of blocks being written (never less than 1) - writing via mmaped files, where the number of operations is equal to the number of pages being written (never less than 1) - syscalls modifing a directory's content: bind(2) (UNIX-domain sockets), link(2), mkdir(2), mkfifo(2), mknod(2), open(2) (file creation), rename(2), rmdir(2), symlink(2), unlink(2) - syscalls modifing metadata: chflags(2), chmod(2), chown(2), utimes(2) - updating the access time of a file when reading it Just like limit_bw_* limits, the limit_op_* limits are also hierarchical and applied at the VFS level. Signed-off-by: Pawel Jakub Dawidek <pawel@dawidek.net>
2023-01-08 11:31:22 -08:00 · 2023-01-08 11:31:22 -08:00 · 8c29642e14
parent b4c7ee6271
commit 8c29642e14
15 changed files with 563 additions and 349 deletions
--- a/include/os/freebsd/spl/sys/sdt.h
+++ b/include/os/freebsd/spl/sys/sdt.h
@ -37,7 +37,7 @@ SDT_PROBE_DECLARE(sdt, , , set__error);
 #define	SET_ERROR(err) \
 	((sdt_sdt___set__error->id ? \
 	(*sdt_probe_func)(sdt_sdt___set__error->id, \
-	    (uintptr_t)err, 0, 0, 0, 0) : 0), err)
+	    (uintptr_t)err, 0, 0, 0, 0, 0) : 0), err)
 #else
 #define	SET_ERROR(err) (err)
 #endif
--- a/include/os/freebsd/spl/sys/systm.h
+++ b/include/os/freebsd/spl/sys/systm.h
@ -39,5 +39,6 @@
 #define	PAGEMASK	(~PAGEOFFSET)

 #define	delay(x)	pause("soldelay", (x))
+#define	delay_sig(x)	(pause_sig("soldelay", (x)) != EAGAIN)

 #endif	/* _OPENSOLARIS_SYS_SYSTM_H_ */
--- a/include/os/linux/spl/sys/timer.h
+++ b/include/os/linux/spl/sys/timer.h
@ -51,6 +51,7 @@
 #define	ddi_time_after_eq64(a, b)	ddi_time_before_eq64(b, a)

 #define	delay(ticks)			schedule_timeout_uninterruptible(ticks)
+#define	delay_sig(ticks)		(schedule_timeout_interruptible(ticks) > 0)

 #define	SEC_TO_TICK(sec)		((sec) * HZ)
 #define	MSEC_TO_TICK(ms)		msecs_to_jiffies(ms)
--- a/include/sys/vfs_ratelimit.h
+++ b/include/sys/vfs_ratelimit.h
@ -55,10 +55,15 @@ void vfs_ratelimit_free(struct vfs_ratelimit *rl);
 struct vfs_ratelimit *vfs_ratelimit_set(struct vfs_ratelimit *rl,
    zfs_prop_t prop, uint64_t limit);

-void vfs_ratelimit_data_read(objset_t *os, size_t blocksize, size_t bytes);
-void vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes);
-void vfs_ratelimit_metadata_read(objset_t *os);
-void vfs_ratelimit_metadata_write(objset_t *os);
+int vfs_ratelimit_data_read(objset_t *os, size_t blocksize, size_t bytes);
+int vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes);
+int vfs_ratelimit_data_copy(objset_t *srcos, objset_t *dstos, size_t blocksize,
+    size_t bytes);
+int vfs_ratelimit_metadata_read(objset_t *os);
+int vfs_ratelimit_metadata_write(objset_t *os);
+
+void vfs_ratelimit_data_read_spin(objset_t *os, size_t blocksize, size_t bytes);
+void vfs_ratelimit_data_write_spin(objset_t *os, size_t blocksize, size_t bytes);

 #ifdef	__cplusplus
 }
--- a/man/man7/zfsprops.7
+++ b/man/man7/zfsprops.7
@ -1189,7 +1189,7 @@ This property may be changed with
 .It Sy limit_bw_total Ns = Ns Ar size Ns | Ns Sy none
 Limits the read, write, or combined bandwidth, respectively, that a dataset and
 its descendants can consume.
-Limits are applied to both file systems and ZFS volumes.
+Limits are applied to file systems, volumes and their snapshots.
 Bandwidth limits are in bytes per second.
 .Pp
 The configured limits are hierarchical, just like quotas; i.e., even if a
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@ -1156,7 +1156,11 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
 		goto out;
 	}

-	vfs_ratelimit_metadata_write(os);
+	error = vfs_ratelimit_metadata_write(os);
+	if (error != 0) {
+		zfs_acl_ids_free(&acl_ids);
+		goto out;
+	}

 	getnewvnode_reserve_();

@ -1291,7 +1295,10 @@ zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
 		ASSERT0(error);
 	}

-	vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		goto out;
+	}

 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
@ -1321,8 +1328,7 @@ zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
-		zfs_exit(zfsvfs, FTAG);
-		return (error);
+		goto out;
 	}

 	/*
@ -1520,7 +1526,12 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
 		return (SET_ERROR(EDQUOT));
 	}

-	vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}

 	/*
 	 * Add a new entry to the directory.
@ -1643,6 +1654,11 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
 		goto out;
 	}

+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		goto out;
+	}
+
 	vnevent_rmdir(vp, dvp, name, ct);

 	vfs_ratelimit_metadata_write(zfsvfs->z_os);
@ -1657,8 +1673,7 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
-		zfs_exit(zfsvfs, FTAG);
-		return (error);
+		goto out;
 	}

 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
@ -1783,6 +1798,21 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
 	offset = zfs_uio_offset(uio);
 	prefetch = zp->z_zn_prefetch;

+	/*
+	 * Calling vfs_ratelimit_data_read() for each directory entry would be
+	 * way too expensive. We don't want to do that so we do the following
+	 * instead:
+	 * We charge here only for a single block. If there is a lot of traffic
+	 * we are going to wait before any reading is issued. Once we read all
+	 * directory entries we will charge the process for the rest, as this is
+	 * when we will know how much data exactly was read.
+	 */
+	error = vfs_ratelimit_data_read(os, zp->z_blksz, zp->z_blksz);
+	if (error != 0) {
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}
+
 	/*
 	 * Initialize the iterator cursor.
 	 */
@ -1940,12 +1970,16 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
 		*ncookies -= ncooks;

 	/*
-	 * This is post factum, but if we would do that inside the loop we
-	 * wouldn't know the record length before reading it anyway plus we
-	 * would be calling vfs_ratelimit_data_read() way too often and each
-	 * call accounts for a single operation.
+	 * Charge the process for the rest, if more than a single block was
+	 * read.
 	 */
-	vfs_ratelimit_data_read(os, zp->z_blksz, outcount);
+	if (error == 0 && outcount > zp->z_blksz) {
+		error = vfs_ratelimit_data_read(os, zp->z_blksz,
+		    outcount - zp->z_blksz);
+		if (error != 0) {
+			goto update;
+		}
+	}

 	if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) {
 		iovp->iov_base += outcount;
@ -2039,7 +2073,11 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
 		}
 	}

-	vfs_ratelimit_metadata_read(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_read(zfsvfs->z_os);
+	if (error != 0) {
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}

 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
@ -2637,7 +2675,10 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
 		}
 	}

-	vfs_ratelimit_metadata_write(os);
+	err = vfs_ratelimit_metadata_write(os);
+	if (err != 0) {
+		goto out2;
+	}

 	tx = dmu_tx_create(os);

@ -3375,6 +3416,11 @@ zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
 		}
 	}

+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		goto out;
+	}
+
 	vn_seqc_write_begin(*svpp);
 	vn_seqc_write_begin(sdvp);
 	if (*tvpp != NULL)
@ -3586,14 +3632,18 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
 		return (error);
 	}

-	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids,
-	    0 /* projid */)) {
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0 /* projid */)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}

-	vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}

 	getnewvnode_reserve_();
 	tx = dmu_tx_create(zfsvfs->z_os);
@ -3692,7 +3742,11 @@ zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct)
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);

-	vfs_ratelimit_metadata_read(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_read(zfsvfs->z_os);
+	if (error != 0) {
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}

 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
@ -3822,7 +3876,11 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
 		return (error);
 	}

-	vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}

 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
@ -3839,8 +3897,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
 	error = zfs_link_create(tdzp, name, szp, tx, 0);

 	if (error == 0) {
-		uint64_t txtype = TX_LINK;
-		zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
+		zfs_log_link(zilog, tx, TX_LINK, tdzp, szp, name);
 	}

 	dmu_tx_commit(tx);
@ -4153,7 +4210,7 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
 		pgsin_a = MIN(*rahead, pgsin_a);
 	}

-	vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz,
+	error = vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz,
 	    MIN(end, obj_size) - start);

 	/*
@ -4162,8 +4219,10 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
 	 * ZFS will panic if we request DMU to read beyond the end of the last
 	 * allocated block.
 	 */
-	error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b,
-	    &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE));
+	if (error == 0) {
+		error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count,
+		    &pgsin_b, &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE));
+	}

 	if (lr != NULL)
 		zfs_rangelock_exit(lr);
@ -4292,7 +4351,9 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
 		goto out;
 	}

-	vfs_ratelimit_data_write(zfsvfs->z_os, zp->z_blksz, len);
+	if (vfs_ratelimit_data_write(zfsvfs->z_os, zp->z_blksz, len) != 0) {
+		goto out;
+	}

 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, off, len);
--- a/module/os/freebsd/zfs/zvol_os.c
+++ b/module/os/freebsd/zfs/zvol_os.c
@ -730,7 +730,9 @@ zvol_geom_bio_strategy(struct bio *bp)

 	if (bp->bio_cmd == BIO_DELETE) {
 		/* Should we account only for a single metadata write? */
-		vfs_ratelimit_metadata_write(zv->zv_objset);
+		error = vfs_ratelimit_metadata_write(zv->zv_objset);
+		if (error != 0)
+			goto unlock;
 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error != 0) {
@ -747,30 +749,30 @@ zvol_geom_bio_strategy(struct bio *bp)
 	while (resid != 0 && off < volsize) {
 		size_t size = MIN(resid, zvol_maxphys);
 		if (doread) {
-			vfs_ratelimit_data_read(zv->zv_objset,
+			error = vfs_ratelimit_data_read(zv->zv_objset,
 			    zv->zv_volblocksize, size);
+			if (error != 0)
+				break;
 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
 			    DMU_READ_PREFETCH);
+			if (error != 0)
+				break;
 		} else {
-			vfs_ratelimit_data_write(zv->zv_objset,
+			error = vfs_ratelimit_data_write(zv->zv_objset,
 			    zv->zv_volblocksize, size);
+			if (error != 0)
+				break;
 			dmu_tx_t *tx = dmu_tx_create(os);
 			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
 			error = dmu_tx_assign(tx, TXG_WAIT);
-			if (error) {
+			if (error != 0) {
 				dmu_tx_abort(tx);
-			} else {
+				break;
+			}
 			dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
 			zvol_log_write(zv, tx, off, size, commit);
 			dmu_tx_commit(tx);
 		}
-		}
-		if (error) {
-			/* Convert checksum errors into IO errors. */
-			if (error == ECKSUM)
-				error = SET_ERROR(EIO);
-			break;
-		}
 		off += size;
 		addr += size;
 		resid -= size;
@ -779,7 +781,12 @@ unlock:
 	zfs_rangelock_exit(lr);

 	bp->bio_completed = bp->bio_length - resid;
-	if (bp->bio_completed < bp->bio_length && off > volsize)
+	if (error == EINTR && bp->bio_completed > 0)
+		error = 0;
+	/* Convert checksum errors into IO errors. */
+	else if (error == ECKSUM)
+		error = SET_ERROR(EIO);
+	if (error == 0 && bp->bio_completed < bp->bio_length && off > volsize)
 		error = SET_ERROR(EINVAL);

 	switch (bp->bio_cmd) {
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@ -683,7 +683,11 @@ top:
 			goto out;
 		}

-		vfs_ratelimit_metadata_write(os);
+		error = vfs_ratelimit_metadata_write(os);
+		if (error != 0) {
+			zfs_acl_ids_free(&acl_ids);
+			goto out;
+		}

 		tx = dmu_tx_create(os);

@ -879,7 +883,11 @@ top:
 		goto out;
 	}

-	vfs_ratelimit_metadata_write(os);
+	error = vfs_ratelimit_metadata_write(os);
+	if (error != 0) {
+		zfs_acl_ids_free(&acl_ids);
+		goto out;
+	}

 	tx = dmu_tx_create(os);

@ -1012,6 +1020,11 @@ top:
 		goto out;
 	}

+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		goto out;
+	}
+
 	mutex_enter(&zp->z_lock);
 	may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
 	    !zn_has_cached_data(zp, 0, LLONG_MAX);
@ -1290,7 +1303,13 @@ top:
 		return (SET_ERROR(EDQUOT));
 	}

-	vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}

 	/*
 	 * Add a new entry to the directory.
@ -1434,7 +1453,10 @@ top:
 		goto out;
 	}

-	vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		goto out;
+	}

 	/*
 	 * Grab a lock on the directory to make sure that no one is
@ -1535,6 +1557,7 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
 	int		done = 0;
 	uint64_t	parent;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
+	size_t		nbytes;

 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
@ -1553,6 +1576,21 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
 	os = zfsvfs->z_os;
 	offset = ctx->pos;
 	prefetch = zp->z_zn_prefetch;
+	nbytes = 0;
+
+	/*
+	 * Calling vfs_ratelimit_data_read() for each directory entry would be
+	 * way too expensive. We don't want to do that so we do the following
+	 * instead:
+	 * We charge here only for a single block. If there is a lot of traffic
+	 * we are going to wait before any reading is issued. Once we read all
+	 * directory entries we will charge the process for the rest, as this is
+	 * when we will know how much data exactly was read.
+	 */
+	error = vfs_ratelimit_data_read(os, zp->z_blksz, zp->z_blksz);
+	if (error != 0) {
+		goto out;
+	}

 	/*
 	 * Initialize the iterator cursor.
@ -1645,18 +1683,21 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
 			offset += 1;
 		}
 		ctx->pos = offset;
+		/*
+		 * TODO: We should be adding size of dirent structure here too.
+		 */
+		nbytes += strlen(zap.za_name);
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */

-#ifdef TODO
 	/*
-	 * This is post factum, but if we would do that inside the loop we
-	 * wouldn't know the record length before reading it anyway plus we
-	 * would be calling vfs_ratelimit_data_read() way too often and each
-	 * call accounts for a single operation.
+	 * Charge the process for the rest, if more than a single block was
+	 * read.
 	 */
-	vfs_ratelimit_data_read(os, zp->z_blksz, size /* ??? */);
-#endif
+	if (error == 0 && nbytes > zp->z_blksz) {
+		error = vfs_ratelimit_data_read(os, zp->z_blksz,
+		    nbytes - zp->z_blksz);
+	}

 update:
 	zap_cursor_fini(&zc);
@ -1697,7 +1738,11 @@ zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);

-	vfs_ratelimit_metadata_read(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_read(zfsvfs->z_os);
+	if (error != 0) {
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}

 	mutex_enter(&zp->z_lock);

@ -2298,7 +2343,10 @@ top:
 		}
 	}

-	vfs_ratelimit_metadata_write(os);
+	err = vfs_ratelimit_metadata_write(os);
+	if (err != 0) {
+		goto out2;
+	}

 	tx = dmu_tx_create(os);

@ -3012,7 +3060,10 @@ top:
 		}
 	}

-	vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		goto out;
+	}

 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
@ -3328,7 +3379,13 @@ top:
 		return (SET_ERROR(EDQUOT));
 	}

-	vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}

 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
@ -3438,7 +3495,11 @@ zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);

-	vfs_ratelimit_metadata_read(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_read(zfsvfs->z_os);
+	if (error != 0) {
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}

 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
@ -3577,7 +3638,11 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
 		return (error);
 	}

-	vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		zfs_exit(zfsvfs, FTAG);
+		return (error);
+	}

 top:
 	/*
@ -3820,6 +3885,13 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 		return (0);
 	}

+	if (vfs_ratelimit_data_write(zfsvfs->z_os, zp->z_blksz, pglen) != 0) {
+		unlock_page(pp);
+		zfs_rangelock_exit(lr);
+		zfs_exit(zfsvfs, FTAG);
+		return (0);
+	}
+
 	/*
 	 * Counterpart for redirty_page_for_writepage() above.  This page
 	 * was in fact not skipped and should not be counted as if it were.
@ -3947,7 +4019,10 @@ zfs_dirty_inode(struct inode *ip, int flags)
 	}
 #endif

-	vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	error = vfs_ratelimit_metadata_write(zfsvfs->z_os);
+	if (error != 0) {
+		goto out;
+	}

 	tx = dmu_tx_create(zfsvfs->z_os);

@ -3994,7 +4069,6 @@ zfs_inactive(struct inode *ip)
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	uint64_t atime[2];
-	int error;
 	int need_unlock = 0;

 	/* Only read lock if we haven't already write locked, e.g. rollback */
@ -4009,16 +4083,19 @@ zfs_inactive(struct inode *ip)
 	}

 	if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
-		vfs_ratelimit_metadata_write(zfsvfs->z_os);
+		if (vfs_ratelimit_metadata_write(zfsvfs->z_os) != 0) {
+			goto out;
+		}

 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);

 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
+		if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
 			dmu_tx_abort(tx);
-		} else {
+			goto out;
+		}
+
 		inode_timespec_t tmp_atime;
 		tmp_atime = zpl_inode_get_atime(ip);
 		ZFS_TIME_ENCODE(&tmp_atime, atime);
@ -4029,8 +4106,7 @@ zfs_inactive(struct inode *ip)
 		mutex_exit(&zp->z_lock);
 		dmu_tx_commit(tx);
 	}
-	}
-
+out:
 	zfs_zinactive(zp);
 	if (need_unlock)
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
@ -4046,6 +4122,7 @@ zfs_fillpage(struct inode *ip, struct page *pp)
 	loff_t i_size = i_size_read(ip);
 	u_offset_t io_off = page_offset(pp);
 	size_t io_len = PAGE_SIZE;
+	int error;

 	ASSERT3U(io_off, <, i_size);

@ -4055,12 +4132,12 @@ zfs_fillpage(struct inode *ip, struct page *pp)
 	vfs_ratelimit_data_read(zfsvfs->z_os, PAGESIZE, io_len);

 	void *va = kmap(pp);
-	int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
+	error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
 	    io_len, va, DMU_READ_PREFETCH);
 	if (io_len != PAGE_SIZE)
 		memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
 	kunmap(pp);
-
+out:
 	if (error) {
 		/* convert checksum errors into IO errors */
 		if (error == ECKSUM)
@ -4097,6 +4174,8 @@ zfs_getpage(struct inode *ip, struct page *pp)
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);

+	error = vfs_ratelimit_data_read(zfsvfs->z_os, 0, PAGE_SIZE);
+	if (error == 0)
 		error = zfs_fillpage(ip, pp);
 	if (error == 0)
 		dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@ -297,8 +297,14 @@ zvol_write(zv_request_t *zvr)
 		if (bytes > volsize - off)	/* don't write past the end */
 			bytes = volsize - off;

-		vfs_ratelimit_data_write(zv->zv_objset, zv->zv_volblocksize,
-		    bytes);
+		error = vfs_ratelimit_data_write(zv->zv_objset,
+		    zv->zv_volblocksize, bytes);
+		if (error != 0) {
+			/* XXX-PJD Is it safe to reset the error? */
+			if (error == EINTR && uio.uio_resid < start_resid)
+				error = 0;
+			break;
+		}

 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);

@ -400,7 +406,11 @@ zvol_discard(zv_request_t *zvr)
 	    start, size, RL_WRITER);

 	/* Should we account only for a single metadata write? */
-	vfs_ratelimit_metadata_write(zv->zv_objset);
+	error = vfs_ratelimit_metadata_write(zv->zv_objset);
+	if (error != 0) {
+		zfs_rangelock_exit(lr);
+		goto unlock;
+	}

 	tx = dmu_tx_create(zv->zv_objset);
 	dmu_tx_mark_netfree(tx);
@ -483,8 +493,14 @@ zvol_read(zv_request_t *zvr)
 		if (bytes > volsize - uio.uio_loffset)
 			bytes = volsize - uio.uio_loffset;

-		vfs_ratelimit_data_read(zv->zv_objset, zv->zv_volblocksize,
-		    bytes);
+		error = vfs_ratelimit_data_read(zv->zv_objset,
+		    zv->zv_volblocksize, bytes);
+		if (error != 0) {
+			/* XXX-PJD Is it safe to reset the error? */
+			if (error == EINTR && uio.uio_resid < start_resid)
+				error = 0;
+			break;
+		}

 		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
 		if (error) {
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@ -2205,7 +2205,11 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)

 		ASSERT3U(drrw->drr_object, ==, rwa->last_object);

-		vfs_ratelimit_data_write(rwa->os, drrw->drr_logical_size,
+		/*
+		 * vfs_ratelimit_data_write_spin() will sleep in short periods
+		 * and return immediately when a signal is pending.
+		 */
+		vfs_ratelimit_data_write_spin(rwa->os, 0,
 		    drrw->drr_logical_size);

 		if (drrw->drr_logical_size != dn->dn_datablksz) {
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@ -1631,6 +1631,7 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range)
 	struct srd *srdp = &range->sru.data;
 	blkptr_t *bp = &srdp->bp;
 	objset_t *os = srta->smta->os;
+	int error;

 	ASSERT3U(range->type, ==, DATA);
 	ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
@ -1685,11 +1686,15 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range)
 	    .zb_blkid = range->start_blkid,
 	};

-	vfs_ratelimit_data_read(os, BP_GET_LSIZE(bp), BP_GET_LSIZE(bp));
+	/*
+	 * vfs_ratelimit_data_read_spin() will sleep in short periods and return
+	 * immediately when a signal is pending.
+	 */
+	vfs_ratelimit_data_read_spin(os, 0, BP_GET_LSIZE(bp));

 	arc_flags_t aflags = ARC_FLAG_CACHED_ONLY;

-	int arc_err = arc_read(NULL, os->os_spa, bp,
+	error = arc_read(NULL, os->os_spa, bp,
 	    arc_getbuf_func, &srdp->abuf, ZIO_PRIORITY_ASYNC_READ,
 	    zioflags, &aflags, &zb);
 	/*
@ -1698,7 +1703,7 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range)
 	 * entry to the ARC, and we also avoid polluting the ARC cache with
 	 * data that is not likely to be used in the future.
 	 */
-	if (arc_err != 0) {
+	if (error != 0) {
 		srdp->abd = abd_alloc_linear(srdp->datasz, B_FALSE);
 		srdp->io_outstanding = B_TRUE;
 		zio_nowait(zio_read(NULL, os->os_spa, bp, srdp->abd,
@ -2555,9 +2560,10 @@ dmu_send_impl(struct dmu_send_params *dspp)
 	while (err == 0 && !range->eos_marker) {
 		err = do_dump(&dsc, range);
 		range = get_next_range(&srt_arg->q, range);
-		if (issig())
+		if (issig()) {
 			err = SET_ERROR(EINTR);
 		}
+	}

 	/*
 	 * If we hit an error or are interrupted, cancel our worker threads and
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@ -371,6 +371,10 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
 			dd->dd_snap_cmtime = t;
 		}

+		if (dd->dd_myname[0] != '$') {
+			dsl_dir_ratelimit_read(dd);
+		}
+
 		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
 		    &dd->dd_dbuf);
 		winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
@ -380,6 +384,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
 			if (dsl_deadlist_is_open(&dd->dd_livelist))
 				dsl_dir_livelist_close(dd);
 			dsl_prop_fini(dd);
+			vfs_ratelimit_free(dd->dd_ratelimit);
 			cv_destroy(&dd->dd_activity_cv);
 			mutex_destroy(&dd->dd_activity_lock);
 			mutex_destroy(&dd->dd_lock);
@ -2036,7 +2041,6 @@ dsl_dir_ratelimit_recurse(dsl_dir_t *dd)
 		ASSERT(child_dd->dd_ratelimit == NULL);
 		child_dd->dd_ratelimit_root = dd->dd_ratelimit_root;

-
 		dsl_dir_ratelimit_recurse(child_dd);

 		dsl_dir_rele(child_dd, FTAG);
@ -2320,7 +2324,7 @@ dsl_dir_ratelimit_rename(dsl_dir_t *dd, dsl_dir_t *newparent)

 	if (dd->dd_ratelimit_root != dd) {
 		ASSERT(dd->dd_ratelimit == NULL);
-		dd->dd_ratelimit_root = newparent;
+		dd->dd_ratelimit_root = newparent->dd_ratelimit_root;

 		dsl_dir_ratelimit_recurse(dd);
 	}
--- a/module/zfs/vfs_ratelimit.c
+++ b/module/zfs/vfs_ratelimit.c
@ -54,7 +54,7 @@
 * - It would be hard to predict what limits should be configured as there are a
 *   lot of factors that dictate how much disk bandwidth is really required
 *   (due to RAIDZ inflation, compression, gang blocks, deduplication,
- *    NOP writes, I/O aggregation, metadata traffic, etc.).
+ *    block cloning, NOP writes, I/O aggregation, metadata traffic, etc.).
 * By enforcing the limits at the VFS level for file system operations it should
 * be easy to find out what limits applications require and verify that the
 * limits are correctly enforced by monitoring system calls issued by the
@ -76,20 +76,20 @@
 * We walk down the dataset tree and set dd_ratelimit_root field to point to
 * this dsl_dir until we find dsl_dir that also has the vfs_ratelimit structure
 * already attached to it (which means it has its own limits configured).
- * During the accounting it allows us for quick access to the ratelimit
+ * During the accounting it allows us to quickly access the ratelimit
 * structure we need by just going to ds_dir->dd_ratelimit_root;
- * If ratelimits are not configured on this dataset or any of its parents,
+ * If ratelimits are not configured on this dataset and all of its ancestors,
 * the ds_dir->dd_ratelimit_root will be set to NULL, so we know we don't
 * have to do any accounting.
 *
 * The limits are configured per second, but we divde the second and the limits
- * into RATELIMIT_RESOLUTION slots (10 by default). This is to avoid a choking
+ * into RATELIMIT_RESOLUTION slots (16 by default). This is to avoid a choking
 * effect, when process is doing progress in 1s steps. For example if we have
 * read bandwidth limits configured to 100MB/s and the process is trying to
 * read 130MB, it will take 1.3 seconds, not 2 seconds.
- * Not that very low limits may be rounded up - 7 ops/s limit will be rounded
- * up to 10 ops/s, so each slot is assigned 1 op/s limit. This rounding up
- * is done in the kernel and isn't shown in the properties value.
+ * Note that very low limits may be rounded up - 7 ops/s limit will be rounded
+ * up to 16 ops/s, so each time slot is assigned 1 op/s limit. This rounding up
+ * is done in the kernel and isn't shown in the properties.
 *
 * How does the accounting work?
 *
@ -99,34 +99,31 @@
 * and two operations total. Not all of those limits have to be configured or
 * some might be configured on a dataset and others on a parent dataset(s).
 *
- * We remember those values in the rtslot structures at every level we have
- * limits configured on. The rtslot strucuture also remembers the time of
- * the request. For each ratelimit type (read bandwidth, total, operation read,
- * operation total) and for each dataset with the limits configured when we walk
- * the dataset tree up we find the point in time until which we have to wait to
- * satisfy configured limit. We select the furthest point in time and we do to
- * sleep. If the request doesn't exceed any limits, we just do the accounting
- * and allow for the request to be executed immediately.
+ * For each type we use two fields to track the wait times: rl_timeslot and
+ * rl_reminder. rl_timeslot holds the point in time up to which the last
+ * processes is waiting for. If the rl_timeslot is lower than the current time,
+ * it means that no processes are waiting. rl_reminder is the amount of data
+ * modulo the limit. For example if we have a read bandwidth limit of 64MB/s,
+ * so it is 4MB per 1/16s. The process is trying to read 11MB. This would
+ * give us rl_timeslot = now + 2 (we account for 2 full time slots of 1/16s)
+ * and rl_reminder = 3MB. This process has to sleep for 2/16s. When immediately
+ * another process is trying to read 1MB, this 1MB will be added to the current
+ * rl_reminder giving 4MB, so full limit unit for 1/16s. Now rl_timeslot will
+ * be set to now + 3 and rl_reminder to 0. The last process is going to sleep
+ * for 3/16s.
 */

 /*
 * Number of slots we divide one second into. More granularity is better for
- * interactivity, but it takes more memory and more calculations.
+ * interactivity, but for small limits we may lose some precision.
 */
 #define	RATELIMIT_RESOLUTION	16

 struct vfs_ratelimit {
 	kmutex_t	rl_lock;
 	uint64_t	rl_limits[ZFS_RATELIMIT_NTYPES];
-	/* List of current waiters and past activity. */
-	list_t		rl_list;
-};
-
-struct rtslot {
-	list_node_t	rts_node;
-	hrtime_t	rts_timeslot;
-	int		rts_types;
-	uint64_t	rts_counts[ZFS_RATELIMIT_NTYPES];
+	uint64_t	rl_timeslot[ZFS_RATELIMIT_NTYPES];
+	uint64_t	rl_reminder[ZFS_RATELIMIT_NTYPES];
 };

 int
@ -197,13 +194,6 @@ vfs_ratelimit_alloc(const uint64_t *limits)
 	rl = kmem_zalloc(sizeof (*rl), KM_SLEEP);

 	mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&rl->rl_list, sizeof (struct rtslot),
-	    offsetof(struct rtslot, rts_node));
-	/* Create two slots for a good start. */
-	for (i = 0; i < 2; i++) {
-		list_insert_tail(&rl->rl_list,
-		    kmem_zalloc(sizeof (struct rtslot), KM_SLEEP));
-	}

 	if (limits != NULL) {
 		for (i = ZFS_RATELIMIT_FIRST; i < ZFS_RATELIMIT_NTYPES; i++) {
@ -227,17 +217,11 @@ vfs_ratelimit_alloc(const uint64_t *limits)
 void
 vfs_ratelimit_free(struct vfs_ratelimit *rl)
 {
-	struct rtslot *slot;

 	if (rl == NULL) {
 		return;
 	}

-	while ((slot = list_remove_head(&rl->rl_list)) != NULL) {
-		kmem_free(slot, sizeof (*slot));
-	}
-	list_destroy(&rl->rl_list);
-
 	mutex_destroy(&rl->rl_lock);

 	kmem_free(rl, sizeof (*rl));
@ -278,28 +262,24 @@ static __inline hrtime_t
 gettimeslot(void)
 {
 	inode_timespec_t ts;
-	hrtime_t nsec;

 	gethrestime(&ts);
-	nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec;
-	return (nsec / (NANOSEC / RATELIMIT_RESOLUTION));
+
+	return (((hrtime_t)ts.tv_sec * RATELIMIT_RESOLUTION) +
+	    ts.tv_nsec / (NANOSEC / RATELIMIT_RESOLUTION));
 }

 /*
 * Returns bit mask of the types configured for the given ratelimit structure.
 */
 static int
-ratelimit_types(const struct vfs_ratelimit *rl)
+ratelimit_types(const uint64_t *counts)
 {
 	int types, type;

-	if (rl == NULL) {
-		return (0);
-	}
-
 	types = 0;
 	for (type = ZFS_RATELIMIT_FIRST; type <= ZFS_RATELIMIT_LAST; type++) {
-		if (rl->rl_limits[type] > 0) {
+		if (counts[type] > 0) {
 			types |= (1 << type);
 		}
 	}
@ -318,7 +298,6 @@ static dsl_dir_t *
 ratelimit_first(objset_t *os, int types)
 {
 	dsl_dir_t *dd;
-	int mytypes;

 	ASSERT(RRM_READ_HELD(&os->os_spa->spa_ratelimit_lock));

@ -327,7 +306,10 @@ ratelimit_first(objset_t *os, int types)
 		if (dd == NULL) {
 			return (NULL);
 		}
-		mytypes = ratelimit_types(dd->dd_ratelimit);
+		if (dd->dd_ratelimit != NULL) {
+			int mytypes;
+
+			mytypes = ratelimit_types(dd->dd_ratelimit->rl_limits);
 			if ((mytypes & types) != 0) {
 				/*
 				 * This dataset has at last one limit we are
@ -335,6 +317,7 @@ ratelimit_first(objset_t *os, int types)
 				 */
 				return (dd);
 			}
+		}
 		if (dd->dd_parent == NULL) {
 			return (NULL);
 		}
@ -351,8 +334,6 @@ ratelimit_first(objset_t *os, int types)
 static dsl_dir_t *
 ratelimit_parent(dsl_dir_t *dd, int types)
 {
-	int mytypes;
-
 	ASSERT(RRM_READ_HELD(&dd->dd_pool->dp_spa->spa_ratelimit_lock));

 	for (;;) {
@ -363,7 +344,10 @@ ratelimit_parent(dsl_dir_t *dd, int types)
 		if (dd == NULL) {
 			return (NULL);
 		}
-		mytypes = ratelimit_types(dd->dd_ratelimit);
+		if (dd->dd_ratelimit != NULL) {
+			int mytypes;
+
+			mytypes = ratelimit_types(dd->dd_ratelimit->rl_limits);
 			if ((mytypes & types) != 0) {
 				/*
 				 * This dataset has at last one limit we are
@ -372,43 +356,7 @@ ratelimit_parent(dsl_dir_t *dd, int types)
 				return (dd);
 			}
 		}
-}
-
-/*
- * If we have any entries with 'timeslot > now' we also must have an entry with
- * 'timeslot == now'. In other words if there is no entry with
- * 'timeslot == now', it means that all the entires expired.
- *
- * We return either the most recent entry related to the given type or we return
- * 'timeslot == now' entry not related to the given type and we will use it to
- * store accouting information about this type as well.
- */
-static struct rtslot *
-ratelimit_find(struct vfs_ratelimit *rl, int typebit, hrtime_t now)
-{
-	struct rtslot *slot;
-
-	ASSERT(MUTEX_HELD(&rl->rl_lock));
-
-	for (slot = list_head(&rl->rl_list); slot != NULL;
-	    slot = list_next(&rl->rl_list, slot)) {
-		if (slot->rts_timeslot < now) {
-			break;
 	}
-		if ((slot->rts_types & typebit) != 0 ||
-		    slot->rts_timeslot == now) {
-			return (slot);
-		}
-	}
-	/* All the entries expired. */
-#ifndef NDEBUG
-	for (slot = list_head(&rl->rl_list); slot != NULL;
-	    slot = list_next(&rl->rl_list, slot)) {
-		ASSERT(slot->rts_timeslot < now);
-	}
-#endif
-
-	return (NULL);
 }

 /*
@ -417,100 +365,42 @@ ratelimit_find(struct vfs_ratelimit *rl, int typebit, hrtime_t now)
 * Return a timeslot we should wait for or now if we can execute the request
 * without waiting (we are within limits).
 */
-static uint64_t
-ratelimit_account(struct vfs_ratelimit *rl, int types, hrtime_t now,
+static hrtime_t
+ratelimit_account(struct vfs_ratelimit *rl, hrtime_t now,
    const uint64_t *counts)
 {
-	uint64_t timeslot;
-	int type, typebit;
+	hrtime_t timeslot;
+	int type;

-	timeslot = 0;
+	timeslot = now;

 	mutex_enter(&rl->rl_lock);

 	for (type = ZFS_RATELIMIT_FIRST; type <= ZFS_RATELIMIT_LAST; type++) {
-		struct rtslot *slot;
-		uint64_t count, nexttimeslot;
+		uint64_t count;

-		typebit = (1 << type);
-
-		if ((types & typebit) == 0) {
-			/* Not interested in this type. */
-			continue;
-		}
 		if (rl->rl_limits[type] == 0) {
 			/* This type has no limit configured on this dataset. */
 			continue;
 		}
 		count = counts[type];
-		ASSERT(count > 0);
-
-		slot = ratelimit_find(rl, typebit, now);
-		if (slot == NULL) {
-			slot = list_remove_tail(&rl->rl_list);
-			ASSERT(slot->rts_timeslot < now);
-			slot->rts_types = typebit;
-			slot->rts_timeslot = now;
-			memset(slot->rts_counts, 0, sizeof (slot->rts_counts));
-			list_insert_head(&rl->rl_list, slot);
-		} else if (slot->rts_timeslot == now) {
-			/* The 'now' slot may not have our type yet. */
-			slot->rts_types |= typebit;
-		}
-		ASSERT((slot->rts_types & typebit) != 0);
-		nexttimeslot = slot->rts_timeslot + 1;
-
-		for (;;) {
-			if (slot->rts_counts[type] + count <=
-			    rl->rl_limits[type]) {
-				slot->rts_counts[type] += count;
-				break;
+		if (count == 0) {
+			/* Not interested in this type. */
+			continue;
 		}

-			/*
-			 * This request is too big to fit into a single slot,
-			 * ie. a single request exceeds the limit or this and
-			 * the previous requests exceed the limit.
-			 */
-
-			/*
-			 * Fit as much as we can into the current slot.
-			 */
-			count -= rl->rl_limits[type] - slot->rts_counts[type];
-			slot->rts_counts[type] = rl->rl_limits[type];
-
-			/*
-			 * Take the next slot (if already exists isn't aware of
-			 * our type yet), take an expired slot from the tail of
-			 * the list or allocate a new slot.
-			 */
-			slot = list_prev(&rl->rl_list, slot);
-			if (slot != NULL) {
-				ASSERT((slot->rts_types & typebit) == 0);
-				ASSERT(slot->rts_timeslot == nexttimeslot);
-				ASSERT0(slot->rts_counts[type]);
-
-				slot->rts_types |= typebit;
+		if (rl->rl_timeslot[type] < now) {
+			rl->rl_reminder[type] = 0;
+			rl->rl_timeslot[type] = now;
 		} else {
-				slot = list_tail(&rl->rl_list);
-				if (slot->rts_timeslot < now) {
-					list_remove(&rl->rl_list, slot);
-				} else {
-					slot = kmem_alloc(sizeof (*slot),
-					    KM_SLEEP);
-				}
-				slot->rts_types = typebit;
-				slot->rts_timeslot = nexttimeslot;
-				memset(slot->rts_counts, 0,
-				    sizeof (slot->rts_counts));
-				list_insert_head(&rl->rl_list, slot);
+			count += rl->rl_reminder[type];
 		}

-			nexttimeslot++;
-		}
+		rl->rl_timeslot[type] += count / rl->rl_limits[type];
+		rl->rl_reminder[type] = count % rl->rl_limits[type];;

-		if (timeslot < slot->rts_timeslot) {
-			timeslot = slot->rts_timeslot;
+		if (timeslot < rl->rl_timeslot[type]) {
+			timeslot = rl->rl_timeslot[type];
 		}
 	}

@ -519,106 +409,173 @@ ratelimit_account(struct vfs_ratelimit *rl, int types, hrtime_t now,
 	return (timeslot);
 }

-static void
-vfs_ratelimit(objset_t *os, int types, const uint64_t *counts)
+static hrtime_t
+ratelimit_account_all(objset_t *os, const uint64_t *counts)
 {
 	dsl_dir_t *dd;
 	hrtime_t now, timeslot;
+	int types;
+
+	ASSERT(RRM_READ_HELD(&os->os_spa->spa_ratelimit_lock));
+
+	types = ratelimit_types(counts);
+	now = timeslot = gettimeslot();
+
+	for (dd = ratelimit_first(os, types); dd != NULL;
+	    dd = ratelimit_parent(dd, types)) {
+		hrtime_t ts;
+
+		ts = ratelimit_account(dd->dd_ratelimit, now, counts);
+		if (ts > timeslot) {
+			timeslot = ts;
+		}
+	}
+
+	return (timeslot);
+}
+
+static int
+ratelimit_sleep(hrtime_t timeslot)
+{
+	hrtime_t now;
+	int error = 0;

 	now = gettimeslot();
-	timeslot = 0;
+
+	if (timeslot > now) {
+		/*
+		 * Too much traffic, slow it down.
+		 */
+#ifdef _KERNEL
+		if (delay_sig((hz / RATELIMIT_RESOLUTION) * (timeslot - now))) {
+			error = EINTR;
+		}
+#else
+		delay((hz / RATELIMIT_RESOLUTION) * (timeslot - now));
+#endif
+	}
+
+	return (error);
+}
+
+static int
+vfs_ratelimit_sleep(objset_t *os, const uint64_t *counts)
+{
+	hrtime_t timeslot;

 	/*
 	 * Prevents configuration changes when we have requests in-flight.
 	 */
 	rrm_enter_read(&os->os_spa->spa_ratelimit_lock, FTAG);

-	for (dd = ratelimit_first(os, types); dd != NULL;
-	    dd = ratelimit_parent(dd, types)) {
-		hrtime_t ts;
-
-		ts = ratelimit_account(dd->dd_ratelimit, types, now, counts);
-		if (ts > timeslot) {
-			timeslot = ts;
-		}
-	}
+	timeslot = ratelimit_account_all(os, counts);

 	rrm_exit(&os->os_spa->spa_ratelimit_lock, FTAG);

-	if (timeslot > now) {
-		/*
-		 * Too much traffic, slow it down.
-		 */
-		delay((hz / RATELIMIT_RESOLUTION) * (timeslot - now));
-	}
+	return (ratelimit_sleep(timeslot));
 }

 /*
 * For every data read we charge:
 * - bytes of read bandwidth
 * - bytes of total bandwidth
- * - (bytes - 1) / blocksize + 1 of read operations
- * - (bytes - 1) / blocksize + 1 of total operations
+ * - (bytes + blocksize - 1) / blocksize of read operations
+ * - (bytes + blocksize - 1) / blocksize of total operations
 */
-void
+int
 vfs_ratelimit_data_read(objset_t *os, size_t blocksize, size_t bytes)
 {
 	uint64_t counts[ZFS_RATELIMIT_NTYPES];
-	unsigned int types;
+	size_t operations;

 	if (bytes == 0) {
-		return;
+		return (0);
 	}
 	if (blocksize == 0) {
 		blocksize = bytes;
 	}
-
-	types =  (1 << ZFS_RATELIMIT_BW_READ);
-	types |= (1 << ZFS_RATELIMIT_BW_TOTAL);
-	types |= (1 << ZFS_RATELIMIT_OP_READ);
-	types |= (1 << ZFS_RATELIMIT_OP_TOTAL);
+	operations = (bytes + blocksize - 1) / blocksize;

 	memset(counts, 0, sizeof (counts));
 	counts[ZFS_RATELIMIT_BW_READ] = bytes;
 	counts[ZFS_RATELIMIT_BW_TOTAL] = bytes;
-	counts[ZFS_RATELIMIT_OP_READ] = (bytes - 1) / blocksize + 1;
-	counts[ZFS_RATELIMIT_OP_TOTAL] = (bytes - 1) / blocksize + 1;
+	counts[ZFS_RATELIMIT_OP_READ] = operations;
+	counts[ZFS_RATELIMIT_OP_TOTAL] = operations;

-	vfs_ratelimit(os, types, counts);
+	return (vfs_ratelimit_sleep(os, counts));
 }

 /*
 * For every data write we charge:
 * - bytes of write bandwidth
 * - bytes of total bandwidth
- * - (bytes - 1) / blocksize + 1 of write operations
- * - (bytes - 1) / blocksize + 1 of total operations
+ * - (bytes + blocksize - 1) / blocksize of read operations
+ * - (bytes + blocksize - 1) / blocksize of total operations
 */
-void
+int
 vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes)
 {
 	uint64_t counts[ZFS_RATELIMIT_NTYPES];
-	unsigned int types;
+	size_t operations;

 	if (bytes == 0) {
-		return;
+		return (0);
 	}
 	if (blocksize == 0) {
 		blocksize = bytes;
 	}
-
-	types =  (1 << ZFS_RATELIMIT_BW_WRITE);
-	types |= (1 << ZFS_RATELIMIT_BW_TOTAL);
-	types |= (1 << ZFS_RATELIMIT_OP_WRITE);
-	types |= (1 << ZFS_RATELIMIT_OP_TOTAL);
+	operations = (bytes + blocksize - 1) / blocksize;

 	memset(counts, 0, sizeof (counts));
 	counts[ZFS_RATELIMIT_BW_WRITE] = bytes;
 	counts[ZFS_RATELIMIT_BW_TOTAL] = bytes;
-	counts[ZFS_RATELIMIT_OP_WRITE] = (bytes - 1) / blocksize + 1;
-	counts[ZFS_RATELIMIT_OP_TOTAL] = (bytes - 1) / blocksize + 1;
+	counts[ZFS_RATELIMIT_OP_WRITE] = operations;
+	counts[ZFS_RATELIMIT_OP_TOTAL] = operations;

-	vfs_ratelimit(os, types, counts);
+	return (vfs_ratelimit_sleep(os, counts));
+}
+
+int
+vfs_ratelimit_data_copy(objset_t *srcos, objset_t *dstos, size_t blocksize,
+    size_t bytes)
+{
+	uint64_t counts[ZFS_RATELIMIT_NTYPES];
+	size_t operations;
+	hrtime_t dstts, srcts;
+	spa_t *spa = srcos->os_spa;
+
+	if (bytes == 0) {
+		return (0);
+	}
+	if (blocksize == 0) {
+		blocksize = bytes;
+	}
+	operations = (bytes + blocksize - 1) / blocksize;
+
+	/*
+	 * Prevents configuration changes when we have requests in-flight.
+	 */
+	rrm_enter_read(&spa->spa_ratelimit_lock, FTAG);
+
+	memset(counts, 0, sizeof (counts));
+	counts[ZFS_RATELIMIT_BW_READ] = bytes;
+	counts[ZFS_RATELIMIT_BW_TOTAL] = bytes;
+	counts[ZFS_RATELIMIT_OP_READ] = operations;
+	counts[ZFS_RATELIMIT_OP_TOTAL] = operations;
+
+	srcts = ratelimit_account_all(srcos, counts);
+
+	memset(counts, 0, sizeof (counts));
+	counts[ZFS_RATELIMIT_BW_WRITE] = bytes;
+	counts[ZFS_RATELIMIT_BW_TOTAL] = bytes;
+	counts[ZFS_RATELIMIT_OP_WRITE] = operations;
+	counts[ZFS_RATELIMIT_OP_TOTAL] = operations;
+
+	dstts = ratelimit_account_all(dstos, counts);
+
+	rrm_exit(&spa->spa_ratelimit_lock, FTAG);
+
+	return (ratelimit_sleep(dstts > srcts ? dstts : srcts));
 }

 /*
@ -626,20 +583,16 @@ vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes)
 * - one read operation
 * - one total operation
 */
-void
+int
 vfs_ratelimit_metadata_read(objset_t *os)
 {
 	uint64_t counts[ZFS_RATELIMIT_NTYPES];
-	unsigned int types;
-
-	types =  (1 << ZFS_RATELIMIT_OP_READ);
-	types |= (1 << ZFS_RATELIMIT_OP_TOTAL);

 	memset(counts, 0, sizeof (counts));
 	counts[ZFS_RATELIMIT_OP_READ] = 1;
 	counts[ZFS_RATELIMIT_OP_TOTAL] = 1;

-	vfs_ratelimit(os, types, counts);
+	return (vfs_ratelimit_sleep(os, counts));
 }

 /*
@ -647,18 +600,89 @@ vfs_ratelimit_metadata_read(objset_t *os)
 * - one read operation
 * - one total operation
 */
-void
+int
 vfs_ratelimit_metadata_write(objset_t *os)
 {
 	uint64_t counts[ZFS_RATELIMIT_NTYPES];
-	unsigned int types;
-
-	types =  (1 << ZFS_RATELIMIT_OP_WRITE);
-	types |= (1 << ZFS_RATELIMIT_OP_TOTAL);

 	memset(counts, 0, sizeof (counts));
 	counts[ZFS_RATELIMIT_OP_WRITE] = 1;
 	counts[ZFS_RATELIMIT_OP_TOTAL] = 1;

-	vfs_ratelimit(os, types, counts);
+	return (vfs_ratelimit_sleep(os, counts));
+}
+
+/*
+ * Function spins until timeout is reached or the process received a signal.
+ * This function is different than ratelimit_sleep(), because pause_sig()
+ * might not be woken up by a signal if the process has multiple threads.
+ * We use *_spin() functions for zfs send/recv where kernel starts additional
+ * kernel threads and interrupting userland process with CTRL+C (SIGINT)
+ * doesn't interrupt pause_sig() waiting in another kernel thread.
+ */
+static void
+ratelimit_spin(objset_t *os, const uint64_t *counts)
+{
+	hrtime_t timeslot;
+
+	/*
+	 * Prevents configuration changes when we have requests in-flight.
+	 */
+	rrm_enter_read(&os->os_spa->spa_ratelimit_lock, FTAG);
+
+	timeslot = ratelimit_account_all(os, counts);
+
+	rrm_exit(&os->os_spa->spa_ratelimit_lock, FTAG);
+
+	while (timeslot > gettimeslot() && !issig()) {
+		delay(hz / RATELIMIT_RESOLUTION);
+	}
+}
+
+void
+vfs_ratelimit_data_read_spin(objset_t *os, size_t blocksize, size_t bytes)
+{
+	uint64_t counts[ZFS_RATELIMIT_NTYPES];
+	size_t operations;
+
+	if (bytes == 0) {
+		return;
+	}
+
+	if (blocksize == 0) {
+		blocksize = bytes;
+	}
+	operations = (bytes + blocksize - 1) / blocksize;
+
+	memset(counts, 0, sizeof (counts));
+	counts[ZFS_RATELIMIT_BW_READ] = bytes;
+	counts[ZFS_RATELIMIT_BW_TOTAL] = bytes;
+	counts[ZFS_RATELIMIT_OP_READ] = operations;
+	counts[ZFS_RATELIMIT_OP_TOTAL] = operations;
+
+	ratelimit_spin(os, counts);
+}
+
+void
+vfs_ratelimit_data_write_spin(objset_t *os, size_t blocksize, size_t bytes)
+{
+	uint64_t counts[ZFS_RATELIMIT_NTYPES];
+	size_t operations;
+
+	if (bytes == 0) {
+		return;
+	}
+
+	if (blocksize == 0) {
+		blocksize = bytes;
+	}
+	operations = (bytes + blocksize - 1) / blocksize;
+
+	memset(counts, 0, sizeof (counts));
+	counts[ZFS_RATELIMIT_BW_WRITE] = bytes;
+	counts[ZFS_RATELIMIT_BW_TOTAL] = bytes;
+	counts[ZFS_RATELIMIT_OP_WRITE] = operations;
+	counts[ZFS_RATELIMIT_OP_TOTAL] = operations;
+
+	ratelimit_spin(os, counts);
 }
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@ -78,7 +78,8 @@ static int zfs_bclone_wait_dirty = 0;
 /*
 * Maximum bytes to read per chunk in zfs_read().
 */
-static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
+//static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
+static uint64_t zfs_vnops_read_chunk_size = 1024 * 512;

 int
 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
@ -299,7 +300,14 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 		ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size -
 		    P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size));

-		vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz, nbytes);
+		error = vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz,
+		    nbytes);
+		if (error != 0) {
+			if (error == EINTR && n < start_resid) {
+				error = 0;
+			}
+			break;
+		}

 #ifdef UIO_NOCOPY
 		if (zfs_uio_segflg(uio) == UIO_NOCOPY)
@ -614,7 +622,15 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 			}
 		}

-		vfs_ratelimit_data_write(zfsvfs->z_os, blksz, nbytes);
+		error = vfs_ratelimit_data_write(zfsvfs->z_os, blksz, nbytes);
+		if (error != 0) {
+			if (error == EINTR && n < start_resid) {
+				error = 0;
+			}
+			if (abuf != NULL)
+				dmu_return_arcbuf(abuf);
+			break;
+		}

 		/*
 		 * Start a transaction.
@ -1315,8 +1331,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 			break;
 		}

-		vfs_ratelimit_data_read(inos, inblksz, size);
-		vfs_ratelimit_data_write(outos, inblksz, size);
+		error = vfs_ratelimit_data_copy(inos, outos, inblksz, size);
+		if (error != 0) {
+			break;
+		}

 		nbps = maxblocks;
 		last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
--- a/tests/zfs-tests/tests/functional/ratelimit/filesystem_op_single.ksh
+++ b/tests/zfs-tests/tests/functional/ratelimit/filesystem_op_single.ksh
@ -90,15 +90,7 @@ log_must ratelimit_filesystem_op_single unlink limit_op_write=none 1024 1 "$TEST
 # Operations total limits limit writing.
 log_must ratelimit_filesystem_op_single chmod limit_op_total=128 512 4 "$TESTDIR/file"
 log_must ratelimit_filesystem_op_single chown limit_op_total=64 512 8 "$TESTDIR/file"
-# Creating a file requires one metadata write and one metadata read operation.
-# On successful open(2), zfs_freebsd_open() calls vnode_create_vobject()
-# with size=0. If size=0, vnode_create_vobject() interprets this as not having
-# the proper size and calls VOP_GETATTR().
-if is_freebsd; then
-	log_must ratelimit_filesystem_op_single create limit_op_total=128 512 8 "$TESTDIR/file"
-else
-	log_must ratelimit_filesystem_op_single create limit_op_total=128 512 4 "$TESTDIR/file"
-fi
+log_must ratelimit_filesystem_op_single create limit_op_total=128 512 4 "$TESTDIR/file"
 log_must ratelimit_filesystem_op_single unlink limit_op_total=64 512 8 "$TESTDIR/file"
 log_must ratelimit_filesystem_op_single mkdir limit_op_total=128 512 4 "$TESTDIR/file"
 log_must ratelimit_filesystem_op_single rmdir limit_op_total=64 512 8 "$TESTDIR/file"
@ -122,11 +114,7 @@ log_must ratelimit_filesystem_op_single unlink limit_op_total=none 1024 1 "$TEST
 # Operations read limits don't affect writing.
 log_must ratelimit_filesystem_op_single chmod limit_op_read=32 1024 1 "$TESTDIR/file"
 log_must ratelimit_filesystem_op_single chown limit_op_read=64 1024 1 "$TESTDIR/file"
-if is_freebsd; then
-	log_must ratelimit_filesystem_op_single create limit_op_read=128 1024 8 "$TESTDIR/file"
-else
-	log_must ratelimit_filesystem_op_single create limit_op_read=128 1024 1 "$TESTDIR/file"
-fi
+log_must ratelimit_filesystem_op_single create limit_op_read=128 1024 1 "$TESTDIR/file"
 log_must ratelimit_filesystem_op_single unlink limit_op_read=256 1024 1 "$TESTDIR/file"
 log_must ratelimit_filesystem_op_single mkdir limit_op_read=32 1024 1 "$TESTDIR/file"
 log_must ratelimit_filesystem_op_single rmdir limit_op_read=64 1024 1 "$TESTDIR/file"