From 8c29642e146f66ed121a963bbeca0baf2825d342 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Sun, 8 Jan 2023 11:31:22 -0800 Subject: [PATCH] Hierarchical bandwidth and operations rate limits. Introduce six new properties: limit_{bw,op}_{read,write,total}. The limit_bw_* properties limit the read, write, or combined bandwidth, respectively, that a dataset and its descendants can consume. Limits are applied to both file systems and ZFS volumes. The configured limits are hierarchical, just like quotas; i.e., even if a higher limit is configured on the child dataset, the parent's lower limit will be enforced. The limits are applied at the VFS level, not at the disk level. The dataset is charged for each operation even if no disk access is required (e.g., due to caching, compression, deduplication, or NOP writes) or if the operation will cause more traffic (due to the copies property, mirroring, or RAIDZ). Read bandwidth consumption is based on: - read-like syscalls, eg., aio_read(2), pread(2), preadv(2), read(2), readv(2), sendfile(2) - syscalls like getdents(2) and getdirentries(2) - reading via mmaped files - zfs send Write bandwidth consumption is based on: - write-like syscalls, eg., aio_write(2), pwrite(2), pwritev(2), write(2), writev(2) - writing via mmaped files - zfs receive The limit_op_* properties limit the read, write, or both metadata operations, respectively, that dataset and its descendants can generate. Read operations consumption is based on: - read-like syscalls where the number of operations is equal to the number of blocks being read (never less than 1) - reading via mmaped files, where the number of operations is equal to the number of pages being read (never less than 1) - syscalls accessing metadata: readlink(2), stat(2) Write operations consumption is based on: - write-like syscalls where the number of operations is equal to the number of blocks being written (never less than 1) - writing via mmaped files, where the number of operations is equal to the number of pages being written (never less than 1) - syscalls modifing a directory's content: bind(2) (UNIX-domain sockets), link(2), mkdir(2), mkfifo(2), mknod(2), open(2) (file creation), rename(2), rmdir(2), symlink(2), unlink(2) - syscalls modifing metadata: chflags(2), chmod(2), chown(2), utimes(2) - updating the access time of a file when reading it Just like limit_bw_* limits, the limit_op_* limits are also hierarchical and applied at the VFS level. Signed-off-by: Pawel Jakub Dawidek --- include/os/freebsd/spl/sys/sdt.h | 2 +- include/os/freebsd/spl/sys/systm.h | 1 + include/os/linux/spl/sys/timer.h | 1 + include/sys/vfs_ratelimit.h | 13 +- man/man7/zfsprops.7 | 2 +- module/os/freebsd/zfs/zfs_vnops_os.c | 111 +++- module/os/freebsd/zfs/zvol_os.c | 37 +- module/os/linux/zfs/zfs_vnops_os.c | 151 ++++-- module/os/linux/zfs/zvol_os.c | 26 +- module/zfs/dmu_recv.c | 6 +- module/zfs/dmu_send.c | 14 +- module/zfs/dsl_dir.c | 8 +- module/zfs/vfs_ratelimit.c | 496 +++++++++--------- module/zfs/zfs_vnops.c | 28 +- .../ratelimit/filesystem_op_single.ksh | 16 +- 15 files changed, 563 insertions(+), 349 deletions(-) diff --git a/include/os/freebsd/spl/sys/sdt.h b/include/os/freebsd/spl/sys/sdt.h index 2daa6de1af..6f45e036bc 100644 --- a/include/os/freebsd/spl/sys/sdt.h +++ b/include/os/freebsd/spl/sys/sdt.h @@ -37,7 +37,7 @@ SDT_PROBE_DECLARE(sdt, , , set__error); #define SET_ERROR(err) \ ((sdt_sdt___set__error->id ? \ (*sdt_probe_func)(sdt_sdt___set__error->id, \ - (uintptr_t)err, 0, 0, 0, 0) : 0), err) + (uintptr_t)err, 0, 0, 0, 0, 0) : 0), err) #else #define SET_ERROR(err) (err) #endif diff --git a/include/os/freebsd/spl/sys/systm.h b/include/os/freebsd/spl/sys/systm.h index 98ee955752..f17d820e7a 100644 --- a/include/os/freebsd/spl/sys/systm.h +++ b/include/os/freebsd/spl/sys/systm.h @@ -39,5 +39,6 @@ #define PAGEMASK (~PAGEOFFSET) #define delay(x) pause("soldelay", (x)) +#define delay_sig(x) (pause_sig("soldelay", (x)) != EAGAIN) #endif /* _OPENSOLARIS_SYS_SYSTM_H_ */ diff --git a/include/os/linux/spl/sys/timer.h b/include/os/linux/spl/sys/timer.h index 02c3c78934..abb9ef04fe 100644 --- a/include/os/linux/spl/sys/timer.h +++ b/include/os/linux/spl/sys/timer.h @@ -51,6 +51,7 @@ #define ddi_time_after_eq64(a, b) ddi_time_before_eq64(b, a) #define delay(ticks) schedule_timeout_uninterruptible(ticks) +#define delay_sig(ticks) (schedule_timeout_interruptible(ticks) > 0) #define SEC_TO_TICK(sec) ((sec) * HZ) #define MSEC_TO_TICK(ms) msecs_to_jiffies(ms) diff --git a/include/sys/vfs_ratelimit.h b/include/sys/vfs_ratelimit.h index c54821aa21..8b92476c83 100644 --- a/include/sys/vfs_ratelimit.h +++ b/include/sys/vfs_ratelimit.h @@ -55,10 +55,15 @@ void vfs_ratelimit_free(struct vfs_ratelimit *rl); struct vfs_ratelimit *vfs_ratelimit_set(struct vfs_ratelimit *rl, zfs_prop_t prop, uint64_t limit); -void vfs_ratelimit_data_read(objset_t *os, size_t blocksize, size_t bytes); -void vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes); -void vfs_ratelimit_metadata_read(objset_t *os); -void vfs_ratelimit_metadata_write(objset_t *os); +int vfs_ratelimit_data_read(objset_t *os, size_t blocksize, size_t bytes); +int vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes); +int vfs_ratelimit_data_copy(objset_t *srcos, objset_t *dstos, size_t blocksize, + size_t bytes); +int vfs_ratelimit_metadata_read(objset_t *os); +int vfs_ratelimit_metadata_write(objset_t *os); + +void vfs_ratelimit_data_read_spin(objset_t *os, size_t blocksize, size_t bytes); +void vfs_ratelimit_data_write_spin(objset_t *os, size_t blocksize, size_t bytes); #ifdef __cplusplus } diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index a97c69d121..185cbff3d6 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -1189,7 +1189,7 @@ This property may be changed with .It Sy limit_bw_total Ns = Ns Ar size Ns | Ns Sy none Limits the read, write, or combined bandwidth, respectively, that a dataset and its descendants can consume. -Limits are applied to both file systems and ZFS volumes. +Limits are applied to file systems, volumes and their snapshots. Bandwidth limits are in bytes per second. .Pp The configured limits are hierarchical, just like quotas; i.e., even if a diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 62f6c87eca..d39ef04b48 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1156,7 +1156,11 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, goto out; } - vfs_ratelimit_metadata_write(os); + error = vfs_ratelimit_metadata_write(os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + goto out; + } getnewvnode_reserve_(); @@ -1291,7 +1295,10 @@ zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) ASSERT0(error); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } /* * We may delete the znode now, or we may put it in the unlinked set; @@ -1321,8 +1328,7 @@ zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - zfs_exit(zfsvfs, FTAG); - return (error); + goto out; } /* @@ -1520,7 +1526,12 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, return (SET_ERROR(EDQUOT)); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * Add a new entry to the directory. @@ -1643,6 +1654,11 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) goto out; } + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } + vnevent_rmdir(vp, dvp, name, ct); vfs_ratelimit_metadata_write(zfsvfs->z_os); @@ -1657,8 +1673,7 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - zfs_exit(zfsvfs, FTAG); - return (error); + goto out; } error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); @@ -1783,6 +1798,21 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, offset = zfs_uio_offset(uio); prefetch = zp->z_zn_prefetch; + /* + * Calling vfs_ratelimit_data_read() for each directory entry would be + * way too expensive. We don't want to do that so we do the following + * instead: + * We charge here only for a single block. If there is a lot of traffic + * we are going to wait before any reading is issued. Once we read all + * directory entries we will charge the process for the rest, as this is + * when we will know how much data exactly was read. + */ + error = vfs_ratelimit_data_read(os, zp->z_blksz, zp->z_blksz); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } + /* * Initialize the iterator cursor. */ @@ -1940,12 +1970,16 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, *ncookies -= ncooks; /* - * This is post factum, but if we would do that inside the loop we - * wouldn't know the record length before reading it anyway plus we - * would be calling vfs_ratelimit_data_read() way too often and each - * call accounts for a single operation. + * Charge the process for the rest, if more than a single block was + * read. */ - vfs_ratelimit_data_read(os, zp->z_blksz, outcount); + if (error == 0 && outcount > zp->z_blksz) { + error = vfs_ratelimit_data_read(os, zp->z_blksz, + outcount - zp->z_blksz); + if (error != 0) { + goto update; + } + } if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) { iovp->iov_base += outcount; @@ -2039,7 +2073,11 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) } } - vfs_ratelimit_metadata_read(zfsvfs->z_os); + error = vfs_ratelimit_metadata_read(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * Return all attributes. It's cheaper to provide the answer @@ -2637,7 +2675,10 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) } } - vfs_ratelimit_metadata_write(os); + err = vfs_ratelimit_metadata_write(os); + if (err != 0) { + goto out2; + } tx = dmu_tx_create(os); @@ -3375,6 +3416,11 @@ zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, } } + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } + vn_seqc_write_begin(*svpp); vn_seqc_write_begin(sdvp); if (*tvpp != NULL) @@ -3586,14 +3632,18 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, return (error); } - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, - 0 /* projid */)) { + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0 /* projid */)) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + zfs_exit(zfsvfs, FTAG); + return (error); + } getnewvnode_reserve_(); tx = dmu_tx_create(zfsvfs->z_os); @@ -3692,7 +3742,11 @@ zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct) if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - vfs_ratelimit_metadata_read(zfsvfs->z_os); + error = vfs_ratelimit_metadata_read(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, @@ -3822,7 +3876,11 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, return (error); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); @@ -3839,8 +3897,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, error = zfs_link_create(tdzp, name, szp, tx, 0); if (error == 0) { - uint64_t txtype = TX_LINK; - zfs_log_link(zilog, tx, txtype, tdzp, szp, name); + zfs_log_link(zilog, tx, TX_LINK, tdzp, szp, name); } dmu_tx_commit(tx); @@ -4153,7 +4210,7 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, pgsin_a = MIN(*rahead, pgsin_a); } - vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz, + error = vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz, MIN(end, obj_size) - start); /* @@ -4162,8 +4219,10 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, * ZFS will panic if we request DMU to read beyond the end of the last * allocated block. */ - error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b, - &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); + if (error == 0) { + error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, + &pgsin_b, &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); + } if (lr != NULL) zfs_rangelock_exit(lr); @@ -4292,7 +4351,9 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, goto out; } - vfs_ratelimit_data_write(zfsvfs->z_os, zp->z_blksz, len); + if (vfs_ratelimit_data_write(zfsvfs->z_os, zp->z_blksz, len) != 0) { + goto out; + } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 4f8278feb1..3df8848172 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -730,7 +730,9 @@ zvol_geom_bio_strategy(struct bio *bp) if (bp->bio_cmd == BIO_DELETE) { /* Should we account only for a single metadata write? */ - vfs_ratelimit_metadata_write(zv->zv_objset); + error = vfs_ratelimit_metadata_write(zv->zv_objset); + if (error != 0) + goto unlock; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { @@ -747,29 +749,29 @@ zvol_geom_bio_strategy(struct bio *bp) while (resid != 0 && off < volsize) { size_t size = MIN(resid, zvol_maxphys); if (doread) { - vfs_ratelimit_data_read(zv->zv_objset, + error = vfs_ratelimit_data_read(zv->zv_objset, zv->zv_volblocksize, size); + if (error != 0) + break; error = dmu_read(os, ZVOL_OBJ, off, size, addr, DMU_READ_PREFETCH); + if (error != 0) + break; } else { - vfs_ratelimit_data_write(zv->zv_objset, + error = vfs_ratelimit_data_write(zv->zv_objset, zv->zv_volblocksize, size); + if (error != 0) + break; dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { + if (error != 0) { dmu_tx_abort(tx); - } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx); - zvol_log_write(zv, tx, off, size, commit); - dmu_tx_commit(tx); + break; } - } - if (error) { - /* Convert checksum errors into IO errors. */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - break; + dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + zvol_log_write(zv, tx, off, size, commit); + dmu_tx_commit(tx); } off += size; addr += size; @@ -779,7 +781,12 @@ unlock: zfs_rangelock_exit(lr); bp->bio_completed = bp->bio_length - resid; - if (bp->bio_completed < bp->bio_length && off > volsize) + if (error == EINTR && bp->bio_completed > 0) + error = 0; + /* Convert checksum errors into IO errors. */ + else if (error == ECKSUM) + error = SET_ERROR(EIO); + if (error == 0 && bp->bio_completed < bp->bio_length && off > volsize) error = SET_ERROR(EINVAL); switch (bp->bio_cmd) { diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 4f3d3eea1b..31acb89bc5 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -683,7 +683,11 @@ top: goto out; } - vfs_ratelimit_metadata_write(os); + error = vfs_ratelimit_metadata_write(os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + goto out; + } tx = dmu_tx_create(os); @@ -879,7 +883,11 @@ top: goto out; } - vfs_ratelimit_metadata_write(os); + error = vfs_ratelimit_metadata_write(os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + goto out; + } tx = dmu_tx_create(os); @@ -1012,6 +1020,11 @@ top: goto out; } + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } + mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && !zn_has_cached_data(zp, 0, LLONG_MAX); @@ -1290,7 +1303,13 @@ top: return (SET_ERROR(EDQUOT)); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * Add a new entry to the directory. @@ -1434,7 +1453,10 @@ top: goto out; } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } /* * Grab a lock on the directory to make sure that no one is @@ -1535,6 +1557,7 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) int done = 0; uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ + size_t nbytes; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); @@ -1553,6 +1576,21 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) os = zfsvfs->z_os; offset = ctx->pos; prefetch = zp->z_zn_prefetch; + nbytes = 0; + + /* + * Calling vfs_ratelimit_data_read() for each directory entry would be + * way too expensive. We don't want to do that so we do the following + * instead: + * We charge here only for a single block. If there is a lot of traffic + * we are going to wait before any reading is issued. Once we read all + * directory entries we will charge the process for the rest, as this is + * when we will know how much data exactly was read. + */ + error = vfs_ratelimit_data_read(os, zp->z_blksz, zp->z_blksz); + if (error != 0) { + goto out; + } /* * Initialize the iterator cursor. @@ -1645,18 +1683,21 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) offset += 1; } ctx->pos = offset; + /* + * TODO: We should be adding size of dirent structure here too. + */ + nbytes += strlen(zap.za_name); } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ -#ifdef TODO /* - * This is post factum, but if we would do that inside the loop we - * wouldn't know the record length before reading it anyway plus we - * would be calling vfs_ratelimit_data_read() way too often and each - * call accounts for a single operation. + * Charge the process for the rest, if more than a single block was + * read. */ - vfs_ratelimit_data_read(os, zp->z_blksz, size /* ??? */); -#endif + if (error == 0 && nbytes > zp->z_blksz) { + error = vfs_ratelimit_data_read(os, zp->z_blksz, + nbytes - zp->z_blksz); + } update: zap_cursor_fini(&zc); @@ -1697,7 +1738,11 @@ zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - vfs_ratelimit_metadata_read(zfsvfs->z_os); + error = vfs_ratelimit_metadata_read(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } mutex_enter(&zp->z_lock); @@ -2298,7 +2343,10 @@ top: } } - vfs_ratelimit_metadata_write(os); + err = vfs_ratelimit_metadata_write(os); + if (err != 0) { + goto out2; + } tx = dmu_tx_create(os); @@ -3012,7 +3060,10 @@ top: } } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); @@ -3328,7 +3379,13 @@ top: return (SET_ERROR(EDQUOT)); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + zfs_exit(zfsvfs, FTAG); + return (error); + } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; @@ -3438,7 +3495,11 @@ zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - vfs_ratelimit_metadata_read(zfsvfs->z_os); + error = vfs_ratelimit_metadata_read(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } mutex_enter(&zp->z_lock); if (zp->z_is_sa) @@ -3577,7 +3638,11 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, return (error); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } top: /* @@ -3820,6 +3885,13 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, return (0); } + if (vfs_ratelimit_data_write(zfsvfs->z_os, zp->z_blksz, pglen) != 0) { + unlock_page(pp); + zfs_rangelock_exit(lr); + zfs_exit(zfsvfs, FTAG); + return (0); + } + /* * Counterpart for redirty_page_for_writepage() above. This page * was in fact not skipped and should not be counted as if it were. @@ -3947,7 +4019,10 @@ zfs_dirty_inode(struct inode *ip, int flags) } #endif - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } tx = dmu_tx_create(zfsvfs->z_os); @@ -3994,7 +4069,6 @@ zfs_inactive(struct inode *ip) znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t atime[2]; - int error; int need_unlock = 0; /* Only read lock if we haven't already write locked, e.g. rollback */ @@ -4009,28 +4083,30 @@ zfs_inactive(struct inode *ip) } if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { - vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (vfs_ratelimit_metadata_write(zfsvfs->z_os) != 0) { + goto out; + } dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); - } else { - inode_timespec_t tmp_atime; - tmp_atime = zpl_inode_get_atime(ip); - ZFS_TIME_ENCODE(&tmp_atime, atime); - mutex_enter(&zp->z_lock); - (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), - (void *)&atime, sizeof (atime), tx); - zp->z_atime_dirty = B_FALSE; - mutex_exit(&zp->z_lock); - dmu_tx_commit(tx); + goto out; } - } + inode_timespec_t tmp_atime; + tmp_atime = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_atime, atime); + mutex_enter(&zp->z_lock); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&atime, sizeof (atime), tx); + zp->z_atime_dirty = B_FALSE; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } +out: zfs_zinactive(zp); if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); @@ -4046,6 +4122,7 @@ zfs_fillpage(struct inode *ip, struct page *pp) loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; + int error; ASSERT3U(io_off, <, i_size); @@ -4055,12 +4132,12 @@ zfs_fillpage(struct inode *ip, struct page *pp) vfs_ratelimit_data_read(zfsvfs->z_os, PAGESIZE, io_len); void *va = kmap(pp); - int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, + error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, io_len, va, DMU_READ_PREFETCH); if (io_len != PAGE_SIZE) memset((char *)va + io_len, 0, PAGE_SIZE - io_len); kunmap(pp); - +out: if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -4097,7 +4174,9 @@ zfs_getpage(struct inode *ip, struct page *pp) if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - error = zfs_fillpage(ip, pp); + error = vfs_ratelimit_data_read(zfsvfs->z_os, 0, PAGE_SIZE); + if (error == 0) + error = zfs_fillpage(ip, pp); if (error == 0) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 5a19f3e579..b65064e3cc 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -297,8 +297,14 @@ zvol_write(zv_request_t *zvr) if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; - vfs_ratelimit_data_write(zv->zv_objset, zv->zv_volblocksize, - bytes); + error = vfs_ratelimit_data_write(zv->zv_objset, + zv->zv_volblocksize, bytes); + if (error != 0) { + /* XXX-PJD Is it safe to reset the error? */ + if (error == EINTR && uio.uio_resid < start_resid) + error = 0; + break; + } dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); @@ -400,7 +406,11 @@ zvol_discard(zv_request_t *zvr) start, size, RL_WRITER); /* Should we account only for a single metadata write? */ - vfs_ratelimit_metadata_write(zv->zv_objset); + error = vfs_ratelimit_metadata_write(zv->zv_objset); + if (error != 0) { + zfs_rangelock_exit(lr); + goto unlock; + } tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); @@ -483,8 +493,14 @@ zvol_read(zv_request_t *zvr) if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; - vfs_ratelimit_data_read(zv->zv_objset, zv->zv_volblocksize, - bytes); + error = vfs_ratelimit_data_read(zv->zv_objset, + zv->zv_volblocksize, bytes); + if (error != 0) { + /* XXX-PJD Is it safe to reset the error? */ + if (error == EINTR && uio.uio_resid < start_resid) + error = 0; + break; + } error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 957d7c7c07..5a756b1049 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -2205,7 +2205,11 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) ASSERT3U(drrw->drr_object, ==, rwa->last_object); - vfs_ratelimit_data_write(rwa->os, drrw->drr_logical_size, + /* + * vfs_ratelimit_data_write_spin() will sleep in short periods + * and return immediately when a signal is pending. + */ + vfs_ratelimit_data_write_spin(rwa->os, 0, drrw->drr_logical_size); if (drrw->drr_logical_size != dn->dn_datablksz) { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 6a345b7dc6..92c40d25e7 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1631,6 +1631,7 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range) struct srd *srdp = &range->sru.data; blkptr_t *bp = &srdp->bp; objset_t *os = srta->smta->os; + int error; ASSERT3U(range->type, ==, DATA); ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); @@ -1685,11 +1686,15 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range) .zb_blkid = range->start_blkid, }; - vfs_ratelimit_data_read(os, BP_GET_LSIZE(bp), BP_GET_LSIZE(bp)); + /* + * vfs_ratelimit_data_read_spin() will sleep in short periods and return + * immediately when a signal is pending. + */ + vfs_ratelimit_data_read_spin(os, 0, BP_GET_LSIZE(bp)); arc_flags_t aflags = ARC_FLAG_CACHED_ONLY; - int arc_err = arc_read(NULL, os->os_spa, bp, + error = arc_read(NULL, os->os_spa, bp, arc_getbuf_func, &srdp->abuf, ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, &zb); /* @@ -1698,7 +1703,7 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range) * entry to the ARC, and we also avoid polluting the ARC cache with * data that is not likely to be used in the future. */ - if (arc_err != 0) { + if (error != 0) { srdp->abd = abd_alloc_linear(srdp->datasz, B_FALSE); srdp->io_outstanding = B_TRUE; zio_nowait(zio_read(NULL, os->os_spa, bp, srdp->abd, @@ -2555,8 +2560,9 @@ dmu_send_impl(struct dmu_send_params *dspp) while (err == 0 && !range->eos_marker) { err = do_dump(&dsc, range); range = get_next_range(&srt_arg->q, range); - if (issig()) + if (issig()) { err = SET_ERROR(EINTR); + } } /* diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 4f7252e2c3..a5d66ac18c 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -371,6 +371,10 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_snap_cmtime = t; } + if (dd->dd_myname[0] != '$') { + dsl_dir_ratelimit_read(dd); + } + dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, &dd->dd_dbuf); winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); @@ -380,6 +384,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); + vfs_ratelimit_free(dd->dd_ratelimit); cv_destroy(&dd->dd_activity_cv); mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); @@ -2036,7 +2041,6 @@ dsl_dir_ratelimit_recurse(dsl_dir_t *dd) ASSERT(child_dd->dd_ratelimit == NULL); child_dd->dd_ratelimit_root = dd->dd_ratelimit_root; - dsl_dir_ratelimit_recurse(child_dd); dsl_dir_rele(child_dd, FTAG); @@ -2320,7 +2324,7 @@ dsl_dir_ratelimit_rename(dsl_dir_t *dd, dsl_dir_t *newparent) if (dd->dd_ratelimit_root != dd) { ASSERT(dd->dd_ratelimit == NULL); - dd->dd_ratelimit_root = newparent; + dd->dd_ratelimit_root = newparent->dd_ratelimit_root; dsl_dir_ratelimit_recurse(dd); } diff --git a/module/zfs/vfs_ratelimit.c b/module/zfs/vfs_ratelimit.c index 4e6493b136..18f3b09d07 100644 --- a/module/zfs/vfs_ratelimit.c +++ b/module/zfs/vfs_ratelimit.c @@ -54,7 +54,7 @@ * - It would be hard to predict what limits should be configured as there are a * lot of factors that dictate how much disk bandwidth is really required * (due to RAIDZ inflation, compression, gang blocks, deduplication, - * NOP writes, I/O aggregation, metadata traffic, etc.). + * block cloning, NOP writes, I/O aggregation, metadata traffic, etc.). * By enforcing the limits at the VFS level for file system operations it should * be easy to find out what limits applications require and verify that the * limits are correctly enforced by monitoring system calls issued by the @@ -76,20 +76,20 @@ * We walk down the dataset tree and set dd_ratelimit_root field to point to * this dsl_dir until we find dsl_dir that also has the vfs_ratelimit structure * already attached to it (which means it has its own limits configured). - * During the accounting it allows us for quick access to the ratelimit + * During the accounting it allows us to quickly access the ratelimit * structure we need by just going to ds_dir->dd_ratelimit_root; - * If ratelimits are not configured on this dataset or any of its parents, + * If ratelimits are not configured on this dataset and all of its ancestors, * the ds_dir->dd_ratelimit_root will be set to NULL, so we know we don't * have to do any accounting. * * The limits are configured per second, but we divde the second and the limits - * into RATELIMIT_RESOLUTION slots (10 by default). This is to avoid a choking + * into RATELIMIT_RESOLUTION slots (16 by default). This is to avoid a choking * effect, when process is doing progress in 1s steps. For example if we have * read bandwidth limits configured to 100MB/s and the process is trying to * read 130MB, it will take 1.3 seconds, not 2 seconds. - * Not that very low limits may be rounded up - 7 ops/s limit will be rounded - * up to 10 ops/s, so each slot is assigned 1 op/s limit. This rounding up - * is done in the kernel and isn't shown in the properties value. + * Note that very low limits may be rounded up - 7 ops/s limit will be rounded + * up to 16 ops/s, so each time slot is assigned 1 op/s limit. This rounding up + * is done in the kernel and isn't shown in the properties. * * How does the accounting work? * @@ -99,34 +99,31 @@ * and two operations total. Not all of those limits have to be configured or * some might be configured on a dataset and others on a parent dataset(s). * - * We remember those values in the rtslot structures at every level we have - * limits configured on. The rtslot strucuture also remembers the time of - * the request. For each ratelimit type (read bandwidth, total, operation read, - * operation total) and for each dataset with the limits configured when we walk - * the dataset tree up we find the point in time until which we have to wait to - * satisfy configured limit. We select the furthest point in time and we do to - * sleep. If the request doesn't exceed any limits, we just do the accounting - * and allow for the request to be executed immediately. + * For each type we use two fields to track the wait times: rl_timeslot and + * rl_reminder. rl_timeslot holds the point in time up to which the last + * processes is waiting for. If the rl_timeslot is lower than the current time, + * it means that no processes are waiting. rl_reminder is the amount of data + * modulo the limit. For example if we have a read bandwidth limit of 64MB/s, + * so it is 4MB per 1/16s. The process is trying to read 11MB. This would + * give us rl_timeslot = now + 2 (we account for 2 full time slots of 1/16s) + * and rl_reminder = 3MB. This process has to sleep for 2/16s. When immediately + * another process is trying to read 1MB, this 1MB will be added to the current + * rl_reminder giving 4MB, so full limit unit for 1/16s. Now rl_timeslot will + * be set to now + 3 and rl_reminder to 0. The last process is going to sleep + * for 3/16s. */ /* * Number of slots we divide one second into. More granularity is better for - * interactivity, but it takes more memory and more calculations. + * interactivity, but for small limits we may lose some precision. */ #define RATELIMIT_RESOLUTION 16 struct vfs_ratelimit { kmutex_t rl_lock; uint64_t rl_limits[ZFS_RATELIMIT_NTYPES]; - /* List of current waiters and past activity. */ - list_t rl_list; -}; - -struct rtslot { - list_node_t rts_node; - hrtime_t rts_timeslot; - int rts_types; - uint64_t rts_counts[ZFS_RATELIMIT_NTYPES]; + uint64_t rl_timeslot[ZFS_RATELIMIT_NTYPES]; + uint64_t rl_reminder[ZFS_RATELIMIT_NTYPES]; }; int @@ -197,13 +194,6 @@ vfs_ratelimit_alloc(const uint64_t *limits) rl = kmem_zalloc(sizeof (*rl), KM_SLEEP); mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&rl->rl_list, sizeof (struct rtslot), - offsetof(struct rtslot, rts_node)); - /* Create two slots for a good start. */ - for (i = 0; i < 2; i++) { - list_insert_tail(&rl->rl_list, - kmem_zalloc(sizeof (struct rtslot), KM_SLEEP)); - } if (limits != NULL) { for (i = ZFS_RATELIMIT_FIRST; i < ZFS_RATELIMIT_NTYPES; i++) { @@ -227,17 +217,11 @@ vfs_ratelimit_alloc(const uint64_t *limits) void vfs_ratelimit_free(struct vfs_ratelimit *rl) { - struct rtslot *slot; if (rl == NULL) { return; } - while ((slot = list_remove_head(&rl->rl_list)) != NULL) { - kmem_free(slot, sizeof (*slot)); - } - list_destroy(&rl->rl_list); - mutex_destroy(&rl->rl_lock); kmem_free(rl, sizeof (*rl)); @@ -278,28 +262,24 @@ static __inline hrtime_t gettimeslot(void) { inode_timespec_t ts; - hrtime_t nsec; gethrestime(&ts); - nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec; - return (nsec / (NANOSEC / RATELIMIT_RESOLUTION)); + + return (((hrtime_t)ts.tv_sec * RATELIMIT_RESOLUTION) + + ts.tv_nsec / (NANOSEC / RATELIMIT_RESOLUTION)); } /* * Returns bit mask of the types configured for the given ratelimit structure. */ static int -ratelimit_types(const struct vfs_ratelimit *rl) +ratelimit_types(const uint64_t *counts) { int types, type; - if (rl == NULL) { - return (0); - } - types = 0; for (type = ZFS_RATELIMIT_FIRST; type <= ZFS_RATELIMIT_LAST; type++) { - if (rl->rl_limits[type] > 0) { + if (counts[type] > 0) { types |= (1 << type); } } @@ -318,7 +298,6 @@ static dsl_dir_t * ratelimit_first(objset_t *os, int types) { dsl_dir_t *dd; - int mytypes; ASSERT(RRM_READ_HELD(&os->os_spa->spa_ratelimit_lock)); @@ -327,13 +306,17 @@ ratelimit_first(objset_t *os, int types) if (dd == NULL) { return (NULL); } - mytypes = ratelimit_types(dd->dd_ratelimit); - if ((mytypes & types) != 0) { - /* - * This dataset has at last one limit we are - * interested in. - */ - return (dd); + if (dd->dd_ratelimit != NULL) { + int mytypes; + + mytypes = ratelimit_types(dd->dd_ratelimit->rl_limits); + if ((mytypes & types) != 0) { + /* + * This dataset has at last one limit we are + * interested in. + */ + return (dd); + } } if (dd->dd_parent == NULL) { return (NULL); @@ -351,8 +334,6 @@ ratelimit_first(objset_t *os, int types) static dsl_dir_t * ratelimit_parent(dsl_dir_t *dd, int types) { - int mytypes; - ASSERT(RRM_READ_HELD(&dd->dd_pool->dp_spa->spa_ratelimit_lock)); for (;;) { @@ -363,154 +344,63 @@ ratelimit_parent(dsl_dir_t *dd, int types) if (dd == NULL) { return (NULL); } - mytypes = ratelimit_types(dd->dd_ratelimit); - if ((mytypes & types) != 0) { - /* - * This dataset has at last one limit we are - * interested in. - */ - return (dd); + if (dd->dd_ratelimit != NULL) { + int mytypes; + + mytypes = ratelimit_types(dd->dd_ratelimit->rl_limits); + if ((mytypes & types) != 0) { + /* + * This dataset has at last one limit we are + * interested in. + */ + return (dd); + } } } } -/* - * If we have any entries with 'timeslot > now' we also must have an entry with - * 'timeslot == now'. In other words if there is no entry with - * 'timeslot == now', it means that all the entires expired. - * - * We return either the most recent entry related to the given type or we return - * 'timeslot == now' entry not related to the given type and we will use it to - * store accouting information about this type as well. - */ -static struct rtslot * -ratelimit_find(struct vfs_ratelimit *rl, int typebit, hrtime_t now) -{ - struct rtslot *slot; - - ASSERT(MUTEX_HELD(&rl->rl_lock)); - - for (slot = list_head(&rl->rl_list); slot != NULL; - slot = list_next(&rl->rl_list, slot)) { - if (slot->rts_timeslot < now) { - break; - } - if ((slot->rts_types & typebit) != 0 || - slot->rts_timeslot == now) { - return (slot); - } - } - /* All the entries expired. */ -#ifndef NDEBUG - for (slot = list_head(&rl->rl_list); slot != NULL; - slot = list_next(&rl->rl_list, slot)) { - ASSERT(slot->rts_timeslot < now); - } -#endif - - return (NULL); -} - /* * Account for our request across all the types configured in this ratelimit * structure. * Return a timeslot we should wait for or now if we can execute the request * without waiting (we are within limits). */ -static uint64_t -ratelimit_account(struct vfs_ratelimit *rl, int types, hrtime_t now, +static hrtime_t +ratelimit_account(struct vfs_ratelimit *rl, hrtime_t now, const uint64_t *counts) { - uint64_t timeslot; - int type, typebit; + hrtime_t timeslot; + int type; - timeslot = 0; + timeslot = now; mutex_enter(&rl->rl_lock); for (type = ZFS_RATELIMIT_FIRST; type <= ZFS_RATELIMIT_LAST; type++) { - struct rtslot *slot; - uint64_t count, nexttimeslot; + uint64_t count; - typebit = (1 << type); - - if ((types & typebit) == 0) { - /* Not interested in this type. */ - continue; - } if (rl->rl_limits[type] == 0) { /* This type has no limit configured on this dataset. */ continue; } count = counts[type]; - ASSERT(count > 0); - - slot = ratelimit_find(rl, typebit, now); - if (slot == NULL) { - slot = list_remove_tail(&rl->rl_list); - ASSERT(slot->rts_timeslot < now); - slot->rts_types = typebit; - slot->rts_timeslot = now; - memset(slot->rts_counts, 0, sizeof (slot->rts_counts)); - list_insert_head(&rl->rl_list, slot); - } else if (slot->rts_timeslot == now) { - /* The 'now' slot may not have our type yet. */ - slot->rts_types |= typebit; - } - ASSERT((slot->rts_types & typebit) != 0); - nexttimeslot = slot->rts_timeslot + 1; - - for (;;) { - if (slot->rts_counts[type] + count <= - rl->rl_limits[type]) { - slot->rts_counts[type] += count; - break; - } - - /* - * This request is too big to fit into a single slot, - * ie. a single request exceeds the limit or this and - * the previous requests exceed the limit. - */ - - /* - * Fit as much as we can into the current slot. - */ - count -= rl->rl_limits[type] - slot->rts_counts[type]; - slot->rts_counts[type] = rl->rl_limits[type]; - - /* - * Take the next slot (if already exists isn't aware of - * our type yet), take an expired slot from the tail of - * the list or allocate a new slot. - */ - slot = list_prev(&rl->rl_list, slot); - if (slot != NULL) { - ASSERT((slot->rts_types & typebit) == 0); - ASSERT(slot->rts_timeslot == nexttimeslot); - ASSERT0(slot->rts_counts[type]); - - slot->rts_types |= typebit; - } else { - slot = list_tail(&rl->rl_list); - if (slot->rts_timeslot < now) { - list_remove(&rl->rl_list, slot); - } else { - slot = kmem_alloc(sizeof (*slot), - KM_SLEEP); - } - slot->rts_types = typebit; - slot->rts_timeslot = nexttimeslot; - memset(slot->rts_counts, 0, - sizeof (slot->rts_counts)); - list_insert_head(&rl->rl_list, slot); - } - - nexttimeslot++; + if (count == 0) { + /* Not interested in this type. */ + continue; } - if (timeslot < slot->rts_timeslot) { - timeslot = slot->rts_timeslot; + if (rl->rl_timeslot[type] < now) { + rl->rl_reminder[type] = 0; + rl->rl_timeslot[type] = now; + } else { + count += rl->rl_reminder[type]; + } + + rl->rl_timeslot[type] += count / rl->rl_limits[type]; + rl->rl_reminder[type] = count % rl->rl_limits[type];; + + if (timeslot < rl->rl_timeslot[type]) { + timeslot = rl->rl_timeslot[type]; } } @@ -519,106 +409,173 @@ ratelimit_account(struct vfs_ratelimit *rl, int types, hrtime_t now, return (timeslot); } -static void -vfs_ratelimit(objset_t *os, int types, const uint64_t *counts) +static hrtime_t +ratelimit_account_all(objset_t *os, const uint64_t *counts) { dsl_dir_t *dd; hrtime_t now, timeslot; + int types; + + ASSERT(RRM_READ_HELD(&os->os_spa->spa_ratelimit_lock)); + + types = ratelimit_types(counts); + now = timeslot = gettimeslot(); + + for (dd = ratelimit_first(os, types); dd != NULL; + dd = ratelimit_parent(dd, types)) { + hrtime_t ts; + + ts = ratelimit_account(dd->dd_ratelimit, now, counts); + if (ts > timeslot) { + timeslot = ts; + } + } + + return (timeslot); +} + +static int +ratelimit_sleep(hrtime_t timeslot) +{ + hrtime_t now; + int error = 0; now = gettimeslot(); - timeslot = 0; + + if (timeslot > now) { + /* + * Too much traffic, slow it down. + */ +#ifdef _KERNEL + if (delay_sig((hz / RATELIMIT_RESOLUTION) * (timeslot - now))) { + error = EINTR; + } +#else + delay((hz / RATELIMIT_RESOLUTION) * (timeslot - now)); +#endif + } + + return (error); +} + +static int +vfs_ratelimit_sleep(objset_t *os, const uint64_t *counts) +{ + hrtime_t timeslot; /* * Prevents configuration changes when we have requests in-flight. */ rrm_enter_read(&os->os_spa->spa_ratelimit_lock, FTAG); - for (dd = ratelimit_first(os, types); dd != NULL; - dd = ratelimit_parent(dd, types)) { - hrtime_t ts; - - ts = ratelimit_account(dd->dd_ratelimit, types, now, counts); - if (ts > timeslot) { - timeslot = ts; - } - } + timeslot = ratelimit_account_all(os, counts); rrm_exit(&os->os_spa->spa_ratelimit_lock, FTAG); - if (timeslot > now) { - /* - * Too much traffic, slow it down. - */ - delay((hz / RATELIMIT_RESOLUTION) * (timeslot - now)); - } + return (ratelimit_sleep(timeslot)); } /* * For every data read we charge: * - bytes of read bandwidth * - bytes of total bandwidth - * - (bytes - 1) / blocksize + 1 of read operations - * - (bytes - 1) / blocksize + 1 of total operations + * - (bytes + blocksize - 1) / blocksize of read operations + * - (bytes + blocksize - 1) / blocksize of total operations */ -void +int vfs_ratelimit_data_read(objset_t *os, size_t blocksize, size_t bytes) { uint64_t counts[ZFS_RATELIMIT_NTYPES]; - unsigned int types; + size_t operations; if (bytes == 0) { - return; + return (0); } if (blocksize == 0) { blocksize = bytes; } - - types = (1 << ZFS_RATELIMIT_BW_READ); - types |= (1 << ZFS_RATELIMIT_BW_TOTAL); - types |= (1 << ZFS_RATELIMIT_OP_READ); - types |= (1 << ZFS_RATELIMIT_OP_TOTAL); + operations = (bytes + blocksize - 1) / blocksize; memset(counts, 0, sizeof (counts)); counts[ZFS_RATELIMIT_BW_READ] = bytes; counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; - counts[ZFS_RATELIMIT_OP_READ] = (bytes - 1) / blocksize + 1; - counts[ZFS_RATELIMIT_OP_TOTAL] = (bytes - 1) / blocksize + 1; + counts[ZFS_RATELIMIT_OP_READ] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; - vfs_ratelimit(os, types, counts); + return (vfs_ratelimit_sleep(os, counts)); } /* * For every data write we charge: * - bytes of write bandwidth * - bytes of total bandwidth - * - (bytes - 1) / blocksize + 1 of write operations - * - (bytes - 1) / blocksize + 1 of total operations + * - (bytes + blocksize - 1) / blocksize of read operations + * - (bytes + blocksize - 1) / blocksize of total operations */ -void +int vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes) { uint64_t counts[ZFS_RATELIMIT_NTYPES]; - unsigned int types; + size_t operations; if (bytes == 0) { - return; + return (0); } if (blocksize == 0) { blocksize = bytes; } - - types = (1 << ZFS_RATELIMIT_BW_WRITE); - types |= (1 << ZFS_RATELIMIT_BW_TOTAL); - types |= (1 << ZFS_RATELIMIT_OP_WRITE); - types |= (1 << ZFS_RATELIMIT_OP_TOTAL); + operations = (bytes + blocksize - 1) / blocksize; memset(counts, 0, sizeof (counts)); counts[ZFS_RATELIMIT_BW_WRITE] = bytes; counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; - counts[ZFS_RATELIMIT_OP_WRITE] = (bytes - 1) / blocksize + 1; - counts[ZFS_RATELIMIT_OP_TOTAL] = (bytes - 1) / blocksize + 1; + counts[ZFS_RATELIMIT_OP_WRITE] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; - vfs_ratelimit(os, types, counts); + return (vfs_ratelimit_sleep(os, counts)); +} + +int +vfs_ratelimit_data_copy(objset_t *srcos, objset_t *dstos, size_t blocksize, + size_t bytes) +{ + uint64_t counts[ZFS_RATELIMIT_NTYPES]; + size_t operations; + hrtime_t dstts, srcts; + spa_t *spa = srcos->os_spa; + + if (bytes == 0) { + return (0); + } + if (blocksize == 0) { + blocksize = bytes; + } + operations = (bytes + blocksize - 1) / blocksize; + + /* + * Prevents configuration changes when we have requests in-flight. + */ + rrm_enter_read(&spa->spa_ratelimit_lock, FTAG); + + memset(counts, 0, sizeof (counts)); + counts[ZFS_RATELIMIT_BW_READ] = bytes; + counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; + counts[ZFS_RATELIMIT_OP_READ] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; + + srcts = ratelimit_account_all(srcos, counts); + + memset(counts, 0, sizeof (counts)); + counts[ZFS_RATELIMIT_BW_WRITE] = bytes; + counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; + counts[ZFS_RATELIMIT_OP_WRITE] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; + + dstts = ratelimit_account_all(dstos, counts); + + rrm_exit(&spa->spa_ratelimit_lock, FTAG); + + return (ratelimit_sleep(dstts > srcts ? dstts : srcts)); } /* @@ -626,20 +583,16 @@ vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes) * - one read operation * - one total operation */ -void +int vfs_ratelimit_metadata_read(objset_t *os) { uint64_t counts[ZFS_RATELIMIT_NTYPES]; - unsigned int types; - - types = (1 << ZFS_RATELIMIT_OP_READ); - types |= (1 << ZFS_RATELIMIT_OP_TOTAL); memset(counts, 0, sizeof (counts)); counts[ZFS_RATELIMIT_OP_READ] = 1; counts[ZFS_RATELIMIT_OP_TOTAL] = 1; - vfs_ratelimit(os, types, counts); + return (vfs_ratelimit_sleep(os, counts)); } /* @@ -647,18 +600,89 @@ vfs_ratelimit_metadata_read(objset_t *os) * - one read operation * - one total operation */ -void +int vfs_ratelimit_metadata_write(objset_t *os) { uint64_t counts[ZFS_RATELIMIT_NTYPES]; - unsigned int types; - - types = (1 << ZFS_RATELIMIT_OP_WRITE); - types |= (1 << ZFS_RATELIMIT_OP_TOTAL); memset(counts, 0, sizeof (counts)); counts[ZFS_RATELIMIT_OP_WRITE] = 1; counts[ZFS_RATELIMIT_OP_TOTAL] = 1; - vfs_ratelimit(os, types, counts); + return (vfs_ratelimit_sleep(os, counts)); +} + +/* + * Function spins until timeout is reached or the process received a signal. + * This function is different than ratelimit_sleep(), because pause_sig() + * might not be woken up by a signal if the process has multiple threads. + * We use *_spin() functions for zfs send/recv where kernel starts additional + * kernel threads and interrupting userland process with CTRL+C (SIGINT) + * doesn't interrupt pause_sig() waiting in another kernel thread. + */ +static void +ratelimit_spin(objset_t *os, const uint64_t *counts) +{ + hrtime_t timeslot; + + /* + * Prevents configuration changes when we have requests in-flight. + */ + rrm_enter_read(&os->os_spa->spa_ratelimit_lock, FTAG); + + timeslot = ratelimit_account_all(os, counts); + + rrm_exit(&os->os_spa->spa_ratelimit_lock, FTAG); + + while (timeslot > gettimeslot() && !issig()) { + delay(hz / RATELIMIT_RESOLUTION); + } +} + +void +vfs_ratelimit_data_read_spin(objset_t *os, size_t blocksize, size_t bytes) +{ + uint64_t counts[ZFS_RATELIMIT_NTYPES]; + size_t operations; + + if (bytes == 0) { + return; + } + + if (blocksize == 0) { + blocksize = bytes; + } + operations = (bytes + blocksize - 1) / blocksize; + + memset(counts, 0, sizeof (counts)); + counts[ZFS_RATELIMIT_BW_READ] = bytes; + counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; + counts[ZFS_RATELIMIT_OP_READ] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; + + ratelimit_spin(os, counts); +} + +void +vfs_ratelimit_data_write_spin(objset_t *os, size_t blocksize, size_t bytes) +{ + uint64_t counts[ZFS_RATELIMIT_NTYPES]; + size_t operations; + + if (bytes == 0) { + return; + } + + if (blocksize == 0) { + blocksize = bytes; + } + operations = (bytes + blocksize - 1) / blocksize; + + memset(counts, 0, sizeof (counts)); + counts[ZFS_RATELIMIT_BW_WRITE] = bytes; + counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; + counts[ZFS_RATELIMIT_OP_WRITE] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; + + ratelimit_spin(os, counts); } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 8a50fe40e7..56c3a43169 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -78,7 +78,8 @@ static int zfs_bclone_wait_dirty = 0; /* * Maximum bytes to read per chunk in zfs_read(). */ -static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; +//static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; +static uint64_t zfs_vnops_read_chunk_size = 1024 * 512; int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) @@ -299,7 +300,14 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); - vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz, nbytes); + error = vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz, + nbytes); + if (error != 0) { + if (error == EINTR && n < start_resid) { + error = 0; + } + break; + } #ifdef UIO_NOCOPY if (zfs_uio_segflg(uio) == UIO_NOCOPY) @@ -614,7 +622,15 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) } } - vfs_ratelimit_data_write(zfsvfs->z_os, blksz, nbytes); + error = vfs_ratelimit_data_write(zfsvfs->z_os, blksz, nbytes); + if (error != 0) { + if (error == EINTR && n < start_resid) { + error = 0; + } + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } /* * Start a transaction. @@ -1315,8 +1331,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, break; } - vfs_ratelimit_data_read(inos, inblksz, size); - vfs_ratelimit_data_write(outos, inblksz, size); + error = vfs_ratelimit_data_copy(inos, outos, inblksz, size); + if (error != 0) { + break; + } nbps = maxblocks; last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); diff --git a/tests/zfs-tests/tests/functional/ratelimit/filesystem_op_single.ksh b/tests/zfs-tests/tests/functional/ratelimit/filesystem_op_single.ksh index cdaa03efd4..957f4c0e22 100755 --- a/tests/zfs-tests/tests/functional/ratelimit/filesystem_op_single.ksh +++ b/tests/zfs-tests/tests/functional/ratelimit/filesystem_op_single.ksh @@ -90,15 +90,7 @@ log_must ratelimit_filesystem_op_single unlink limit_op_write=none 1024 1 "$TEST # Operations total limits limit writing. log_must ratelimit_filesystem_op_single chmod limit_op_total=128 512 4 "$TESTDIR/file" log_must ratelimit_filesystem_op_single chown limit_op_total=64 512 8 "$TESTDIR/file" -# Creating a file requires one metadata write and one metadata read operation. -# On successful open(2), zfs_freebsd_open() calls vnode_create_vobject() -# with size=0. If size=0, vnode_create_vobject() interprets this as not having -# the proper size and calls VOP_GETATTR(). -if is_freebsd; then - log_must ratelimit_filesystem_op_single create limit_op_total=128 512 8 "$TESTDIR/file" -else - log_must ratelimit_filesystem_op_single create limit_op_total=128 512 4 "$TESTDIR/file" -fi +log_must ratelimit_filesystem_op_single create limit_op_total=128 512 4 "$TESTDIR/file" log_must ratelimit_filesystem_op_single unlink limit_op_total=64 512 8 "$TESTDIR/file" log_must ratelimit_filesystem_op_single mkdir limit_op_total=128 512 4 "$TESTDIR/file" log_must ratelimit_filesystem_op_single rmdir limit_op_total=64 512 8 "$TESTDIR/file" @@ -122,11 +114,7 @@ log_must ratelimit_filesystem_op_single unlink limit_op_total=none 1024 1 "$TEST # Operations read limits don't affect writing. log_must ratelimit_filesystem_op_single chmod limit_op_read=32 1024 1 "$TESTDIR/file" log_must ratelimit_filesystem_op_single chown limit_op_read=64 1024 1 "$TESTDIR/file" -if is_freebsd; then - log_must ratelimit_filesystem_op_single create limit_op_read=128 1024 8 "$TESTDIR/file" -else - log_must ratelimit_filesystem_op_single create limit_op_read=128 1024 1 "$TESTDIR/file" -fi +log_must ratelimit_filesystem_op_single create limit_op_read=128 1024 1 "$TESTDIR/file" log_must ratelimit_filesystem_op_single unlink limit_op_read=256 1024 1 "$TESTDIR/file" log_must ratelimit_filesystem_op_single mkdir limit_op_read=32 1024 1 "$TESTDIR/file" log_must ratelimit_filesystem_op_single rmdir limit_op_read=64 1024 1 "$TESTDIR/file"