diff --git a/include/os/freebsd/spl/sys/sdt.h b/include/os/freebsd/spl/sys/sdt.h index 2daa6de1af..6f45e036bc 100644 --- a/include/os/freebsd/spl/sys/sdt.h +++ b/include/os/freebsd/spl/sys/sdt.h @@ -37,7 +37,7 @@ SDT_PROBE_DECLARE(sdt, , , set__error); #define SET_ERROR(err) \ ((sdt_sdt___set__error->id ? \ (*sdt_probe_func)(sdt_sdt___set__error->id, \ - (uintptr_t)err, 0, 0, 0, 0) : 0), err) + (uintptr_t)err, 0, 0, 0, 0, 0) : 0), err) #else #define SET_ERROR(err) (err) #endif diff --git a/include/os/freebsd/spl/sys/systm.h b/include/os/freebsd/spl/sys/systm.h index 98ee955752..f17d820e7a 100644 --- a/include/os/freebsd/spl/sys/systm.h +++ b/include/os/freebsd/spl/sys/systm.h @@ -39,5 +39,6 @@ #define PAGEMASK (~PAGEOFFSET) #define delay(x) pause("soldelay", (x)) +#define delay_sig(x) (pause_sig("soldelay", (x)) != EAGAIN) #endif /* _OPENSOLARIS_SYS_SYSTM_H_ */ diff --git a/include/os/linux/spl/sys/timer.h b/include/os/linux/spl/sys/timer.h index 02c3c78934..abb9ef04fe 100644 --- a/include/os/linux/spl/sys/timer.h +++ b/include/os/linux/spl/sys/timer.h @@ -51,6 +51,7 @@ #define ddi_time_after_eq64(a, b) ddi_time_before_eq64(b, a) #define delay(ticks) schedule_timeout_uninterruptible(ticks) +#define delay_sig(ticks) (schedule_timeout_interruptible(ticks) > 0) #define SEC_TO_TICK(sec) ((sec) * HZ) #define MSEC_TO_TICK(ms) msecs_to_jiffies(ms) diff --git a/include/sys/vfs_ratelimit.h b/include/sys/vfs_ratelimit.h index c54821aa21..8b92476c83 100644 --- a/include/sys/vfs_ratelimit.h +++ b/include/sys/vfs_ratelimit.h @@ -55,10 +55,15 @@ void vfs_ratelimit_free(struct vfs_ratelimit *rl); struct vfs_ratelimit *vfs_ratelimit_set(struct vfs_ratelimit *rl, zfs_prop_t prop, uint64_t limit); -void vfs_ratelimit_data_read(objset_t *os, size_t blocksize, size_t bytes); -void vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes); -void vfs_ratelimit_metadata_read(objset_t *os); -void vfs_ratelimit_metadata_write(objset_t *os); +int vfs_ratelimit_data_read(objset_t *os, size_t blocksize, size_t bytes); +int vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes); +int vfs_ratelimit_data_copy(objset_t *srcos, objset_t *dstos, size_t blocksize, + size_t bytes); +int vfs_ratelimit_metadata_read(objset_t *os); +int vfs_ratelimit_metadata_write(objset_t *os); + +void vfs_ratelimit_data_read_spin(objset_t *os, size_t blocksize, size_t bytes); +void vfs_ratelimit_data_write_spin(objset_t *os, size_t blocksize, size_t bytes); #ifdef __cplusplus } diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index a97c69d121..185cbff3d6 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -1189,7 +1189,7 @@ This property may be changed with .It Sy limit_bw_total Ns = Ns Ar size Ns | Ns Sy none Limits the read, write, or combined bandwidth, respectively, that a dataset and its descendants can consume. -Limits are applied to both file systems and ZFS volumes. +Limits are applied to file systems, volumes and their snapshots. Bandwidth limits are in bytes per second. .Pp The configured limits are hierarchical, just like quotas; i.e., even if a diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 62f6c87eca..d39ef04b48 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1156,7 +1156,11 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, goto out; } - vfs_ratelimit_metadata_write(os); + error = vfs_ratelimit_metadata_write(os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + goto out; + } getnewvnode_reserve_(); @@ -1291,7 +1295,10 @@ zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) ASSERT0(error); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } /* * We may delete the znode now, or we may put it in the unlinked set; @@ -1321,8 +1328,7 @@ zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - zfs_exit(zfsvfs, FTAG); - return (error); + goto out; } /* @@ -1520,7 +1526,12 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, return (SET_ERROR(EDQUOT)); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * Add a new entry to the directory. @@ -1643,6 +1654,11 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) goto out; } + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } + vnevent_rmdir(vp, dvp, name, ct); vfs_ratelimit_metadata_write(zfsvfs->z_os); @@ -1657,8 +1673,7 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - zfs_exit(zfsvfs, FTAG); - return (error); + goto out; } error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); @@ -1783,6 +1798,21 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, offset = zfs_uio_offset(uio); prefetch = zp->z_zn_prefetch; + /* + * Calling vfs_ratelimit_data_read() for each directory entry would be + * way too expensive. We don't want to do that so we do the following + * instead: + * We charge here only for a single block. If there is a lot of traffic + * we are going to wait before any reading is issued. Once we read all + * directory entries we will charge the process for the rest, as this is + * when we will know how much data exactly was read. + */ + error = vfs_ratelimit_data_read(os, zp->z_blksz, zp->z_blksz); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } + /* * Initialize the iterator cursor. */ @@ -1940,12 +1970,16 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, *ncookies -= ncooks; /* - * This is post factum, but if we would do that inside the loop we - * wouldn't know the record length before reading it anyway plus we - * would be calling vfs_ratelimit_data_read() way too often and each - * call accounts for a single operation. + * Charge the process for the rest, if more than a single block was + * read. */ - vfs_ratelimit_data_read(os, zp->z_blksz, outcount); + if (error == 0 && outcount > zp->z_blksz) { + error = vfs_ratelimit_data_read(os, zp->z_blksz, + outcount - zp->z_blksz); + if (error != 0) { + goto update; + } + } if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) { iovp->iov_base += outcount; @@ -2039,7 +2073,11 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) } } - vfs_ratelimit_metadata_read(zfsvfs->z_os); + error = vfs_ratelimit_metadata_read(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * Return all attributes. It's cheaper to provide the answer @@ -2637,7 +2675,10 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) } } - vfs_ratelimit_metadata_write(os); + err = vfs_ratelimit_metadata_write(os); + if (err != 0) { + goto out2; + } tx = dmu_tx_create(os); @@ -3375,6 +3416,11 @@ zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, } } + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } + vn_seqc_write_begin(*svpp); vn_seqc_write_begin(sdvp); if (*tvpp != NULL) @@ -3586,14 +3632,18 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, return (error); } - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, - 0 /* projid */)) { + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0 /* projid */)) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + zfs_exit(zfsvfs, FTAG); + return (error); + } getnewvnode_reserve_(); tx = dmu_tx_create(zfsvfs->z_os); @@ -3692,7 +3742,11 @@ zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct) if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - vfs_ratelimit_metadata_read(zfsvfs->z_os); + error = vfs_ratelimit_metadata_read(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, @@ -3822,7 +3876,11 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, return (error); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); @@ -3839,8 +3897,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, error = zfs_link_create(tdzp, name, szp, tx, 0); if (error == 0) { - uint64_t txtype = TX_LINK; - zfs_log_link(zilog, tx, txtype, tdzp, szp, name); + zfs_log_link(zilog, tx, TX_LINK, tdzp, szp, name); } dmu_tx_commit(tx); @@ -4153,7 +4210,7 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, pgsin_a = MIN(*rahead, pgsin_a); } - vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz, + error = vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz, MIN(end, obj_size) - start); /* @@ -4162,8 +4219,10 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, * ZFS will panic if we request DMU to read beyond the end of the last * allocated block. */ - error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b, - &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); + if (error == 0) { + error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, + &pgsin_b, &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); + } if (lr != NULL) zfs_rangelock_exit(lr); @@ -4292,7 +4351,9 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, goto out; } - vfs_ratelimit_data_write(zfsvfs->z_os, zp->z_blksz, len); + if (vfs_ratelimit_data_write(zfsvfs->z_os, zp->z_blksz, len) != 0) { + goto out; + } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 4f8278feb1..3df8848172 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -730,7 +730,9 @@ zvol_geom_bio_strategy(struct bio *bp) if (bp->bio_cmd == BIO_DELETE) { /* Should we account only for a single metadata write? */ - vfs_ratelimit_metadata_write(zv->zv_objset); + error = vfs_ratelimit_metadata_write(zv->zv_objset); + if (error != 0) + goto unlock; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { @@ -747,29 +749,29 @@ zvol_geom_bio_strategy(struct bio *bp) while (resid != 0 && off < volsize) { size_t size = MIN(resid, zvol_maxphys); if (doread) { - vfs_ratelimit_data_read(zv->zv_objset, + error = vfs_ratelimit_data_read(zv->zv_objset, zv->zv_volblocksize, size); + if (error != 0) + break; error = dmu_read(os, ZVOL_OBJ, off, size, addr, DMU_READ_PREFETCH); + if (error != 0) + break; } else { - vfs_ratelimit_data_write(zv->zv_objset, + error = vfs_ratelimit_data_write(zv->zv_objset, zv->zv_volblocksize, size); + if (error != 0) + break; dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { + if (error != 0) { dmu_tx_abort(tx); - } else { - dmu_write(os, ZVOL_OBJ, off, size, addr, tx); - zvol_log_write(zv, tx, off, size, commit); - dmu_tx_commit(tx); + break; } - } - if (error) { - /* Convert checksum errors into IO errors. */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - break; + dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + zvol_log_write(zv, tx, off, size, commit); + dmu_tx_commit(tx); } off += size; addr += size; @@ -779,7 +781,12 @@ unlock: zfs_rangelock_exit(lr); bp->bio_completed = bp->bio_length - resid; - if (bp->bio_completed < bp->bio_length && off > volsize) + if (error == EINTR && bp->bio_completed > 0) + error = 0; + /* Convert checksum errors into IO errors. */ + else if (error == ECKSUM) + error = SET_ERROR(EIO); + if (error == 0 && bp->bio_completed < bp->bio_length && off > volsize) error = SET_ERROR(EINVAL); switch (bp->bio_cmd) { diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 4f3d3eea1b..31acb89bc5 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -683,7 +683,11 @@ top: goto out; } - vfs_ratelimit_metadata_write(os); + error = vfs_ratelimit_metadata_write(os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + goto out; + } tx = dmu_tx_create(os); @@ -879,7 +883,11 @@ top: goto out; } - vfs_ratelimit_metadata_write(os); + error = vfs_ratelimit_metadata_write(os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + goto out; + } tx = dmu_tx_create(os); @@ -1012,6 +1020,11 @@ top: goto out; } + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } + mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && !zn_has_cached_data(zp, 0, LLONG_MAX); @@ -1290,7 +1303,13 @@ top: return (SET_ERROR(EDQUOT)); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + zfs_exit(zfsvfs, FTAG); + return (error); + } /* * Add a new entry to the directory. @@ -1434,7 +1453,10 @@ top: goto out; } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } /* * Grab a lock on the directory to make sure that no one is @@ -1535,6 +1557,7 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) int done = 0; uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ + size_t nbytes; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); @@ -1553,6 +1576,21 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) os = zfsvfs->z_os; offset = ctx->pos; prefetch = zp->z_zn_prefetch; + nbytes = 0; + + /* + * Calling vfs_ratelimit_data_read() for each directory entry would be + * way too expensive. We don't want to do that so we do the following + * instead: + * We charge here only for a single block. If there is a lot of traffic + * we are going to wait before any reading is issued. Once we read all + * directory entries we will charge the process for the rest, as this is + * when we will know how much data exactly was read. + */ + error = vfs_ratelimit_data_read(os, zp->z_blksz, zp->z_blksz); + if (error != 0) { + goto out; + } /* * Initialize the iterator cursor. @@ -1645,18 +1683,21 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) offset += 1; } ctx->pos = offset; + /* + * TODO: We should be adding size of dirent structure here too. + */ + nbytes += strlen(zap.za_name); } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ -#ifdef TODO /* - * This is post factum, but if we would do that inside the loop we - * wouldn't know the record length before reading it anyway plus we - * would be calling vfs_ratelimit_data_read() way too often and each - * call accounts for a single operation. + * Charge the process for the rest, if more than a single block was + * read. */ - vfs_ratelimit_data_read(os, zp->z_blksz, size /* ??? */); -#endif + if (error == 0 && nbytes > zp->z_blksz) { + error = vfs_ratelimit_data_read(os, zp->z_blksz, + nbytes - zp->z_blksz); + } update: zap_cursor_fini(&zc); @@ -1697,7 +1738,11 @@ zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - vfs_ratelimit_metadata_read(zfsvfs->z_os); + error = vfs_ratelimit_metadata_read(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } mutex_enter(&zp->z_lock); @@ -2298,7 +2343,10 @@ top: } } - vfs_ratelimit_metadata_write(os); + err = vfs_ratelimit_metadata_write(os); + if (err != 0) { + goto out2; + } tx = dmu_tx_create(os); @@ -3012,7 +3060,10 @@ top: } } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); @@ -3328,7 +3379,13 @@ top: return (SET_ERROR(EDQUOT)); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + zfs_exit(zfsvfs, FTAG); + return (error); + } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; @@ -3438,7 +3495,11 @@ zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - vfs_ratelimit_metadata_read(zfsvfs->z_os); + error = vfs_ratelimit_metadata_read(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } mutex_enter(&zp->z_lock); if (zp->z_is_sa) @@ -3577,7 +3638,11 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, return (error); } - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } top: /* @@ -3820,6 +3885,13 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, return (0); } + if (vfs_ratelimit_data_write(zfsvfs->z_os, zp->z_blksz, pglen) != 0) { + unlock_page(pp); + zfs_rangelock_exit(lr); + zfs_exit(zfsvfs, FTAG); + return (0); + } + /* * Counterpart for redirty_page_for_writepage() above. This page * was in fact not skipped and should not be counted as if it were. @@ -3947,7 +4019,10 @@ zfs_dirty_inode(struct inode *ip, int flags) } #endif - vfs_ratelimit_metadata_write(zfsvfs->z_os); + error = vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (error != 0) { + goto out; + } tx = dmu_tx_create(zfsvfs->z_os); @@ -3994,7 +4069,6 @@ zfs_inactive(struct inode *ip) znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t atime[2]; - int error; int need_unlock = 0; /* Only read lock if we haven't already write locked, e.g. rollback */ @@ -4009,28 +4083,30 @@ zfs_inactive(struct inode *ip) } if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { - vfs_ratelimit_metadata_write(zfsvfs->z_os); + if (vfs_ratelimit_metadata_write(zfsvfs->z_os) != 0) { + goto out; + } dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); - } else { - inode_timespec_t tmp_atime; - tmp_atime = zpl_inode_get_atime(ip); - ZFS_TIME_ENCODE(&tmp_atime, atime); - mutex_enter(&zp->z_lock); - (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), - (void *)&atime, sizeof (atime), tx); - zp->z_atime_dirty = B_FALSE; - mutex_exit(&zp->z_lock); - dmu_tx_commit(tx); + goto out; } - } + inode_timespec_t tmp_atime; + tmp_atime = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_atime, atime); + mutex_enter(&zp->z_lock); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&atime, sizeof (atime), tx); + zp->z_atime_dirty = B_FALSE; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } +out: zfs_zinactive(zp); if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); @@ -4046,6 +4122,7 @@ zfs_fillpage(struct inode *ip, struct page *pp) loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; + int error; ASSERT3U(io_off, <, i_size); @@ -4055,12 +4132,12 @@ zfs_fillpage(struct inode *ip, struct page *pp) vfs_ratelimit_data_read(zfsvfs->z_os, PAGESIZE, io_len); void *va = kmap(pp); - int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, + error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, io_len, va, DMU_READ_PREFETCH); if (io_len != PAGE_SIZE) memset((char *)va + io_len, 0, PAGE_SIZE - io_len); kunmap(pp); - +out: if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -4097,7 +4174,9 @@ zfs_getpage(struct inode *ip, struct page *pp) if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - error = zfs_fillpage(ip, pp); + error = vfs_ratelimit_data_read(zfsvfs->z_os, 0, PAGE_SIZE); + if (error == 0) + error = zfs_fillpage(ip, pp); if (error == 0) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 5a19f3e579..b65064e3cc 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -297,8 +297,14 @@ zvol_write(zv_request_t *zvr) if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; - vfs_ratelimit_data_write(zv->zv_objset, zv->zv_volblocksize, - bytes); + error = vfs_ratelimit_data_write(zv->zv_objset, + zv->zv_volblocksize, bytes); + if (error != 0) { + /* XXX-PJD Is it safe to reset the error? */ + if (error == EINTR && uio.uio_resid < start_resid) + error = 0; + break; + } dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); @@ -400,7 +406,11 @@ zvol_discard(zv_request_t *zvr) start, size, RL_WRITER); /* Should we account only for a single metadata write? */ - vfs_ratelimit_metadata_write(zv->zv_objset); + error = vfs_ratelimit_metadata_write(zv->zv_objset); + if (error != 0) { + zfs_rangelock_exit(lr); + goto unlock; + } tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); @@ -483,8 +493,14 @@ zvol_read(zv_request_t *zvr) if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; - vfs_ratelimit_data_read(zv->zv_objset, zv->zv_volblocksize, - bytes); + error = vfs_ratelimit_data_read(zv->zv_objset, + zv->zv_volblocksize, bytes); + if (error != 0) { + /* XXX-PJD Is it safe to reset the error? */ + if (error == EINTR && uio.uio_resid < start_resid) + error = 0; + break; + } error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 957d7c7c07..5a756b1049 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -2205,7 +2205,11 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) ASSERT3U(drrw->drr_object, ==, rwa->last_object); - vfs_ratelimit_data_write(rwa->os, drrw->drr_logical_size, + /* + * vfs_ratelimit_data_write_spin() will sleep in short periods + * and return immediately when a signal is pending. + */ + vfs_ratelimit_data_write_spin(rwa->os, 0, drrw->drr_logical_size); if (drrw->drr_logical_size != dn->dn_datablksz) { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 6a345b7dc6..92c40d25e7 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1631,6 +1631,7 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range) struct srd *srdp = &range->sru.data; blkptr_t *bp = &srdp->bp; objset_t *os = srta->smta->os; + int error; ASSERT3U(range->type, ==, DATA); ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); @@ -1685,11 +1686,15 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range) .zb_blkid = range->start_blkid, }; - vfs_ratelimit_data_read(os, BP_GET_LSIZE(bp), BP_GET_LSIZE(bp)); + /* + * vfs_ratelimit_data_read_spin() will sleep in short periods and return + * immediately when a signal is pending. + */ + vfs_ratelimit_data_read_spin(os, 0, BP_GET_LSIZE(bp)); arc_flags_t aflags = ARC_FLAG_CACHED_ONLY; - int arc_err = arc_read(NULL, os->os_spa, bp, + error = arc_read(NULL, os->os_spa, bp, arc_getbuf_func, &srdp->abuf, ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, &zb); /* @@ -1698,7 +1703,7 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range) * entry to the ARC, and we also avoid polluting the ARC cache with * data that is not likely to be used in the future. */ - if (arc_err != 0) { + if (error != 0) { srdp->abd = abd_alloc_linear(srdp->datasz, B_FALSE); srdp->io_outstanding = B_TRUE; zio_nowait(zio_read(NULL, os->os_spa, bp, srdp->abd, @@ -2555,8 +2560,9 @@ dmu_send_impl(struct dmu_send_params *dspp) while (err == 0 && !range->eos_marker) { err = do_dump(&dsc, range); range = get_next_range(&srt_arg->q, range); - if (issig()) + if (issig()) { err = SET_ERROR(EINTR); + } } /* diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 4f7252e2c3..a5d66ac18c 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -371,6 +371,10 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_snap_cmtime = t; } + if (dd->dd_myname[0] != '$') { + dsl_dir_ratelimit_read(dd); + } + dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, &dd->dd_dbuf); winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); @@ -380,6 +384,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); + vfs_ratelimit_free(dd->dd_ratelimit); cv_destroy(&dd->dd_activity_cv); mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); @@ -2036,7 +2041,6 @@ dsl_dir_ratelimit_recurse(dsl_dir_t *dd) ASSERT(child_dd->dd_ratelimit == NULL); child_dd->dd_ratelimit_root = dd->dd_ratelimit_root; - dsl_dir_ratelimit_recurse(child_dd); dsl_dir_rele(child_dd, FTAG); @@ -2320,7 +2324,7 @@ dsl_dir_ratelimit_rename(dsl_dir_t *dd, dsl_dir_t *newparent) if (dd->dd_ratelimit_root != dd) { ASSERT(dd->dd_ratelimit == NULL); - dd->dd_ratelimit_root = newparent; + dd->dd_ratelimit_root = newparent->dd_ratelimit_root; dsl_dir_ratelimit_recurse(dd); } diff --git a/module/zfs/vfs_ratelimit.c b/module/zfs/vfs_ratelimit.c index 4e6493b136..18f3b09d07 100644 --- a/module/zfs/vfs_ratelimit.c +++ b/module/zfs/vfs_ratelimit.c @@ -54,7 +54,7 @@ * - It would be hard to predict what limits should be configured as there are a * lot of factors that dictate how much disk bandwidth is really required * (due to RAIDZ inflation, compression, gang blocks, deduplication, - * NOP writes, I/O aggregation, metadata traffic, etc.). + * block cloning, NOP writes, I/O aggregation, metadata traffic, etc.). * By enforcing the limits at the VFS level for file system operations it should * be easy to find out what limits applications require and verify that the * limits are correctly enforced by monitoring system calls issued by the @@ -76,20 +76,20 @@ * We walk down the dataset tree and set dd_ratelimit_root field to point to * this dsl_dir until we find dsl_dir that also has the vfs_ratelimit structure * already attached to it (which means it has its own limits configured). - * During the accounting it allows us for quick access to the ratelimit + * During the accounting it allows us to quickly access the ratelimit * structure we need by just going to ds_dir->dd_ratelimit_root; - * If ratelimits are not configured on this dataset or any of its parents, + * If ratelimits are not configured on this dataset and all of its ancestors, * the ds_dir->dd_ratelimit_root will be set to NULL, so we know we don't * have to do any accounting. * * The limits are configured per second, but we divde the second and the limits - * into RATELIMIT_RESOLUTION slots (10 by default). This is to avoid a choking + * into RATELIMIT_RESOLUTION slots (16 by default). This is to avoid a choking * effect, when process is doing progress in 1s steps. For example if we have * read bandwidth limits configured to 100MB/s and the process is trying to * read 130MB, it will take 1.3 seconds, not 2 seconds. - * Not that very low limits may be rounded up - 7 ops/s limit will be rounded - * up to 10 ops/s, so each slot is assigned 1 op/s limit. This rounding up - * is done in the kernel and isn't shown in the properties value. + * Note that very low limits may be rounded up - 7 ops/s limit will be rounded + * up to 16 ops/s, so each time slot is assigned 1 op/s limit. This rounding up + * is done in the kernel and isn't shown in the properties. * * How does the accounting work? * @@ -99,34 +99,31 @@ * and two operations total. Not all of those limits have to be configured or * some might be configured on a dataset and others on a parent dataset(s). * - * We remember those values in the rtslot structures at every level we have - * limits configured on. The rtslot strucuture also remembers the time of - * the request. For each ratelimit type (read bandwidth, total, operation read, - * operation total) and for each dataset with the limits configured when we walk - * the dataset tree up we find the point in time until which we have to wait to - * satisfy configured limit. We select the furthest point in time and we do to - * sleep. If the request doesn't exceed any limits, we just do the accounting - * and allow for the request to be executed immediately. + * For each type we use two fields to track the wait times: rl_timeslot and + * rl_reminder. rl_timeslot holds the point in time up to which the last + * processes is waiting for. If the rl_timeslot is lower than the current time, + * it means that no processes are waiting. rl_reminder is the amount of data + * modulo the limit. For example if we have a read bandwidth limit of 64MB/s, + * so it is 4MB per 1/16s. The process is trying to read 11MB. This would + * give us rl_timeslot = now + 2 (we account for 2 full time slots of 1/16s) + * and rl_reminder = 3MB. This process has to sleep for 2/16s. When immediately + * another process is trying to read 1MB, this 1MB will be added to the current + * rl_reminder giving 4MB, so full limit unit for 1/16s. Now rl_timeslot will + * be set to now + 3 and rl_reminder to 0. The last process is going to sleep + * for 3/16s. */ /* * Number of slots we divide one second into. More granularity is better for - * interactivity, but it takes more memory and more calculations. + * interactivity, but for small limits we may lose some precision. */ #define RATELIMIT_RESOLUTION 16 struct vfs_ratelimit { kmutex_t rl_lock; uint64_t rl_limits[ZFS_RATELIMIT_NTYPES]; - /* List of current waiters and past activity. */ - list_t rl_list; -}; - -struct rtslot { - list_node_t rts_node; - hrtime_t rts_timeslot; - int rts_types; - uint64_t rts_counts[ZFS_RATELIMIT_NTYPES]; + uint64_t rl_timeslot[ZFS_RATELIMIT_NTYPES]; + uint64_t rl_reminder[ZFS_RATELIMIT_NTYPES]; }; int @@ -197,13 +194,6 @@ vfs_ratelimit_alloc(const uint64_t *limits) rl = kmem_zalloc(sizeof (*rl), KM_SLEEP); mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&rl->rl_list, sizeof (struct rtslot), - offsetof(struct rtslot, rts_node)); - /* Create two slots for a good start. */ - for (i = 0; i < 2; i++) { - list_insert_tail(&rl->rl_list, - kmem_zalloc(sizeof (struct rtslot), KM_SLEEP)); - } if (limits != NULL) { for (i = ZFS_RATELIMIT_FIRST; i < ZFS_RATELIMIT_NTYPES; i++) { @@ -227,17 +217,11 @@ vfs_ratelimit_alloc(const uint64_t *limits) void vfs_ratelimit_free(struct vfs_ratelimit *rl) { - struct rtslot *slot; if (rl == NULL) { return; } - while ((slot = list_remove_head(&rl->rl_list)) != NULL) { - kmem_free(slot, sizeof (*slot)); - } - list_destroy(&rl->rl_list); - mutex_destroy(&rl->rl_lock); kmem_free(rl, sizeof (*rl)); @@ -278,28 +262,24 @@ static __inline hrtime_t gettimeslot(void) { inode_timespec_t ts; - hrtime_t nsec; gethrestime(&ts); - nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec; - return (nsec / (NANOSEC / RATELIMIT_RESOLUTION)); + + return (((hrtime_t)ts.tv_sec * RATELIMIT_RESOLUTION) + + ts.tv_nsec / (NANOSEC / RATELIMIT_RESOLUTION)); } /* * Returns bit mask of the types configured for the given ratelimit structure. */ static int -ratelimit_types(const struct vfs_ratelimit *rl) +ratelimit_types(const uint64_t *counts) { int types, type; - if (rl == NULL) { - return (0); - } - types = 0; for (type = ZFS_RATELIMIT_FIRST; type <= ZFS_RATELIMIT_LAST; type++) { - if (rl->rl_limits[type] > 0) { + if (counts[type] > 0) { types |= (1 << type); } } @@ -318,7 +298,6 @@ static dsl_dir_t * ratelimit_first(objset_t *os, int types) { dsl_dir_t *dd; - int mytypes; ASSERT(RRM_READ_HELD(&os->os_spa->spa_ratelimit_lock)); @@ -327,13 +306,17 @@ ratelimit_first(objset_t *os, int types) if (dd == NULL) { return (NULL); } - mytypes = ratelimit_types(dd->dd_ratelimit); - if ((mytypes & types) != 0) { - /* - * This dataset has at last one limit we are - * interested in. - */ - return (dd); + if (dd->dd_ratelimit != NULL) { + int mytypes; + + mytypes = ratelimit_types(dd->dd_ratelimit->rl_limits); + if ((mytypes & types) != 0) { + /* + * This dataset has at last one limit we are + * interested in. + */ + return (dd); + } } if (dd->dd_parent == NULL) { return (NULL); @@ -351,8 +334,6 @@ ratelimit_first(objset_t *os, int types) static dsl_dir_t * ratelimit_parent(dsl_dir_t *dd, int types) { - int mytypes; - ASSERT(RRM_READ_HELD(&dd->dd_pool->dp_spa->spa_ratelimit_lock)); for (;;) { @@ -363,154 +344,63 @@ ratelimit_parent(dsl_dir_t *dd, int types) if (dd == NULL) { return (NULL); } - mytypes = ratelimit_types(dd->dd_ratelimit); - if ((mytypes & types) != 0) { - /* - * This dataset has at last one limit we are - * interested in. - */ - return (dd); + if (dd->dd_ratelimit != NULL) { + int mytypes; + + mytypes = ratelimit_types(dd->dd_ratelimit->rl_limits); + if ((mytypes & types) != 0) { + /* + * This dataset has at last one limit we are + * interested in. + */ + return (dd); + } } } } -/* - * If we have any entries with 'timeslot > now' we also must have an entry with - * 'timeslot == now'. In other words if there is no entry with - * 'timeslot == now', it means that all the entires expired. - * - * We return either the most recent entry related to the given type or we return - * 'timeslot == now' entry not related to the given type and we will use it to - * store accouting information about this type as well. - */ -static struct rtslot * -ratelimit_find(struct vfs_ratelimit *rl, int typebit, hrtime_t now) -{ - struct rtslot *slot; - - ASSERT(MUTEX_HELD(&rl->rl_lock)); - - for (slot = list_head(&rl->rl_list); slot != NULL; - slot = list_next(&rl->rl_list, slot)) { - if (slot->rts_timeslot < now) { - break; - } - if ((slot->rts_types & typebit) != 0 || - slot->rts_timeslot == now) { - return (slot); - } - } - /* All the entries expired. */ -#ifndef NDEBUG - for (slot = list_head(&rl->rl_list); slot != NULL; - slot = list_next(&rl->rl_list, slot)) { - ASSERT(slot->rts_timeslot < now); - } -#endif - - return (NULL); -} - /* * Account for our request across all the types configured in this ratelimit * structure. * Return a timeslot we should wait for or now if we can execute the request * without waiting (we are within limits). */ -static uint64_t -ratelimit_account(struct vfs_ratelimit *rl, int types, hrtime_t now, +static hrtime_t +ratelimit_account(struct vfs_ratelimit *rl, hrtime_t now, const uint64_t *counts) { - uint64_t timeslot; - int type, typebit; + hrtime_t timeslot; + int type; - timeslot = 0; + timeslot = now; mutex_enter(&rl->rl_lock); for (type = ZFS_RATELIMIT_FIRST; type <= ZFS_RATELIMIT_LAST; type++) { - struct rtslot *slot; - uint64_t count, nexttimeslot; + uint64_t count; - typebit = (1 << type); - - if ((types & typebit) == 0) { - /* Not interested in this type. */ - continue; - } if (rl->rl_limits[type] == 0) { /* This type has no limit configured on this dataset. */ continue; } count = counts[type]; - ASSERT(count > 0); - - slot = ratelimit_find(rl, typebit, now); - if (slot == NULL) { - slot = list_remove_tail(&rl->rl_list); - ASSERT(slot->rts_timeslot < now); - slot->rts_types = typebit; - slot->rts_timeslot = now; - memset(slot->rts_counts, 0, sizeof (slot->rts_counts)); - list_insert_head(&rl->rl_list, slot); - } else if (slot->rts_timeslot == now) { - /* The 'now' slot may not have our type yet. */ - slot->rts_types |= typebit; - } - ASSERT((slot->rts_types & typebit) != 0); - nexttimeslot = slot->rts_timeslot + 1; - - for (;;) { - if (slot->rts_counts[type] + count <= - rl->rl_limits[type]) { - slot->rts_counts[type] += count; - break; - } - - /* - * This request is too big to fit into a single slot, - * ie. a single request exceeds the limit or this and - * the previous requests exceed the limit. - */ - - /* - * Fit as much as we can into the current slot. - */ - count -= rl->rl_limits[type] - slot->rts_counts[type]; - slot->rts_counts[type] = rl->rl_limits[type]; - - /* - * Take the next slot (if already exists isn't aware of - * our type yet), take an expired slot from the tail of - * the list or allocate a new slot. - */ - slot = list_prev(&rl->rl_list, slot); - if (slot != NULL) { - ASSERT((slot->rts_types & typebit) == 0); - ASSERT(slot->rts_timeslot == nexttimeslot); - ASSERT0(slot->rts_counts[type]); - - slot->rts_types |= typebit; - } else { - slot = list_tail(&rl->rl_list); - if (slot->rts_timeslot < now) { - list_remove(&rl->rl_list, slot); - } else { - slot = kmem_alloc(sizeof (*slot), - KM_SLEEP); - } - slot->rts_types = typebit; - slot->rts_timeslot = nexttimeslot; - memset(slot->rts_counts, 0, - sizeof (slot->rts_counts)); - list_insert_head(&rl->rl_list, slot); - } - - nexttimeslot++; + if (count == 0) { + /* Not interested in this type. */ + continue; } - if (timeslot < slot->rts_timeslot) { - timeslot = slot->rts_timeslot; + if (rl->rl_timeslot[type] < now) { + rl->rl_reminder[type] = 0; + rl->rl_timeslot[type] = now; + } else { + count += rl->rl_reminder[type]; + } + + rl->rl_timeslot[type] += count / rl->rl_limits[type]; + rl->rl_reminder[type] = count % rl->rl_limits[type];; + + if (timeslot < rl->rl_timeslot[type]) { + timeslot = rl->rl_timeslot[type]; } } @@ -519,106 +409,173 @@ ratelimit_account(struct vfs_ratelimit *rl, int types, hrtime_t now, return (timeslot); } -static void -vfs_ratelimit(objset_t *os, int types, const uint64_t *counts) +static hrtime_t +ratelimit_account_all(objset_t *os, const uint64_t *counts) { dsl_dir_t *dd; hrtime_t now, timeslot; + int types; + + ASSERT(RRM_READ_HELD(&os->os_spa->spa_ratelimit_lock)); + + types = ratelimit_types(counts); + now = timeslot = gettimeslot(); + + for (dd = ratelimit_first(os, types); dd != NULL; + dd = ratelimit_parent(dd, types)) { + hrtime_t ts; + + ts = ratelimit_account(dd->dd_ratelimit, now, counts); + if (ts > timeslot) { + timeslot = ts; + } + } + + return (timeslot); +} + +static int +ratelimit_sleep(hrtime_t timeslot) +{ + hrtime_t now; + int error = 0; now = gettimeslot(); - timeslot = 0; + + if (timeslot > now) { + /* + * Too much traffic, slow it down. + */ +#ifdef _KERNEL + if (delay_sig((hz / RATELIMIT_RESOLUTION) * (timeslot - now))) { + error = EINTR; + } +#else + delay((hz / RATELIMIT_RESOLUTION) * (timeslot - now)); +#endif + } + + return (error); +} + +static int +vfs_ratelimit_sleep(objset_t *os, const uint64_t *counts) +{ + hrtime_t timeslot; /* * Prevents configuration changes when we have requests in-flight. */ rrm_enter_read(&os->os_spa->spa_ratelimit_lock, FTAG); - for (dd = ratelimit_first(os, types); dd != NULL; - dd = ratelimit_parent(dd, types)) { - hrtime_t ts; - - ts = ratelimit_account(dd->dd_ratelimit, types, now, counts); - if (ts > timeslot) { - timeslot = ts; - } - } + timeslot = ratelimit_account_all(os, counts); rrm_exit(&os->os_spa->spa_ratelimit_lock, FTAG); - if (timeslot > now) { - /* - * Too much traffic, slow it down. - */ - delay((hz / RATELIMIT_RESOLUTION) * (timeslot - now)); - } + return (ratelimit_sleep(timeslot)); } /* * For every data read we charge: * - bytes of read bandwidth * - bytes of total bandwidth - * - (bytes - 1) / blocksize + 1 of read operations - * - (bytes - 1) / blocksize + 1 of total operations + * - (bytes + blocksize - 1) / blocksize of read operations + * - (bytes + blocksize - 1) / blocksize of total operations */ -void +int vfs_ratelimit_data_read(objset_t *os, size_t blocksize, size_t bytes) { uint64_t counts[ZFS_RATELIMIT_NTYPES]; - unsigned int types; + size_t operations; if (bytes == 0) { - return; + return (0); } if (blocksize == 0) { blocksize = bytes; } - - types = (1 << ZFS_RATELIMIT_BW_READ); - types |= (1 << ZFS_RATELIMIT_BW_TOTAL); - types |= (1 << ZFS_RATELIMIT_OP_READ); - types |= (1 << ZFS_RATELIMIT_OP_TOTAL); + operations = (bytes + blocksize - 1) / blocksize; memset(counts, 0, sizeof (counts)); counts[ZFS_RATELIMIT_BW_READ] = bytes; counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; - counts[ZFS_RATELIMIT_OP_READ] = (bytes - 1) / blocksize + 1; - counts[ZFS_RATELIMIT_OP_TOTAL] = (bytes - 1) / blocksize + 1; + counts[ZFS_RATELIMIT_OP_READ] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; - vfs_ratelimit(os, types, counts); + return (vfs_ratelimit_sleep(os, counts)); } /* * For every data write we charge: * - bytes of write bandwidth * - bytes of total bandwidth - * - (bytes - 1) / blocksize + 1 of write operations - * - (bytes - 1) / blocksize + 1 of total operations + * - (bytes + blocksize - 1) / blocksize of read operations + * - (bytes + blocksize - 1) / blocksize of total operations */ -void +int vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes) { uint64_t counts[ZFS_RATELIMIT_NTYPES]; - unsigned int types; + size_t operations; if (bytes == 0) { - return; + return (0); } if (blocksize == 0) { blocksize = bytes; } - - types = (1 << ZFS_RATELIMIT_BW_WRITE); - types |= (1 << ZFS_RATELIMIT_BW_TOTAL); - types |= (1 << ZFS_RATELIMIT_OP_WRITE); - types |= (1 << ZFS_RATELIMIT_OP_TOTAL); + operations = (bytes + blocksize - 1) / blocksize; memset(counts, 0, sizeof (counts)); counts[ZFS_RATELIMIT_BW_WRITE] = bytes; counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; - counts[ZFS_RATELIMIT_OP_WRITE] = (bytes - 1) / blocksize + 1; - counts[ZFS_RATELIMIT_OP_TOTAL] = (bytes - 1) / blocksize + 1; + counts[ZFS_RATELIMIT_OP_WRITE] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; - vfs_ratelimit(os, types, counts); + return (vfs_ratelimit_sleep(os, counts)); +} + +int +vfs_ratelimit_data_copy(objset_t *srcos, objset_t *dstos, size_t blocksize, + size_t bytes) +{ + uint64_t counts[ZFS_RATELIMIT_NTYPES]; + size_t operations; + hrtime_t dstts, srcts; + spa_t *spa = srcos->os_spa; + + if (bytes == 0) { + return (0); + } + if (blocksize == 0) { + blocksize = bytes; + } + operations = (bytes + blocksize - 1) / blocksize; + + /* + * Prevents configuration changes when we have requests in-flight. + */ + rrm_enter_read(&spa->spa_ratelimit_lock, FTAG); + + memset(counts, 0, sizeof (counts)); + counts[ZFS_RATELIMIT_BW_READ] = bytes; + counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; + counts[ZFS_RATELIMIT_OP_READ] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; + + srcts = ratelimit_account_all(srcos, counts); + + memset(counts, 0, sizeof (counts)); + counts[ZFS_RATELIMIT_BW_WRITE] = bytes; + counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; + counts[ZFS_RATELIMIT_OP_WRITE] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; + + dstts = ratelimit_account_all(dstos, counts); + + rrm_exit(&spa->spa_ratelimit_lock, FTAG); + + return (ratelimit_sleep(dstts > srcts ? dstts : srcts)); } /* @@ -626,20 +583,16 @@ vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes) * - one read operation * - one total operation */ -void +int vfs_ratelimit_metadata_read(objset_t *os) { uint64_t counts[ZFS_RATELIMIT_NTYPES]; - unsigned int types; - - types = (1 << ZFS_RATELIMIT_OP_READ); - types |= (1 << ZFS_RATELIMIT_OP_TOTAL); memset(counts, 0, sizeof (counts)); counts[ZFS_RATELIMIT_OP_READ] = 1; counts[ZFS_RATELIMIT_OP_TOTAL] = 1; - vfs_ratelimit(os, types, counts); + return (vfs_ratelimit_sleep(os, counts)); } /* @@ -647,18 +600,89 @@ vfs_ratelimit_metadata_read(objset_t *os) * - one read operation * - one total operation */ -void +int vfs_ratelimit_metadata_write(objset_t *os) { uint64_t counts[ZFS_RATELIMIT_NTYPES]; - unsigned int types; - - types = (1 << ZFS_RATELIMIT_OP_WRITE); - types |= (1 << ZFS_RATELIMIT_OP_TOTAL); memset(counts, 0, sizeof (counts)); counts[ZFS_RATELIMIT_OP_WRITE] = 1; counts[ZFS_RATELIMIT_OP_TOTAL] = 1; - vfs_ratelimit(os, types, counts); + return (vfs_ratelimit_sleep(os, counts)); +} + +/* + * Function spins until timeout is reached or the process received a signal. + * This function is different than ratelimit_sleep(), because pause_sig() + * might not be woken up by a signal if the process has multiple threads. + * We use *_spin() functions for zfs send/recv where kernel starts additional + * kernel threads and interrupting userland process with CTRL+C (SIGINT) + * doesn't interrupt pause_sig() waiting in another kernel thread. + */ +static void +ratelimit_spin(objset_t *os, const uint64_t *counts) +{ + hrtime_t timeslot; + + /* + * Prevents configuration changes when we have requests in-flight. + */ + rrm_enter_read(&os->os_spa->spa_ratelimit_lock, FTAG); + + timeslot = ratelimit_account_all(os, counts); + + rrm_exit(&os->os_spa->spa_ratelimit_lock, FTAG); + + while (timeslot > gettimeslot() && !issig()) { + delay(hz / RATELIMIT_RESOLUTION); + } +} + +void +vfs_ratelimit_data_read_spin(objset_t *os, size_t blocksize, size_t bytes) +{ + uint64_t counts[ZFS_RATELIMIT_NTYPES]; + size_t operations; + + if (bytes == 0) { + return; + } + + if (blocksize == 0) { + blocksize = bytes; + } + operations = (bytes + blocksize - 1) / blocksize; + + memset(counts, 0, sizeof (counts)); + counts[ZFS_RATELIMIT_BW_READ] = bytes; + counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; + counts[ZFS_RATELIMIT_OP_READ] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; + + ratelimit_spin(os, counts); +} + +void +vfs_ratelimit_data_write_spin(objset_t *os, size_t blocksize, size_t bytes) +{ + uint64_t counts[ZFS_RATELIMIT_NTYPES]; + size_t operations; + + if (bytes == 0) { + return; + } + + if (blocksize == 0) { + blocksize = bytes; + } + operations = (bytes + blocksize - 1) / blocksize; + + memset(counts, 0, sizeof (counts)); + counts[ZFS_RATELIMIT_BW_WRITE] = bytes; + counts[ZFS_RATELIMIT_BW_TOTAL] = bytes; + counts[ZFS_RATELIMIT_OP_WRITE] = operations; + counts[ZFS_RATELIMIT_OP_TOTAL] = operations; + + ratelimit_spin(os, counts); } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 8a50fe40e7..56c3a43169 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -78,7 +78,8 @@ static int zfs_bclone_wait_dirty = 0; /* * Maximum bytes to read per chunk in zfs_read(). */ -static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; +//static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; +static uint64_t zfs_vnops_read_chunk_size = 1024 * 512; int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) @@ -299,7 +300,14 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); - vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz, nbytes); + error = vfs_ratelimit_data_read(zfsvfs->z_os, zp->z_blksz, + nbytes); + if (error != 0) { + if (error == EINTR && n < start_resid) { + error = 0; + } + break; + } #ifdef UIO_NOCOPY if (zfs_uio_segflg(uio) == UIO_NOCOPY) @@ -614,7 +622,15 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) } } - vfs_ratelimit_data_write(zfsvfs->z_os, blksz, nbytes); + error = vfs_ratelimit_data_write(zfsvfs->z_os, blksz, nbytes); + if (error != 0) { + if (error == EINTR && n < start_resid) { + error = 0; + } + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } /* * Start a transaction. @@ -1315,8 +1331,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, break; } - vfs_ratelimit_data_read(inos, inblksz, size); - vfs_ratelimit_data_write(outos, inblksz, size); + error = vfs_ratelimit_data_copy(inos, outos, inblksz, size); + if (error != 0) { + break; + } nbps = maxblocks; last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); diff --git a/tests/zfs-tests/tests/functional/ratelimit/filesystem_op_single.ksh b/tests/zfs-tests/tests/functional/ratelimit/filesystem_op_single.ksh index cdaa03efd4..957f4c0e22 100755 --- a/tests/zfs-tests/tests/functional/ratelimit/filesystem_op_single.ksh +++ b/tests/zfs-tests/tests/functional/ratelimit/filesystem_op_single.ksh @@ -90,15 +90,7 @@ log_must ratelimit_filesystem_op_single unlink limit_op_write=none 1024 1 "$TEST # Operations total limits limit writing. log_must ratelimit_filesystem_op_single chmod limit_op_total=128 512 4 "$TESTDIR/file" log_must ratelimit_filesystem_op_single chown limit_op_total=64 512 8 "$TESTDIR/file" -# Creating a file requires one metadata write and one metadata read operation. -# On successful open(2), zfs_freebsd_open() calls vnode_create_vobject() -# with size=0. If size=0, vnode_create_vobject() interprets this as not having -# the proper size and calls VOP_GETATTR(). -if is_freebsd; then - log_must ratelimit_filesystem_op_single create limit_op_total=128 512 8 "$TESTDIR/file" -else - log_must ratelimit_filesystem_op_single create limit_op_total=128 512 4 "$TESTDIR/file" -fi +log_must ratelimit_filesystem_op_single create limit_op_total=128 512 4 "$TESTDIR/file" log_must ratelimit_filesystem_op_single unlink limit_op_total=64 512 8 "$TESTDIR/file" log_must ratelimit_filesystem_op_single mkdir limit_op_total=128 512 4 "$TESTDIR/file" log_must ratelimit_filesystem_op_single rmdir limit_op_total=64 512 8 "$TESTDIR/file" @@ -122,11 +114,7 @@ log_must ratelimit_filesystem_op_single unlink limit_op_total=none 1024 1 "$TEST # Operations read limits don't affect writing. log_must ratelimit_filesystem_op_single chmod limit_op_read=32 1024 1 "$TESTDIR/file" log_must ratelimit_filesystem_op_single chown limit_op_read=64 1024 1 "$TESTDIR/file" -if is_freebsd; then - log_must ratelimit_filesystem_op_single create limit_op_read=128 1024 8 "$TESTDIR/file" -else - log_must ratelimit_filesystem_op_single create limit_op_read=128 1024 1 "$TESTDIR/file" -fi +log_must ratelimit_filesystem_op_single create limit_op_read=128 1024 1 "$TESTDIR/file" log_must ratelimit_filesystem_op_single unlink limit_op_read=256 1024 1 "$TESTDIR/file" log_must ratelimit_filesystem_op_single mkdir limit_op_read=32 1024 1 "$TESTDIR/file" log_must ratelimit_filesystem_op_single rmdir limit_op_read=64 1024 1 "$TESTDIR/file"