From 8bd3dca9bf3e9a4315d58be316bcfaf8e76c6a6a Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Thu, 11 Nov 2021 21:52:16 +0100 Subject: [PATCH] Introduce a tunable to exclude special class buffers from L2ARC Special allocation class or dedup vdevs may have roughly the same performance as L2ARC vdevs. Introduce a new tunable to exclude those buffers from being cacheable on L2ARC. Reviewed-by: Don Brady Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #11761 Closes #12285 --- include/sys/arc.h | 1 + include/sys/dbuf.h | 11 +------ include/sys/dmu_objset.h | 4 --- man/man4/zfs.4 | 5 +++ module/zfs/arc.c | 12 +++++++ module/zfs/dbuf.c | 71 +++++++++++++++++++++++++++++++++++++--- module/zfs/dmu.c | 2 +- module/zfs/dmu_objset.c | 34 +++++++++++++++++-- 8 files changed, 119 insertions(+), 21 deletions(-) diff --git a/include/sys/arc.h b/include/sys/arc.h index a3241f3685..5d8176894e 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -85,6 +85,7 @@ typedef void arc_prune_func_t(int64_t bytes, void *priv); /* Shared module parameters */ extern int zfs_arc_average_blocksize; +extern int l2arc_exclude_special; /* generic arc_done_func_t's which you can use */ arc_read_done_func_t arc_bcopy_func; diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 93d80066be..2e7385113e 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -441,16 +441,7 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2CACHEABLE(_db) \ - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (dbuf_is_metadata(_db) && \ - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) - -#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \ - ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (((_level) > 0 || \ - DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \ - ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA))) +boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db); #ifdef ZFS_DEBUG diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index e89ee64ea6..7ade2dc912 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -200,10 +200,6 @@ struct objset { #define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) #define DMU_PROJECTUSED_DNODE(os) ((os)->os_projectused_dnode.dnh_dnode) -#define DMU_OS_IS_L2CACHEABLE(os) \ - ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ - (os)->os_secondary_cache == ZFS_CACHE_METADATA) - /* called from zpl */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 6495e9b4cd..c32dd4b1b2 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -109,6 +109,11 @@ A value of .Sy 100 disables this feature. . +.It Sy l2arc_exclude_special Ns = Ns Sy 0 Ns | Ns 1 Pq int +Controls whether buffers present on special vdevs are eligibile for caching +into L2ARC. +If set to 1, exclude dbufs on special vdevs from being cached to L2ARC. +. .It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int Controls whether only MFU metadata and data are cached from ARC into L2ARC. This may be desired to avoid wasting space on L2ARC when reading/writing large diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 215250ea6f..0ba366f185 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -877,6 +877,14 @@ static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, #define l2arc_hdr_arcstats_decrement_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) +/* + * l2arc_exclude_special : A zfs module parameter that controls whether buffers + * present on special vdevs are eligibile for caching in L2ARC. If + * set to 1, exclude dbufs on special vdevs from being cached to + * L2ARC. + */ +int l2arc_exclude_special = 0; + /* * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU * metadata and data are cached from ARC into L2ARC. @@ -11136,6 +11144,10 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, "Cache only MFU data from ARC into L2ARC"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, + "If set to 1 exclude dbufs on special vdevs from being cached to " + "L2ARC."); + ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes"); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index e687d96501..1a022c8b8a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -53,6 +53,7 @@ #include #include #include +#include kstat_t *dbuf_ksp; @@ -594,6 +595,68 @@ dbuf_is_metadata(dmu_buf_impl_t *db) } } +/* + * We want to exclude buffers that are on a special allocation class from + * L2ARC. + */ +boolean_t +dbuf_is_l2cacheable(dmu_buf_impl_t *db) +{ + vdev_t *vd = NULL; + zfs_cache_type_t cache = db->db_objset->os_secondary_cache; + blkptr_t *bp = db->db_blkptr; + + if (bp != NULL && !BP_IS_HOLE(bp)) { + uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev; + + if (vdev < rvd->vdev_children) + vd = rvd->vdev_child[vdev]; + + if (cache == ZFS_CACHE_ALL || + (dbuf_is_metadata(db) && cache == ZFS_CACHE_METADATA)) { + if (vd == NULL) + return (B_TRUE); + + if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || + l2arc_exclude_special == 0) + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static inline boolean_t +dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level) +{ + vdev_t *vd = NULL; + zfs_cache_type_t cache = dn->dn_objset->os_secondary_cache; + + if (bp != NULL && !BP_IS_HOLE(bp)) { + uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev; + + if (vdev < rvd->vdev_children) + vd = rvd->vdev_child[vdev]; + + if (cache == ZFS_CACHE_ALL || ((level > 0 || + DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)) && + cache == ZFS_CACHE_METADATA)) { + if (vd == NULL) + return (B_TRUE); + + if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || + l2arc_exclude_special == 0) + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * This function *must* return indices evenly distributed between all @@ -1523,7 +1586,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); - if (DBUF_IS_L2CACHEABLE(db)) + if (dbuf_is_l2cacheable(db)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); @@ -3372,7 +3435,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, dpa->dpa_arg = arg; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) + if (dnode_level_is_l2cacheable(&bp, dn, level)) dpa->dpa_aflags |= ARC_FLAG_L2CACHE; /* @@ -3390,7 +3453,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) + if (dnode_level_is_l2cacheable(&bp, dn, level)) iter_aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, @@ -4989,7 +5052,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) children_ready_cb = dbuf_write_children_ready; dr->dr_zio = arc_write(pio, os->os_spa, txg, - &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), + &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 4e7127bd1b..e38c9b452a 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1846,7 +1846,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, - zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), + zgd->zgd_bp, dr->dt.dl.dr_data, dbuf_is_l2cacheable(db), &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index b938089023..a8975797e8 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -63,6 +63,8 @@ #include #include #include "zfs_namecheck.h" +#include +#include /* * Needed to close a window in dnode_move() that allows the objset to be freed @@ -411,6 +413,34 @@ dnode_multilist_index_func(multilist_t *ml, void *obj) multilist_get_num_sublists(ml)); } +static inline boolean_t +dmu_os_is_l2cacheable(objset_t *os) +{ + vdev_t *vd = NULL; + zfs_cache_type_t cache = os->os_secondary_cache; + blkptr_t *bp = os->os_rootbp; + + if (bp != NULL && !BP_IS_HOLE(bp)) { + uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + vdev_t *rvd = os->os_spa->spa_root_vdev; + + if (vdev < rvd->vdev_children) + vd = rvd->vdev_child[vdev]; + + if (cache == ZFS_CACHE_ALL || cache == ZFS_CACHE_METADATA) { + if (vd == NULL) + return (B_TRUE); + + if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || + l2arc_exclude_special == 0) + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * Instantiates the objset_t in-memory structure corresponding to the * objset_phys_t that's pointed to by the specified blkptr_t. @@ -453,7 +483,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - if (DMU_OS_IS_L2CACHEABLE(os)) + if (dmu_os_is_l2cacheable(os)) aflags |= ARC_FLAG_L2CACHE; if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) { @@ -1661,7 +1691,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) } zio = arc_write(pio, os->os_spa, tx->tx_txg, - blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), + blkptr_copy, os->os_phys_buf, dmu_os_is_l2cacheable(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);