From 9d2e5c14b2f94c91aa389799bd9e80e1098263e7 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 1 Mar 2023 18:27:40 -0500 Subject: [PATCH] System-wide speculative prefetch limit. With some pathological access patterns it is possible to make ZFS accumulate almost unlimited amount of speculative prefetch ZIOs. Combined with linear ABD allocations in RAIDZ code, it appears to be possible to exhaust system KVA, triggering kernel panic. Address this by introducing a system-wide counter of active prefetch requests and blocking prefetch distance doubling per stream hits if the number of active requests is higher that ~6% of ARC size. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14516 --- include/sys/arc_impl.h | 1 + module/zfs/dmu_zfetch.c | 29 ++++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 43818d4104..db6238fda6 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -30,6 +30,7 @@ #define _SYS_ARC_IMPL_H #include +#include #include #include #include diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index bca881d82f..d2985d5729 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -28,6 +28,7 @@ */ #include +#include #include #include #include @@ -65,13 +66,15 @@ typedef struct zfetch_stats { kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; kstat_named_t zfetchstat_io_issued; + kstat_named_t zfetchstat_io_active; } zfetch_stats_t; static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, - { "io_issued", KSTAT_DATA_UINT64 }, + { "io_issued", KSTAT_DATA_UINT64 }, + { "io_active", KSTAT_DATA_UINT64 }, }; struct { @@ -79,6 +82,7 @@ struct { wmsum_t zfetchstat_misses; wmsum_t zfetchstat_max_streams; wmsum_t zfetchstat_io_issued; + aggsum_t zfetchstat_io_active; } zfetch_sums; #define ZFETCHSTAT_BUMP(stat) \ @@ -104,6 +108,8 @@ zfetch_kstats_update(kstat_t *ksp, int rw) wmsum_value(&zfetch_sums.zfetchstat_max_streams); zs->zfetchstat_io_issued.value.ui64 = wmsum_value(&zfetch_sums.zfetchstat_io_issued); + zs->zfetchstat_io_active.value.ui64 = + aggsum_value(&zfetch_sums.zfetchstat_io_active); return (0); } @@ -114,6 +120,7 @@ zfetch_init(void) wmsum_init(&zfetch_sums.zfetchstat_misses, 0); wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); + aggsum_init(&zfetch_sums.zfetchstat_io_active, 0); zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), @@ -138,6 +145,8 @@ zfetch_fini(void) wmsum_fini(&zfetch_sums.zfetchstat_misses); wmsum_fini(&zfetch_sums.zfetchstat_max_streams); wmsum_fini(&zfetch_sums.zfetchstat_io_issued); + ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active)); + aggsum_fini(&zfetch_sums.zfetchstat_io_active); } /* @@ -294,6 +303,7 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) zs->zs_more = B_TRUE; if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) dmu_zfetch_stream_fini(zs); + aggsum_add(&zfetch_sums.zfetchstat_io_active, -1); } /* @@ -407,20 +417,28 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, * Start prefetch from the demand access size (nblks). Double the * distance every access up to zfetch_min_distance. After that only * if needed increase the distance by 1/8 up to zfetch_max_distance. + * + * Don't double the distance beyond single block if we have more + * than ~6% of ARC held by active prefetches. It should help with + * getting out of RAM on some badly mispredicted read patterns. */ - unsigned int nbytes = nblks << zf->zf_dnode->dn_datablkshift; + unsigned int dbs = zf->zf_dnode->dn_datablkshift; + unsigned int nbytes = nblks << dbs; unsigned int pf_nblks; if (fetch_data) { if (unlikely(zs->zs_pf_dist < nbytes)) zs->zs_pf_dist = nbytes; - else if (zs->zs_pf_dist < zfetch_min_distance) + else if (zs->zs_pf_dist < zfetch_min_distance && + (zs->zs_pf_dist < (1 << dbs) || + aggsum_compare(&zfetch_sums.zfetchstat_io_active, + arc_c_max >> (4 + dbs)) < 0)) zs->zs_pf_dist *= 2; else if (zs->zs_more) zs->zs_pf_dist += zs->zs_pf_dist / 8; zs->zs_more = B_FALSE; if (zs->zs_pf_dist > zfetch_max_distance) zs->zs_pf_dist = zfetch_max_distance; - pf_nblks = zs->zs_pf_dist >> zf->zf_dnode->dn_datablkshift; + pf_nblks = zs->zs_pf_dist >> dbs; } else { pf_nblks = 0; } @@ -439,7 +457,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, zs->zs_ipf_dist *= 2; if (zs->zs_ipf_dist > zfetch_max_idistance) zs->zs_ipf_dist = zfetch_max_idistance; - pf_nblks = zs->zs_ipf_dist >> zf->zf_dnode->dn_datablkshift; + pf_nblks = zs->zs_ipf_dist >> dbs; if (zs->zs_ipf_start < zs->zs_pf_end) zs->zs_ipf_start = zs->zs_pf_end; if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) @@ -509,6 +527,7 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) dmu_zfetch_stream_fini(zs); return; } + aggsum_add(&zfetch_sums.zfetchstat_io_active, issued); if (!have_lock) rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);