diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index defebe3b2f..01693d72dd 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -1058,10 +1058,10 @@ extern uint_t arc_lotsfree_percent; extern uint64_t zfs_arc_min; extern uint64_t zfs_arc_max; -extern void arc_reduce_target_size(int64_t to_free); +extern uint64_t arc_reduce_target_size(uint64_t to_free); extern boolean_t arc_reclaim_needed(void); extern void arc_kmem_reap_soon(void); -extern void arc_wait_for_eviction(uint64_t, boolean_t); +extern void arc_wait_for_eviction(uint64_t, boolean_t, boolean_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 3f7485fa78..45b6c338aa 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -831,6 +831,13 @@ even with a small average compressed block size of ~8 KiB. The parameter can be set to 0 (zero) to disable the limit, and only applies on Linux. . +.It Sy zfs_arc_shrinker_seeks Ns = Ns Sy 2 Pq int +Relative cost of ARC eviction on Linux, AKA number of seeks needed to +restore evicted page. +Bigger values make ARC more precious and evictions smaller, comparing to +other kernel subsystems. +Value of 4 means parity with page cache. +. .It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq u64 The target number of bytes the ARC should leave as free memory on the system. If zero, equivalent to the bigger of diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index e271d3bf98..f52c7bb5af 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -149,18 +149,17 @@ static eventhandler_tag arc_event_lowmem = NULL; static void arc_lowmem(void *arg __unused, int howto __unused) { - int64_t free_memory, to_free; + int64_t can_free, free_memory, to_free; arc_no_grow = B_TRUE; arc_warm = B_TRUE; arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + free_memory = arc_available_memory(); - int64_t can_free = arc_c - arc_c_min; - if (can_free <= 0) - return; - to_free = (can_free >> arc_shrink_shift) - MIN(free_memory, 0); + can_free = arc_c - arc_c_min; + to_free = (MAX(can_free, 0) >> arc_shrink_shift) - MIN(free_memory, 0); DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); - arc_reduce_target_size(to_free); + to_free = arc_reduce_target_size(to_free); /* * It is unsafe to block here in arbitrary threads, because we can come @@ -168,7 +167,7 @@ arc_lowmem(void *arg __unused, int howto __unused) * with ARC reclaim thread. */ if (curproc == pageproc) - arc_wait_for_eviction(to_free, B_FALSE); + arc_wait_for_eviction(to_free, B_FALSE, B_FALSE); } void diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 02dd80c060..75a9ea5322 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -49,6 +49,7 @@ #include #include #include +#include #endif #include #include @@ -58,6 +59,7 @@ #include #include +#ifdef _KERNEL /* * This is a limit on how many pages the ARC shrinker makes available for * eviction in response to one page allocation attempt. Note that in @@ -72,11 +74,20 @@ * See also the comment in arc_shrinker_count(). * Set to 0 to disable limit. */ -int zfs_arc_shrinker_limit = 10000; +static int zfs_arc_shrinker_limit = 10000; + +/* + * Relative cost of ARC eviction, AKA number of seeks needed to restore evicted + * page. Bigger values make ARC more precious and evictions smaller comparing + * to other kernel subsystems. Value of 4 means parity with page cache, + * according to my reading of kernel's do_shrink_slab() and other code. + */ +static int zfs_arc_shrinker_seeks = DEFAULT_SEEKS; #ifdef CONFIG_MEMORY_HOTPLUG static struct notifier_block arc_hotplug_callback_mem_nb; #endif +#endif /* * Return a default max arc size based on the amount of physical memory. @@ -170,22 +181,7 @@ static unsigned long arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { /* - * __GFP_FS won't be set if we are called from ZFS code (see - * kmem_flags_convert(), which removes it). To avoid a deadlock, we - * don't allow evicting in this case. We return 0 rather than - * SHRINK_STOP so that the shrinker logic doesn't accumulate a - * deficit against us. - */ - if (!(sc->gfp_mask & __GFP_FS)) { - return (0); - } - - /* - * This code is reached in the "direct reclaim" case, where the - * kernel (outside ZFS) is trying to allocate a page, and the system - * is low on memory. - * - * The kernel's shrinker code doesn't understand how many pages the + * The kernel's shrinker code may not understand how many pages the * ARC's callback actually frees, so it may ask the ARC to shrink a * lot for one page allocation. This is problematic because it may * take a long time, thus delaying the page allocation, and because @@ -204,40 +200,44 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) * * See also the comment above zfs_arc_shrinker_limit. */ + int64_t can_free = btop(arc_evictable_memory()); int64_t limit = zfs_arc_shrinker_limit != 0 ? zfs_arc_shrinker_limit : INT64_MAX; - return (MIN(limit, btop((int64_t)arc_evictable_memory()))); + return (MIN(can_free, limit)); } static unsigned long arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { - ASSERT((sc->gfp_mask & __GFP_FS) != 0); - /* The arc is considered warm once reclaim has occurred */ if (unlikely(arc_warm == B_FALSE)) arc_warm = B_TRUE; - /* - * Evict the requested number of pages by reducing arc_c and waiting - * for the requested amount of data to be evicted. - */ - arc_reduce_target_size(ptob(sc->nr_to_scan)); - arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); - if (current->reclaim_state != NULL) -#ifdef HAVE_RECLAIM_STATE_RECLAIMED - current->reclaim_state->reclaimed += sc->nr_to_scan; -#else - current->reclaim_state->reclaimed_slab += sc->nr_to_scan; -#endif - /* * We are experiencing memory pressure which the arc_evict_zthr was - * unable to keep up with. Set arc_no_grow to briefly pause arc + * unable to keep up with. Set arc_no_grow to briefly pause ARC * growth to avoid compounding the memory pressure. */ arc_no_grow = B_TRUE; + /* + * Evict the requested number of pages by reducing arc_c and waiting + * for the requested amount of data to be evicted. To avoid deadlock + * do not wait for eviction if we may be called from ZFS itself (see + * kmem_flags_convert() removing __GFP_FS). It may cause excessive + * eviction later if many evictions are accumulated, but just skipping + * the eviction is not good either if most of memory is used by ARC. + */ + uint64_t to_free = arc_reduce_target_size(ptob(sc->nr_to_scan)); + if (sc->gfp_mask & __GFP_FS) + arc_wait_for_eviction(to_free, B_FALSE, B_FALSE); + if (current->reclaim_state != NULL) +#ifdef HAVE_RECLAIM_STATE_RECLAIMED + current->reclaim_state->reclaimed += btop(to_free); +#else + current->reclaim_state->reclaimed_slab += btop(to_free); +#endif + /* * When direct reclaim is observed it usually indicates a rapid * increase in memory pressure. This occurs because the kswapd @@ -250,7 +250,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) ARCSTAT_BUMP(arcstat_memory_direct_count); } - return (sc->nr_to_scan); + return (btop(to_free)); } static struct shrinker *arc_shrinker = NULL; @@ -304,9 +304,7 @@ arc_set_sys_free(uint64_t allmem) * arc_wait_for_eviction() will wait until at least the * high_wmark_pages() are free (see arc_evict_state_impl()). * - * Note: Even when the system is very low on memory, the kernel's - * shrinker code may only ask for one "batch" of pages (512KB) to be - * evicted. If concurrent allocations consume these pages, there may + * Note: If concurrent allocations consume these pages, there may * still be insufficient free pages, and the OOM killer takes action. * * By setting arc_sys_free large enough, and having @@ -318,20 +316,26 @@ arc_set_sys_free(uint64_t allmem) * It's hard to iterate the zones from a linux kernel module, which * makes it difficult to determine the watermark dynamically. Instead * we compute the maximum high watermark for this system, based - * on the amount of memory, assuming default parameters on Linux kernel - * 5.3. + * on the amount of memory, using the same method as the kernel uses + * to calculate its internal `min_free_kbytes` variable. See + * torvalds/linux@ee8eb9a5fe86 for the change in the upper clamp value + * from 64M to 256M. */ /* * Base wmark_low is 4 * the square root of Kbytes of RAM. */ - long wmark = 4 * int_sqrt(allmem/1024) * 1024; + long wmark = int_sqrt(allmem / 1024 * 16) * 1024; /* - * Clamp to between 128K and 64MB. + * Clamp to between 128K and 256/64MB. */ wmark = MAX(wmark, 128 * 1024); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0) + wmark = MIN(wmark, 256 * 1024 * 1024); +#else wmark = MIN(wmark, 64 * 1024 * 1024); +#endif /* * watermark_boost can increase the wmark by up to 150%. @@ -357,7 +361,7 @@ arc_lowmem_init(void) * swapping out pages when it is preferable to shrink the arc. */ arc_shrinker = spl_register_shrinker("zfs-arc-shrinker", - arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); + arc_shrinker_count, arc_shrinker_scan, zfs_arc_shrinker_seeks); VERIFY(arc_shrinker); arc_set_sys_free(allmem); @@ -500,3 +504,5 @@ arc_unregister_hotplug(void) ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, "Limit on number of pages that ARC shrinker can reclaim at once"); +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_seeks, INT, ZMOD_RD, + "Relative cost of ARC eviction vs other kernel subsystems"); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 30d30b98a6..2711178132 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4398,13 +4398,14 @@ arc_flush(spa_t *spa, boolean_t retry) (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry); } -void -arc_reduce_target_size(int64_t to_free) +uint64_t +arc_reduce_target_size(uint64_t to_free) { - uint64_t c = arc_c; - - if (c <= arc_c_min) - return; + /* + * Get the actual arc size. Even if we don't need it, this updates + * the aggsum lower bound estimate for arc_is_overflowing(). + */ + uint64_t asize = aggsum_value(&arc_sums.arcstat_size); /* * All callers want the ARC to actually evict (at least) this much @@ -4414,16 +4415,28 @@ arc_reduce_target_size(int64_t to_free) * immediately have arc_c < arc_size and therefore the arc_evict_zthr * will evict. */ - uint64_t asize = aggsum_value(&arc_sums.arcstat_size); - if (asize < c) - to_free += c - asize; - arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min); + uint64_t c = arc_c; + if (c > arc_c_min) { + c = MIN(c, MAX(asize, arc_c_min)); + to_free = MIN(to_free, c - arc_c_min); + arc_c = c - to_free; + } else { + to_free = 0; + } - /* See comment in arc_evict_cb_check() on why lock+flag */ - mutex_enter(&arc_evict_lock); - arc_evict_needed = B_TRUE; - mutex_exit(&arc_evict_lock); - zthr_wakeup(arc_evict_zthr); + /* + * Whether or not we reduced the target size, request eviction if the + * current size is over it now, since caller obviously wants some RAM. + */ + if (asize > arc_c) { + /* See comment in arc_evict_cb_check() on why lock+flag */ + mutex_enter(&arc_evict_lock); + arc_evict_needed = B_TRUE; + mutex_exit(&arc_evict_lock); + zthr_wakeup(arc_evict_zthr); + } + + return (to_free); } /* @@ -4630,9 +4643,9 @@ arc_reap_cb_check(void *arg, zthr_t *zthr) static void arc_reap_cb(void *arg, zthr_t *zthr) { - (void) arg, (void) zthr; + int64_t can_free, free_memory, to_free; - int64_t free_memory; + (void) arg, (void) zthr; fstrans_cookie_t cookie = spl_fstrans_mark(); /* @@ -4660,13 +4673,10 @@ arc_reap_cb(void *arg, zthr_t *zthr) * amount, reduce by what is needed to hit the fractional amount. */ free_memory = arc_available_memory(); - - int64_t can_free = arc_c - arc_c_min; - if (can_free > 0) { - int64_t to_free = (can_free >> arc_shrink_shift) - free_memory; - if (to_free > 0) - arc_reduce_target_size(to_free); - } + can_free = arc_c - arc_c_min; + to_free = (MAX(can_free, 0) >> arc_shrink_shift) - free_memory; + if (to_free > 0) + arc_reduce_target_size(to_free); spl_fstrans_unmark(cookie); } @@ -4754,16 +4764,11 @@ arc_adapt(uint64_t bytes) } /* - * Check if arc_size has grown past our upper threshold, determined by - * zfs_arc_overflow_shift. + * Check if ARC current size has grown past our upper thresholds. */ static arc_ovf_level_t -arc_is_overflowing(boolean_t use_reserve) +arc_is_overflowing(boolean_t lax, boolean_t use_reserve) { - /* Always allow at least one block of overflow */ - int64_t overflow = MAX(SPA_MAXBLOCKSIZE, - arc_c >> zfs_arc_overflow_shift); - /* * We just compare the lower bound here for performance reasons. Our * primary goals are to make sure that the arc never grows without @@ -4773,12 +4778,22 @@ arc_is_overflowing(boolean_t use_reserve) * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ - int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - - arc_c - overflow / 2; - if (!use_reserve) - overflow /= 2; - return (over < 0 ? ARC_OVF_NONE : - over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); + int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - + zfs_max_recordsize; + + /* Always allow at least one block of overflow. */ + if (over < 0) + return (ARC_OVF_NONE); + + /* If we are under memory pressure, report severe overflow. */ + if (!lax) + return (ARC_OVF_SEVERE); + + /* We are not under pressure, so be more or less relaxed. */ + int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2; + if (use_reserve) + overflow *= 3; + return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * @@ -4810,15 +4825,17 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag) /* * Wait for the specified amount of data (in bytes) to be evicted from the - * ARC, and for there to be sufficient free memory in the system. Waiting for - * eviction ensures that the memory used by the ARC decreases. Waiting for - * free memory ensures that the system won't run out of free pages, regardless - * of ARC behavior and settings. See arc_lowmem_init(). + * ARC, and for there to be sufficient free memory in the system. + * The lax argument specifies that caller does not have a specific reason + * to wait, not aware of any memory pressure. Low memory handlers though + * should set it to B_FALSE to wait for all required evictions to complete. + * The use_reserve argument allows some callers to wait less than others + * to not block critical code paths, possibly blocking other resources. */ void -arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve) +arc_wait_for_eviction(uint64_t amount, boolean_t lax, boolean_t use_reserve) { - switch (arc_is_overflowing(use_reserve)) { + switch (arc_is_overflowing(lax, use_reserve)) { case ARC_OVF_NONE: return; case ARC_OVF_SOME: @@ -4913,7 +4930,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, * under arc_c. See the comment above zfs_arc_eviction_pct. */ arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100, - alloc_flags & ARC_HDR_USE_RESERVE); + B_TRUE, alloc_flags & ARC_HDR_USE_RESERVE); arc_buf_contents_t type = arc_buf_type(hdr); if (type == ARC_BUFC_METADATA) {