Fix ARC ghost states eviction accounting

arc_evict_hdr() returns number of evicted bytes in scope of specific
state.  For ghost states it does not mean the amount of really freed
memory, but the logical buffer size.  It is correct for the eviction
process, but not for waking up threads waiting for ARC size reduction,
as added in "Revise ARC shrinker algorithm" commit, causing premature
wakeups while ARC is still overflowed, allowing even bigger overflow,
plus processing overhead when next allocation will also get blocked,
probably also for too short time.

To fix that make arc_evict_hdr() also return the amount of really
freed memory, which for the ghost states is only the header, and use
it to update arc_evict_count instead.  Originally I was thinking to
not return it at all, since arc_get_data_impl() does not account for
the headers, but decided that some slow allocation progress is better
than long waits, reaching on my tests up to 100ms.

To reduce negative latency effects of long time periods when reclaim
thread can free little real memory, start reclamation process earlier,
before we actually reached the overflow threshold, when we have to
throttle new allocations.  We can also do it without taking global
arc_evict_lock, reducing the contention.

Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Closes #12279
This commit is contained in:
Alexander Motin 2021-07-13 11:41:59 -04:00 committed by Tony Hutter
parent a5e68f0478
commit 45305a067f
4 changed files with 113 additions and 81 deletions

View File

@ -984,7 +984,6 @@ extern unsigned long zfs_arc_max;
extern void arc_reduce_target_size(int64_t to_free); extern void arc_reduce_target_size(int64_t to_free);
extern boolean_t arc_reclaim_needed(void); extern boolean_t arc_reclaim_needed(void);
extern void arc_kmem_reap_soon(void); extern void arc_kmem_reap_soon(void);
extern boolean_t arc_is_overflowing(void);
extern void arc_wait_for_eviction(uint64_t); extern void arc_wait_for_eviction(uint64_t);
extern void arc_lowmem_init(void); extern void arc_lowmem_init(void);

View File

@ -712,20 +712,22 @@ equivalent to the greater of the number of online CPUs and
The ARC size is considered to be overflowing if it exceeds the current The ARC size is considered to be overflowing if it exceeds the current
ARC target size ARC target size
.Pq Sy arc_c .Pq Sy arc_c
by a threshold determined by this parameter. by thresholds determined by this parameter.
The threshold is calculated as a fraction of Exceeding by
.Sy arc_c .Sy ( arc_c >> zfs_arc_overflow_shift ) * 0.5
using the formula starts ARC reclamation process.
.Sy arc_c >> zfs_arc_overflow_shift . If that appears insufficient, exceeding by
.Sy ( arc_c >> zfs_arc_overflow_shift ) * 1.5
blocks new buffer allocation until the reclaim thread catches up.
Started reclamation process continues till ARC size returns below the
target size.
.Pp .Pp
The default value of The default value of
.Sy 8 .Sy 8
causes the ARC to be considered overflowing if it exceeds the target size by causes the ARC to start reclamation if it exceeds the target size by
.Em 1/256th Pq Em 0.3% .Em 0.2%
of the target size. of the target size, and block allocations by
.Pp .Em 0.6% .
When the ARC is overflowing, new buffer allocations are stalled until
the reclaim thread catches up and the overflow condition no longer exists.
. .
.It Sy zfs_arc_p_min_shift Ns = Ns Sy 0 Pq int .It Sy zfs_arc_p_min_shift Ns = Ns Sy 0 Pq int
If nonzero, this will update If nonzero, this will update

View File

@ -234,8 +234,6 @@ arc_lowmem(void *arg __unused, int howto __unused)
*/ */
if (curproc == pageproc) if (curproc == pageproc)
arc_wait_for_eviction(to_free); arc_wait_for_eviction(to_free);
else
arc_wait_for_eviction(0);
} }
void void

View File

@ -826,6 +826,12 @@ typedef enum arc_fill_flags {
ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */
} arc_fill_flags_t; } arc_fill_flags_t;
typedef enum arc_ovf_level {
ARC_OVF_NONE, /* ARC within target size. */
ARC_OVF_SOME, /* ARC is slightly overflowed. */
ARC_OVF_SEVERE /* ARC is severely overflowed. */
} arc_ovf_level_t;
static kmutex_t l2arc_feed_thr_lock; static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv; static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit; static uint8_t l2arc_thread_exit;
@ -3866,9 +3872,18 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
* - arc_mru_ghost -> deleted * - arc_mru_ghost -> deleted
* - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> arc_l2c_only
* - arc_mfu_ghost -> deleted * - arc_mfu_ghost -> deleted
*
* Return total size of evicted data buffers for eviction progress tracking.
* When evicting from ghost states return logical buffer size to make eviction
* progress at the same (or at least comparable) rate as from non-ghost states.
*
* Return *real_evicted for actual ARC size reduction to wake up threads
* waiting for it. For non-ghost states it includes size of evicted data
* buffers (the headers are not freed there). For ghost states it includes
* only the evicted headers size.
*/ */
static int64_t static int64_t
arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
{ {
arc_state_t *evicted_state, *state; arc_state_t *evicted_state, *state;
int64_t bytes_evicted = 0; int64_t bytes_evicted = 0;
@ -3878,6 +3893,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
ASSERT(MUTEX_HELD(hash_lock)); ASSERT(MUTEX_HELD(hash_lock));
ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_HAS_L1HDR(hdr));
*real_evicted = 0;
state = hdr->b_l1hdr.b_state; state = hdr->b_l1hdr.b_state;
if (GHOST_STATE(state)) { if (GHOST_STATE(state)) {
ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr));
@ -3914,9 +3930,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
*/ */
hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr = arc_hdr_realloc(hdr, hdr_full_cache,
hdr_l2only_cache); hdr_l2only_cache);
*real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
} else { } else {
arc_change_state(arc_anon, hdr, hash_lock); arc_change_state(arc_anon, hdr, hash_lock);
arc_hdr_destroy(hdr); arc_hdr_destroy(hdr);
*real_evicted += HDR_FULL_SIZE;
} }
return (bytes_evicted); return (bytes_evicted);
} }
@ -3940,8 +3958,10 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
ARCSTAT_BUMP(arcstat_mutex_miss); ARCSTAT_BUMP(arcstat_mutex_miss);
break; break;
} }
if (buf->b_data != NULL) if (buf->b_data != NULL) {
bytes_evicted += HDR_GET_LSIZE(hdr); bytes_evicted += HDR_GET_LSIZE(hdr);
*real_evicted += HDR_GET_LSIZE(hdr);
}
mutex_exit(&buf->b_evict_lock); mutex_exit(&buf->b_evict_lock);
arc_buf_destroy_impl(buf); arc_buf_destroy_impl(buf);
} }
@ -3977,6 +3997,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
arc_cksum_free(hdr); arc_cksum_free(hdr);
bytes_evicted += arc_hdr_size(hdr); bytes_evicted += arc_hdr_size(hdr);
*real_evicted += arc_hdr_size(hdr);
/* /*
* If this hdr is being evicted and has a compressed * If this hdr is being evicted and has a compressed
@ -4018,7 +4039,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
uint64_t spa, int64_t bytes) uint64_t spa, int64_t bytes)
{ {
multilist_sublist_t *mls; multilist_sublist_t *mls;
uint64_t bytes_evicted = 0; uint64_t bytes_evicted = 0, real_evicted = 0;
arc_buf_hdr_t *hdr; arc_buf_hdr_t *hdr;
kmutex_t *hash_lock; kmutex_t *hash_lock;
int evict_count = 0; int evict_count = 0;
@ -4079,10 +4100,13 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
ASSERT(!MUTEX_HELD(hash_lock)); ASSERT(!MUTEX_HELD(hash_lock));
if (mutex_tryenter(hash_lock)) { if (mutex_tryenter(hash_lock)) {
uint64_t evicted = arc_evict_hdr(hdr, hash_lock); uint64_t revicted;
uint64_t evicted = arc_evict_hdr(hdr, hash_lock,
&revicted);
mutex_exit(hash_lock); mutex_exit(hash_lock);
bytes_evicted += evicted; bytes_evicted += evicted;
real_evicted += revicted;
/* /*
* If evicted is zero, arc_evict_hdr() must have * If evicted is zero, arc_evict_hdr() must have
@ -4112,7 +4136,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* 1/64th of RAM). See the comments in arc_wait_for_eviction(). * 1/64th of RAM). See the comments in arc_wait_for_eviction().
*/ */
mutex_enter(&arc_evict_lock); mutex_enter(&arc_evict_lock);
arc_evict_count += bytes_evicted; arc_evict_count += real_evicted;
if (arc_free_memory() > arc_sys_free / 2) { if (arc_free_memory() > arc_sys_free / 2) {
arc_evict_waiter_t *aw; arc_evict_waiter_t *aw;
@ -5126,7 +5150,7 @@ arc_adapt(int bytes, arc_state_t *state)
* Check if arc_size has grown past our upper threshold, determined by * Check if arc_size has grown past our upper threshold, determined by
* zfs_arc_overflow_shift. * zfs_arc_overflow_shift.
*/ */
boolean_t static arc_ovf_level_t
arc_is_overflowing(void) arc_is_overflowing(void)
{ {
/* Always allow at least one block of overflow */ /* Always allow at least one block of overflow */
@ -5142,8 +5166,10 @@ arc_is_overflowing(void)
* in the ARC. In practice, that's in the tens of MB, which is low * in the ARC. In practice, that's in the tens of MB, which is low
* enough to be safe. * enough to be safe.
*/ */
return (aggsum_lower_bound(&arc_sums.arcstat_size) >= int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
(int64_t)arc_c + overflow); arc_c - overflow / 2;
return (over < 0 ? ARC_OVF_NONE :
over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
} }
static abd_t * static abd_t *
@ -5185,21 +5211,41 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
void void
arc_wait_for_eviction(uint64_t amount) arc_wait_for_eviction(uint64_t amount)
{ {
mutex_enter(&arc_evict_lock); switch (arc_is_overflowing()) {
if (arc_is_overflowing()) { case ARC_OVF_NONE:
return;
case ARC_OVF_SOME:
/*
* This is a bit racy without taking arc_evict_lock, but the
* worst that can happen is we either call zthr_wakeup() extra
* time due to race with other thread here, or the set flag
* get cleared by arc_evict_cb(), which is unlikely due to
* big hysteresis, but also not important since at this level
* of overflow the eviction is purely advisory. Same time
* taking the global lock here every time without waiting for
* the actual eviction creates a significant lock contention.
*/
if (!arc_evict_needed) {
arc_evict_needed = B_TRUE; arc_evict_needed = B_TRUE;
zthr_wakeup(arc_evict_zthr); zthr_wakeup(arc_evict_zthr);
}
if (amount != 0) { return;
case ARC_OVF_SEVERE:
default:
{
arc_evict_waiter_t aw; arc_evict_waiter_t aw;
list_link_init(&aw.aew_node); list_link_init(&aw.aew_node);
cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
uint64_t last_count = 0; uint64_t last_count = 0;
mutex_enter(&arc_evict_lock);
if (!list_is_empty(&arc_evict_waiters)) { if (!list_is_empty(&arc_evict_waiters)) {
arc_evict_waiter_t *last = arc_evict_waiter_t *last =
list_tail(&arc_evict_waiters); list_tail(&arc_evict_waiters);
last_count = last->aew_count; last_count = last->aew_count;
} else if (!arc_evict_needed) {
arc_evict_needed = B_TRUE;
zthr_wakeup(arc_evict_zthr);
} }
/* /*
* Note, the last waiter's count may be less than * Note, the last waiter's count may be less than
@ -5207,8 +5253,7 @@ arc_wait_for_eviction(uint64_t amount)
* case arc_evict_state_impl() may have deferred * case arc_evict_state_impl() may have deferred
* wakeups (but still incremented arc_evict_count). * wakeups (but still incremented arc_evict_count).
*/ */
aw.aew_count = aw.aew_count = MAX(last_count, arc_evict_count) + amount;
MAX(last_count, arc_evict_count) + amount;
list_insert_tail(&arc_evict_waiters, &aw); list_insert_tail(&arc_evict_waiters, &aw);
@ -5220,23 +5265,19 @@ arc_wait_for_eviction(uint64_t amount)
uint64_t, aw.aew_count); uint64_t, aw.aew_count);
/* /*
* We will be woken up either when arc_evict_count * We will be woken up either when arc_evict_count reaches
* reaches aew_count, or when the ARC is no longer * aew_count, or when the ARC is no longer overflowing and
* overflowing and eviction completes. * eviction completes.
* In case of "false" wakeup, we will still be on the list.
*/ */
do {
cv_wait(&aw.aew_cv, &arc_evict_lock); cv_wait(&aw.aew_cv, &arc_evict_lock);
} while (list_link_active(&aw.aew_node));
/* mutex_exit(&arc_evict_lock);
* In case of "false" wakeup, we will still be on the
* list.
*/
if (list_link_active(&aw.aew_node))
list_remove(&arc_evict_waiters, &aw);
cv_destroy(&aw.aew_cv); cv_destroy(&aw.aew_cv);
} }
} }
mutex_exit(&arc_evict_lock);
} }
/* /*
@ -5267,16 +5308,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
* requested size to be evicted. This should be more than 100%, to * requested size to be evicted. This should be more than 100%, to
* ensure that that progress is also made towards getting arc_size * ensure that that progress is also made towards getting arc_size
* under arc_c. See the comment above zfs_arc_eviction_pct. * under arc_c. See the comment above zfs_arc_eviction_pct.
*
* We do the overflowing check without holding the arc_evict_lock to
* reduce lock contention in this hot path. Note that
* arc_wait_for_eviction() will acquire the lock and check again to
* ensure we are truly overflowing before blocking.
*/ */
if (arc_is_overflowing()) { arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100);
arc_wait_for_eviction(size *
zfs_arc_eviction_pct / 100);
}
VERIFY3U(hdr->b_type, ==, type); VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) { if (type == ARC_BUFC_METADATA) {