Remove skc_reclaim, hdr_recl, kmem_cache shrinker

The SPL kmem_cache implementation provides a mechanism, `skc_reclaim`,
whereby individual caches can register a callback to be invoked when
there is memory pressure.  This mechanism is used in only one place: the
ARC registers the `hdr_recl()` reclaim function.  This function wakes up
the `arc_reap_zthr`, whose job is to call `kmem_cache_reap()` and
`arc_reduce_target_size()`.

The `skc_reclaim` callbacks are invoked only by shrinker callbacks and
`arc_reap_zthr`, and only callback only wakes up `arc_reap_zthr`.  When
called from `arc_reap_zthr`, waking `arc_reap_zthr` is a no-op.  When
called from shrinker callbacks, we are already aware of memory pressure
and responding to it.  Therefore there is little benefit to ever calling
the `hdr_recl()` `skc_reclaim` callback.

The `arc_reap_zthr` also wakes once a second, and if memory is low when
allocating an ARC buffer.  Therefore, additionally waking it from the
shrinker calbacks has little benefit.

The shrinker callbacks can be invoked very frequently, e.g. 10,000 times
per second.  Additionally, for invocation of the shrinker callback,
skc_reclaim is invoked many times.  Therefore, this mechanism consumes
significant amounts of CPU time.

The kmem_cache shrinker calls `spl_kmem_cache_reap_now()`, which,
in addition to invoking `skc_reclaim()`, does two things to attempt to
free pages for use by the system:
 1. Return free objects from the magazine layer to the slab layer
 2. Return entirely-free slabs to the page layer (i.e. free pages)

These actions apply only to caches implemented by the SPL, not those
that use the underlying kernel SLAB/SLUB caches.  The SPL caches are
used for objects >=32KB, which are primarily linear ABD's cached in the
DBUF cache.

These actions (freeing objects from the magazine layer and returning
entirely-free slabs) are also taken whenever a `kmem_cache_free()` call
finds a full magazine.  So there would typically be zero entirely-free
slabs, and the number of objects in magazines is limited (typically no
more than 64 objects per magazine, and there's one magazine per CPU).
Therefore the benefit of `spl_kmem_cache_reap_now()`, while nonzero, is
modest.

We also call `spl_kmem_cache_reap_now()` from the `arc_reap_zthr`, when
memory pressure is detected.  Therefore, calling
`spl_kmem_cache_reap_now()` from the kmem_cache shrinker is not needed.

This commit removes the `skc_reclaim` mechanism, its only callback
`hdr_recl()`, and the kmem_cache shrinker callback.

Reviewed-By: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10576
This commit is contained in:
Matthew Ahrens 2020-07-19 09:58:30 -07:00 committed by GitHub
parent e862b7ecfc
commit 026e529cb3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 166 deletions

View File

@ -124,7 +124,6 @@ extern struct rw_semaphore spl_kmem_cache_sem;
typedef int (*spl_kmem_ctor_t)(void *, void *, int);
typedef void (*spl_kmem_dtor_t)(void *, void *);
typedef void (*spl_kmem_reclaim_t)(void *);
typedef struct spl_kmem_magazine {
uint32_t skm_magic; /* Sanity magic */
@ -174,7 +173,6 @@ typedef struct spl_kmem_cache {
uint32_t skc_mag_refill; /* Magazine refill count */
spl_kmem_ctor_t skc_ctor; /* Constructor */
spl_kmem_dtor_t skc_dtor; /* Destructor */
spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */
void *skc_private; /* Private data */
void *skc_vmp; /* Unused */
struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */
@ -210,7 +208,7 @@ typedef struct spl_kmem_cache {
extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size,
size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor,
spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags);
void *reclaim, void *priv, void *vmp, int flags);
extern void spl_kmem_cache_set_move(spl_kmem_cache_t *,
kmem_cbrc_t (*)(void *, void *, size_t, void *));
extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);

View File

@ -25,7 +25,6 @@
#include <linux/percpu_compat.h>
#include <sys/kmem.h>
#include <sys/kmem_cache.h>
#include <sys/shrinker.h>
#include <sys/taskq.h>
#include <sys/timer.h>
#include <sys/vmem.h>
@ -883,7 +882,7 @@ spl_magazine_destroy(spl_kmem_cache_t *skc)
*/
spl_kmem_cache_t *
spl_kmem_cache_create(char *name, size_t size, size_t align,
spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim,
void *priv, void *vmp, int flags)
{
gfp_t lflags = kmem_flags_convert(KM_SLEEP);
@ -897,6 +896,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
ASSERT0(flags & KMC_NOHASH);
ASSERT0(flags & KMC_QCACHE);
ASSERT(vmp == NULL);
ASSERT(reclaim == NULL);
might_sleep();
@ -915,7 +915,6 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
skc->skc_ctor = ctor;
skc->skc_dtor = dtor;
skc->skc_reclaim = reclaim;
skc->skc_private = priv;
skc->skc_vmp = vmp;
skc->skc_linux_cache = NULL;
@ -1606,78 +1605,11 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
EXPORT_SYMBOL(spl_kmem_cache_free);
/*
* The generic shrinker function for all caches. Under Linux a shrinker
* may not be tightly coupled with a slab cache. In fact Linux always
* systematically tries calling all registered shrinker callbacks which
* report that they contain unused objects. Because of this we only
* register one shrinker function in the shim layer for all slab caches.
* We always attempt to shrink all caches when this generic shrinker
* is called.
*
* The _count() function returns the number of free-able objects.
* The _scan() function returns the number of objects that were freed.
*/
static unsigned long
spl_kmem_cache_shrinker_count(struct shrinker *shrink,
struct shrink_control *sc)
{
spl_kmem_cache_t *skc = NULL;
int alloc = 0;
down_read(&spl_kmem_cache_sem);
list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
alloc += skc->skc_obj_alloc;
}
up_read(&spl_kmem_cache_sem);
return (MAX(alloc, 0));
}
static unsigned long
spl_kmem_cache_shrinker_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
spl_kmem_cache_t *skc = NULL;
int alloc = 0;
/*
* No shrinking in a transaction context. Can cause deadlocks.
*/
if (spl_fstrans_check())
return (SHRINK_STOP);
down_read(&spl_kmem_cache_sem);
list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
uint64_t oldalloc = skc->skc_obj_alloc;
spl_kmem_cache_reap_now(skc);
if (oldalloc > skc->skc_obj_alloc)
alloc += oldalloc - skc->skc_obj_alloc;
}
up_read(&spl_kmem_cache_sem);
/*
* When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
* This functionality only exists to work around a rare issue where
* shrink_slabs() is repeatedly invoked by many cores causing the
* system to thrash.
*/
if (spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE)
return (SHRINK_STOP);
return (MAX(alloc, 0));
}
SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
spl_kmem_cache_shrinker_count, spl_kmem_cache_shrinker_scan,
KMC_DEFAULT_SEEKS);
/*
* Call the registered reclaim function for a cache. Depending on how
* many and which objects are released it may simply repopulate the
* local magazine which will then need to age-out. Objects which cannot
* fit in the magazine we will be released back to their slabs which will
* also need to age out before being release. This is all just best
* effort and we do not want to thrash creating and destroying slabs.
* Depending on how many and which objects are released it may simply
* repopulate the local magazine which will then need to age-out. Objects
* which cannot fit in the magazine will be released back to their slabs
* which will also need to age out before being released. This is all just
* best effort and we do not want to thrash creating and destroying slabs.
*/
void
spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
@ -1685,16 +1617,10 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
atomic_inc(&skc->skc_ref);
if (skc->skc_flags & KMC_SLAB)
return;
/*
* Execute the registered reclaim callback if it exists.
*/
if (skc->skc_flags & KMC_SLAB) {
if (skc->skc_reclaim)
skc->skc_reclaim(skc->skc_private);
goto out;
}
atomic_inc(&skc->skc_ref);
/*
* Prevent concurrent cache reaping when contended.
@ -1702,39 +1628,6 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
goto out;
/*
* When a reclaim function is available it may be invoked repeatedly
* until at least a single slab can be freed. This ensures that we
* do free memory back to the system. This helps minimize the chance
* of an OOM event when the bulk of memory is used by the slab.
*
* When free slabs are already available the reclaim callback will be
* skipped. Additionally, if no forward progress is detected despite
* a reclaim function the cache will be skipped to avoid deadlock.
*
* Longer term this would be the correct place to add the code which
* repacks the slabs in order minimize fragmentation.
*/
if (skc->skc_reclaim) {
uint64_t objects = UINT64_MAX;
int do_reclaim;
do {
spin_lock(&skc->skc_lock);
do_reclaim =
(skc->skc_slab_total > 0) &&
((skc->skc_slab_total-skc->skc_slab_alloc) == 0) &&
(skc->skc_obj_alloc < objects);
objects = skc->skc_obj_alloc;
spin_unlock(&skc->skc_lock);
if (do_reclaim)
skc->skc_reclaim(skc->skc_private);
} while (do_reclaim);
}
/* Reclaim from the magazine and free all now empty slabs. */
if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
spl_kmem_magazine_t *skm;
@ -1773,12 +1666,13 @@ EXPORT_SYMBOL(spl_kmem_cache_reap_active);
void
spl_kmem_reap(void)
{
struct shrink_control sc;
spl_kmem_cache_t *skc = NULL;
sc.nr_to_scan = KMC_REAP_CHUNK;
sc.gfp_mask = GFP_KERNEL;
(void) spl_kmem_cache_shrinker_scan(NULL, &sc);
down_read(&spl_kmem_cache_sem);
list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
spl_kmem_cache_reap_now(skc);
}
up_read(&spl_kmem_cache_sem);
}
EXPORT_SYMBOL(spl_kmem_reap);
@ -1791,7 +1685,6 @@ spl_kmem_cache_init(void)
spl_kmem_cache_kmem_threads, maxclsyspri,
spl_kmem_cache_kmem_threads * 8, INT_MAX,
TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
spl_register_shrinker(&spl_kmem_cache_shrinker);
return (0);
}
@ -1799,6 +1692,5 @@ spl_kmem_cache_init(void)
void
spl_kmem_cache_fini(void)
{
spl_unregister_shrinker(&spl_kmem_cache_shrinker);
taskq_destroy(spl_kmem_cache_taskq);
}

View File

@ -380,11 +380,6 @@ static int arc_min_prescient_prefetch_ms;
*/
int arc_lotsfree_percent = 10;
/*
* hdr_recl() uses this to determine if the arc is up and running.
*/
static boolean_t arc_initialized;
/*
* The arc has filled available memory and has now warmed up.
*/
@ -1198,22 +1193,6 @@ buf_dest(void *vbuf, void *unused)
arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
}
/*
* Reclaim callback -- invoked when memory is low.
*/
/* ARGSUSED */
static void
hdr_recl(void *unused)
{
dprintf("hdr_recl called\n");
/*
* umem calls the reclaim func when we destroy the buf cache,
* which is after we do arc_fini().
*/
if (arc_initialized)
zthr_wakeup(arc_reap_zthr);
}
static void
buf_init(void)
{
@ -1249,12 +1228,12 @@ retry:
}
hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
hdr_recl, NULL, NULL, 0);
NULL, NULL, NULL, 0);
hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
@ -4688,9 +4667,6 @@ arc_kmem_reap_soon(void)
static boolean_t
arc_adjust_cb_check(void *arg, zthr_t *zthr)
{
if (!arc_initialized)
return (B_FALSE);
/*
* This is necessary so that any changes which may have been made to
* many of the zfs_arc_* module parameters will be propagated to
@ -4778,9 +4754,6 @@ arc_adjust_cb(void *arg, zthr_t *zthr)
static boolean_t
arc_reap_cb_check(void *arg, zthr_t *zthr)
{
if (!arc_initialized)
return (B_FALSE);
int64_t free_memory = arc_available_memory();
/*
@ -7348,12 +7321,6 @@ arc_init(void)
arc_state_init();
/*
* The arc must be "uninitialized", so that hdr_recl() (which is
* registered by buf_init()) will not access arc_reap_zthr before
* it is created.
*/
ASSERT(!arc_initialized);
buf_init();
list_create(&arc_prune_list, sizeof (arc_prune_t),
@ -7377,7 +7344,6 @@ arc_init(void)
arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
arc_reap_cb, NULL, SEC2NSEC(1));
arc_initialized = B_TRUE;
arc_warm = B_FALSE;
/*
@ -7412,8 +7378,6 @@ arc_fini(void)
/* Use B_TRUE to ensure *all* buffers are evicted */
arc_flush(NULL, B_TRUE);
arc_initialized = B_FALSE;
if (arc_ksp != NULL) {
kstat_delete(arc_ksp);
arc_ksp = NULL;