Add KMC_SLAB cache type

For small objects the Linux slab allocator has several advantages
over its counterpart in the SPL.  These include:

1) It is more memory-efficient and packs objects more tightly.
2) It is continually tuned to maximize performance.

Therefore it makes sense to layer the SPLs slab allocator on top
of the Linux slab allocator.  This allows us to leverage the
advantages above while preserving the Illumos semantics we depend
on.  However, there are some things we need to be careful of:

1) The Linux slab allocator was never designed to work well with
   large objects.  Because the SPL slab must still handle this use
   case a cut off limit was added to transition from Linux slab
   backed objects to kmem or vmem backed slabs.

   spl_kmem_cache_slab_limit - Objects less than or equal to this
   size in bytes will be backed by the Linux slab.  By default
   this value is zero which disables the Linux slab functionality.
   Reasonable values for this cut off limit are in the range of
   4096-16386 bytes.

   spl_kmem_cache_kmem_limit - Objects less than or equal to this
   size in bytes will be backed by a kmem slab.  Objects over this
   size will be vmem backed instead.  This value defaults to
   1/8 a page, or 512 bytes on an x86_64 architecture.

2) Be aware that using the Linux slab may inadvertently introduce
   new deadlocks.  Care has been taken previously to ensure that
   all allocations which occur in the write path use GFP_NOIO.
   However, there may be internal allocations performed in the
   Linux slab which do not honor these flags.  If this is the case
   a deadlock may occur.

The path forward is definitely to start relying on the Linux slab.
But for that to happen we need to start building confidence that
there aren't any unexpected surprises lurking for us.  And ideally
need to move completely away from using the SPLs slab for large
memory allocations.  This patch is a first step.

NOTES:
1) The KMC_NOMAGAZINE flag was leveraged to support the Linux slab
   backed caches but it is not supported for kmem/vmem backed caches.

2) Regardless of the spl_kmem_cache_*_limit settings a cache may
   be explicitly set to a given type by passed the KMC_KMEM,
   KMC_VMEM, or KMC_SLAB flags during cache creation.

3) The constructors, destructors, and reclaim callbacks are all
   functional and will be called regardless of the cache type.

4) KMC_SLAB caches will not appear in /proc/spl/kmem/slab due to
   the issues involved in presenting correct object accounting.
   Instead they will appear in /proc/slabinfo under the same names.

5) Several kmem SPLAT tests needed to be fixed because they relied
   incorrectly on internal kmem slab accounting.  With the updated
   test cases all the SPLAT tests pass as expected.

6) An autoconf test was added to ensure that the __GFP_COMP flag
   was correctly added to the default flags used when allocating
   a slab.  This is required to ensure all pages in higher order
   slabs are properly refcounted, see ae16ed9.

7) When using the SLUB allocator there is no need to attempt to
   set the __GFP_COMP flag.  This has been the default behavior
   for the SLUB since Linux 2.6.25.

8) When using the SLUB it may be desirable to set the slub_nomerge
   kernel parameter to prevent caches from being merged.

Original-patch-by: DHE <git@dehacked.net>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Tim Chase <tim@chase2k.com>
Signed-off-by: DHE <git@dehacked.net>
Signed-off-by: Chunwei Chen <tuxoko@gmail.com>
Closes #356
This commit is contained in:
Brian Behlendorf 2013-12-08 17:01:45 -05:00
parent ad3412efd7
commit a073aeb060
5 changed files with 241 additions and 37 deletions

View File

@ -93,6 +93,7 @@ AC_DEFUN([SPL_AC_CONFIG_KERNEL], [
SPL_AC_SCHED_RT_HEADER
SPL_AC_2ARGS_VFS_GETATTR
SPL_AC_USLEEP_RANGE
SPL_AC_KMEM_CACHE_ALLOCFLAGS
])
AC_DEFUN([SPL_AC_MODULE_SYMVERS], [
@ -2532,3 +2533,40 @@ AC_DEFUN([SPL_AC_USLEEP_RANGE], [
AC_MSG_RESULT(no)
])
])
dnl #
dnl # 2.6.35 API change,
dnl # The cachep->gfpflags member was renamed cachep->allocflags. These are
dnl # private allocation flags which are applied when allocating a new slab
dnl # in kmem_getpages(). Unfortunately there is no public API for setting
dnl # non-default flags.
dnl #
AC_DEFUN([SPL_AC_KMEM_CACHE_ALLOCFLAGS], [
AC_MSG_CHECKING([whether struct kmem_cache has allocflags])
SPL_LINUX_TRY_COMPILE([
#include <linux/slab.h>
],[
struct kmem_cache cachep __attribute__ ((unused));
cachep.allocflags = GFP_KERNEL;
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_KMEM_CACHE_ALLOCFLAGS, 1,
[struct kmem_cache has allocflags])
],[
AC_MSG_RESULT(no)
AC_MSG_CHECKING([whether struct kmem_cache has gfpflags])
SPL_LINUX_TRY_COMPILE([
#include <linux/slab.h>
],[
struct kmem_cache cachep __attribute__ ((unused));
cachep.gfpflags = GFP_KERNEL;
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_KMEM_CACHE_GFPFLAGS, 1,
[struct kmem_cache has gfpflags])
],[
AC_MSG_RESULT(no)
])
])
])

View File

@ -340,8 +340,9 @@ enum {
KMC_BIT_QCACHE = 4, /* XXX: Unsupported */
KMC_BIT_KMEM = 5, /* Use kmem cache */
KMC_BIT_VMEM = 6, /* Use vmem cache */
KMC_BIT_OFFSLAB = 7, /* Objects not on slab */
KMC_BIT_NOEMERGENCY = 8, /* Disable emergency objects */
KMC_BIT_SLAB = 7, /* Use Linux slab cache */
KMC_BIT_OFFSLAB = 8, /* Objects not on slab */
KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */
KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */
KMC_BIT_GROWING = 15, /* Growing in progress */
KMC_BIT_REAPING = 16, /* Reaping in progress */
@ -367,6 +368,7 @@ typedef enum kmem_cbrc {
#define KMC_QCACHE (1 << KMC_BIT_QCACHE)
#define KMC_KMEM (1 << KMC_BIT_KMEM)
#define KMC_VMEM (1 << KMC_BIT_VMEM)
#define KMC_SLAB (1 << KMC_BIT_SLAB)
#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY)
#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED)
@ -456,6 +458,7 @@ typedef struct spl_kmem_cache {
spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */
void *skc_private; /* Private data */
void *skc_vmp; /* Unused */
struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */
unsigned long skc_flags; /* Flags */
uint32_t skc_obj_size; /* Object size */
uint32_t skc_obj_align; /* Object alignment */
@ -513,4 +516,24 @@ void spl_kmem_fini(void);
#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \
((ptr) < (void *)VMALLOC_END))
/*
* Allow custom slab allocation flags to be set for KMC_SLAB based caches.
* One use for this function is to ensure the __GFP_COMP flag is part of
* the default allocation mask which ensures higher order allocations are
* properly refcounted. This flag was added to the default ->allocflags
* as of Linux 3.11.
*/
static inline void
kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags)
{
if (skc->skc_linux_cache == NULL)
return;
#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
skc->skc_linux_cache->allocflags |= flags;
#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
skc->skc_linux_cache->gfpflags |= flags;
#endif
}
#endif /* _SPL_KMEM_H */

View File

@ -33,6 +33,16 @@
#define SS_DEBUG_SUBSYS SS_KMEM
/*
* Within the scope of spl-kmem.c file the kmem_cache_* definitions
* are removed to allow access to the real Linux slab allocator.
*/
#undef kmem_cache_destroy
#undef kmem_cache_create
#undef kmem_cache_alloc
#undef kmem_cache_free
/*
* Cache expiration was implemented because it was part of the default Solaris
* kmem_cache behavior. The idea is that per-cpu objects which haven't been
@ -60,6 +70,16 @@ unsigned int spl_kmem_cache_max_size = 32;
module_param(spl_kmem_cache_max_size, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
unsigned int spl_kmem_cache_slab_limit = 0;
module_param(spl_kmem_cache_slab_limit, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
"Objects less than N bytes use the Linux slab");
unsigned int spl_kmem_cache_kmem_limit = (PAGE_SIZE / 4);
module_param(spl_kmem_cache_kmem_limit, uint, 0644);
MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
"Objects less than N bytes use the kmalloc");
/*
* The minimum amount of memory measured in pages to be free at all
* times on the system. This is similar to Linux's zone->pages_min
@ -1348,7 +1368,10 @@ spl_cache_age(void *data)
return;
atomic_inc(&skc->skc_ref);
spl_on_each_cpu(spl_magazine_age, skc, 1);
if (!(skc->skc_flags & KMC_NOMAGAZINE))
spl_on_each_cpu(spl_magazine_age, skc, 1);
spl_slab_reclaim(skc, skc->skc_reap, 0);
while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
@ -1493,6 +1516,9 @@ spl_magazine_create(spl_kmem_cache_t *skc)
int i;
SENTRY;
if (skc->skc_flags & KMC_NOMAGAZINE)
SRETURN(0);
skc->skc_mag_size = spl_magazine_size(skc);
skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
@ -1519,6 +1545,11 @@ spl_magazine_destroy(spl_kmem_cache_t *skc)
int i;
SENTRY;
if (skc->skc_flags & KMC_NOMAGAZINE) {
SEXIT;
return;
}
for_each_online_cpu(i) {
skm = skc->skc_mag[i];
spl_cache_flush(skc, skm, skm->skm_avail);
@ -1541,11 +1572,12 @@ spl_magazine_destroy(spl_kmem_cache_t *skc)
* flags
* KMC_NOTOUCH Disable cache object aging (unsupported)
* KMC_NODEBUG Disable debugging (unsupported)
* KMC_NOMAGAZINE Disable magazine (unsupported)
* KMC_NOHASH Disable hashing (unsupported)
* KMC_QCACHE Disable qcache (unsupported)
* KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab
* KMC_KMEM Force kmem backed cache
* KMC_VMEM Force vmem backed cache
* KMC_SLAB Force Linux slab backed cache
* KMC_OFFSLAB Locate objects off the slab
*/
spl_kmem_cache_t *
@ -1591,6 +1623,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
skc->skc_reclaim = reclaim;
skc->skc_private = priv;
skc->skc_vmp = vmp;
skc->skc_linux_cache = NULL;
skc->skc_flags = flags;
skc->skc_obj_size = size;
skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
@ -1617,28 +1650,69 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
skc->skc_obj_emergency = 0;
skc->skc_obj_emergency_max = 0;
/*
* Verify the requested alignment restriction is sane.
*/
if (align) {
VERIFY(ISP2(align));
VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); /* Min alignment */
VERIFY3U(align, <=, PAGE_SIZE); /* Max alignment */
VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
VERIFY3U(align, <=, PAGE_SIZE);
skc->skc_obj_align = align;
}
/* If none passed select a cache type based on object size */
if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM))) {
if (spl_obj_size(skc) < (PAGE_SIZE / 8))
/*
* When no specific type of slab is requested (kmem, vmem, or
* linuxslab) then select a cache type based on the object size
* and default tunables.
*/
if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
/*
* Objects smaller than spl_kmem_cache_slab_limit can
* use the Linux slab for better space-efficiency. By
* default this functionality is disabled until its
* performance characters are fully understood.
*/
if (spl_kmem_cache_slab_limit &&
size <= (size_t)spl_kmem_cache_slab_limit)
skc->skc_flags |= KMC_SLAB;
/*
* Small objects, less than spl_kmem_cache_kmem_limit per
* object should use kmem because their slabs are small.
*/
else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
skc->skc_flags |= KMC_KMEM;
/*
* All other objects are considered large and are placed
* on vmem backed slabs.
*/
else
skc->skc_flags |= KMC_VMEM;
}
rc = spl_slab_size(skc, &skc->skc_slab_objs, &skc->skc_slab_size);
if (rc)
SGOTO(out, rc);
/*
* Given the type of slab allocate the required resources.
*/
if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
rc = spl_slab_size(skc,
&skc->skc_slab_objs, &skc->skc_slab_size);
if (rc)
SGOTO(out, rc);
rc = spl_magazine_create(skc);
if (rc)
SGOTO(out, rc);
rc = spl_magazine_create(skc);
if (rc)
SGOTO(out, rc);
} else {
skc->skc_linux_cache = kmem_cache_create(
skc->skc_name, size, align, 0, NULL);
if (skc->skc_linux_cache == NULL)
SGOTO(out, rc = ENOMEM);
kmem_cache_set_allocflags(skc, __GFP_COMP);
skc->skc_flags |= KMC_NOMAGAZINE;
}
if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
@ -1680,6 +1754,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
SENTRY;
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
down_write(&spl_kmem_cache_sem);
list_del_init(&skc->skc_list);
@ -1699,8 +1774,14 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
* cache reaping action which races with this destroy. */
wait_event(wq, atomic_read(&skc->skc_ref) == 0);
spl_magazine_destroy(skc);
spl_slab_reclaim(skc, 0, 1);
if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
spl_magazine_destroy(skc);
spl_slab_reclaim(skc, 0, 1);
} else {
ASSERT(skc->skc_flags & KMC_SLAB);
kmem_cache_destroy(skc->skc_linux_cache);
}
spin_lock(&skc->skc_lock);
/* Validate there are no objects in use and free all the
@ -1806,7 +1887,9 @@ spl_cache_reclaim_wait(void *word)
}
/*
* No available objects on any slabs, create a new slab.
* No available objects on any slabs, create a new slab. Note that this
* functionality is disabled for KMC_SLAB caches which are backed by the
* Linux slab.
*/
static int
spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
@ -1815,6 +1898,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
SENTRY;
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT((skc->skc_flags & KMC_SLAB) == 0);
might_sleep();
*obj = NULL;
@ -2016,7 +2100,28 @@ spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
ASSERT(flags & KM_SLEEP);
atomic_inc(&skc->skc_ref);
/*
* Allocate directly from a Linux slab. All optimizations are left
* to the underlying cache we only need to guarantee that KM_SLEEP
* callers will never fail.
*/
if (skc->skc_flags & KMC_SLAB) {
struct kmem_cache *slc = skc->skc_linux_cache;
do {
obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
if (obj && skc->skc_ctor)
skc->skc_ctor(obj, skc->skc_private, flags);
} while ((obj == NULL) && !(flags & KM_NOSLEEP));
atomic_dec(&skc->skc_ref);
SRETURN(obj);
}
local_irq_disable();
restart:
@ -2068,6 +2173,17 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
atomic_inc(&skc->skc_ref);
/*
* Free the object from the Linux underlying Linux slab.
*/
if (skc->skc_flags & KMC_SLAB) {
if (skc->skc_dtor)
skc->skc_dtor(obj, skc->skc_private);
kmem_cache_free(skc->skc_linux_cache, obj);
goto out;
}
/*
* Only virtual slabs may have emergency objects and these objects
* are guaranteed to have physical addresses. They must be removed
@ -2166,13 +2282,27 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
/* Prevent concurrent cache reaping when contended */
if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
SEXIT;
return;
atomic_inc(&skc->skc_ref);
/*
* Execute the registered reclaim callback if it exists. The
* per-cpu caches will be drained when is set KMC_EXPIRE_MEM.
*/
if (skc->skc_flags & KMC_SLAB) {
if (skc->skc_reclaim)
skc->skc_reclaim(skc->skc_private);
if (spl_kmem_cache_expire & KMC_EXPIRE_MEM)
kmem_cache_shrink(skc->skc_linux_cache);
SGOTO(out, 0);
}
atomic_inc(&skc->skc_ref);
/*
* Prevent concurrent cache reaping when contended.
*/
if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
SGOTO(out, 0);
/*
* When a reclaim function is available it may be invoked repeatedly
@ -2222,7 +2352,7 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
smp_mb__after_clear_bit();
wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
out:
atomic_dec(&skc->skc_ref);
SEXIT;

View File

@ -646,6 +646,12 @@ slab_seq_show(struct seq_file *f, void *p)
ASSERT(skc->skc_magic == SKC_MAGIC);
/*
* Backed by Linux slab see /proc/slabinfo.
*/
if (skc->skc_flags & KMC_SLAB)
return (0);
spin_lock(&skc->skc_lock);
seq_printf(f, "%-36s ", skc->skc_name);
seq_printf(f, "0x%05lx %9lu %9lu %8u %8u "

View File

@ -394,18 +394,25 @@ splat_kmem_cache_test_debug(struct file *file, char *name,
{
int j;
splat_vprint(file, name,
"%s cache objects %d, slabs %u/%u objs %u/%u mags ",
kcp->kcp_cache->skc_name, kcp->kcp_count,
splat_vprint(file, name, "%s cache objects %d",
kcp->kcp_cache->skc_name, kcp->kcp_count);
if (kcp->kcp_cache->skc_flags & (KMC_KMEM | KMC_VMEM)) {
splat_vprint(file, name, ", slabs %u/%u objs %u/%u",
(unsigned)kcp->kcp_cache->skc_slab_alloc,
(unsigned)kcp->kcp_cache->skc_slab_total,
(unsigned)kcp->kcp_cache->skc_obj_alloc,
(unsigned)kcp->kcp_cache->skc_obj_total);
for_each_online_cpu(j)
splat_print(file, "%u/%u ",
kcp->kcp_cache->skc_mag[j]->skm_avail,
kcp->kcp_cache->skc_mag[j]->skm_size);
if (!(kcp->kcp_cache->skc_flags & KMC_NOMAGAZINE)) {
splat_vprint(file, name, "%s", "mags");
for_each_online_cpu(j)
splat_print(file, "%u/%u ",
kcp->kcp_cache->skc_mag[j]->skm_avail,
kcp->kcp_cache->skc_mag[j]->skm_size);
}
}
splat_print(file, "%s\n", "");
}
@ -900,14 +907,14 @@ splat_kmem_test8(struct file *file, void *arg)
kmem_cache_reap_now(kcp->kcp_cache);
splat_kmem_cache_test_debug(file, SPLAT_KMEM_TEST8_NAME, kcp);
if (kcp->kcp_cache->skc_obj_total == 0)
if (kcp->kcp_count == 0)
break;
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ / 10);
}
if (kcp->kcp_cache->skc_obj_total == 0) {
if (kcp->kcp_count == 0) {
splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
"Successfully created %d objects "
"in cache %s and reclaimed them\n",
@ -915,7 +922,7 @@ splat_kmem_test8(struct file *file, void *arg)
} else {
splat_vprint(file, SPLAT_KMEM_TEST8_NAME,
"Failed to reclaim %u/%d objects from cache %s\n",
(unsigned)kcp->kcp_cache->skc_obj_total,
(unsigned)kcp->kcp_count,
SPLAT_KMEM_OBJ_COUNT, SPLAT_KMEM_CACHE_NAME);
rc = -ENOMEM;
}
@ -995,14 +1002,14 @@ splat_kmem_test9(struct file *file, void *arg)
for (i = 0; i < 60; i++) {
splat_kmem_cache_test_debug(file, SPLAT_KMEM_TEST9_NAME, kcp);
if (kcp->kcp_cache->skc_obj_total == 0)
if (kcp->kcp_count == 0)
break;
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ);
}
if (kcp->kcp_cache->skc_obj_total == 0) {
if (kcp->kcp_count == 0) {
splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
"Successfully created %d objects "
"in cache %s and reclaimed them\n",
@ -1010,7 +1017,7 @@ splat_kmem_test9(struct file *file, void *arg)
} else {
splat_vprint(file, SPLAT_KMEM_TEST9_NAME,
"Failed to reclaim %u/%d objects from cache %s\n",
(unsigned)kcp->kcp_cache->skc_obj_total, count,
(unsigned)kcp->kcp_count, count,
SPLAT_KMEM_CACHE_NAME);
rc = -ENOMEM;
}