kmem slab fixes

- Default SPL_KMEM_CACHE_DELAY changed to 15 to match Solaris.
- Aged out slab checking occurs every SPL_KMEM_CACHE_DELAY / 3.
- skc->skc_reap tunable added whichs allows callers of
  spl_slab_reclaim() to cap the number of slabs reclaimed.
  On Solaris all eligible slabs are always reclaimed, and this
  is still the default behavior.  However, I suspect that is
  not always wise for reasons such as in the next comment.
- spl_slab_reclaim() added cond_resched() while walking the
  slab/object free lists.  Soft lockups were observed when
  freeing large numbers of vmalloc'd slabs/objets.
- spl_slab_reclaim() 'sks->sks_ref > 0' check changes from
  incorrect 'break' to 'continue' to ensure all slabs are
  checked.
- spl_cache_age() reworked to avoid a deadlock with
  do_flush_tlb_all() which occured because we slept waiting
  for completion in spl_cache_age().  To waiting for magazine
  reclamation to finish is not required so we no longer wait.
- spl_magazine_create() and spl_magazine_destroy() shifted
  back to using for_each_online_cpu() instead of the
  spl_on_each_cpu() approach which was of course a bad idea
  due to memory allocations which Ricardo pointed out.
This commit is contained in:
Brian Behlendorf 2009-02-12 13:32:10 -08:00
parent f500ccff35
commit 37db7d8cf9
2 changed files with 54 additions and 37 deletions

View File

@ -239,7 +239,8 @@ extern struct rw_semaphore spl_kmem_cache_sem;
#define SKS_MAGIC 0x22222222
#define SKC_MAGIC 0x2c2c2c2c
#define SPL_KMEM_CACHE_DELAY 5 /* Minimum slab release age */
#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */
#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */
#define SPL_KMEM_CACHE_OBJ_PER_SLAB 32 /* Target objects per slab */
#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 8 /* Minimum objects per slab */
#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */
@ -292,6 +293,7 @@ typedef struct spl_kmem_cache {
uint32_t skc_slab_objs; /* Objects per slab */
uint32_t skc_slab_size; /* Slab size */
uint32_t skc_delay; /* Slab reclaim interval */
uint32_t skc_reap; /* Slab reclaim count */
atomic_t skc_ref; /* Ref count callers */
struct delayed_work skc_work; /* Slab reclaim work */
struct work_struct work;

View File

@ -856,16 +856,19 @@ spl_slab_free(spl_kmem_slab_t *sks,
/*
* Traverses all the partial slabs attached to a cache and free those
* which which are currently empty, and have not been touched for
* skc_delay seconds. This is to avoid thrashing.
* skc_delay seconds to avoid thrashing. The count argument is
* passed to optionally cap the number of slabs reclaimed, a count
* of zero means try and reclaim everything. When flag is set we
* always free an available slab regardless of age.
*/
static void
spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
{
spl_kmem_slab_t *sks, *m;
spl_kmem_obj_t *sko, *n;
LIST_HEAD(sks_list);
LIST_HEAD(sko_list);
int size;
int size, i = 0;
ENTRY;
/*
@ -878,11 +881,18 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
spin_lock(&skc->skc_lock);
list_for_each_entry_safe_reverse(sks, m, &skc->skc_partial_list,
sks_list) {
if (sks->sks_ref > 0)
break;
/* Release at most count slabs */
if (count && i > count)
break;
if (flag || time_after(jiffies,sks->sks_age+skc->skc_delay*HZ))
/* Skip active slabs */
if (sks->sks_ref > 0)
continue;
if (time_after(jiffies,sks->sks_age+skc->skc_delay*HZ)||flag) {
spl_slab_free(sks, &sks_list, &sko_list);
i++;
}
}
spin_unlock(&skc->skc_lock);
@ -896,12 +906,18 @@ spl_slab_reclaim(spl_kmem_cache_t *skc, int flag)
size = P2ROUNDUP(skc->skc_obj_size, skc->skc_obj_align) +
P2ROUNDUP(sizeof(spl_kmem_obj_t), skc->skc_obj_align);
list_for_each_entry_safe(sko, n, &sko_list, sko_list)
/* To avoid soft lockups conditionally reschedule */
list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
kv_free(skc, sko->sko_addr, size);
cond_resched();
}
}
list_for_each_entry_safe(sks, m, &sks_list, sks_list)
/* To avoid soft lockups conditionally reschedule */
list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
kv_free(skc, sks, skc->skc_slab_size);
cond_resched();
}
EXIT;
}
@ -937,11 +953,11 @@ spl_cache_age(void *data)
spl_get_work_data(data, spl_kmem_cache_t, skc_work.work);
ASSERT(skc->skc_magic == SKC_MAGIC);
spl_on_each_cpu(spl_magazine_age, skc, 1);
spl_slab_reclaim(skc, 0);
spl_slab_reclaim(skc, skc->skc_reap, 0);
spl_on_each_cpu(spl_magazine_age, skc, 0);
if (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags))
schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
}
/*
@ -1057,49 +1073,47 @@ spl_magazine_free(spl_kmem_magazine_t *skm)
EXIT;
}
static void
__spl_magazine_create(void *data)
{
spl_kmem_cache_t *skc = data;
int id = smp_processor_id();
skc->skc_mag[id] = spl_magazine_alloc(skc, cpu_to_node(id));
ASSERT(skc->skc_mag[id]);
}
/*
* Create all pre-cpu magazines of reasonable sizes.
*/
static int
spl_magazine_create(spl_kmem_cache_t *skc)
{
int i;
ENTRY;
skc->skc_mag_size = spl_magazine_size(skc);
skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
spl_on_each_cpu(__spl_magazine_create, skc, 1);
for_each_online_cpu(i) {
skc->skc_mag[i] = spl_magazine_alloc(skc, cpu_to_node(i));
if (!skc->skc_mag[i]) {
for (i--; i >= 0; i--)
spl_magazine_free(skc->skc_mag[i]);
RETURN(-ENOMEM);
}
}
RETURN(0);
}
static void
__spl_magazine_destroy(void *data)
{
spl_kmem_cache_t *skc = data;
spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
(void)spl_cache_flush(skc, skm, skm->skm_avail);
spl_magazine_free(skm);
}
/*
* Destroy all pre-cpu magazines.
*/
static void
spl_magazine_destroy(spl_kmem_cache_t *skc)
{
spl_kmem_magazine_t *skm;
int i;
ENTRY;
spl_on_each_cpu(__spl_magazine_destroy, skc, 1);
for_each_online_cpu(i) {
skm = skc->skc_mag[i];
(void)spl_cache_flush(skc, skm, skm->skm_avail);
spl_magazine_free(skm);
}
EXIT;
}
@ -1168,6 +1182,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
skc->skc_obj_size = size;
skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
skc->skc_delay = SPL_KMEM_CACHE_DELAY;
skc->skc_reap = SPL_KMEM_CACHE_REAP;
atomic_set(&skc->skc_ref, 0);
INIT_LIST_HEAD(&skc->skc_list);
@ -1209,7 +1224,7 @@ spl_kmem_cache_create(char *name, size_t size, size_t align,
GOTO(out, rc);
spl_init_delayed_work(&skc->skc_work, spl_cache_age, skc);
schedule_delayed_work(&skc->skc_work, 2 * skc->skc_delay * HZ);
schedule_delayed_work(&skc->skc_work, skc->skc_delay / 3 * HZ);
down_write(&spl_kmem_cache_sem);
list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
@ -1249,7 +1264,7 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
wait_event(wq, atomic_read(&skc->skc_ref) == 0);
spl_magazine_destroy(skc);
spl_slab_reclaim(skc, 1);
spl_slab_reclaim(skc, 0, 1);
spin_lock(&skc->skc_lock);
/* Validate there are no objects in use and free all the
@ -1654,7 +1669,7 @@ spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
if (skc->skc_reclaim)
skc->skc_reclaim(skc->skc_private);
spl_slab_reclaim(skc, 0);
spl_slab_reclaim(skc, skc->skc_reap, 0);
clear_bit(KMC_BIT_REAPING, &skc->skc_flags);
atomic_dec(&skc->skc_ref);