From 436ad60faa971dc62f30ebd5c79fa55722234147 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 9 Jan 2015 14:00:34 -0800 Subject: [PATCH] Fix kmem cache deadlock logic The kmem cache implementation always adds new slabs by dispatching a task to the spl_kmem_cache taskq to perform the allocation. This is done because large slabs must be allocated using vmalloc(). It is possible these allocations will block on IO because the GFP_NOIO flag is not honored. This can result in a deadlock. Therefore, a deadlock detection strategy was implemented to deal with this case. When it is determined, by timeout, that the spl_kmem_cache thread has deadlocked attempting to add a new slab. Then all callers attempting to allocate from the cache fall back to using kmalloc() which does honor all passed flags. This logic was correct but an optimization in the code allowed for a deadlock. Because only slabs backed by vmalloc() can deadlock in the way described above. An optimization was made to only invoke this deadlock detection code for vmalloc() backed caches. This had the advantage of making it easy to distinguish these objects when they were freed. But this isn't strictly safe. If all the spl_kmem_cache threads end up deadlocked than we can't grow any of the other caches either. This can once again result in a deadlock if memory needs to be allocated from one of these other caches to ensure forward progress. The fix here is to remove the optimization which limits this fall back allocation stratagy to vmalloc() backed caches. Doing this means we may need to take the cache lock in spl_kmem_cache_free() call path. But this small cost can be mitigated by ignoring objects with virtual addresses. For good measure the default number of spl_kmem_cache threads has been increased from 1 to 4, and made tunable. This alone wouldn't resolve the original issue since it's still possible for all the threads to be deadlocked. However, it does help responsiveness by ensuring that a single deadlocked spl_kmem_cache thread doesn't block allocations from other caches until the timeout is reached. Signed-off-by: Brian Behlendorf --- module/spl/spl-kmem-cache.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c index 5160aa1cff..38fee703db 100644 --- a/module/spl/spl-kmem-cache.c +++ b/module/spl/spl-kmem-cache.c @@ -139,6 +139,15 @@ module_param(spl_kmem_cache_kmem_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, "Objects less than N bytes use the kmalloc"); +/* + * The number of threads available to allocate new slabs for caches. This + * should not need to be tuned but it is available for performance analysis. + */ +unsigned int spl_kmem_cache_kmem_threads = 4; +module_param(spl_kmem_cache_kmem_threads, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, + "Number of spl_kmem_cache threads"); + /* * Slab allocation interfaces * @@ -544,14 +553,14 @@ spl_emergency_free(spl_kmem_cache_t *skc, void *obj) spin_lock(&skc->skc_lock); ske = spl_emergency_search(&skc->skc_emergency_tree, obj); - if (likely(ske)) { + if (ske) { rb_erase(&ske->ske_node, &skc->skc_emergency_tree); skc->skc_obj_emergency--; skc->skc_obj_total--; } spin_unlock(&skc->skc_lock); - if (unlikely(ske == NULL)) + if (ske == NULL) return (-ENOENT); kfree(ske->ske_obj); @@ -1237,7 +1246,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) remaining = wait_event_timeout(skc->skc_waitq, spl_cache_grow_wait(skc), HZ / 10); - if (!remaining && test_bit(KMC_BIT_VMEM, &skc->skc_flags)) { + if (!remaining) { spin_lock(&skc->skc_lock); if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); @@ -1464,6 +1473,7 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) spl_kmem_magazine_t *skm; unsigned long flags; int do_reclaim = 0; + int do_emergency = 0; ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); @@ -1484,13 +1494,18 @@ spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) } /* - * Only virtual slabs may have emergency objects and these objects - * are guaranteed to have physical addresses. They must be removed - * from the tree of emergency objects and the freed. + * While a cache has outstanding emergency objects all freed objects + * must be checked. However, since emergency objects will never use + * a virtual address these objects can be safely excluded as an + * optimization. */ - if ((skc->skc_flags & KMC_VMEM) && !is_vmalloc_addr(obj)) { - spl_emergency_free(skc, obj); - goto out; + if (!is_vmalloc_addr(obj)) { + spin_lock(&skc->skc_lock); + do_emergency = (skc->skc_obj_emergency > 0); + spin_unlock(&skc->skc_lock); + + if (do_emergency && (spl_emergency_free(skc, obj) == 0)) + goto out; } local_irq_save(flags); @@ -1702,7 +1717,7 @@ spl_kmem_cache_init(void) init_rwsem(&spl_kmem_cache_sem); INIT_LIST_HEAD(&spl_kmem_cache_list); spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", - 1, maxclsyspri, 1, 32, TASKQ_PREPOPULATE); + spl_kmem_cache_kmem_threads, maxclsyspri, 1, 32, TASKQ_PREPOPULATE); spl_register_shrinker(&spl_kmem_cache_shrinker); return (0);