Fix vdev_queue_aggregate() deadlock
This deadlock may manifest itself in slightly different ways but at the core it is caused by a memory allocation blocking on file- system reclaim in the zio pipeline. This is normally impossible because zio_execute() disables filesystem reclaim by setting PF_FSTRANS on the thread. However, kmem cache allocations may still indirectly block on file system reclaim while holding the critical vq->vq_lock as shown below. To resolve this issue zio_buf_alloc_flags() is introduced which allocation flags to be passed. This can then be used in vdev_queue_aggregate() with KM_NOSLEEP when allocating the aggregate IO buffer. Since aggregating the IO is purely a performance optimization we want this to either succeed or fail quickly. Trying too hard to allocate this memory under the vq->vq_lock can negatively impact performance and result in this deadlock. * z_wr_iss zio_vdev_io_start vdev_queue_io -> Takes vq->vq_lock vdev_queue_io_to_issue vdev_queue_aggregate zio_buf_alloc -> Waiting on spl_kmem_cache process * z_wr_int zio_vdev_io_done vdev_queue_io_done mutex_lock -> Waiting on vq->vq_lock held by z_wr_iss * txg_sync spa_sync dsl_pool_sync zio_wait -> Waiting on zio being handled by z_wr_int * spl_kmem_cache spl_cache_grow_work kv_alloc spl_vmalloc ... evict zpl_evict_inode zfs_inactive dmu_tx_wait txg_wait_open -> Waiting on txg_sync Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Chunwei Chen <david.chen@osnexus.com> Signed-off-by: Tim Chase <tim@chase2k.com> Closes #3808 Closes #3867
This commit is contained in:
parent
a8ad3bf02c
commit
6fe53787f3
|
@ -525,6 +525,7 @@ extern void *zio_buf_alloc(size_t size);
|
||||||
extern void zio_buf_free(void *buf, size_t size);
|
extern void zio_buf_free(void *buf, size_t size);
|
||||||
extern void *zio_data_buf_alloc(size_t size);
|
extern void *zio_data_buf_alloc(size_t size);
|
||||||
extern void zio_data_buf_free(void *buf, size_t size);
|
extern void zio_data_buf_free(void *buf, size_t size);
|
||||||
|
extern void *zio_buf_alloc_flags(size_t size, int flags);
|
||||||
|
|
||||||
extern void zio_resubmit_stage_async(void *);
|
extern void zio_resubmit_stage_async(void *);
|
||||||
|
|
||||||
|
|
|
@ -502,6 +502,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
|
||||||
boolean_t stretch = B_FALSE;
|
boolean_t stretch = B_FALSE;
|
||||||
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
|
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
|
||||||
enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
|
enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
|
||||||
|
void *buf;
|
||||||
|
|
||||||
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
|
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
|
||||||
return (NULL);
|
return (NULL);
|
||||||
|
@ -608,8 +609,12 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
|
||||||
size = IO_SPAN(first, last);
|
size = IO_SPAN(first, last);
|
||||||
ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
|
ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
|
||||||
|
|
||||||
|
buf = zio_buf_alloc_flags(size, KM_NOSLEEP);
|
||||||
|
if (buf == NULL)
|
||||||
|
return (NULL);
|
||||||
|
|
||||||
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
|
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
|
||||||
zio_buf_alloc(size), size, first->io_type, zio->io_priority,
|
buf, size, first->io_type, zio->io_priority,
|
||||||
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
|
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
|
||||||
vdev_queue_agg_io_done, NULL);
|
vdev_queue_agg_io_done, NULL);
|
||||||
aio->io_timestamp = first->io_timestamp;
|
aio->io_timestamp = first->io_timestamp;
|
||||||
|
|
|
@ -248,6 +248,20 @@ zio_data_buf_alloc(size_t size)
|
||||||
return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
|
return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Use zio_buf_alloc_flags when specific allocation flags are needed. e.g.
|
||||||
|
* passing KM_NOSLEEP when it is acceptable for an allocation to fail.
|
||||||
|
*/
|
||||||
|
void *
|
||||||
|
zio_buf_alloc_flags(size_t size, int flags)
|
||||||
|
{
|
||||||
|
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
|
||||||
|
|
||||||
|
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
|
||||||
|
|
||||||
|
return (kmem_cache_alloc(zio_buf_cache[c], flags));
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
zio_buf_free(void *buf, size_t size)
|
zio_buf_free(void *buf, size_t size)
|
||||||
{
|
{
|
||||||
|
@ -3475,6 +3489,7 @@ zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
|
||||||
EXPORT_SYMBOL(zio_type_name);
|
EXPORT_SYMBOL(zio_type_name);
|
||||||
EXPORT_SYMBOL(zio_buf_alloc);
|
EXPORT_SYMBOL(zio_buf_alloc);
|
||||||
EXPORT_SYMBOL(zio_data_buf_alloc);
|
EXPORT_SYMBOL(zio_data_buf_alloc);
|
||||||
|
EXPORT_SYMBOL(zio_buf_alloc_flags);
|
||||||
EXPORT_SYMBOL(zio_buf_free);
|
EXPORT_SYMBOL(zio_buf_free);
|
||||||
EXPORT_SYMBOL(zio_data_buf_free);
|
EXPORT_SYMBOL(zio_data_buf_free);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue