Illumos #734: Use taskq_dispatch_ent() interface

It has been observed that some of the hottest locks are those
of the zio taskqs.  Contention on these locks can limit the
rate at which zios are dispatched which limits performance.

This upstream change from Illumos uses new interface to the
taskqs which allow them to utilize a prealloc'ed taskq_ent_t.
This removes the need to perform an allocation at dispatch
time while holding the contended lock.  This has the effect
of improving system performance.

Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Alexey Zaytsev <alexey.zaytsev@nexenta.com>
Reviewed by: Jason Brian King <jason.brian.king@gmail.com>
Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Approved by: Gordon Ross <gwr@nexenta.com>

References to Illumos issue:
  https://www.illumos.org/issues/734

Ported-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #482
This commit is contained in:
Garrett D'Amore 2011-11-07 16:26:52 -08:00 committed by Brian Behlendorf
parent 30a9524e45
commit a38718a63d
5 changed files with 134 additions and 41 deletions

View File

@ -22,6 +22,9 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/
#ifndef _SYS_ZFS_CONTEXT_H #ifndef _SYS_ZFS_CONTEXT_H
#define _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H
@ -365,6 +368,16 @@ typedef struct taskq taskq_t;
typedef uintptr_t taskqid_t; typedef uintptr_t taskqid_t;
typedef void (task_func_t)(void *); typedef void (task_func_t)(void *);
typedef struct taskq_ent {
struct taskq_ent *tqent_next;
struct taskq_ent *tqent_prev;
task_func_t *tqent_func;
void *tqent_arg;
uintptr_t tqent_flags;
} taskq_ent_t;
#define TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */
#define TASKQ_PREPOPULATE 0x0001 #define TASKQ_PREPOPULATE 0x0001
#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */
#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */
@ -385,6 +398,10 @@ extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
#define taskq_create_sysdc(a, b, d, e, p, dc, f) \ #define taskq_create_sysdc(a, b, d, e, p, dc, f) \
(taskq_create(a, b, maxclsyspri, d, e, f)) (taskq_create(a, b, maxclsyspri, d, e, f))
extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
taskq_ent_t *);
extern int taskq_empty_ent(taskq_ent_t *);
extern void taskq_init_ent(taskq_ent_t *);
extern void taskq_destroy(taskq_t *); extern void taskq_destroy(taskq_t *);
extern void taskq_wait(taskq_t *); extern void taskq_wait(taskq_t *);
extern int taskq_member(taskq_t *, kthread_t *); extern int taskq_member(taskq_t *, kthread_t *);

View File

@ -22,6 +22,9 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/ */
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/
#ifndef _ZIO_H #ifndef _ZIO_H
#define _ZIO_H #define _ZIO_H
@ -423,6 +426,9 @@ struct zio {
/* FMA state */ /* FMA state */
zio_cksum_report_t *io_cksum_report; zio_cksum_report_t *io_cksum_report;
uint64_t io_ena; uint64_t io_ena;
/* Taskq dispatching state */
taskq_ent_t io_tqent;
}; };
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,

View File

@ -22,19 +22,15 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
int taskq_now; int taskq_now;
taskq_t *system_taskq; taskq_t *system_taskq;
typedef struct task {
struct task *task_next;
struct task *task_prev;
task_func_t *task_func;
void *task_arg;
} task_t;
#define TASKQ_ACTIVE 0x00010000 #define TASKQ_ACTIVE 0x00010000
struct taskq { struct taskq {
@ -51,18 +47,19 @@ struct taskq {
int tq_maxalloc; int tq_maxalloc;
kcondvar_t tq_maxalloc_cv; kcondvar_t tq_maxalloc_cv;
int tq_maxalloc_wait; int tq_maxalloc_wait;
task_t *tq_freelist; taskq_ent_t *tq_freelist;
task_t tq_task; taskq_ent_t tq_task;
}; };
static task_t * static taskq_ent_t *
task_alloc(taskq_t *tq, int tqflags) task_alloc(taskq_t *tq, int tqflags)
{ {
task_t *t; taskq_ent_t *t;
int rv; int rv;
again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
tq->tq_freelist = t->task_next; ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
tq->tq_freelist = t->tqent_next;
} else { } else {
if (tq->tq_nalloc >= tq->tq_maxalloc) { if (tq->tq_nalloc >= tq->tq_maxalloc) {
if (!(tqflags & KM_SLEEP)) if (!(tqflags & KM_SLEEP))
@ -87,25 +84,28 @@ again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
} }
mutex_exit(&tq->tq_lock); mutex_exit(&tq->tq_lock);
t = kmem_alloc(sizeof (task_t), tqflags); t = kmem_alloc(sizeof (taskq_ent_t), tqflags);
mutex_enter(&tq->tq_lock); mutex_enter(&tq->tq_lock);
if (t != NULL) if (t != NULL) {
/* Make sure we start without any flags */
t->tqent_flags = 0;
tq->tq_nalloc++; tq->tq_nalloc++;
} }
}
return (t); return (t);
} }
static void static void
task_free(taskq_t *tq, task_t *t) task_free(taskq_t *tq, taskq_ent_t *t)
{ {
if (tq->tq_nalloc <= tq->tq_minalloc) { if (tq->tq_nalloc <= tq->tq_minalloc) {
t->task_next = tq->tq_freelist; t->tqent_next = tq->tq_freelist;
tq->tq_freelist = t; tq->tq_freelist = t;
} else { } else {
tq->tq_nalloc--; tq->tq_nalloc--;
mutex_exit(&tq->tq_lock); mutex_exit(&tq->tq_lock);
kmem_free(t, sizeof (task_t)); kmem_free(t, sizeof (taskq_ent_t));
mutex_enter(&tq->tq_lock); mutex_enter(&tq->tq_lock);
} }
@ -116,7 +116,7 @@ task_free(taskq_t *tq, task_t *t)
taskqid_t taskqid_t
taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
{ {
task_t *t; taskq_ent_t *t;
if (taskq_now) { if (taskq_now) {
func(arg); func(arg);
@ -130,26 +130,77 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
return (0); return (0);
} }
if (tqflags & TQ_FRONT) { if (tqflags & TQ_FRONT) {
t->task_next = tq->tq_task.task_next; t->tqent_next = tq->tq_task.tqent_next;
t->task_prev = &tq->tq_task; t->tqent_prev = &tq->tq_task;
} else { } else {
t->task_next = &tq->tq_task; t->tqent_next = &tq->tq_task;
t->task_prev = tq->tq_task.task_prev; t->tqent_prev = tq->tq_task.tqent_prev;
} }
t->task_next->task_prev = t; t->tqent_next->tqent_prev = t;
t->task_prev->task_next = t; t->tqent_prev->tqent_next = t;
t->task_func = func; t->tqent_func = func;
t->task_arg = arg; t->tqent_arg = arg;
ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
cv_signal(&tq->tq_dispatch_cv); cv_signal(&tq->tq_dispatch_cv);
mutex_exit(&tq->tq_lock); mutex_exit(&tq->tq_lock);
return (1); return (1);
} }
int
taskq_empty_ent(taskq_ent_t *t)
{
return t->tqent_next == NULL;
}
void
taskq_init_ent(taskq_ent_t *t)
{
t->tqent_next = NULL;
t->tqent_prev = NULL;
t->tqent_func = NULL;
t->tqent_arg = NULL;
t->tqent_flags = 0;
}
void
taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
taskq_ent_t *t)
{
ASSERT(func != NULL);
ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
/*
* Mark it as a prealloc'd task. This is important
* to ensure that we don't free it later.
*/
t->tqent_flags |= TQENT_FLAG_PREALLOC;
/*
* Enqueue the task to the underlying queue.
*/
mutex_enter(&tq->tq_lock);
if (flags & TQ_FRONT) {
t->tqent_next = tq->tq_task.tqent_next;
t->tqent_prev = &tq->tq_task;
} else {
t->tqent_next = &tq->tq_task;
t->tqent_prev = tq->tq_task.tqent_prev;
}
t->tqent_next->tqent_prev = t;
t->tqent_prev->tqent_next = t;
t->tqent_func = func;
t->tqent_arg = arg;
cv_signal(&tq->tq_dispatch_cv);
mutex_exit(&tq->tq_lock);
}
void void
taskq_wait(taskq_t *tq) taskq_wait(taskq_t *tq)
{ {
mutex_enter(&tq->tq_lock); mutex_enter(&tq->tq_lock);
while (tq->tq_task.task_next != &tq->tq_task || tq->tq_active != 0) while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0)
cv_wait(&tq->tq_wait_cv, &tq->tq_lock); cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
mutex_exit(&tq->tq_lock); mutex_exit(&tq->tq_lock);
} }
@ -158,26 +209,31 @@ static void
taskq_thread(void *arg) taskq_thread(void *arg)
{ {
taskq_t *tq = arg; taskq_t *tq = arg;
task_t *t; taskq_ent_t *t;
boolean_t prealloc;
mutex_enter(&tq->tq_lock); mutex_enter(&tq->tq_lock);
while (tq->tq_flags & TASKQ_ACTIVE) { while (tq->tq_flags & TASKQ_ACTIVE) {
if ((t = tq->tq_task.task_next) == &tq->tq_task) { if ((t = tq->tq_task.tqent_next) == &tq->tq_task) {
if (--tq->tq_active == 0) if (--tq->tq_active == 0)
cv_broadcast(&tq->tq_wait_cv); cv_broadcast(&tq->tq_wait_cv);
cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock); cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
tq->tq_active++; tq->tq_active++;
continue; continue;
} }
t->task_prev->task_next = t->task_next; t->tqent_prev->tqent_next = t->tqent_next;
t->task_next->task_prev = t->task_prev; t->tqent_next->tqent_prev = t->tqent_prev;
t->tqent_next = NULL;
t->tqent_prev = NULL;
prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC;
mutex_exit(&tq->tq_lock); mutex_exit(&tq->tq_lock);
rw_enter(&tq->tq_threadlock, RW_READER); rw_enter(&tq->tq_threadlock, RW_READER);
t->task_func(t->task_arg); t->tqent_func(t->tqent_arg);
rw_exit(&tq->tq_threadlock); rw_exit(&tq->tq_threadlock);
mutex_enter(&tq->tq_lock); mutex_enter(&tq->tq_lock);
if (!prealloc)
task_free(tq, t); task_free(tq, t);
} }
tq->tq_nthreads--; tq->tq_nthreads--;
@ -217,8 +273,8 @@ taskq_create(const char *name, int nthreads, pri_t pri,
tq->tq_nthreads = nthreads; tq->tq_nthreads = nthreads;
tq->tq_minalloc = minalloc; tq->tq_minalloc = minalloc;
tq->tq_maxalloc = maxalloc; tq->tq_maxalloc = maxalloc;
tq->tq_task.task_next = &tq->tq_task; tq->tq_task.tqent_next = &tq->tq_task;
tq->tq_task.task_prev = &tq->tq_task; tq->tq_task.tqent_prev = &tq->tq_task;
tq->tq_threadlist = kmem_alloc(nthreads*sizeof(kthread_t *), KM_SLEEP); tq->tq_threadlist = kmem_alloc(nthreads*sizeof(kthread_t *), KM_SLEEP);
if (flags & TASKQ_PREPOPULATE) { if (flags & TASKQ_PREPOPULATE) {

View File

@ -22,6 +22,9 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/ */
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/
/* /*
* This file contains all the routines used when modifying on-disk SPA state. * This file contains all the routines used when modifying on-disk SPA state.
@ -665,7 +668,7 @@ spa_create_zio_taskqs(spa_t *spa)
const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
enum zti_modes mode = ztip->zti_mode; enum zti_modes mode = ztip->zti_mode;
uint_t value = ztip->zti_value; uint_t value = ztip->zti_value;
uint_t flags = TASKQ_PREPOPULATE; uint_t flags = 0;
char name[32]; char name[32];
if (t == ZIO_TYPE_WRITE) if (t == ZIO_TYPE_WRITE)

View File

@ -21,6 +21,7 @@
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -570,6 +571,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio_add_child(pio, zio); zio_add_child(pio, zio);
} }
taskq_init_ent(&zio->io_tqent);
return (zio); return (zio);
} }
@ -1073,7 +1076,7 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
{ {
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type; zio_type_t t = zio->io_type;
int flags = TQ_NOSLEEP | (cutinline ? TQ_FRONT : 0); int flags = (cutinline ? TQ_FRONT : 0);
/* /*
* If we're a config writer or a probe, the normal issue and * If we're a config writer or a probe, the normal issue and
@ -1098,8 +1101,14 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
ASSERT3U(q, <, ZIO_TASKQ_TYPES); ASSERT3U(q, <, ZIO_TASKQ_TYPES);
while (taskq_dispatch(spa->spa_zio_taskq[t][q], /*
(task_func_t *)zio_execute, zio, flags) == 0); /* do nothing */ * NB: We are assuming that the zio can only be dispatched
* to a single taskq at a time. It would be a grievous error
* to dispatch the zio to another taskq at the same time.
*/
ASSERT(taskq_empty_ent(&zio->io_tqent));
taskq_dispatch_ent(spa->spa_zio_taskq[t][q],
(task_func_t *)zio_execute, zio, flags, &zio->io_tqent);
} }
static boolean_t static boolean_t
@ -2947,9 +2956,11 @@ zio_done(zio_t *zio)
* Reexecution is potentially a huge amount of work. * Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq. * Hand it off to the otherwise-unused claim taskq.
*/ */
(void) taskq_dispatch( ASSERT(taskq_empty_ent(&zio->io_tqent));
(void) taskq_dispatch_ent(
zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
(task_func_t *)zio_reexecute, zio, TQ_SLEEP); (task_func_t *)zio_reexecute, zio, 0,
&zio->io_tqent);
} }
return (ZIO_PIPELINE_STOP); return (ZIO_PIPELINE_STOP);
} }