From 1229323d5f82bcb0525b312c80330968cd41466e Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 24 Jul 2015 10:08:31 -0700 Subject: [PATCH] Align thread priority with Linux defaults Under Linux filesystem threads responsible for handling I/O are normally created with the maximum priority. Non-I/O filesystem processes run with the default priority. ZFS should adopt the same priority scheme under Linux to maintain good performance and so that it will complete fairly when other Linux filesystems are active. The priorities have been updated to the following: $ ps -eLo rtprio,cls,pid,pri,nice,cmd | egrep 'z_|spl_|zvol|arc|dbu|meta' - TS 10743 19 -20 [spl_kmem_cache] - TS 10744 19 -20 [spl_system_task] - TS 10745 19 -20 [spl_dynamic_tas] - TS 10764 19 0 [dbu_evict] - TS 10765 19 0 [arc_prune] - TS 10766 19 0 [arc_reclaim] - TS 10767 19 0 [arc_user_evicts] - TS 10768 19 0 [l2arc_feed] - TS 10769 39 0 [z_unmount] - TS 10770 39 -20 [zvol] - TS 11011 39 -20 [z_null_iss] - TS 11012 39 -20 [z_null_int] - TS 11013 39 -20 [z_rd_iss] - TS 11014 39 -20 [z_rd_int_0] - TS 11022 38 -19 [z_wr_iss] - TS 11023 39 -20 [z_wr_iss_h] - TS 11024 39 -20 [z_wr_int_0] - TS 11032 39 -20 [z_wr_int_h] - TS 11033 39 -20 [z_fr_iss_0] - TS 11041 39 -20 [z_fr_int] - TS 11042 39 -20 [z_cl_iss] - TS 11043 39 -20 [z_cl_int] - TS 11044 39 -20 [z_ioctl_iss] - TS 11045 39 -20 [z_ioctl_int] - TS 11046 39 -20 [metaslab_group_] - TS 11050 19 0 [z_iput] - TS 11121 38 -19 [z_wr_iss] Note that under Linux the meaning of a processes priority is inverted with respect to illumos. High values on Linux indicate a _low_ priority while high value on illumos indicate a _high_ priority. In order to preserve the logical meaning of the minclsyspri and maxclsyspri macros when they are used by the illumos wrapper functions their values have been inverted. This way when changes are merged from upstream illumos we won't need to remember to invert the macro. It could also lead to confusion. This patch depends on https://github.com/zfsonlinux/spl/pull/466. Signed-off-by: Brian Behlendorf Signed-off-by: Ned Bass Closes #3607 --- include/sys/zfs_context.h | 9 +++++++-- lib/libzpool/kernel.c | 2 ++ lib/libzpool/taskq.c | 4 ++-- module/zfs/arc.c | 8 ++++---- module/zfs/dbuf.c | 2 +- module/zfs/dmu_objset.c | 2 +- module/zfs/dsl_pool.c | 2 +- module/zfs/metaslab.c | 2 +- module/zfs/spa.c | 8 +++++--- module/zfs/txg.c | 6 +++--- module/zfs/zfs_ctldir.c | 2 +- module/zfs/zil.c | 2 +- 12 files changed, 29 insertions(+), 20 deletions(-) diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 7652a9cae1..4f7e3287f3 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -233,6 +233,7 @@ typedef struct kthread { kt_did_t t_tid; thread_func_t t_func; void * t_arg; + pri_t t_pri; } kthread_t; #define curthread zk_thread_current() @@ -615,8 +616,12 @@ extern void delay(clock_t ticks); #define max_ncpus 64 #define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN)) -#define minclsyspri 60 -#define maxclsyspri 99 +/* + * Process priorities as defined by setpriority(2) and getpriority(2). + */ +#define minclsyspri 19 +#define maxclsyspri -20 +#define defclsyspri 0 #define CPU_SEQID (pthread_self() & (max_ncpus - 1)) diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 80da41151c..a451026999 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -128,6 +128,7 @@ zk_thread_helper(void *arg) VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); kthread_nr++; VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); + (void) setpriority(PRIO_PROCESS, 0, kt->t_pri); kt->t_tid = pthread_self(); ((thread_func_arg_t) kt->t_func)(kt->t_arg); @@ -151,6 +152,7 @@ zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, kt = umem_zalloc(sizeof (kthread_t), UMEM_NOFAIL); kt->t_func = func; kt->t_arg = arg; + kt->t_pri = pri; VERIFY0(pthread_attr_init(&attr)); VERIFY0(pthread_attr_setdetachstate(&attr, detachstate)); diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 59b4c7c05f..bd92e61ab0 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -308,7 +308,7 @@ taskq_create(const char *name, int nthreads, pri_t pri, for (t = 0; t < nthreads; t++) VERIFY((tq->tq_threadlist[t] = thread_create(NULL, 0, - taskq_thread, tq, TS_RUN, NULL, 0, 0)) != NULL); + taskq_thread, tq, TS_RUN, NULL, 0, pri)) != NULL); return (tq); } @@ -371,7 +371,7 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id) void system_taskq_init(void) { - system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512, + system_taskq = taskq_create("system_taskq", 64, maxclsyspri, 4, 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 66e67795ff..c2fdf16309 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5431,7 +5431,7 @@ arc_init(void) mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); - arc_prune_taskq = taskq_create("arc_prune", max_ncpus, minclsyspri, + arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri, max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, @@ -5444,10 +5444,10 @@ arc_init(void) } (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); + TS_RUN, defclsyspri); (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); + TS_RUN, defclsyspri); arc_dead = FALSE; arc_warm = B_FALSE; @@ -6954,7 +6954,7 @@ l2arc_start(void) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, - TS_RUN, minclsyspri); + TS_RUN, defclsyspri); } void diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index c7b6a5d9aa..3807418c97 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -413,7 +413,7 @@ retry: * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ - dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); + dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); } void diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 56a804dfb6..4d5baf8da8 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1839,7 +1839,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, ntasks = dmu_find_threads; if (ntasks == 0) ntasks = vdev_count_leaves(dp->dp_spa) * 4; - tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks, + tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks, INT_MAX, 0); if (tq == NULL) { kmem_free(dcp, sizeof (*dcp)); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 23cf438628..ada0eac63e 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -170,7 +170,7 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); - dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, minclsyspri, + dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, defclsyspri, max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); return (dp); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 5544859b68..b328cbb0a1 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -492,7 +492,7 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) mg->mg_activation_count = 0; mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, - minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC); + maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC); return (mg); } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d8eaf9979f..2e23a341fb 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -898,11 +898,13 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU - * intensive. Run it at slightly lower priority - * than the other taskqs. + * intensive. Run it at slightly less important + * priority than the other taskqs. Under Linux this + * means incrementing the priority value on platforms + * like illumos it should be decremented. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) - pri--; + pri++; tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 7681856ea1..1d5ee97b13 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -205,7 +205,7 @@ txg_sync_start(dsl_pool_t *dp) tx->tx_threads = 2; tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, - dp, 0, &p0, TS_RUN, minclsyspri); + dp, 0, &p0, TS_RUN, defclsyspri); /* * The sync thread can need a larger-than-default stack size on @@ -213,7 +213,7 @@ txg_sync_start(dsl_pool_t *dp) * scrub_visitbp() recursion. */ tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, - dp, 0, &p0, TS_RUN, minclsyspri); + dp, 0, &p0, TS_RUN, defclsyspri); mutex_exit(&tx->tx_sync_lock); } @@ -445,7 +445,7 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) * Commit callback taskq hasn't been created yet. */ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", - max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2, + max_ncpus, defclsyspri, max_ncpus, max_ncpus * 2, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); } diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index 59405de82b..efa6cfa0a4 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -1009,7 +1009,7 @@ zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, void zfsctl_init(void) { - zfs_expire_taskq = taskq_create("z_unmount", 1, maxclsyspri, + zfs_expire_taskq = taskq_create("z_unmount", 1, defclsyspri, 1, 8, TASKQ_PREPOPULATE); } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 6a3885816c..289b23c7f4 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1888,7 +1888,7 @@ zil_open(objset_t *os, zil_get_data_t *get_data) ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; - zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, + zilog->zl_clean_taskq = taskq_create("zil_clean", 1, defclsyspri, 2, 2, TASKQ_PREPOPULATE); return (zilog);