Skip to content

Commit 06305d7

Browse files
committed
Scale worker threads and taskqs with number of CPUs.
While use of dynamic taskqs allows to reduce number of idle threads, hardcoded 8 taskqs of each kind is a big overkill for small systems, complicating CPU scheduling, increasing I/O reorder, etc, while providing no real locking benefits, just not needed there. On another side, 12*8 worker threads per kind are able to overload almost any system nowadays. For example, pool of several fast SSDs with SHA256 checksum makes system barely responsive during scrub, or with dedup enabled barely responsive during large file deletion. To address both problems this patch introduces ZTI_SCALE macro, alike to ZTI_BATCH, but with multiple taskqs, depending on number of CPUs, to be used in places where lock scalability is needed, while request ordering is not so much. The code is made to create new taskq for ~6 worker threads (less for small systems, but more for very large) up to 80% of CPU cores (previous 75% was not good for rounding down). Both number of threads and threads per taskq are now tunable in case somebody really wants to use all of system power for ZFS. While obviously some benchmarks show small peak performance reduction (not so big really, especially on systems with SMT, where use of the second threads does not give as much performance as the first ones), they also show dramatic latency reduction and much more smooth user- space operation in case of high CPU usage by ZFS. Signed-off-by: Alexander Motin <[email protected]> Sponsored-By: iXsystems, Inc.
1 parent 056a658 commit 06305d7

File tree

2 files changed

+71
-25
lines changed

2 files changed

+71
-25
lines changed

man/man5/zfs-module-parameters.5

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4079,11 +4079,25 @@ Percentage of online CPUs (or CPU cores, etc) which will run a worker thread
40794079
for I/O. These workers are responsible for I/O work such as compression and
40804080
checksum calculations. Fractional number of CPUs will be rounded down.
40814081
.sp
4082-
The default value of 75 was chosen to avoid using all CPUs which can result in
4083-
latency issues and inconsistent application performance, especially when high
4084-
compression is enabled.
4082+
The default value of 80 was chosen to avoid using all CPUs which can result in
4083+
latency issues and inconsistent application performance, especially when slower
4084+
compression and/or checksumming is enabled.
40854085
.sp
4086-
Default value: \fB75\fR.
4086+
Default value: \fB80\fR.
4087+
.RE
4088+
4089+
.sp
4090+
.ne 2
4091+
.na
4092+
\fBzio_taskq_batch_tpq\fR (uint)
4093+
.ad
4094+
.RS 12n
4095+
Number of worker threads per taskq. Lower value improves I/O ordering and
4096+
CPU utilization, while higher reduces lock contention.
4097+
.sp
4098+
By default about 6 worker threads per taskq, depending on system size.
4099+
.sp
4100+
Default value: \fB0\fR.
40874101
.RE
40884102

40894103
.sp

module/zfs/spa.c

Lines changed: 53 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -108,13 +108,15 @@ int zfs_ccw_retry_interval = 300;
108108
typedef enum zti_modes {
109109
ZTI_MODE_FIXED, /* value is # of threads (min 1) */
110110
ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
111+
ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */
111112
ZTI_MODE_NULL, /* don't create a taskq */
112113
ZTI_NMODES
113114
} zti_modes_t;
114115

115116
#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
116117
#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
117118
#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
119+
#define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 }
118120
#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
119121

120122
#define ZTI_N(n) ZTI_P(n, 1)
@@ -141,7 +143,8 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
141143
* point of lock contention. The ZTI_P(#, #) macro indicates that we need an
142144
* additional degree of parallelism specified by the number of threads per-
143145
* taskq and the number of taskqs; when dispatching an event in this case, the
144-
* particular taskq is chosen at random.
146+
* particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH,
147+
* but with number of taskqs also scaling with number of CPUs.
145148
*
146149
* The different taskq priorities are to handle the different contexts (issue
147150
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
@@ -150,9 +153,9 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
150153
const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
151154
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
152155
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
153-
{ ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */
154-
{ ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */
155-
{ ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
156+
{ ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
157+
{ ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */
158+
{ ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
156159
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
157160
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
158161
{ ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */
@@ -164,7 +167,8 @@ static boolean_t spa_has_active_shared_spare(spa_t *spa);
164167
static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
165168
static void spa_vdev_resilver_done(spa_t *spa);
166169

167-
uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
170+
uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */
171+
uint_t zio_taskq_batch_tpq; /* threads per taskq */
168172
boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
169173
uint_t zio_taskq_basedc = 80; /* base duty cycle */
170174

@@ -957,24 +961,12 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
957961
uint_t value = ztip->zti_value;
958962
uint_t count = ztip->zti_count;
959963
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
960-
uint_t flags = 0;
964+
uint_t cpus, flags = 0;
961965
boolean_t batch = B_FALSE;
962966

963-
if (mode == ZTI_MODE_NULL) {
964-
tqs->stqs_count = 0;
965-
tqs->stqs_taskq = NULL;
966-
return;
967-
}
968-
969-
ASSERT3U(count, >, 0);
970-
971-
tqs->stqs_count = count;
972-
tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
973-
974967
switch (mode) {
975968
case ZTI_MODE_FIXED:
976-
ASSERT3U(value, >=, 1);
977-
value = MAX(value, 1);
969+
ASSERT3U(value, >, 0);
978970
flags |= TASKQ_DYNAMIC;
979971
break;
980972

@@ -984,19 +976,56 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
984976
value = MIN(zio_taskq_batch_pct, 100);
985977
break;
986978

979+
case ZTI_MODE_SCALE:
980+
flags |= TASKQ_THREADS_CPU_PCT;
981+
/*
982+
* We want more taskqs to reduce lock contention, but we want
983+
* less for better request ordering and CPU utilization.
984+
*/
985+
cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
986+
if (zio_taskq_batch_tpq > 0) {
987+
count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) /
988+
zio_taskq_batch_tpq);
989+
} else {
990+
/*
991+
* Prefer 6 threads per taskq, but no more taskqs
992+
* than threads in them on large systems.
993+
*/
994+
count = 1 + cpus / 6;
995+
while (count * count > cpus)
996+
count--;
997+
}
998+
/* Limit each taskq within 100% to not trigger assertion. */
999+
count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
1000+
value = (zio_taskq_batch_pct + count / 2) / count;
1001+
break;
1002+
1003+
case ZTI_MODE_NULL:
1004+
tqs->stqs_count = 0;
1005+
tqs->stqs_taskq = NULL;
1006+
return;
1007+
9871008
default:
9881009
panic("unrecognized mode for %s_%s taskq (%u:%u) in "
9891010
"spa_activate()",
9901011
zio_type_name[t], zio_taskq_types[q], mode, value);
9911012
break;
9921013
}
9931014

1015+
ASSERT3U(count, >, 0);
1016+
tqs->stqs_count = count;
1017+
tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
1018+
9941019
for (uint_t i = 0; i < count; i++) {
9951020
taskq_t *tq;
9961021
char name[32];
9971022

998-
(void) snprintf(name, sizeof (name), "%s_%s",
999-
zio_type_name[t], zio_taskq_types[q]);
1023+
if (count > 1)
1024+
(void) snprintf(name, sizeof (name), "%s_%s_%u",
1025+
zio_type_name[t], zio_taskq_types[q], i);
1026+
else
1027+
(void) snprintf(name, sizeof (name), "%s_%s",
1028+
zio_type_name[t], zio_taskq_types[q]);
10001029

10011030
if (zio_taskq_sysdc && spa->spa_proc != &p0) {
10021031
if (batch)
@@ -9863,6 +9892,9 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
98639892
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
98649893
"Percentage of CPUs to run an IO worker thread");
98659894

9895+
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
9896+
"Number of threads per IO worker taskqueue");
9897+
98669898
ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW,
98679899
"Allow importing pool with up to this number of missing top-level "
98689900
"vdevs (in read-only mode)");

0 commit comments

Comments
 (0)