Skip to content

Commit 99741bd

Browse files
authored
zvol: use multiple taskq
Currently, zvol uses a single taskq, resulting in throughput bottleneck under heavy load due to lock contention on the single taskq. This patch addresses the performance bottleneck under heavy load conditions by utilizing multiple taskqs, thus mitigating lock contention. The number of taskqs scale dynamically based on the available CPUs in the system, as illustrated below: taskq total cpus taskqs threads threads ------- ------- ------- ------- 1 1 32 32 2 1 32 32 4 1 32 32 8 2 16 32 16 3 11 33 32 5 7 35 64 8 8 64 128 11 12 132 256 16 16 256 Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Tony Nguyen <[email protected]> Signed-off-by: Ameer Hamza <[email protected]> Closes openzfs#15992
1 parent 30c4eba commit 99741bd

File tree

2 files changed

+99
-10
lines changed

2 files changed

+99
-10
lines changed

man/man4/zfs.4

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2387,6 +2387,13 @@ The number of requests which can be handled concurrently is controlled by
23872387
is ignored when running on a kernel that supports block multiqueue
23882388
.Pq Li blk-mq .
23892389
.
2390+
.It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint
2391+
Number of zvol taskqs.
2392+
If
2393+
.Sy 0
2394+
(the default) then scaling is done internally to prefer 6 threads per taskq.
2395+
This only applies on Linux.
2396+
.
23902397
.It Sy zvol_threads Ns = Ns Sy 0 Pq uint
23912398
The number of system wide threads to use for processing zvol block IOs.
23922399
If

module/os/linux/zfs/zvol_os.c

Lines changed: 92 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <sys/spa_impl.h>
3838
#include <sys/zvol.h>
3939
#include <sys/zvol_impl.h>
40+
#include <cityhash.h>
4041

4142
#include <linux/blkdev_compat.h>
4243
#include <linux/task_io_accounting_ops.h>
@@ -53,6 +54,12 @@ static unsigned int zvol_request_sync = 0;
5354
static unsigned int zvol_prefetch_bytes = (128 * 1024);
5455
static unsigned long zvol_max_discard_blocks = 16384;
5556

57+
/*
58+
* Switch taskq at multiple of 512 MB offset. This can be set to a lower value
59+
* to utilize more threads for small files but may affect prefetch hits.
60+
*/
61+
#define ZVOL_TASKQ_OFFSET_SHIFT 29
62+
5663
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
5764
static unsigned int zvol_open_timeout_ms = 1000;
5865
#endif
@@ -74,6 +81,7 @@ static boolean_t zvol_use_blk_mq = B_FALSE;
7481
* read and write tests to a zvol in an NVMe pool (with 16 CPUs).
7582
*/
7683
static unsigned int zvol_blk_mq_blocks_per_thread = 8;
84+
static unsigned int zvol_num_taskqs = 0;
7785
#endif
7886

7987
#ifndef BLKDEV_DEFAULT_RQ
@@ -114,7 +122,11 @@ struct zvol_state_os {
114122
boolean_t use_blk_mq;
115123
};
116124

117-
static taskq_t *zvol_taskq;
125+
typedef struct zv_taskq {
126+
uint_t tqs_cnt;
127+
taskq_t **tqs_taskq;
128+
} zv_taskq_t;
129+
static zv_taskq_t zvol_taskqs;
118130
static struct ida zvol_ida;
119131

120132
typedef struct zv_request_stack {
@@ -532,6 +544,17 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
532544
}
533545

534546
zv_request_task_t *task;
547+
zv_taskq_t *ztqs = &zvol_taskqs;
548+
uint_t blk_mq_hw_queue = 0;
549+
uint_t tq_idx;
550+
uint_t taskq_hash;
551+
#ifdef HAVE_BLK_MQ
552+
if (rq)
553+
blk_mq_hw_queue = rq->mq_hctx->queue_num;
554+
#endif
555+
taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
556+
blk_mq_hw_queue, 0);
557+
tq_idx = taskq_hash % ztqs->tqs_cnt;
535558

536559
if (rw == WRITE) {
537560
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
@@ -601,15 +624,15 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
601624
zvol_discard(&zvr);
602625
} else {
603626
task = zv_request_task_create(zvr);
604-
taskq_dispatch_ent(zvol_taskq,
627+
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
605628
zvol_discard_task, task, 0, &task->ent);
606629
}
607630
} else {
608631
if (force_sync) {
609632
zvol_write(&zvr);
610633
} else {
611634
task = zv_request_task_create(zvr);
612-
taskq_dispatch_ent(zvol_taskq,
635+
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
613636
zvol_write_task, task, 0, &task->ent);
614637
}
615638
}
@@ -631,7 +654,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
631654
zvol_read(&zvr);
632655
} else {
633656
task = zv_request_task_create(zvr);
634-
taskq_dispatch_ent(zvol_taskq,
657+
taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
635658
zvol_read_task, task, 0, &task->ent);
636659
}
637660
}
@@ -1598,8 +1621,40 @@ zvol_init(void)
15981621
zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
15991622
}
16001623

1624+
/*
1625+
* Use atleast 32 zvol_threads but for many core system,
1626+
* prefer 6 threads per taskq, but no more taskqs
1627+
* than threads in them on large systems.
1628+
*
1629+
* taskq total
1630+
* cpus taskqs threads threads
1631+
* ------- ------- ------- -------
1632+
* 1 1 32 32
1633+
* 2 1 32 32
1634+
* 4 1 32 32
1635+
* 8 2 16 32
1636+
* 16 3 11 33
1637+
* 32 5 7 35
1638+
* 64 8 8 64
1639+
* 128 11 12 132
1640+
* 256 16 16 256
1641+
*/
1642+
zv_taskq_t *ztqs = &zvol_taskqs;
1643+
uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
1644+
if (num_tqs == 0) {
1645+
num_tqs = 1 + num_online_cpus() / 6;
1646+
while (num_tqs * num_tqs > zvol_actual_threads)
1647+
num_tqs--;
1648+
}
1649+
uint_t per_tq_thread = zvol_actual_threads / num_tqs;
1650+
if (per_tq_thread * num_tqs < zvol_actual_threads)
1651+
per_tq_thread++;
1652+
ztqs->tqs_cnt = num_tqs;
1653+
ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
16011654
error = register_blkdev(zvol_major, ZVOL_DRIVER);
16021655
if (error) {
1656+
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
1657+
ztqs->tqs_taskq = NULL;
16031658
printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
16041659
return (error);
16051660
}
@@ -1619,11 +1674,22 @@ zvol_init(void)
16191674
1024);
16201675
}
16211676
#endif
1622-
zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri,
1623-
zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1624-
if (zvol_taskq == NULL) {
1625-
unregister_blkdev(zvol_major, ZVOL_DRIVER);
1626-
return (-ENOMEM);
1677+
for (uint_t i = 0; i < num_tqs; i++) {
1678+
char name[32];
1679+
(void) snprintf(name, sizeof (name), "%s_tq-%u",
1680+
ZVOL_DRIVER, i);
1681+
ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
1682+
maxclsyspri, per_tq_thread, INT_MAX,
1683+
TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
1684+
if (ztqs->tqs_taskq[i] == NULL) {
1685+
for (int j = i - 1; j >= 0; j--)
1686+
taskq_destroy(ztqs->tqs_taskq[j]);
1687+
unregister_blkdev(zvol_major, ZVOL_DRIVER);
1688+
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
1689+
sizeof (taskq_t *));
1690+
ztqs->tqs_taskq = NULL;
1691+
return (-ENOMEM);
1692+
}
16271693
}
16281694

16291695
zvol_init_impl();
@@ -1634,9 +1700,22 @@ zvol_init(void)
16341700
void
16351701
zvol_fini(void)
16361702
{
1703+
zv_taskq_t *ztqs = &zvol_taskqs;
16371704
zvol_fini_impl();
16381705
unregister_blkdev(zvol_major, ZVOL_DRIVER);
1639-
taskq_destroy(zvol_taskq);
1706+
1707+
if (ztqs->tqs_taskq == NULL) {
1708+
ASSERT3U(ztqs->tqs_cnt, ==, 0);
1709+
} else {
1710+
for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
1711+
ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
1712+
taskq_destroy(ztqs->tqs_taskq[i]);
1713+
}
1714+
kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
1715+
sizeof (taskq_t *));
1716+
ztqs->tqs_taskq = NULL;
1717+
}
1718+
16401719
ida_destroy(&zvol_ida);
16411720
}
16421721

@@ -1657,6 +1736,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
16571736
module_param(zvol_max_discard_blocks, ulong, 0444);
16581737
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
16591738

1739+
module_param(zvol_num_taskqs, uint, 0444);
1740+
MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
1741+
16601742
module_param(zvol_prefetch_bytes, uint, 0644);
16611743
MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
16621744

0 commit comments

Comments
 (0)