Skip to content

Commit 844ae58

Browse files
committed
Reduce ZIO io_lock contention on sorted scrub
During sorted scrub multiple threads (one per vdev) are issuing many ZIOs same time, all using the same scn->scn_zio_root ZIO as parent. It causes huge lock contention on the single global lock on that ZIO. Improve it by introducing per-queue null ZIOs, children to that one, and using them instead as proxy. For 12 SSD pool storing 1.5TB of 4KB blocks on 80-core system this dramatically reduces lock contention and reduces scrub time from 21 minutes down to 12.5, while actual read stages (not scan) are about 3x faster, reaching 100K blocks per second per vdev. Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored-By: iXsystems, Inc. Closes openzfs#13553
1 parent 3358f2d commit 844ae58

File tree

1 file changed

+15
-4
lines changed

1 file changed

+15
-4
lines changed

module/zfs/dsl_scan.c

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ typedef struct scan_io {
279279
struct dsl_scan_io_queue {
280280
dsl_scan_t *q_scn; /* associated dsl_scan_t */
281281
vdev_t *q_vd; /* top-level vdev that this queue represents */
282+
zio_t *q_zio; /* scn_zio_root child for waiting on IO */
282283

283284
/* trees used for sorting I/Os and extents of I/Os */
284285
range_tree_t *q_exts_by_addr;
@@ -3021,15 +3022,19 @@ scan_io_queues_run_one(void *arg)
30213022
dsl_scan_io_queue_t *queue = arg;
30223023
kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
30233024
boolean_t suspended = B_FALSE;
3024-
range_seg_t *rs = NULL;
3025-
scan_io_t *sio = NULL;
3025+
range_seg_t *rs;
3026+
scan_io_t *sio;
3027+
zio_t *zio;
30263028
list_t sio_list;
30273029

30283030
ASSERT(queue->q_scn->scn_is_sorted);
30293031

30303032
list_create(&sio_list, sizeof (scan_io_t),
30313033
offsetof(scan_io_t, sio_nodes.sio_list_node));
3034+
zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa,
3035+
NULL, NULL, NULL, ZIO_FLAG_CANFAIL);
30323036
mutex_enter(q_lock);
3037+
queue->q_zio = zio;
30333038

30343039
/* Calculate maximum in-flight bytes for this vdev. */
30353040
queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit *
@@ -3096,7 +3101,9 @@ scan_io_queues_run_one(void *arg)
30963101
scan_io_queue_insert_impl(queue, sio);
30973102
}
30983103

3104+
queue->q_zio = NULL;
30993105
mutex_exit(q_lock);
3106+
zio_nowait(zio);
31003107
list_destroy(&sio_list);
31013108
}
31023109

@@ -4052,6 +4059,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
40524059
dsl_scan_t *scn = dp->dp_scan;
40534060
size_t size = BP_GET_PSIZE(bp);
40544061
abd_t *data = abd_alloc_for_io(size, B_FALSE);
4062+
zio_t *pio;
40554063

40564064
if (queue == NULL) {
40574065
ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
@@ -4060,6 +4068,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
40604068
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
40614069
spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
40624070
mutex_exit(&spa->spa_scrub_lock);
4071+
pio = scn->scn_zio_root;
40634072
} else {
40644073
kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
40654074

@@ -4068,12 +4077,14 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
40684077
while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
40694078
cv_wait(&queue->q_zio_cv, q_lock);
40704079
queue->q_inflight_bytes += BP_GET_PSIZE(bp);
4080+
pio = queue->q_zio;
40714081
mutex_exit(q_lock);
40724082
}
40734083

4084+
ASSERT(pio != NULL);
40744085
count_block(scn, dp->dp_blkstats, bp);
4075-
zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size,
4076-
dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
4086+
zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done,
4087+
queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
40774088
}
40784089

40794090
/*

0 commit comments

Comments
 (0)