Skip to content

Commit 2041d6e

Browse files
authored
Improve scrub maxinflight_bytes math.
Previously, ZFS scaled maxinflight_bytes based on total number of disks in the pool. A 3-wide mirror was receiving a queue depth of 3 disks, which it should not, since it reads from all the disks inside. For wide raidz the situation was slightly better, but still a 3-wide raidz1 received a depth of 3 disks instead of 2. The new code counts only unique data disks, i.e. 1 disk for mirrors and non-parity disks for raidz/draid. For draid the math is still imperfect, since vdev_get_nparity() returns number of parity disks per group, not per vdev, but still some better than it was. This should slightly reduce scrub influence on payload for some pool topologies by avoiding excessive queuing. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Ryan Moeller <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored-By: iXsystems, Inc. Closing #12046
1 parent ba646e3 commit 2041d6e

File tree

2 files changed

+16
-26
lines changed

2 files changed

+16
-26
lines changed

man/man5/zfs-module-parameters.5

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3326,7 +3326,7 @@ Default value: \fB0\fR.
33263326
Maximum amount of data that can be concurrently issued at once for scrubs and
33273327
resilvers per leaf device, given in bytes.
33283328
.sp
3329-
Default value: \fB41943040\fR.
3329+
Default value: \fB4194304\fR.
33303330
.RE
33313331

33323332
.sp

module/zfs/dsl_scan.c

Lines changed: 15 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
126126
static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
127127
static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
128128
static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
129-
static uint64_t dsl_scan_count_leaves(vdev_t *vd);
129+
static uint64_t dsl_scan_count_data_disks(vdev_t *vd);
130130

131131
extern int zfs_vdev_async_write_active_min_dirty_percent;
132132

@@ -451,7 +451,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
451451
* phase are done per top-level vdev and are handled separately.
452452
*/
453453
scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
454-
dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20);
454+
dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
455455

456456
avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
457457
offsetof(scan_ds_t, sds_node));
@@ -2759,22 +2759,16 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
27592759
}
27602760

27612761
static uint64_t
2762-
dsl_scan_count_leaves(vdev_t *vd)
2762+
dsl_scan_count_data_disks(vdev_t *rvd)
27632763
{
27642764
uint64_t i, leaves = 0;
27652765

2766-
/* we only count leaves that belong to the main pool and are readable */
2767-
if (vd->vdev_islog || vd->vdev_isspare ||
2768-
vd->vdev_isl2cache || !vdev_readable(vd))
2769-
return (0);
2770-
2771-
if (vd->vdev_ops->vdev_op_leaf)
2772-
return (1);
2773-
2774-
for (i = 0; i < vd->vdev_children; i++) {
2775-
leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
2766+
for (i = 0; i < rvd->vdev_children; i++) {
2767+
vdev_t *vd = rvd->vdev_child[i];
2768+
if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache)
2769+
continue;
2770+
leaves += vdev_get_ndisks(vd) - vdev_get_nparity(vd);
27762771
}
2777-
27782772
return (leaves);
27792773
}
27802774

@@ -3017,18 +3011,16 @@ scan_io_queues_run_one(void *arg)
30173011
range_seg_t *rs = NULL;
30183012
scan_io_t *sio = NULL;
30193013
list_t sio_list;
3020-
uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
3021-
uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
30223014

30233015
ASSERT(queue->q_scn->scn_is_sorted);
30243016

30253017
list_create(&sio_list, sizeof (scan_io_t),
30263018
offsetof(scan_io_t, sio_nodes.sio_list_node));
30273019
mutex_enter(q_lock);
30283020

3029-
/* calculate maximum in-flight bytes for this txg (min 1MB) */
3030-
queue->q_maxinflight_bytes =
3031-
MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
3021+
/* Calculate maximum in-flight bytes for this vdev. */
3022+
queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit *
3023+
(vdev_get_ndisks(queue->q_vd) - vdev_get_nparity(queue->q_vd)));
30323024

30333025
/* reset per-queue scan statistics for this txg */
30343026
queue->q_total_seg_size_this_txg = 0;
@@ -3665,16 +3657,14 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
36653657
/* Need to scan metadata for more blocks to scrub */
36663658
dsl_scan_phys_t *scnp = &scn->scn_phys;
36673659
taskqid_t prefetch_tqid;
3668-
uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
3669-
uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
36703660

36713661
/*
36723662
* Recalculate the max number of in-flight bytes for pool-wide
36733663
* scanning operations (minimum 1MB). Limits for the issuing
36743664
* phase are done per top-level vdev and are handled separately.
36753665
*/
3676-
scn->scn_maxinflight_bytes =
3677-
MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
3666+
scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
3667+
dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
36783668

36793669
if (scnp->scn_ddt_bookmark.ddb_class <=
36803670
scnp->scn_ddt_class_max) {
@@ -4050,9 +4040,8 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
40504040
size_t size = BP_GET_PSIZE(bp);
40514041
abd_t *data = abd_alloc_for_io(size, B_FALSE);
40524042

4053-
ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
4054-
40554043
if (queue == NULL) {
4044+
ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
40564045
mutex_enter(&spa->spa_scrub_lock);
40574046
while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
40584047
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
@@ -4061,6 +4050,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
40614050
} else {
40624051
kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
40634052

4053+
ASSERT3U(queue->q_maxinflight_bytes, >, 0);
40644054
mutex_enter(q_lock);
40654055
while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
40664056
cv_wait(&queue->q_zio_cv, q_lock);

0 commit comments

Comments
 (0)