Skip to content

Commit cb758c7

Browse files
committed
Fix concurrent resilvers initiated at same time
For draid vdevs it was possible to initiate both the sequential and healing resilver at same time. This fixes the following two scenarios. 1) There's a window where a sequential rebuild can be started via ZED even if a healing resilver has been scheduled. - This is fixed by adding additional check in spa_vdev_attach() for any scheduled resilver and return appropriate error code when a resilver is already in progress. 2) It was possible for zpool clear to start a healing resilver when it wasn't needed at all. This occurs because during a vdev_open() the device is presumed to be healthy not until the device is validated by vdev_validate() and it's set unavailable. However, by this point an async resilver will have already been requested if the DTL isn't empty. - This is fixed by cancelling the SPA_ASYNC_RESILVER request immediately at the end of vdev_reopen() when a resilver is unneeded. Finally, added a testcase in ZTS for verification. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Dipak Ghosh <[email protected]> Signed-off-by: Akash B <[email protected]> Issue #14881
1 parent ad0a554 commit cb758c7

File tree

5 files changed

+120
-3
lines changed

5 files changed

+120
-3
lines changed

module/zfs/spa.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
* Copyright 2017 Joyent, Inc.
3434
* Copyright (c) 2017, Intel Corporation.
3535
* Copyright (c) 2021, Colm Buckley <[email protected]>
36+
* Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
3637
*/
3738

3839
/*
@@ -6874,9 +6875,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
68746875
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
68756876
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
68766877

6877-
if (dsl_scan_resilvering(spa_get_dsl(spa)))
6878+
if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
6879+
dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
68786880
return (spa_vdev_exit(spa, NULL, txg,
68796881
ZFS_ERR_RESILVER_IN_PROGRESS));
6882+
}
68806883
} else {
68816884
if (vdev_rebuild_active(rvd))
68826885
return (spa_vdev_exit(spa, NULL, txg,

module/zfs/vdev.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
* Copyright (c) 2017, Intel Corporation.
3030
* Copyright (c) 2019, Datto Inc. All rights reserved.
3131
* Copyright (c) 2021, Klara Inc.
32-
* Copyright [2021] Hewlett Packard Enterprise Development LP
32+
* Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
3333
*/
3434

3535
#include <sys/zfs_context.h>
@@ -2699,6 +2699,17 @@ vdev_reopen(vdev_t *vd)
26992699
(void) vdev_validate(vd);
27002700
}
27012701

2702+
/*
2703+
* Recheck if resilver is still needed and cancel any
2704+
* scheduled resilver if resilver is unneeded.
2705+
*/
2706+
if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
2707+
spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
2708+
mutex_enter(&spa->spa_async_lock);
2709+
spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
2710+
mutex_exit(&spa->spa_async_lock);
2711+
}
2712+
27022713
/*
27032714
* Reassess parent vdev's health.
27042715
*/

tests/runfiles/common.run

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,8 @@ tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift']
472472
tags = ['functional', 'cli_root', 'zpool_replace']
473473

474474
[tests/functional/cli_root/zpool_resilver]
475-
tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart']
475+
tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart',
476+
'zpool_resilver_concurrent']
476477
tags = ['functional', 'cli_root', 'zpool_resilver']
477478

478479
[tests/functional/cli_root/zpool_scrub]

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
11421142
functional/cli_root/zpool_resilver/setup.ksh \
11431143
functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \
11441144
functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \
1145+
functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \
11451146
functional/cli_root/zpool_scrub/cleanup.ksh \
11461147
functional/cli_root/zpool_scrub/setup.ksh \
11471148
functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#!/bin/ksh -p
2+
#
3+
# CDDL HEADER START
4+
#
5+
# The contents of this file are subject to the terms of the
6+
# Common Development and Distribution License (the "License").
7+
# You may not use this file except in compliance with the License.
8+
#
9+
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10+
# or http://www.opensolaris.org/os/licensing.
11+
# See the License for the specific language governing permissions
12+
# and limitations under the License.
13+
#
14+
# When distributing Covered Code, include this CDDL HEADER in each
15+
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16+
# If applicable, add the following below this CDDL HEADER, with the
17+
# fields enclosed by brackets "[]" replaced with your own identifying
18+
# information: Portions Copyright [yyyy] [name of copyright owner]
19+
#
20+
# CDDL HEADER END
21+
#
22+
23+
#
24+
# Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
25+
#
26+
27+
. $STF_SUITE/include/libtest.shlib
28+
. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib
29+
30+
#
31+
# DESCRIPTION:
32+
# Verify 'zpool clear' doesn't cause concurrent resilvers
33+
#
34+
# STRATEGY:
35+
# 1. Create N(10) virtual disk files.
36+
# 2. Create draid pool based on the virtual disk files.
37+
# 3. Fill the filesystem with directories and files.
38+
# 4. Force-fault 2 vdevs and verify distributed spare is kicked in.
39+
# 5. Free the distributed spare by replacing the faulty drive.
40+
# 6. Run zpool clear and verify that it does not initiate 2 resilvers
41+
# concurrently while distributed spare gets kicked in.
42+
#
43+
44+
verify_runnable "global"
45+
46+
typeset -ir devs=10
47+
typeset -ir nparity=1
48+
typeset -ir ndata=8
49+
typeset -ir dspare=1
50+
51+
function cleanup
52+
{
53+
poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"
54+
55+
for i in {0..$devs}; do
56+
log_must rm -f "$BASEDIR/vdev$i"
57+
done
58+
59+
for dir in $BASEDIR; do
60+
if [[ -d $dir ]]; then
61+
log_must rm -rf $dir
62+
fi
63+
done
64+
65+
zed_stop
66+
zed_cleanup
67+
}
68+
69+
log_assert "Verify zpool clear on draid pool doesn't cause concurrent resilvers"
70+
log_onexit cleanup
71+
72+
setup_test_env $TESTPOOL draid${nparity}:${ndata}d:${dspare}s $devs
73+
74+
# ZED needed for sequential resilver
75+
zed_setup
76+
log_must zed_start
77+
78+
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev5
79+
log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60
80+
log_must zpool wait -t resilver $TESTPOOL
81+
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev6
82+
83+
log_must zpool labelclear -f $BASEDIR/vdev5
84+
log_must zpool labelclear -f $BASEDIR/vdev6
85+
86+
log_must zpool replace -w $TESTPOOL $BASEDIR/vdev5
87+
sync_pool $TESTPOOL
88+
89+
log_must zpool events -c
90+
log_must zpool clear $TESTPOOL
91+
log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60
92+
log_must zpool wait -t resilver $TESTPOOL
93+
log_must zpool wait -t scrub $TESTPOOL
94+
95+
nof_resilver=$(zpool events | grep -c resilver_start)
96+
if [ $nof_resilver = 1 ] ; then
97+
log_must verify_pool $TESTPOOL
98+
log_pass "zpool clear on draid pool doesn't cause concurrent resilvers"
99+
else
100+
log_fail "FAIL: sequential and healing resilver initiated concurrently"
101+
fi

0 commit comments

Comments
 (0)