Skip to content

Commit 7cdaa4d

Browse files
committed
Too many vdev probe errors should suspend pool
Similar to what we saw in #16569, we need to consider that a replacing vdev should not be considered as fully contributing to the redundancy of a raidz vdev even though current IO has enough redundancy. When a failed vdev_probe() is faulting a disk, it now checks if that disk is required, and if so it suspends the pool until the admin can return the missing disks. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Don Brady <[email protected]>
1 parent 6c9b4f1 commit 7cdaa4d

File tree

4 files changed

+169
-8
lines changed

4 files changed

+169
-8
lines changed

module/zfs/spa.c

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8948,16 +8948,26 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
89488948
}
89498949

89508950
static void
8951-
spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
8951+
spa_async_fault_vdev(vdev_t *vd, int *suspend)
89528952
{
89538953
if (vd->vdev_fault_wanted) {
8954+
vdev_state_t newstate = VDEV_STATE_FAULTED;
89548955
vd->vdev_fault_wanted = B_FALSE;
8955-
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
8956-
VDEV_AUX_ERR_EXCEEDED);
8957-
}
89588956

8957+
/*
8958+
* If this device has the only valid copy of the data, then
8959+
* back off and simply mark the vdev as degraded instead.
8960+
*/
8961+
if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL &&
8962+
vdev_dtl_required(vd)) {
8963+
newstate = VDEV_STATE_DEGRADED;
8964+
/* A required disk is missing so suspend the pool */
8965+
*suspend |= 1;
8966+
}
8967+
vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED);
8968+
}
89598969
for (int c = 0; c < vd->vdev_children; c++)
8960-
spa_async_fault_vdev(spa, vd->vdev_child[c]);
8970+
spa_async_fault_vdev(vd->vdev_child[c], suspend);
89618971
}
89628972

89638973
static void
@@ -9049,8 +9059,13 @@ spa_async_thread(void *arg)
90499059
*/
90509060
if (tasks & SPA_ASYNC_FAULT_VDEV) {
90519061
spa_vdev_state_enter(spa, SCL_NONE);
9052-
spa_async_fault_vdev(spa, spa->spa_root_vdev);
9062+
int suspend = 0;
9063+
spa_async_fault_vdev(spa->spa_root_vdev, &suspend);
90539064
(void) spa_vdev_state_exit(spa, NULL, 0);
9065+
if (suspend) {
9066+
zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
9067+
zio_resume_wait(spa);
9068+
}
90549069
}
90559070

90569071
/*

tests/runfiles/linux.run

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,8 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
125125
'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
126126
'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
127127
'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
128-
'fault_limits', 'scrub_after_resilver', 'suspend_resume_single',
129-
'zpool_status_-s']
128+
'fault_limits', 'scrub_after_resilver', 'suspend_on_probe_errors',
129+
'suspend_resume_single', 'zpool_status_-s']
130130
tags = ['functional', 'fault']
131131

132132
[tests/functional/features/large_dnode:Linux]

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1531,6 +1531,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
15311531
functional/fault/decrypt_fault.ksh \
15321532
functional/fault/fault_limits.ksh \
15331533
functional/fault/scrub_after_resilver.ksh \
1534+
functional/fault/suspend_on_probe_errors.ksh \
15341535
functional/fault/suspend_resume_single.ksh \
15351536
functional/fault/setup.ksh \
15361537
functional/fault/zpool_status_-s.ksh \
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#!/bin/ksh -p
2+
#
3+
# CDDL HEADER START
4+
#
5+
# The contents of this file are subject to the terms of the
6+
# Common Development and Distribution License (the "License").
7+
# You may not use this file except in compliance with the License.
8+
#
9+
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10+
# or https://opensource.org/licenses/CDDL-1.0.
11+
# See the License for the specific language governing permissions
12+
# and limitations under the License.
13+
#
14+
# When distributing Covered Code, include this CDDL HEADER in each
15+
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16+
# If applicable, add the following below this CDDL HEADER, with the
17+
# fields enclosed by brackets "[]" replaced with your own identifying
18+
# information: Portions Copyright [yyyy] [name of copyright owner]
19+
#
20+
# CDDL HEADER END
21+
#
22+
23+
#
24+
# Copyright (c) 2024, Klara Inc.
25+
#
26+
27+
. $STF_SUITE/include/libtest.shlib
28+
. $STF_SUITE/include/blkdev.shlib
29+
30+
#
31+
# DESCRIPTION: Verify that 4 disk removed from a raidz3 will suspend the pool
32+
#
33+
# STRATEGY:
34+
# 1. Disable ZED -- this test is focused on vdev_probe errors
35+
# 2. Create a raidz3 pool where 4 disks can be removed (i.e., using scsi_debug)
36+
# 3. Add some data to it for a resilver workload
37+
# 4. Replace one of the child vdevs to start a replacing vdev
38+
# 5. During the resilver, remove 4 disks including one from the replacing vdev
39+
# 6. Verify that the pool is suspended (it used to remain online)
40+
#
41+
42+
DEV_SIZE_MB=1024
43+
44+
FILE_VDEV_CNT=8
45+
FILE_VDEV_SIZ=256M
46+
47+
function cleanup
48+
{
49+
destroy_pool $TESTPOOL
50+
unload_scsi_debug
51+
rm -f $DATA_FILE
52+
for i in {0..$((FILE_VDEV_CNT - 1))}; do
53+
log_must rm -f "$TEST_BASE_DIR/dev-$i"
54+
done
55+
zed_start
56+
}
57+
58+
log_onexit cleanup
59+
60+
log_assert "VDEV probe errors for more disks than parity should suspend a pool"
61+
62+
log_note "Stoping ZED process"
63+
zed_stop
64+
zpool events -c
65+
66+
# Make a debug device that we can "unplug" and loose 4 drives at once
67+
unload_scsi_debug
68+
load_scsi_debug $DEV_SIZE_MB 1 1 1 '512b'
69+
sd=$(get_debug_device)
70+
71+
# Create 4 partitions that match the FILE_VDEV_SIZ
72+
parted "/dev/${sd}" --script mklabel gpt
73+
parted "/dev/${sd}" --script mkpart primary 0% 25%
74+
parted "/dev/${sd}" --script mkpart primary 25% 50%
75+
parted "/dev/${sd}" --script mkpart primary 50% 75%
76+
parted "/dev/${sd}" --script mkpart primary 75% 100%
77+
block_device_wait "/dev/${sd}"
78+
blkdevs="/dev/${sd}1 /dev/${sd}2 /dev/${sd}3 /dev/${sd}4"
79+
80+
# Create 8 file vdevs
81+
typeset -a filedevs
82+
for i in {0..$((FILE_VDEV_CNT - 1))}; do
83+
device=$TEST_BASE_DIR/dev-$i
84+
log_must truncate -s $FILE_VDEV_SIZ $device
85+
# Use all but the last one for pool create
86+
if [[ $i -lt "7" ]]; then
87+
filedevs[${#filedevs[*]}+1]=$device
88+
fi
89+
done
90+
91+
# Create a raidz-3 pool that we can pull 4 disks from
92+
log_must zpool create -f $TESTPOOL raidz3 ${filedevs[@]} $blkdevs
93+
sync_pool $TESTPOOL
94+
95+
# Add some data to the pool
96+
log_must zfs create $TESTPOOL/fs
97+
MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)"
98+
SECONDS=0
99+
log_must fill_fs $MNTPOINT 3 200 32768 300 Z
100+
log_note "fill_fs took $SECONDS seconds"
101+
sync_pool $TESTPOOL
102+
103+
# Start a replacing vdev
104+
log_must zpool replace -f $TESTPOOL /dev/${sd}4 $TEST_BASE_DIR/dev-7
105+
106+
# Remove 4 disks all at once
107+
log_must eval "echo offline > /sys/block/${sd}/device/state"
108+
109+
# Add some writes to drive the vdev probe errors
110+
log_must dd if=/dev/urandom of=$MNTPOINT/writes bs=1M count=1
111+
112+
# Wait until sync starts, and the pool suspends
113+
log_note "waiting for pool to suspend"
114+
typeset -i tries=10
115+
until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
116+
if ((tries-- == 0)); then
117+
zpool status -s
118+
log_fail "UNEXPECTED -- pool did not suspend"
119+
fi
120+
sleep 1
121+
done
122+
123+
zpool status $TESTPOOL
124+
125+
# Put the missing disks back into service
126+
log_must eval "echo running > /sys/block/$sd/device/state"
127+
128+
# Clear the vdev error states, which will reopen the vdevs and resume the pool
129+
log_must zpool clear $TESTPOOL
130+
131+
# Wait until the pool resumes
132+
log_note "waiting for pool to resume"
133+
tries=10
134+
until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) != "SUSPENDED" ]] ; do
135+
if ((tries-- == 0)); then
136+
log_fail "pool did not resume"
137+
fi
138+
sleep 1
139+
done
140+
141+
# Make sure a pool scrub comes back clean
142+
log_must zpool scrub -w $TESTPOOL
143+
log_must check_pool_status $pool "errors" "No known data errors"
144+
145+
log_pass "VDEV probe errors for more disks than parity should suspend a pool"

0 commit comments

Comments
 (0)