Skip to content

Commit 5645a2f

Browse files
committed
Too many vdev probe errors should suspend pool
Similar to what we saw in #16569, we need to consider that a replacing vdev should not be considered as fully contributing to the redundancy of a raidz vdev even though current IO has enough redundancy. When a failed vdev_probe() is faulting a disk, it now checks if that disk is required, and if so it suspends the pool until the admin can return the missing disks. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Don Brady <[email protected]>
1 parent 6c9b4f1 commit 5645a2f

File tree

4 files changed

+173
-8
lines changed

4 files changed

+173
-8
lines changed

module/zfs/spa.c

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8948,16 +8948,26 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
89488948
}
89498949

89508950
static void
8951-
spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
8951+
spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend)
89528952
{
89538953
if (vd->vdev_fault_wanted) {
8954+
vdev_state_t newstate = VDEV_STATE_FAULTED;
89548955
vd->vdev_fault_wanted = B_FALSE;
8955-
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
8956-
VDEV_AUX_ERR_EXCEEDED);
8957-
}
89588956

8957+
/*
8958+
* If this device has the only valid copy of the data, then
8959+
* back off and simply mark the vdev as degraded instead.
8960+
*/
8961+
if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL &&
8962+
vdev_dtl_required(vd)) {
8963+
newstate = VDEV_STATE_DEGRADED;
8964+
/* A required disk is missing so suspend the pool */
8965+
*suspend = B_TRUE;
8966+
}
8967+
vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED);
8968+
}
89598969
for (int c = 0; c < vd->vdev_children; c++)
8960-
spa_async_fault_vdev(spa, vd->vdev_child[c]);
8970+
spa_async_fault_vdev(vd->vdev_child[c], suspend);
89618971
}
89628972

89638973
static void
@@ -9049,8 +9059,13 @@ spa_async_thread(void *arg)
90499059
*/
90509060
if (tasks & SPA_ASYNC_FAULT_VDEV) {
90519061
spa_vdev_state_enter(spa, SCL_NONE);
9052-
spa_async_fault_vdev(spa, spa->spa_root_vdev);
9062+
boolean_t suspend = B_FALSE;
9063+
spa_async_fault_vdev(spa->spa_root_vdev, &suspend);
90539064
(void) spa_vdev_state_exit(spa, NULL, 0);
9065+
if (suspend) {
9066+
zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
9067+
zio_resume_wait(spa);
9068+
}
90549069
}
90559070

90569071
/*

tests/runfiles/linux.run

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,8 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
125125
'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
126126
'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
127127
'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
128-
'fault_limits', 'scrub_after_resilver', 'suspend_resume_single',
129-
'zpool_status_-s']
128+
'fault_limits', 'scrub_after_resilver', 'suspend_on_probe_errors',
129+
'suspend_resume_single', 'zpool_status_-s']
130130
tags = ['functional', 'fault']
131131

132132
[tests/functional/features/large_dnode:Linux]

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1531,6 +1531,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
15311531
functional/fault/decrypt_fault.ksh \
15321532
functional/fault/fault_limits.ksh \
15331533
functional/fault/scrub_after_resilver.ksh \
1534+
functional/fault/suspend_on_probe_errors.ksh \
15341535
functional/fault/suspend_resume_single.ksh \
15351536
functional/fault/setup.ksh \
15361537
functional/fault/zpool_status_-s.ksh \
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#!/bin/ksh -p
2+
#
3+
# CDDL HEADER START
4+
#
5+
# The contents of this file are subject to the terms of the
6+
# Common Development and Distribution License (the "License").
7+
# You may not use this file except in compliance with the License.
8+
#
9+
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10+
# or https://opensource.org/licenses/CDDL-1.0.
11+
# See the License for the specific language governing permissions
12+
# and limitations under the License.
13+
#
14+
# When distributing Covered Code, include this CDDL HEADER in each
15+
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16+
# If applicable, add the following below this CDDL HEADER, with the
17+
# fields enclosed by brackets "[]" replaced with your own identifying
18+
# information: Portions Copyright [yyyy] [name of copyright owner]
19+
#
20+
# CDDL HEADER END
21+
#
22+
23+
#
24+
# Copyright (c) 2024, Klara Inc.
25+
#
26+
27+
. $STF_SUITE/include/libtest.shlib
28+
. $STF_SUITE/include/blkdev.shlib
29+
30+
#
31+
# DESCRIPTION: Verify that 4 disks removed from a raidz3 will suspend the pool
32+
#
33+
# STRATEGY:
34+
# 1. Disable ZED -- this test is focused on vdev_probe errors
35+
# 2. Create a raidz3 pool where 4 disks can be removed (i.e., using scsi_debug)
36+
# 3. Add some data to it for a resilver workload
37+
# 4. Replace one of the child vdevs to start a replacing vdev
38+
# 5. During the resilver, remove 4 disks including one from the replacing vdev
39+
# 6. Verify that the pool is suspended (it used to remain online)
40+
#
41+
42+
DEV_SIZE_MB=1024
43+
44+
FILE_VDEV_CNT=8
45+
FILE_VDEV_SIZ=256M
46+
47+
function cleanup
48+
{
49+
destroy_pool $TESTPOOL
50+
if [[ "$(cat /sys/block/$sd/device/state)" == "offline" ]]; then
51+
log_must eval "echo running > /sys/block/$sd/device/state"
52+
fi
53+
unload_scsi_debug
54+
rm -f $DATA_FILE
55+
for i in {0..$((FILE_VDEV_CNT - 1))}; do
56+
log_must rm -f "$TEST_BASE_DIR/dev-$i"
57+
done
58+
zed_start
59+
}
60+
61+
log_onexit cleanup
62+
63+
log_assert "VDEV probe errors for more disks than parity should suspend a pool"
64+
65+
log_note "Stoping ZED process"
66+
zed_stop
67+
zpool events -c
68+
69+
# Make a debug device that we can "unplug" and lose 4 drives at once
70+
unload_scsi_debug
71+
load_scsi_debug $DEV_SIZE_MB 1 1 1 '512b'
72+
sd=$(get_debug_device)
73+
74+
# Create 4 partitions that match the FILE_VDEV_SIZ
75+
parted "/dev/${sd}" --script mklabel gpt
76+
parted "/dev/${sd}" --script mkpart primary 0% 25%
77+
parted "/dev/${sd}" --script mkpart primary 25% 50%
78+
parted "/dev/${sd}" --script mkpart primary 50% 75%
79+
parted "/dev/${sd}" --script mkpart primary 75% 100%
80+
block_device_wait "/dev/${sd}"
81+
blkdevs="/dev/${sd}1 /dev/${sd}2 /dev/${sd}3 /dev/${sd}4"
82+
83+
# Create 8 file vdevs
84+
typeset -a filedevs
85+
for i in {0..$((FILE_VDEV_CNT - 1))}; do
86+
device=$TEST_BASE_DIR/dev-$i
87+
log_must truncate -s $FILE_VDEV_SIZ $device
88+
# Use all but the last one for pool create
89+
if [[ $i -lt "7" ]]; then
90+
filedevs[${#filedevs[*]}+1]=$device
91+
fi
92+
done
93+
94+
# Create a raidz-3 pool that we can pull 4 disks from
95+
log_must zpool create -f $TESTPOOL raidz3 ${filedevs[@]} $blkdevs
96+
sync_pool $TESTPOOL
97+
98+
# Add some data to the pool
99+
log_must zfs create $TESTPOOL/fs
100+
MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)"
101+
SECONDS=0
102+
log_must fill_fs $MNTPOINT 3 200 32768 300 Z
103+
log_note "fill_fs took $SECONDS seconds"
104+
sync_pool $TESTPOOL
105+
106+
# Start a replacing vdev
107+
log_must zpool replace -f $TESTPOOL /dev/${sd}4 $TEST_BASE_DIR/dev-7
108+
109+
# Remove 4 disks all at once
110+
log_must eval "echo offline > /sys/block/${sd}/device/state"
111+
112+
# Add some writes to drive the vdev probe errors
113+
log_must dd if=/dev/urandom of=$MNTPOINT/writes bs=1M count=1
114+
115+
# Wait until sync starts, and the pool suspends
116+
log_note "waiting for pool to suspend"
117+
typeset -i tries=30
118+
until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
119+
if ((tries-- == 0)); then
120+
zpool status -s
121+
log_fail "UNEXPECTED -- pool did not suspend"
122+
fi
123+
sleep 1
124+
done
125+
126+
log_must zpool status $TESTPOOL
127+
128+
# Put the missing disks back into service
129+
log_must eval "echo running > /sys/block/$sd/device/state"
130+
131+
# Clear the vdev error states, which will reopen the vdevs and resume the pool
132+
log_must zpool clear $TESTPOOL
133+
134+
# Wait until the pool resumes
135+
log_note "waiting for pool to resume"
136+
tries=30
137+
until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) != "SUSPENDED" ]] ; do
138+
if ((tries-- == 0)); then
139+
log_fail "pool did not resume"
140+
fi
141+
sleep 1
142+
done
143+
144+
# Make sure a pool scrub comes back clean
145+
log_must zpool scrub -w $TESTPOOL
146+
log_must zpool status -v $TESTPOOL
147+
log_must check_pool_status $TESTPOOL "errors" "No known data errors"
148+
149+
log_pass "VDEV probe errors for more disks than parity should suspend a pool"

0 commit comments

Comments
 (0)