Skip to content

Commit f0f330e

Browse files
author
Don Brady
authored
Fix ZED auto-replace for VDEVs using by-id paths
The change is simple -- restore the original code so that the VDEV path is updated when using by-id paths. The more challenging part was to devise a second ZTS test, that would test auto-replace for 'by-id' and help prevent a future regression. With that new test, we can now do an A|B test with , and without, the fix to confirm that auto-replace for by-id paths works. The existing auto-replace test, functional/fault/auto_replace_001_pos, will confirm that we didn't break auto-replace for 'by-vdev' paths. In the original functional/fault/auto_replace_001_pos test, the disk wipe (using dd) was not effective in removing the partitioning since the kernel was never informed of the wipe. Added a call to wipefs(8) so that the kernel is informed and ZED will re-partition the device. Added a validation step that the re-partitioning occurred by confirming that the GPT partition UUID changes. Sponsored-By: OpenDrives Inc. Sponsored-By: Klara Inc. Reviewed-by: Rob Norris <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Signed-off-by: Don Brady <[email protected]> Closes #15363
1 parent c0e5899 commit f0f330e

File tree

9 files changed

+279
-35
lines changed

9 files changed

+279
-35
lines changed

cmd/zed/agents/zfs_mod.c

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
2525
* Copyright (c) 2016, 2017, Intel Corporation.
2626
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
27+
* Copyright (c) 2023, Klara Inc.
2728
*/
2829

2930
/*
@@ -204,7 +205,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
204205
uint64_t is_spare = 0;
205206
const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
206207
char rawpath[PATH_MAX], fullpath[PATH_MAX];
207-
char devpath[PATH_MAX];
208+
char pathbuf[PATH_MAX];
208209
int ret;
209210
int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;
210211
boolean_t is_sd = B_FALSE;
@@ -214,6 +215,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
214215
char **lines = NULL;
215216
int lines_cnt = 0;
216217

218+
/*
219+
* Get the persistent path, typically under the '/dev/disk/by-id' or
220+
* '/dev/disk/by-vdev' directories. Note that this path can change
221+
* when a vdev is replaced with a new disk.
222+
*/
217223
if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
218224
return;
219225

@@ -370,15 +376,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
370376
(void) snprintf(rawpath, sizeof (rawpath), "%s%s",
371377
is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
372378

373-
if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) {
379+
if (realpath(rawpath, pathbuf) == NULL && !is_mpath_wholedisk) {
374380
zed_log_msg(LOG_INFO, " realpath: %s failed (%s)",
375381
rawpath, strerror(errno));
376382

377-
(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
378-
&newstate);
383+
int err = zpool_vdev_online(zhp, fullpath,
384+
ZFS_ONLINE_FORCEFAULT, &newstate);
379385

380-
zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)",
381-
fullpath, libzfs_error_description(g_zfshdl));
386+
zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s) "
387+
"err %d, new state %d",
388+
fullpath, libzfs_error_description(g_zfshdl), err,
389+
err ? (int)newstate : 0);
382390
return;
383391
}
384392

@@ -428,15 +436,15 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
428436
* to trigger a ZFS fault for the device (and any hot spare
429437
* replacement).
430438
*/
431-
leafname = strrchr(devpath, '/') + 1;
439+
leafname = strrchr(pathbuf, '/') + 1;
432440

433441
/*
434442
* If this is a request to label a whole disk, then attempt to
435443
* write out the label.
436444
*/
437445
if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname,
438446
vdev, "autoreplace", &lines, &lines_cnt) != 0) {
439-
zed_log_msg(LOG_INFO,
447+
zed_log_msg(LOG_WARNING,
440448
" zpool_prepare_and_label_disk: could not "
441449
"label '%s' (%s)", leafname,
442450
libzfs_error_description(g_zfshdl));
@@ -468,7 +476,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
468476
sizeof (device->pd_physpath));
469477
list_insert_tail(&g_device_list, device);
470478

471-
zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)",
479+
zed_log_msg(LOG_NOTICE, " zpool_label_disk: async '%s' (%llu)",
472480
leafname, (u_longlong_t)guid);
473481

474482
return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */
@@ -491,8 +499,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
491499
}
492500
if (!found) {
493501
/* unexpected partition slice encountered */
494-
zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
495-
fullpath);
502+
zed_log_msg(LOG_WARNING, "labeled disk %s was "
503+
"unexpected here", fullpath);
496504
(void) zpool_vdev_online(zhp, fullpath,
497505
ZFS_ONLINE_FORCEFAULT, &newstate);
498506
return;
@@ -501,8 +509,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
501509
zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)",
502510
physpath, (u_longlong_t)guid);
503511

504-
(void) snprintf(devpath, sizeof (devpath), "%s%s",
505-
DEV_BYID_PATH, new_devid);
512+
/*
513+
* Paths that begin with '/dev/disk/by-id/' will change and so
514+
* they must be updated before calling zpool_vdev_attach().
515+
*/
516+
if (strncmp(path, DEV_BYID_PATH, strlen(DEV_BYID_PATH)) == 0) {
517+
(void) snprintf(pathbuf, sizeof (pathbuf), "%s%s",
518+
DEV_BYID_PATH, new_devid);
519+
zed_log_msg(LOG_INFO, " zpool_label_disk: path '%s' "
520+
"replaced by '%s'", path, pathbuf);
521+
path = pathbuf;
522+
}
506523
}
507524

508525
libzfs_free_str_array(lines, lines_cnt);
@@ -545,9 +562,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
545562
* Wait for udev to verify the links exist, then auto-replace
546563
* the leaf disk at same physical location.
547564
*/
548-
if (zpool_label_disk_wait(path, 3000) != 0) {
549-
zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
550-
"disk %s is missing", path);
565+
if (zpool_label_disk_wait(path, DISK_LABEL_WAIT) != 0) {
566+
zed_log_msg(LOG_WARNING, "zfs_mod: pool '%s', after labeling "
567+
"replacement disk, the expected disk partition link '%s' "
568+
"is missing after waiting %u ms",
569+
zpool_get_name(zhp), path, DISK_LABEL_WAIT);
551570
nvlist_free(nvroot);
552571
return;
553572
}
@@ -562,7 +581,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
562581
B_TRUE, B_FALSE);
563582
}
564583

565-
zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
584+
zed_log_msg(LOG_WARNING, " zpool_vdev_replace: %s with %s (%s)",
566585
fullpath, path, (ret == 0) ? "no errors" :
567586
libzfs_error_description(g_zfshdl));
568587

@@ -660,7 +679,7 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
660679
dp->dd_prop, path);
661680
dp->dd_found = B_TRUE;
662681

663-
/* pass the new devid for use by replacing code */
682+
/* pass the new devid for use by auto-replacing code */
664683
if (dp->dd_new_devid != NULL) {
665684
(void) nvlist_add_string(nvl, "new_devid",
666685
dp->dd_new_devid);

include/libzutil.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ extern "C" {
3434
#endif
3535

3636
/*
37-
* Default wait time for a device name to be created.
37+
* Default wait time in milliseconds for a device name to be created.
3838
*/
3939
#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */
4040

lib/libzutil/os/linux/zutil_import_os.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -582,9 +582,8 @@ zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
582582
* Wait up to timeout_ms for udev to set up the device node. The device is
583583
* considered ready when libudev determines it has been initialized, all of
584584
* the device links have been verified to exist, and it has been allowed to
585-
* settle. At this point the device the device can be accessed reliably.
586-
* Depending on the complexity of the udev rules this process could take
587-
* several seconds.
585+
* settle. At this point the device can be accessed reliably. Depending on
586+
* the complexity of the udev rules this process could take several seconds.
588587
*/
589588
int
590589
zpool_label_disk_wait(const char *path, int timeout_ms)

tests/runfiles/linux.run

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,10 @@ tags = ['functional', 'fallocate']
122122

123123
[tests/functional/fault:Linux]
124124
tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
125-
'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos',
126-
'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared',
127-
'decrypt_fault', 'decompress_fault', 'scrub_after_resilver',
128-
'zpool_status_-s']
125+
'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
126+
'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
127+
'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
128+
'scrub_after_resilver', 'zpool_status_-s']
129129
tags = ['functional', 'fault']
130130

131131
[tests/functional/features/large_dnode:Linux]

tests/test-runner/bin/zts-report.py.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ if os.environ.get('CI') == 'true':
328328
'fault/auto_online_001_pos': ['SKIP', ci_reason],
329329
'fault/auto_online_002_pos': ['SKIP', ci_reason],
330330
'fault/auto_replace_001_pos': ['SKIP', ci_reason],
331+
'fault/auto_replace_002_pos': ['SKIP', ci_reason],
331332
'fault/auto_spare_ashift': ['SKIP', ci_reason],
332333
'fault/auto_spare_shared': ['SKIP', ci_reason],
333334
'procfs/pool_state': ['SKIP', ci_reason],

tests/zfs-tests/include/commands.cfg

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,12 +130,14 @@ export SYSTEM_FILES_LINUX='attr
130130
chattr
131131
exportfs
132132
fallocate
133+
flock
133134
free
134135
getfattr
135136
groupadd
136137
groupdel
137138
groupmod
138139
hostid
140+
logger
139141
losetup
140142
lsattr
141143
lsblk
@@ -145,21 +147,20 @@ export SYSTEM_FILES_LINUX='attr
145147
md5sum
146148
mkswap
147149
modprobe
150+
mountpoint
148151
mpstat
149152
nsenter
150153
parted
151154
perf
152155
setfattr
156+
setpriv
153157
sha256sum
154158
udevadm
155159
unshare
156160
useradd
157161
userdel
158162
usermod
159-
setpriv
160-
mountpoint
161-
flock
162-
logger'
163+
wipefs'
163164

164165
export ZFS_FILES='zdb
165166
zfs

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,6 +1431,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
14311431
functional/fault/auto_online_001_pos.ksh \
14321432
functional/fault/auto_online_002_pos.ksh \
14331433
functional/fault/auto_replace_001_pos.ksh \
1434+
functional/fault/auto_replace_002_pos.ksh \
14341435
functional/fault/auto_spare_001_pos.ksh \
14351436
functional/fault/auto_spare_002_pos.ksh \
14361437
functional/fault/auto_spare_ashift.ksh \

tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,14 @@
3434
# 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path.
3535
# This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug".
3636
# 2. Create a pool and set autoreplace=on (auto-replace is opt-in)
37-
# 3. Export a pool
37+
# 3. Export the pool
3838
# 4. Wipe and offline the scsi_debug disk
39-
# 5. Import pool with missing disk
39+
# 5. Import the pool with missing disk
4040
# 6. Re-online the wiped scsi_debug disk
41-
# 7. Verify the ZED detects the new unused disk and adds it back to the pool
41+
# 7. Verify ZED detects the new blank disk and replaces the missing vdev
42+
# 8. Verify that the scsi_debug disk was re-partitioned
4243
#
43-
# Creates a raidz1 zpool using persistent disk path names
44+
# Creates a raidz1 zpool using persistent /dev/disk/by-vdev path names
4445
# (ie not /dev/sdc)
4546
#
4647
# Auto-replace is opt in, and matches by phys_path.
@@ -83,11 +84,27 @@ log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE $DISK1 $DISK2 $DISK3
8384
log_must zpool set autoreplace=on $TESTPOOL
8485

8586
# Add some data to the pool
86-
log_must mkfile $FSIZE /$TESTPOOL/data
87+
log_must zfs create $TESTPOOL/fs
88+
log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z
8789
log_must zpool export $TESTPOOL
8890

91+
# Record the partition UUID for later comparison
92+
part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
93+
--value /dev/disk/by-id/$SD_DEVICE_ID)
94+
[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid}
95+
96+
#
8997
# Wipe and offline the disk
98+
#
99+
# Note that it is not enough to zero the disk to expunge the partitions.
100+
# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe').
101+
#
102+
# Using partprobe is overkill and hdparm is not as common as wipefs. So
103+
# we use wipefs which lets the kernel know the partition was removed
104+
# from the device (i.e., calls BLKRRPART ioctl).
105+
#
90106
log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE
107+
log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID
91108
remove_disk $SD
92109
block_device_wait
93110

@@ -106,4 +123,18 @@ log_must wait_replacing $TESTPOOL 60
106123
# Validate auto-replace was successful
107124
log_must check_state $TESTPOOL "" "ONLINE"
108125

126+
#
127+
# Confirm the partition UUID changed so we know the new disk was relabeled
128+
#
129+
# Note: some older versions of udevadm don't support "--property" option so
130+
# we'll # skip this test when it is not supported
131+
#
132+
if [ ! -z "$part_uuid" ]; then
133+
new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
134+
--value /dev/disk/by-id/$SD_DEVICE_ID)
135+
log_note new disk GPT uuid ${new_uuid}
136+
[[ "$part_uuid" = "$new_uuid" ]] && \
137+
log_fail "The new disk was not relabeled as expected"
138+
fi
139+
109140
log_pass "Auto-replace test successful"

0 commit comments

Comments
 (0)