Skip to content

Commit 501fff8

Browse files
ahrensgamanakis
authored andcommitted
deadlock between spa_errlog_lock and dp_config_rwlock
There is a lock order inversion deadlock between `spa_errlog_lock` and `dp_config_rwlock`: A thread in `spa_delete_dataset_errlog()` is running from a sync task. It is holding the `dp_config_rwlock` for writer (see `dsl_sync_task_sync()`), and waiting for the `spa_errlog_lock`. A thread in `dsl_pool_config_enter()` is holding the `spa_errlog_lock` (see `spa_get_errlog_size()`) and waiting for the `dp_config_rwlock` (as reader). Note that this was introduced by openzfs#12812. This commit address this by defining the lock ordering to be dp_config_rwlock first, then spa_errlog_lock / spa_errlist_lock. spa_get_errlog() and spa_get_errlog_size() can acquire the locks in this order, and then process_error_block() and get_head_and_birth_txg() can verify that the dp_config_rwlock is already held. Additionally, a buffer overrun in `spa_get_errlog()` is corrected. Many code paths didn't check if `*count` got to zero, instead continuing to overwrite past the beginning of the userspace buffer at `uaddr`. Tested by having some errors in the pool (via `zinject -t data /path/to/file`), one thread running `zpool iostat 0.001`, and another thread runs `zfs destroy` (in a loop, although it hits the first time). This reproduces the problem easily without the fix, and works with the fix. Closes openzfs#14239 Signed-off-by: Matthew Ahrens <[email protected]>
1 parent cb8aaaa commit 501fff8

File tree

9 files changed

+138
-249
lines changed

9 files changed

+138
-249
lines changed

cmd/zpool/zpool_main.c

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8691,37 +8691,17 @@ status_callback(zpool_handle_t *zhp, void *data)
86918691

86928692
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
86938693
&nerr) == 0) {
8694-
nvlist_t *nverrlist = NULL;
8695-
8696-
/*
8697-
* If the approximate error count is small, get a
8698-
* precise count by fetching the entire log and
8699-
* uniquifying the results.
8700-
*/
8701-
if (nerr > 0 && nerr < 100 && !cbp->cb_verbose &&
8702-
zpool_get_errlog(zhp, &nverrlist) == 0) {
8703-
nvpair_t *elem;
8704-
8705-
elem = NULL;
8706-
nerr = 0;
8707-
while ((elem = nvlist_next_nvpair(nverrlist,
8708-
elem)) != NULL) {
8709-
nerr++;
8710-
}
8711-
}
8712-
nvlist_free(nverrlist);
8713-
87148694
(void) printf("\n");
8715-
8716-
if (nerr == 0)
8717-
(void) printf(gettext("errors: No known data "
8718-
"errors\n"));
8719-
else if (!cbp->cb_verbose)
8695+
if (nerr == 0) {
8696+
(void) printf(gettext(
8697+
"errors: No known data errors\n"));
8698+
} else if (!cbp->cb_verbose) {
87208699
(void) printf(gettext("errors: %llu data "
87218700
"errors, use '-v' for a list\n"),
87228701
(u_longlong_t)nerr);
8723-
else
8702+
} else {
87248703
print_error_log(zhp);
8704+
}
87258705
}
87268706

87278707
if (cbp->cb_dedup_stats)

cmd/ztest.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6313,7 +6313,7 @@ ztest_scrub_impl(spa_t *spa)
63136313
while (dsl_scan_scrubbing(spa_get_dsl(spa)))
63146314
txg_wait_synced(spa_get_dsl(spa), 0);
63156315

6316-
if (spa_get_errlog_size(spa) > 0)
6316+
if (spa_approx_errlog_size(spa) > 0)
63176317
return (ECKSUM);
63186318

63196319
ztest_pool_scrubbed = B_TRUE;

include/sys/spa.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1148,7 +1148,7 @@ extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
11481148
extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
11491149
extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
11501150
extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
1151-
extern uint64_t spa_get_errlog_size(spa_t *spa);
1151+
extern uint64_t spa_approx_errlog_size(spa_t *spa);
11521152
extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count);
11531153
extern uint64_t spa_get_last_errlog_size(spa_t *spa);
11541154
extern void spa_errlog_rotate(spa_t *spa);

lib/libzfs/libzfs_pool.c

Lines changed: 20 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4206,33 +4206,28 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
42064206
{
42074207
zfs_cmd_t zc = {"\0"};
42084208
libzfs_handle_t *hdl = zhp->zpool_hdl;
4209-
uint64_t count;
4210-
zbookmark_phys_t *zb = NULL;
4211-
int i;
4209+
zbookmark_phys_t *buf;
4210+
uint64_t buflen = 10000; /* approx. 1MB of RAM */
4211+
4212+
if (fnvlist_lookup_uint64(zhp->zpool_config,
4213+
ZPOOL_CONFIG_ERRCOUNT) == 0)
4214+
return (0);
42124215

42134216
/*
4214-
* Retrieve the raw error list from the kernel. If the number of errors
4215-
* has increased, allocate more space and continue until we get the
4216-
* entire list.
4217+
* Retrieve the raw error list from the kernel. If it doesn't fit,
4218+
* allocate a larger buffer and retry.
42174219
*/
4218-
count = fnvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT);
4219-
if (count == 0)
4220-
return (0);
4221-
zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
4222-
count * sizeof (zbookmark_phys_t));
4223-
zc.zc_nvlist_dst_size = count;
42244220
(void) strcpy(zc.zc_name, zhp->zpool_name);
42254221
for (;;) {
4222+
buf = zfs_alloc(zhp->zpool_hdl,
4223+
buflen * sizeof (zbookmark_phys_t));
4224+
zc.zc_nvlist_dst = (uintptr_t)buf;
4225+
zc.zc_nvlist_dst_size = buflen;
42264226
if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG,
42274227
&zc) != 0) {
4228-
free((void *)(uintptr_t)zc.zc_nvlist_dst);
4228+
free(buf);
42294229
if (errno == ENOMEM) {
4230-
void *dst;
4231-
4232-
count = zc.zc_nvlist_dst_size;
4233-
dst = zfs_alloc(zhp->zpool_hdl, count *
4234-
sizeof (zbookmark_phys_t));
4235-
zc.zc_nvlist_dst = (uintptr_t)dst;
4230+
buflen *= 2;
42364231
} else {
42374232
return (zpool_standard_error_fmt(hdl, errno,
42384233
dgettext(TEXT_DOMAIN, "errors: List of "
@@ -4250,18 +4245,17 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
42504245
* _not_ copied as part of the process. So we point the start of our
42514246
* array appropriate and decrement the total number of elements.
42524247
*/
4253-
zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
4254-
zc.zc_nvlist_dst_size;
4255-
count -= zc.zc_nvlist_dst_size;
4248+
zbookmark_phys_t *zb = buf + zc.zc_nvlist_dst_size;
4249+
uint64_t zblen = buflen - zc.zc_nvlist_dst_size;
42564250

4257-
qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
4251+
qsort(zb, zblen, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
42584252

42594253
verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
42604254

42614255
/*
42624256
* Fill in the nverrlistp with nvlist's of dataset and object numbers.
42634257
*/
4264-
for (i = 0; i < count; i++) {
4258+
for (uint64_t i = 0; i < zblen; i++) {
42654259
nvlist_t *nv;
42664260

42674261
/* ignoring zb_blkid and zb_level for now */
@@ -4288,11 +4282,11 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
42884282
nvlist_free(nv);
42894283
}
42904284

4291-
free((void *)(uintptr_t)zc.zc_nvlist_dst);
4285+
free(buf);
42924286
return (0);
42934287

42944288
nomem:
4295-
free((void *)(uintptr_t)zc.zc_nvlist_dst);
4289+
free(buf);
42964290
return (no_memory(zhp->zpool_hdl));
42974291
}
42984292

lib/libzfs/libzfs_status.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,6 @@ check_status(nvlist_t *config, boolean_t isimport,
222222
{
223223
pool_scan_stat_t *ps = NULL;
224224
uint_t vsc, psc;
225-
uint64_t nerr;
226225
uint64_t suspended;
227226
uint64_t hostid = 0;
228227
uint64_t errata = 0;
@@ -392,6 +391,7 @@ check_status(nvlist_t *config, boolean_t isimport,
392391
* Persistent data errors.
393392
*/
394393
if (!isimport) {
394+
uint64_t nerr;
395395
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
396396
&nerr) == 0 && nerr != 0)
397397
return (ZPOOL_STATUS_CORRUPT_DATA);

module/zfs/dsl_scan.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,13 +1093,13 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
10931093

10941094
if (dsl_scan_restarting(scn, tx))
10951095
spa_history_log_internal(spa, "scan aborted, restarting", tx,
1096-
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
1096+
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
10971097
else if (!complete)
10981098
spa_history_log_internal(spa, "scan cancelled", tx,
1099-
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
1099+
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
11001100
else
11011101
spa_history_log_internal(spa, "scan done", tx,
1102-
"errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
1102+
"errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
11031103

11041104
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
11051105
spa->spa_scrub_active = B_FALSE;
@@ -1162,7 +1162,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
11621162
vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
11631163
spa_history_log_internal(spa,
11641164
"starting deferred resilver", tx, "errors=%llu",
1165-
(u_longlong_t)spa_get_errlog_size(spa));
1165+
(u_longlong_t)spa_approx_errlog_size(spa));
11661166
spa_async_request(spa, SPA_ASYNC_RESILVER);
11671167
}
11681168

module/zfs/spa.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5543,7 +5543,7 @@ spa_get_stats(const char *name, nvlist_t **config,
55435543

55445544
fnvlist_add_uint64(*config,
55455545
ZPOOL_CONFIG_ERRCOUNT,
5546-
spa_get_errlog_size(spa));
5546+
spa_approx_errlog_size(spa));
55475547

55485548
if (spa_suspended(spa)) {
55495549
fnvlist_add_uint64(*config,

0 commit comments

Comments
 (0)