Skip to content

Commit 5dbfb0b

Browse files
committed
arc: avoid possible deadlock in arc_read
In l2arc_evict(), the config lock may be acquired in reverse order (e.g., first the config lock (writer), then a hash lock) unlike in arc_read() during scenarios like L2ARC device removal. To avoid deadlocks, if the attempt to acquire the config lock (reader) fails in arc_read(), release the hash lock, wait for the config lock, and retry from the beginning. Signed-off-by: Ameer Hamza <[email protected]>
1 parent 6a2f7b3 commit 5dbfb0b

File tree

6 files changed

+40
-16
lines changed

6 files changed

+40
-16
lines changed

cmd/zdb/zdb.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9043,7 +9043,7 @@ zdb_read_block(char *thing, spa_t *spa)
90439043
const blkptr_t *b = (const blkptr_t *)(void *)
90449044
((uintptr_t)buf + (uintptr_t)blkptr_offset);
90459045
if (zfs_blkptr_verify(spa, b,
9046-
BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) {
9046+
BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY)) {
90479047
abd_return_buf_copy(pabd, buf, lsize);
90489048
borrowed = B_FALSE;
90499049
buf = lbuf;
@@ -9052,7 +9052,7 @@ zdb_read_block(char *thing, spa_t *spa)
90529052
b = (const blkptr_t *)(void *)
90539053
((uintptr_t)buf + (uintptr_t)blkptr_offset);
90549054
if (lsize == -1 || zfs_blkptr_verify(spa, b,
9055-
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) {
9055+
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
90569056
printf("invalid block pointer at this DVA\n");
90579057
goto out;
90589058
}

include/sys/zio.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,7 @@ enum blk_verify_flag {
546546
enum blk_config_flag {
547547
BLK_CONFIG_HELD, // SCL_VDEV held for writer
548548
BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader
549+
BLK_CONFIG_NEEDED_TRY, // Try with SCL_VDEV for reader
549550
BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV
550551
};
551552

@@ -663,7 +664,7 @@ extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t);
663664
extern int zio_resume(spa_t *spa);
664665
extern void zio_resume_wait(spa_t *spa);
665666

666-
extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
667+
extern int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
667668
enum blk_config_flag blk_config, enum blk_verify_flag blk_verify);
668669

669670
/*

module/zfs/arc.c

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5683,6 +5683,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
56835683
boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
56845684
arc_buf_t *buf = NULL;
56855685
int rc = 0;
5686+
boolean_t bp_validation = B_FALSE;
56865687

56875688
ASSERT(!embedded_bp ||
56885689
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
@@ -5725,7 +5726,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
57255726
* should always be the case since the blkptr is protected by
57265727
* a checksum.
57275728
*/
5728-
if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP,
5729+
if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP,
57295730
BLK_VERIFY_LOG)) {
57305731
mutex_exit(hash_lock);
57315732
rc = SET_ERROR(ECKSUM);
@@ -5877,6 +5878,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
58775878
abd_t *hdr_abd;
58785879
int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
58795880
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
5881+
int config_lock;
5882+
int error;
58805883

58815884
if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
58825885
if (hash_lock != NULL)
@@ -5885,16 +5888,31 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
58855888
goto done;
58865889
}
58875890

5891+
if (zio_flags & ZIO_FLAG_CONFIG_WRITER) {
5892+
config_lock = BLK_CONFIG_HELD;
5893+
} else if (hash_lock != NULL) {
5894+
/*
5895+
* Prevent lock order reversal
5896+
*/
5897+
config_lock = BLK_CONFIG_NEEDED_TRY;
5898+
} else {
5899+
config_lock = BLK_CONFIG_NEEDED;
5900+
}
5901+
58885902
/*
58895903
* Verify the block pointer contents are reasonable. This
58905904
* should always be the case since the blkptr is protected by
58915905
* a checksum.
58925906
*/
5893-
if (!zfs_blkptr_verify(spa, bp,
5894-
(zio_flags & ZIO_FLAG_CONFIG_WRITER) ?
5895-
BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
5907+
if (!bp_validation && (error = zfs_blkptr_verify(spa, bp,
5908+
config_lock, BLK_VERIFY_LOG))) {
58965909
if (hash_lock != NULL)
58975910
mutex_exit(hash_lock);
5911+
if (error == EBUSY && !zfs_blkptr_verify(spa, bp,
5912+
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
5913+
bp_validation = B_TRUE;
5914+
goto top;
5915+
}
58985916
rc = SET_ERROR(ECKSUM);
58995917
goto done;
59005918
}

module/zfs/dsl_scan.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2305,7 +2305,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
23052305
DMU_USERUSED_OBJECT, tx);
23062306
}
23072307
arc_buf_destroy(buf, &buf);
2308-
} else if (!zfs_blkptr_verify(spa, bp,
2308+
} else if (zfs_blkptr_verify(spa, bp,
23092309
BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
23102310
/*
23112311
* Sanity check the block pointer contents, this is handled

module/zfs/spa.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2778,7 +2778,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
27782778
* When damaged consider it to be a metadata error since we cannot
27792779
* trust the BP_GET_TYPE and BP_GET_LEVEL values.
27802780
*/
2781-
if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
2781+
if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
27822782
atomic_inc_64(&sle->sle_meta_count);
27832783
return (0);
27842784
}

module/zfs/zio.c

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1164,7 +1164,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
11641164
* it only contains known object types, checksum/compression identifiers,
11651165
* block sizes within the maximum allowed limits, valid DVAs, etc.
11661166
*
1167-
* If everything checks out B_TRUE is returned. The zfs_blkptr_verify
1167+
* If everything checks out 0 is returned. The zfs_blkptr_verify
11681168
* argument controls the behavior when an invalid field is detected.
11691169
*
11701170
* Values for blk_verify_flag:
@@ -1179,7 +1179,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
11791179
* BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
11801180
* performance
11811181
*/
1182-
boolean_t
1182+
int
11831183
zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
11841184
enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
11851185
{
@@ -1211,7 +1211,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
12111211
"blkptr at %px has invalid PSIZE %llu",
12121212
bp, (longlong_t)BPE_GET_PSIZE(bp));
12131213
}
1214-
return (errors == 0);
1214+
return ((errors) ? ECKSUM : 0);
12151215
}
12161216
if (unlikely(BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS)) {
12171217
errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
@@ -1229,7 +1229,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
12291229
* will be done once the zio is executed in vdev_mirror_map_alloc.
12301230
*/
12311231
if (unlikely(!spa->spa_trust_config))
1232-
return (errors == 0);
1232+
return ((errors) ? ECKSUM : 0);
12331233

12341234
switch (blk_config) {
12351235
case BLK_CONFIG_HELD:
@@ -1238,8 +1238,12 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
12381238
case BLK_CONFIG_NEEDED:
12391239
spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
12401240
break;
1241+
case BLK_CONFIG_NEEDED_TRY:
1242+
if (!spa_config_tryenter(spa, SCL_VDEV, bp, RW_READER))
1243+
return (EBUSY);
1244+
break;
12411245
case BLK_CONFIG_SKIP:
1242-
return (errors == 0);
1246+
return ((errors) ? ECKSUM : 0);
12431247
default:
12441248
panic("invalid blk_config %u", blk_config);
12451249
}
@@ -1294,10 +1298,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
12941298
bp, i, (longlong_t)offset);
12951299
}
12961300
}
1297-
if (blk_config == BLK_CONFIG_NEEDED)
1301+
if (blk_config == BLK_CONFIG_NEEDED || blk_config ==
1302+
BLK_CONFIG_NEEDED_TRY)
12981303
spa_config_exit(spa, SCL_VDEV, bp);
12991304

1300-
return (errors == 0);
1305+
return ((errors) ? ECKSUM : 0);
13011306
}
13021307

13031308
boolean_t

0 commit comments

Comments
 (0)