Skip to content

Commit af5dbed

Browse files
authored
Fix scn_queue races on very old pools
Code for pools before version 11 uses dmu_objset_find_dp() to scan for children datasets/clones. It calls enqueue_clones_cb() and enqueue_cb() callbacks in parallel from multiple taskq threads. It ends up bad for scan_ds_queue_insert(), corrupting scn_queue AVL-tree. Fix it by introducing a mutex to protect those two scan_ds_queue_insert() calls. All other calls are done from the sync thread and so serialized. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Brian Atkinson <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes #16162
1 parent a0f3c8a commit af5dbed

File tree

2 files changed

+7
-0
lines changed

2 files changed

+7
-0
lines changed

include/sys/dsl_scan.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ typedef struct dsl_scan {
173173
dsl_scan_phys_t scn_phys; /* on disk representation of scan */
174174
dsl_scan_phys_t scn_phys_cached;
175175
avl_tree_t scn_queue; /* queue of datasets to scan */
176+
kmutex_t scn_queue_lock; /* serializes scn_queue inserts */
176177
uint64_t scn_queues_pending; /* outstanding data to issue */
177178
/* members needed for syncing error scrub status to disk */
178179
dsl_errorscrub_phys_t errorscrub_phys;

module/zfs/dsl_scan.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
491491

492492
avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
493493
offsetof(scan_ds_t, sds_node));
494+
mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL);
494495
avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
495496
sizeof (scan_prefetch_issue_ctx_t),
496497
offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
@@ -646,6 +647,7 @@ dsl_scan_fini(dsl_pool_t *dp)
646647

647648
scan_ds_queue_clear(scn);
648649
avl_destroy(&scn->scn_queue);
650+
mutex_destroy(&scn->scn_queue_lock);
649651
scan_ds_prefetch_queue_clear(scn);
650652
avl_destroy(&scn->scn_prefetch_queue);
651653

@@ -2723,8 +2725,10 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
27232725
return (err);
27242726
ds = prev;
27252727
}
2728+
mutex_enter(&scn->scn_queue_lock);
27262729
scan_ds_queue_insert(scn, ds->ds_object,
27272730
dsl_dataset_phys(ds)->ds_prev_snap_txg);
2731+
mutex_exit(&scn->scn_queue_lock);
27282732
dsl_dataset_rele(ds, FTAG);
27292733
return (0);
27302734
}
@@ -2915,8 +2919,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
29152919
ds = prev;
29162920
}
29172921

2922+
mutex_enter(&scn->scn_queue_lock);
29182923
scan_ds_queue_insert(scn, ds->ds_object,
29192924
dsl_dataset_phys(ds)->ds_prev_snap_txg);
2925+
mutex_exit(&scn->scn_queue_lock);
29202926
dsl_dataset_rele(ds, FTAG);
29212927
return (0);
29222928
}

0 commit comments

Comments
 (0)