Skip to content

Commit 6a18dd9

Browse files
committed
Guarantee that spa_load_guid is unique
The zpool reguid feature introduced the spa_load_guid, which is a transient value used for runtime identification purposes in the ARC. This value is not the same as the spa's persistent pool guid. However, the value is seeded from spa_generate_load_guid() which does not check for uniqueness against the spa_load_guid from other pools. Although extremely rare, you can end up with two different pools sharing the same spa_load_guid value! This change guarantees that the value is always unique and additionally not still in use by an async arc flush task. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Don Brady <[email protected]>
1 parent fa43c22 commit 6a18dd9

File tree

5 files changed

+134
-17
lines changed

5 files changed

+134
-17
lines changed

include/sys/arc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ void arc_flush(spa_t *spa, boolean_t retry);
332332
void arc_flush_async(spa_t *spa);
333333
void arc_tempreserve_clear(uint64_t reserve);
334334
int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
335+
boolean_t arc_async_flush_guid_inuse(uint64_t load_guid);
335336

336337
uint64_t arc_all_memory(void);
337338
uint64_t arc_default_max(uint64_t min, uint64_t allmem);

include/sys/spa.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1103,6 +1103,7 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
11031103
extern char *spa_strdup(const char *);
11041104
extern void spa_strfree(char *);
11051105
extern uint64_t spa_generate_guid(spa_t *spa);
1106+
extern uint64_t spa_generate_load_guid(void);
11061107
extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
11071108
extern void spa_freeze(spa_t *spa);
11081109
extern int spa_change_guid(spa_t *spa, const uint64_t *guidp);

module/zfs/arc.c

Lines changed: 103 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,23 @@ static buf_hash_table_t buf_hash_table;
776776

777777
uint64_t zfs_crc64_table[256];
778778

779+
/*
780+
* Asynchronous ARC flush
781+
*
782+
* We track these in a list for arc_async_flush_guid_inuse().
783+
* Used for both L1 and L2 async teardown.
784+
*/
785+
static list_t arc_async_flush_list;
786+
static kmutex_t arc_async_flush_lock;
787+
788+
typedef struct arc_async_flush {
789+
uint64_t af_spa_guid;
790+
taskq_ent_t af_tqent;
791+
uint_t af_cache_level; /* 1 or 2 to differentiate node */
792+
list_node_t af_node;
793+
} arc_async_flush_t;
794+
795+
779796
/*
780797
* Level 2 ARC
781798
*/
@@ -4419,19 +4436,52 @@ arc_flush(spa_t *spa, boolean_t retry)
44194436
arc_flush_impl(spa != NULL ? spa_load_guid(spa) : 0, retry);
44204437
}
44214438

4439+
static arc_async_flush_t *
4440+
arc_async_flush_add(uint64_t spa_guid, uint_t level)
4441+
{
4442+
arc_async_flush_t *af = kmem_alloc(sizeof (*af), KM_SLEEP);
4443+
af->af_spa_guid = spa_guid;
4444+
af->af_cache_level = level;
4445+
taskq_init_ent(&af->af_tqent);
4446+
list_link_init(&af->af_node);
4447+
4448+
mutex_enter(&arc_async_flush_lock);
4449+
list_insert_tail(&arc_async_flush_list, af);
4450+
mutex_exit(&arc_async_flush_lock);
4451+
4452+
return (af);
4453+
}
4454+
4455+
static void
4456+
arc_async_flush_remove(uint64_t spa_guid, uint_t level)
4457+
{
4458+
mutex_enter(&arc_async_flush_lock);
4459+
for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
4460+
af != NULL; af = list_next(&arc_async_flush_list, af)) {
4461+
if (af->af_spa_guid == spa_guid &&
4462+
af->af_cache_level == level) {
4463+
list_remove(&arc_async_flush_list, af);
4464+
kmem_free(af, sizeof (*af));
4465+
break;
4466+
}
4467+
}
4468+
mutex_exit(&arc_async_flush_lock);
4469+
}
4470+
44224471
static void
44234472
arc_flush_task(void *arg)
44244473
{
4425-
uint64_t guid = *((uint64_t *)arg);
4474+
arc_async_flush_t *af = arg;
44264475
hrtime_t start_time = gethrtime();
4476+
uint64_t spa_guid = af->af_spa_guid;
44274477

4428-
arc_flush_impl(guid, B_FALSE);
4429-
kmem_free(arg, sizeof (uint64_t *));
4478+
arc_flush_impl(spa_guid, B_FALSE);
4479+
arc_async_flush_remove(spa_guid, af->af_cache_level);
44304480

44314481
uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
44324482
if (elaspsed > 0) {
44334483
zfs_dbgmsg("spa %llu arc flushed in %llu ms",
4434-
(u_longlong_t)guid, (u_longlong_t)elaspsed);
4484+
(u_longlong_t)spa_guid, (u_longlong_t)elaspsed);
44354485
}
44364486
}
44374487

@@ -4448,15 +4498,36 @@ arc_flush_task(void *arg)
44484498
void
44494499
arc_flush_async(spa_t *spa)
44504500
{
4451-
uint64_t *guidp = kmem_alloc(sizeof (uint64_t *), KM_SLEEP);
4501+
uint64_t spa_guid = spa_load_guid(spa);
4502+
arc_async_flush_t *af = arc_async_flush_add(spa_guid, 1);
44524503

4453-
*guidp = spa_load_guid(spa);
4504+
/*
4505+
* Note that arc_flush_task() needs arc_async_flush_lock to remove af
4506+
* list node. So by holding the lock we avoid a race for af removal
4507+
* with our use here.
4508+
*/
4509+
mutex_enter(&arc_async_flush_lock);
4510+
taskq_dispatch_ent(arc_flush_taskq, arc_flush_task,
4511+
af, TQ_SLEEP, &af->af_tqent);
4512+
mutex_exit(&arc_async_flush_lock);
4513+
}
44544514

4455-
if (taskq_dispatch(arc_flush_taskq, arc_flush_task, guidp,
4456-
TQ_SLEEP) == TASKQID_INVALID) {
4457-
arc_flush_impl(*guidp, B_FALSE);
4458-
kmem_free(guidp, sizeof (uint64_t *));
4515+
/*
4516+
* Check if a guid is still in-use as part of an async teardown task
4517+
*/
4518+
boolean_t
4519+
arc_async_flush_guid_inuse(uint64_t spa_guid)
4520+
{
4521+
mutex_enter(&arc_async_flush_lock);
4522+
for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
4523+
af != NULL; af = list_next(&arc_async_flush_list, af)) {
4524+
if (af->af_spa_guid == spa_guid) {
4525+
mutex_exit(&arc_async_flush_lock);
4526+
return (B_TRUE);
4527+
}
44594528
}
4529+
mutex_exit(&arc_async_flush_lock);
4530+
return (B_FALSE);
44604531
}
44614532

44624533
uint64_t
@@ -7802,6 +7873,9 @@ arc_init(void)
78027873
arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
78037874
defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
78047875

7876+
list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
7877+
offsetof(arc_async_flush_t, af_node));
7878+
mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
78057879
arc_flush_taskq = taskq_create("arc_flush", 75, defclsyspri,
78067880
1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
78077881

@@ -7885,6 +7959,9 @@ arc_fini(void)
78857959
taskq_wait(arc_prune_taskq);
78867960
taskq_destroy(arc_prune_taskq);
78877961

7962+
list_destroy(&arc_async_flush_list);
7963+
mutex_destroy(&arc_async_flush_lock);
7964+
78887965
mutex_enter(&arc_prune_mtx);
78897966
while ((p = list_remove_head(&arc_prune_list)) != NULL) {
78907967
(void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
@@ -9797,6 +9874,8 @@ typedef struct {
97979874
l2arc_dev_t *rva_l2arc_dev;
97989875
uint64_t rva_spa_gid;
97999876
uint64_t rva_vdev_gid;
9877+
boolean_t rva_async;
9878+
98009879
} remove_vdev_args_t;
98019880

98029881
static void
@@ -9827,6 +9906,9 @@ l2arc_device_teardown(void *arg)
98279906
(u_longlong_t)rva->rva_vdev_gid,
98289907
(u_longlong_t)elaspsed);
98299908
}
9909+
9910+
if (rva->rva_async)
9911+
arc_async_flush_remove(rva->rva_spa_gid, 2);
98309912
kmem_free(rva, sizeof (remove_vdev_args_t));
98319913
}
98329914

@@ -9852,7 +9934,7 @@ l2arc_remove_vdev(vdev_t *vd)
98529934
remove_vdev_args_t *rva = kmem_alloc(sizeof (remove_vdev_args_t),
98539935
KM_SLEEP);
98549936
rva->rva_l2arc_dev = remdev;
9855-
rva->rva_spa_gid = spa_guid(remdev->l2ad_spa);
9937+
rva->rva_spa_gid = spa_load_guid(spa);
98569938
rva->rva_vdev_gid = remdev->l2ad_vdev->vdev_guid;
98579939

98589940
/*
@@ -9868,6 +9950,7 @@ l2arc_remove_vdev(vdev_t *vd)
98689950
asynchronous = B_FALSE;
98699951
}
98709952
mutex_exit(&l2arc_rebuild_thr_lock);
9953+
rva->rva_async = asynchronous;
98719954

98729955
/*
98739956
* Remove device from global list
@@ -9885,13 +9968,17 @@ l2arc_remove_vdev(vdev_t *vd)
98859968
}
98869969
mutex_exit(&l2arc_dev_mtx);
98879970

9888-
/*
9889-
* If possible, the teardown is completed asynchronously
9890-
*/
9891-
if (!asynchronous || taskq_dispatch(arc_flush_taskq,
9892-
l2arc_device_teardown, rva, TQ_SLEEP) == TASKQID_INVALID) {
9971+
if (!asynchronous) {
98939972
l2arc_device_teardown(rva);
9973+
return;
98949974
}
9975+
9976+
arc_async_flush_t *af = arc_async_flush_add(rva->rva_spa_gid, 2);
9977+
9978+
mutex_enter(&arc_async_flush_lock);
9979+
taskq_dispatch_ent(arc_flush_taskq, l2arc_device_teardown, rva,
9980+
TQ_SLEEP, &af->af_tqent);
9981+
mutex_exit(&arc_async_flush_lock);
98959982
}
98969983

98979984
void

module/zfs/spa_misc.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1588,6 +1588,34 @@ spa_generate_guid(spa_t *spa)
15881588
return (guid);
15891589
}
15901590

1591+
static boolean_t
1592+
spa_load_guid_exists(uint64_t guid)
1593+
{
1594+
avl_tree_t *t = &spa_namespace_avl;
1595+
1596+
ASSERT(MUTEX_HELD(&spa_namespace_lock));
1597+
1598+
for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
1599+
if (spa_load_guid(spa) == guid)
1600+
return (B_TRUE);
1601+
}
1602+
1603+
return (arc_async_flush_guid_inuse(guid));
1604+
}
1605+
1606+
uint64_t
1607+
spa_generate_load_guid(void)
1608+
{
1609+
uint64_t guid;
1610+
1611+
do {
1612+
(void) random_get_pseudo_bytes((void *)&guid,
1613+
sizeof (guid));
1614+
} while (guid == 0 || spa_load_guid_exists(guid));
1615+
1616+
return (guid);
1617+
}
1618+
15911619
void
15921620
snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
15931621
{

module/zfs/vdev.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -647,7 +647,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
647647
if (spa->spa_root_vdev == NULL) {
648648
ASSERT(ops == &vdev_root_ops);
649649
spa->spa_root_vdev = vd;
650-
spa->spa_load_guid = spa_generate_guid(NULL);
650+
spa->spa_load_guid = spa_generate_load_guid();
651651
}
652652

653653
if (guid == 0 && ops != &vdev_hole_ops) {

0 commit comments

Comments
 (0)