Skip to content

Commit d6449cb

Browse files
amotindatacore-rm
authored andcommitted
Optimize allocation throttling
Remove mc_lock use from metaslab_class_throttle_*(). The math there is based on refcounts and so atomic, so the only race possible there is between zfs_refcount_count() and zfs_refcount_add(). But in most cases metaslab_class_throttle_reserve() is called with the allocator lock held, which covers the race. In cases where the lock is not held, GANG_ALLOCATION() or METASLAB_MUST_RESERVE are set, and so we do not use zfs_refcount_count(). And even if we assume some other non-existing scenario, the worst that may happen from this race is few more I/Os get to allocation earlier, that is not a problem. Move locks and data of different allocators into different cache lines to avoid false sharing. Group spa_alloc_* arrays together into single array of aligned struct spa_alloc spa_allocs. Align struct metaslab_class_allocator. Reviewed-by: Paul Dagnelie <[email protected]> Reviewed-by: Ryan Moeller <[email protected]> Reviewed-by: Don Brady <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored-By: iXsystems, Inc. Closes openzfs#12314
1 parent 661b1c8 commit d6449cb

File tree

6 files changed

+45
-58
lines changed

6 files changed

+45
-58
lines changed

include/sys/metaslab_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ typedef struct metaslab_class_allocator {
157157
*/
158158
uint64_t mca_alloc_max_slots;
159159
zfs_refcount_t mca_alloc_slots;
160-
} metaslab_class_allocator_t;
160+
} ____cacheline_aligned metaslab_class_allocator_t;
161161

162162
/*
163163
* A metaslab class encompasses a category of allocatable top-level vdevs.

include/sys/spa_impl.h

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@
5757
extern "C" {
5858
#endif
5959

60+
typedef struct spa_alloc {
61+
kmutex_t spaa_lock;
62+
avl_tree_t spaa_tree;
63+
} ____cacheline_aligned spa_alloc_t;
64+
6065
typedef struct spa_error_entry {
6166
zbookmark_phys_t se_bookmark;
6267
char *se_name;
@@ -250,13 +255,11 @@ struct spa {
250255
list_t spa_config_dirty_list; /* vdevs with dirty config */
251256
list_t spa_state_dirty_list; /* vdevs with dirty state */
252257
/*
253-
* spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
254-
* stored in spa_alloc_count. There is one tree and one lock for each
255-
* allocator, to help improve allocation performance in write-heavy
256-
* workloads.
258+
* spa_allocs is an array, whose lengths is stored in spa_alloc_count.
259+
* There is one tree and one lock for each allocator, to help improve
260+
* allocation performance in write-heavy workloads.
257261
*/
258-
kmutex_t *spa_alloc_locks;
259-
avl_tree_t *spa_alloc_trees;
262+
spa_alloc_t *spa_allocs;
260263
int spa_alloc_count;
261264

262265
spa_aux_vdev_t spa_spares; /* hot spares */

module/zfs/metaslab.c

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5611,31 +5611,21 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
56115611
zio_t *zio, int flags)
56125612
{
56135613
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
5614-
uint64_t available_slots = 0;
5615-
boolean_t slot_reserved = B_FALSE;
56165614
uint64_t max = mca->mca_alloc_max_slots;
56175615

56185616
ASSERT(mc->mc_alloc_throttle_enabled);
5619-
mutex_enter(&mc->mc_lock);
5620-
5621-
uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
5622-
if (reserved_slots < max)
5623-
available_slots = max - reserved_slots;
5624-
5625-
if (slots <= available_slots || GANG_ALLOCATION(flags) ||
5626-
flags & METASLAB_MUST_RESERVE) {
5617+
if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
5618+
zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
56275619
/*
56285620
* We reserve the slots individually so that we can unreserve
56295621
* them individually when an I/O completes.
56305622
*/
56315623
for (int d = 0; d < slots; d++)
56325624
zfs_refcount_add(&mca->mca_alloc_slots, zio);
56335625
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
5634-
slot_reserved = B_TRUE;
5626+
return (B_TRUE);
56355627
}
5636-
5637-
mutex_exit(&mc->mc_lock);
5638-
return (slot_reserved);
5628+
return (B_FALSE);
56395629
}
56405630

56415631
void
@@ -5645,10 +5635,8 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
56455635
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
56465636

56475637
ASSERT(mc->mc_alloc_throttle_enabled);
5648-
mutex_enter(&mc->mc_lock);
56495638
for (int d = 0; d < slots; d++)
56505639
zfs_refcount_remove(&mca->mca_alloc_slots, zio);
5651-
mutex_exit(&mc->mc_lock);
56525640
}
56535641

56545642
static int

module/zfs/spa.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9313,9 +9313,9 @@ spa_sync(spa_t *spa, uint64_t txg)
93139313
spa->spa_sync_pass = 0;
93149314

93159315
for (int i = 0; i < spa->spa_alloc_count; i++) {
9316-
mutex_enter(&spa->spa_alloc_locks[i]);
9317-
VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
9318-
mutex_exit(&spa->spa_alloc_locks[i]);
9316+
mutex_enter(&spa->spa_allocs[i].spaa_lock);
9317+
VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
9318+
mutex_exit(&spa->spa_allocs[i].spaa_lock);
93199319
}
93209320

93219321
/*
@@ -9426,9 +9426,9 @@ spa_sync(spa_t *spa, uint64_t txg)
94269426
dsl_pool_sync_done(dp, txg);
94279427

94289428
for (int i = 0; i < spa->spa_alloc_count; i++) {
9429-
mutex_enter(&spa->spa_alloc_locks[i]);
9430-
VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
9431-
mutex_exit(&spa->spa_alloc_locks[i]);
9429+
mutex_enter(&spa->spa_allocs[i].spaa_lock);
9430+
VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
9431+
mutex_exit(&spa->spa_allocs[i].spaa_lock);
94329432
}
94339433

94349434
/*

module/zfs/spa_misc.c

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -703,13 +703,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
703703
spa->spa_root = spa_strdup(altroot);
704704

705705
spa->spa_alloc_count = spa_allocators;
706-
spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
707-
sizeof (kmutex_t), KM_SLEEP);
708-
spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
709-
sizeof (avl_tree_t), KM_SLEEP);
706+
spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
707+
sizeof (spa_alloc_t), KM_SLEEP);
710708
for (int i = 0; i < spa->spa_alloc_count; i++) {
711-
mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
712-
avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
709+
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
710+
NULL);
711+
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
713712
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
714713
}
715714
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
@@ -802,13 +801,11 @@ spa_remove(spa_t *spa)
802801
}
803802

804803
for (int i = 0; i < spa->spa_alloc_count; i++) {
805-
avl_destroy(&spa->spa_alloc_trees[i]);
806-
mutex_destroy(&spa->spa_alloc_locks[i]);
804+
avl_destroy(&spa->spa_allocs[i].spaa_tree);
805+
mutex_destroy(&spa->spa_allocs[i].spaa_lock);
807806
}
808-
kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
809-
sizeof (kmutex_t));
810-
kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
811-
sizeof (avl_tree_t));
807+
kmem_free(spa->spa_allocs, spa->spa_alloc_count *
808+
sizeof (spa_alloc_t));
812809

813810
avl_destroy(&spa->spa_metaslabs_by_flushed);
814811
avl_destroy(&spa->spa_sm_logs_by_txg);

module/zfs/zio.c

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -877,8 +877,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
877877
zio->io_bookmark = *zb;
878878

879879
if (pio != NULL) {
880-
if (zio->io_metaslab_class == NULL)
881-
zio->io_metaslab_class = pio->io_metaslab_class;
880+
zio->io_metaslab_class = pio->io_metaslab_class;
882881
if (zio->io_logical == NULL)
883882
zio->io_logical = pio->io_logical;
884883
if (zio->io_child_type == ZIO_CHILD_GANG)
@@ -3384,9 +3383,9 @@ zio_io_to_allocate(spa_t *spa, int allocator)
33843383
{
33853384
zio_t *zio;
33863385

3387-
ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
3386+
ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock));
33883387

3389-
zio = avl_first(&spa->spa_alloc_trees[allocator]);
3388+
zio = avl_first(&spa->spa_allocs[allocator].spaa_tree);
33903389
if (zio == NULL)
33913390
return (NULL);
33923391

@@ -3398,11 +3397,11 @@ zio_io_to_allocate(spa_t *spa, int allocator)
33983397
*/
33993398
ASSERT3U(zio->io_allocator, ==, allocator);
34003399
if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
3401-
zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
3400+
zio->io_prop.zp_copies, allocator, zio, 0)) {
34023401
return (NULL);
34033402
}
34043403

3405-
avl_remove(&spa->spa_alloc_trees[allocator], zio);
3404+
avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio);
34063405
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
34073406

34083407
return (zio);
@@ -3426,8 +3425,8 @@ zio_dva_throttle(zio_t *zio)
34263425
return (zio);
34273426
}
34283427

3428+
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
34293429
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
3430-
34313430
ASSERT3U(zio->io_queued_timestamp, >, 0);
34323431
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
34333432

@@ -3439,14 +3438,14 @@ zio_dva_throttle(zio_t *zio)
34393438
* into 2^20 block regions, and then hash based on the objset, object,
34403439
* level, and region to accomplish both of these goals.
34413440
*/
3442-
zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
3441+
int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
34433442
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
3444-
mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
3445-
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
3443+
zio->io_allocator = allocator;
34463444
zio->io_metaslab_class = mc;
3447-
avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
3448-
nio = zio_io_to_allocate(spa, zio->io_allocator);
3449-
mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
3445+
mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
3446+
avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
3447+
nio = zio_io_to_allocate(spa, allocator);
3448+
mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
34503449
return (nio);
34513450
}
34523451

@@ -3455,9 +3454,9 @@ zio_allocate_dispatch(spa_t *spa, int allocator)
34553454
{
34563455
zio_t *zio;
34573456

3458-
mutex_enter(&spa->spa_alloc_locks[allocator]);
3457+
mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
34593458
zio = zio_io_to_allocate(spa, allocator);
3460-
mutex_exit(&spa->spa_alloc_locks[allocator]);
3459+
mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
34613460
if (zio == NULL)
34623461
return;
34633462

@@ -3647,8 +3646,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
36473646
* some parallelism.
36483647
*/
36493648
int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
3650-
int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
3651-
spa->spa_alloc_count;
3649+
int allocator = (uint_t)cityhash4(0, 0, 0,
3650+
os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
36523651
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
36533652
txg, NULL, flags, &io_alloc_list, NULL, allocator);
36543653
*slog = (error == 0);

0 commit comments

Comments
 (0)