Skip to content

Commit a34161c

Browse files
committed
Remove ARC/ZIO physdone callbacks.
Those callbacks were introduced many years ago as part of a bigger patch to smoothen the write throttling within a txg. They allow to account completion of individual physical writes within a logical one, improving cases when some of physical writes complete much sooner than others, gradually opening the write throttle. Few years after that ZFS got allocation throttling, working on a level of logical writes and limiting number of writes queued to vdevs at any point, and so limiting latency distribution between the physical writes and especially writes of multiple copies. The addition of scheduling deadline I proposed in openzfs#14925 should further reduce the latency distribution. Grown memory sizes over the past 10 years should also reduce importance of the smoothing. While the use of physdone callback may still in theory provide some smoother throttling, there are cases where we simply can not afford it. Since dirty data accounting is protected by pool-wide lock, in case of 6-wide RAIDZ, for example, it requires us to take it 8 times per logical block write, creating huge lock contention. My tests of this patch show radical reduction of the lock spinning time on workloads when smaller blocks are written to RAIDZ pools, when each of the disks receives 8-16KB chunks, but the total rate reaching 100K+ blocks per second. Same time attempts to measure any write time fluctuations didn't show anything noticeable. While there, remove also io_child_count/io_parent_count counters. They are used only for couple assertions that can be avoided. Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc.
1 parent 8653f1d commit a34161c

File tree

8 files changed

+26
-143
lines changed

8 files changed

+26
-143
lines changed

include/sys/arc.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -304,9 +304,8 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
304304
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
305305
arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp,
306306
arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
307-
arc_write_done_func_t *physdone, arc_write_done_func_t *done,
308-
void *priv, zio_priority_t priority, int zio_flags,
309-
const zbookmark_phys_t *zb);
307+
arc_write_done_func_t *done, void *priv, zio_priority_t priority,
308+
int zio_flags, const zbookmark_phys_t *zb);
310309

311310
arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv);
312311
void arc_remove_prune_callback(arc_prune_t *p);

include/sys/arc_impl.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,6 @@ struct arc_write_callback {
123123
void *awcb_private;
124124
arc_write_done_func_t *awcb_ready;
125125
arc_write_done_func_t *awcb_children_ready;
126-
arc_write_done_func_t *awcb_physdone;
127126
arc_write_done_func_t *awcb_done;
128127
arc_buf_t *awcb_buf;
129128
};

include/sys/zio.h

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,6 @@ struct zio {
461461
/* Callback info */
462462
zio_done_func_t *io_ready;
463463
zio_done_func_t *io_children_ready;
464-
zio_done_func_t *io_physdone;
465464
zio_done_func_t *io_done;
466465
void *io_private;
467466
int64_t io_prev_space_delta; /* DMU private */
@@ -504,9 +503,6 @@ struct zio {
504503
int io_error;
505504
int io_child_error[ZIO_CHILD_TYPES];
506505
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
507-
uint64_t io_child_count;
508-
uint64_t io_phys_children;
509-
uint64_t io_parent_count;
510506
uint64_t *io_stall;
511507
zio_t *io_gang_leader;
512508
zio_gang_node_t *io_gang_tree;
@@ -554,9 +550,8 @@ extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
554550
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
555551
struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
556552
zio_done_func_t *ready, zio_done_func_t *children_ready,
557-
zio_done_func_t *physdone, zio_done_func_t *done,
558-
void *priv, zio_priority_t priority, zio_flag_t flags,
559-
const zbookmark_phys_t *zb);
553+
zio_done_func_t *done, void *priv, zio_priority_t priority,
554+
zio_flag_t flags, const zbookmark_phys_t *zb);
560555

561556
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
562557
struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,

module/zfs/arc.c

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6676,18 +6676,6 @@ arc_write_children_ready(zio_t *zio)
66766676
callback->awcb_children_ready(zio, buf, callback->awcb_private);
66776677
}
66786678

6679-
/*
6680-
* The SPA calls this callback for each physical write that happens on behalf
6681-
* of a logical write. See the comment in dbuf_write_physdone() for details.
6682-
*/
6683-
static void
6684-
arc_write_physdone(zio_t *zio)
6685-
{
6686-
arc_write_callback_t *cb = zio->io_private;
6687-
if (cb->awcb_physdone != NULL)
6688-
cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
6689-
}
6690-
66916679
static void
66926680
arc_write_done(zio_t *zio)
66936681
{
@@ -6777,9 +6765,9 @@ zio_t *
67776765
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
67786766
blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
67796767
const zio_prop_t *zp, arc_write_done_func_t *ready,
6780-
arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
6781-
arc_write_done_func_t *done, void *private, zio_priority_t priority,
6782-
int zio_flags, const zbookmark_phys_t *zb)
6768+
arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
6769+
void *private, zio_priority_t priority, int zio_flags,
6770+
const zbookmark_phys_t *zb)
67836771
{
67846772
arc_buf_hdr_t *hdr = buf->b_hdr;
67856773
arc_write_callback_t *callback;
@@ -6826,7 +6814,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
68266814
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
68276815
callback->awcb_ready = ready;
68286816
callback->awcb_children_ready = children_ready;
6829-
callback->awcb_physdone = physdone;
68306817
callback->awcb_done = done;
68316818
callback->awcb_private = private;
68326819
callback->awcb_buf = buf;
@@ -6863,8 +6850,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
68636850
abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
68646851
HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
68656852
(children_ready != NULL) ? arc_write_children_ready : NULL,
6866-
arc_write_physdone, arc_write_done, callback,
6867-
priority, zio_flags, zb);
6853+
arc_write_done, callback, priority, zio_flags, zb);
68686854

68696855
return (zio);
68706856
}

module/zfs/dbuf.c

Lines changed: 9 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -4369,22 +4369,6 @@ dbuf_lightweight_ready(zio_t *zio)
43694369
rw_exit(&parent_db->db_rwlock);
43704370
}
43714371

4372-
static void
4373-
dbuf_lightweight_physdone(zio_t *zio)
4374-
{
4375-
dbuf_dirty_record_t *dr = zio->io_private;
4376-
dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
4377-
ASSERT3U(dr->dr_txg, ==, zio->io_txg);
4378-
4379-
/*
4380-
* The callback will be called io_phys_children times. Retire one
4381-
* portion of our dirty space each time we are called. Any rounding
4382-
* error will be cleaned up by dbuf_lightweight_done().
4383-
*/
4384-
int delta = dr->dr_accounted / zio->io_phys_children;
4385-
dsl_pool_undirty_space(dp, delta, zio->io_txg);
4386-
}
4387-
43884372
static void
43894373
dbuf_lightweight_done(zio_t *zio)
43904374
{
@@ -4403,16 +4387,8 @@ dbuf_lightweight_done(zio_t *zio)
44034387
dsl_dataset_block_born(ds, zio->io_bp, tx);
44044388
}
44054389

4406-
/*
4407-
* See comment in dbuf_write_done().
4408-
*/
4409-
if (zio->io_phys_children == 0) {
4410-
dsl_pool_undirty_space(dmu_objset_pool(os),
4411-
dr->dr_accounted, zio->io_txg);
4412-
} else {
4413-
dsl_pool_undirty_space(dmu_objset_pool(os),
4414-
dr->dr_accounted % zio->io_phys_children, zio->io_txg);
4415-
}
4390+
dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
4391+
zio->io_txg);
44164392

44174393
abd_free(dr->dt.dll.dr_abd);
44184394
kmem_free(dr, sizeof (*dr));
@@ -4446,8 +4422,7 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
44464422
dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
44474423
dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
44484424
&dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
4449-
dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
4450-
ZIO_PRIORITY_ASYNC_WRITE,
4425+
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
44514426
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
44524427

44534428
zio_nowait(dr->dr_zio);
@@ -4789,37 +4764,6 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
47894764
DB_DNODE_EXIT(db);
47904765
}
47914766

4792-
/*
4793-
* The SPA will call this callback several times for each zio - once
4794-
* for every physical child i/o (zio->io_phys_children times). This
4795-
* allows the DMU to monitor the progress of each logical i/o. For example,
4796-
* there may be 2 copies of an indirect block, or many fragments of a RAID-Z
4797-
* block. There may be a long delay before all copies/fragments are completed,
4798-
* so this callback allows us to retire dirty space gradually, as the physical
4799-
* i/os complete.
4800-
*/
4801-
static void
4802-
dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
4803-
{
4804-
(void) buf;
4805-
dmu_buf_impl_t *db = arg;
4806-
objset_t *os = db->db_objset;
4807-
dsl_pool_t *dp = dmu_objset_pool(os);
4808-
dbuf_dirty_record_t *dr;
4809-
int delta = 0;
4810-
4811-
dr = db->db_data_pending;
4812-
ASSERT3U(dr->dr_txg, ==, zio->io_txg);
4813-
4814-
/*
4815-
* The callback will be called io_phys_children times. Retire one
4816-
* portion of our dirty space each time we are called. Any rounding
4817-
* error will be cleaned up by dbuf_write_done().
4818-
*/
4819-
delta = dr->dr_accounted / zio->io_phys_children;
4820-
dsl_pool_undirty_space(dp, delta, zio->io_txg);
4821-
}
4822-
48234767
static void
48244768
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
48254769
{
@@ -4894,27 +4838,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
48944838
db->db_data_pending = NULL;
48954839
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
48964840

4897-
/*
4898-
* If we didn't do a physical write in this ZIO and we
4899-
* still ended up here, it means that the space of the
4900-
* dbuf that we just released (and undirtied) above hasn't
4901-
* been marked as undirtied in the pool's accounting.
4902-
*
4903-
* Thus, we undirty that space in the pool's view of the
4904-
* world here. For physical writes this type of update
4905-
* happens in dbuf_write_physdone().
4906-
*
4907-
* If we did a physical write, cleanup any rounding errors
4908-
* that came up due to writing multiple copies of a block
4909-
* on disk [see dbuf_write_physdone()].
4910-
*/
4911-
if (zio->io_phys_children == 0) {
4912-
dsl_pool_undirty_space(dmu_objset_pool(os),
4913-
dr->dr_accounted, zio->io_txg);
4914-
} else {
4915-
dsl_pool_undirty_space(dmu_objset_pool(os),
4916-
dr->dr_accounted % zio->io_phys_children, zio->io_txg);
4917-
}
4841+
dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
4842+
zio->io_txg);
49184843

49194844
kmem_free(dr, sizeof (dbuf_dirty_record_t));
49204845
}
@@ -5162,7 +5087,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
51625087

51635088
dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
51645089
contents, db->db.db_size, db->db.db_size, &zp,
5165-
dbuf_write_override_ready, NULL, NULL,
5090+
dbuf_write_override_ready, NULL,
51665091
dbuf_write_override_done,
51675092
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
51685093
mutex_enter(&db->db_mtx);
@@ -5176,7 +5101,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
51765101
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
51775102
dr->dr_zio = zio_write(pio, os->os_spa, txg,
51785103
&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
5179-
dbuf_write_nofill_ready, NULL, NULL,
5104+
dbuf_write_nofill_ready, NULL,
51805105
dbuf_write_nofill_done, db,
51815106
ZIO_PRIORITY_ASYNC_WRITE,
51825107
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
@@ -5195,9 +5120,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
51955120
dr->dr_zio = arc_write(pio, os->os_spa, txg,
51965121
&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
51975122
dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
5198-
children_ready_cb, dbuf_write_physdone,
5199-
dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
5200-
ZIO_FLAG_MUSTSUCCEED, &zb);
5123+
children_ready_cb, dbuf_write_done, db,
5124+
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
52015125
}
52025126
}
52035127

module/zfs/dmu.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1698,7 +1698,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
16981698
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
16991699
abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
17001700
zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
1701-
dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
1701+
dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
17021702
dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
17031703

17041704
return (0);
@@ -1864,7 +1864,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
18641864

18651865
zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
18661866
dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
1867-
&zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
1867+
&zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
18681868
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
18691869

18701870
return (0);

module/zfs/dmu_objset.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1698,7 +1698,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
16981698

16991699
zio = arc_write(pio, os->os_spa, tx->tx_txg,
17001700
blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os),
1701-
&zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
1701+
&zp, dmu_objset_write_ready, NULL, dmu_objset_write_done,
17021702
os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
17031703

17041704
/*

module/zfs/zio.c

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -650,9 +650,6 @@ zio_add_child(zio_t *pio, zio_t *cio)
650650
list_insert_head(&pio->io_child_list, zl);
651651
list_insert_head(&cio->io_parent_list, zl);
652652

653-
pio->io_child_count++;
654-
cio->io_parent_count++;
655-
656653
mutex_exit(&cio->io_lock);
657654
mutex_exit(&pio->io_lock);
658655
}
@@ -669,9 +666,6 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
669666
list_remove(&pio->io_child_list, zl);
670667
list_remove(&cio->io_parent_list, zl);
671668

672-
pio->io_child_count--;
673-
cio->io_parent_count--;
674-
675669
mutex_exit(&cio->io_lock);
676670
mutex_exit(&pio->io_lock);
677671
kmem_cache_free(zio_link_cache, zl);
@@ -1162,9 +1156,8 @@ zio_t *
11621156
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
11631157
abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
11641158
zio_done_func_t *ready, zio_done_func_t *children_ready,
1165-
zio_done_func_t *physdone, zio_done_func_t *done,
1166-
void *private, zio_priority_t priority, zio_flag_t flags,
1167-
const zbookmark_phys_t *zb)
1159+
zio_done_func_t *done, void *private, zio_priority_t priority,
1160+
zio_flag_t flags, const zbookmark_phys_t *zb)
11681161
{
11691162
zio_t *zio;
11701163

@@ -1184,7 +1177,6 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
11841177

11851178
zio->io_ready = ready;
11861179
zio->io_children_ready = children_ready;
1187-
zio->io_physdone = physdone;
11881180
zio->io_prop = *zp;
11891181

11901182
/*
@@ -1517,16 +1509,11 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
15171509
flags &= ~ZIO_FLAG_IO_ALLOCATING;
15181510
}
15191511

1520-
15211512
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
15221513
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
15231514
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
15241515
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
15251516

1526-
zio->io_physdone = pio->io_physdone;
1527-
if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
1528-
zio->io_logical->io_phys_children++;
1529-
15301517
return (zio);
15311518
}
15321519

@@ -2717,7 +2704,7 @@ zio_gang_tree_assemble_done(zio_t *zio)
27172704
blkptr_t *bp = zio->io_bp;
27182705

27192706
ASSERT(gio == zio_unique_parent(zio));
2720-
ASSERT(zio->io_child_count == 0);
2707+
ASSERT(list_is_empty(&zio->io_child_list));
27212708

27222709
if (zio->io_error)
27232710
return;
@@ -2975,7 +2962,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
29752962
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
29762963
has_data ? abd_get_offset(pio->io_abd, pio->io_size -
29772964
resid) : NULL, lsize, lsize, &zp,
2978-
zio_write_gang_member_ready, NULL, NULL,
2965+
zio_write_gang_member_ready, NULL,
29792966
zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
29802967
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
29812968

@@ -3437,7 +3424,7 @@ zio_ddt_write(zio_t *zio)
34373424
} else {
34383425
cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
34393426
zio->io_orig_size, zio->io_orig_size, zp,
3440-
zio_ddt_child_write_ready, NULL, NULL,
3427+
zio_ddt_child_write_ready, NULL,
34413428
zio_ddt_child_write_done, dde, zio->io_priority,
34423429
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
34433430

@@ -4147,13 +4134,6 @@ zio_vdev_io_assess(zio_t *zio)
41474134
if (zio->io_error)
41484135
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
41494136

4150-
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
4151-
zio->io_physdone != NULL) {
4152-
ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
4153-
ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
4154-
zio->io_physdone(zio->io_logical);
4155-
}
4156-
41574137
return (zio);
41584138
}
41594139

@@ -4903,7 +4883,7 @@ zio_done(zio_t *zio)
49034883
return (NULL);
49044884
}
49054885

4906-
ASSERT(zio->io_child_count == 0);
4886+
ASSERT(list_is_empty(&zio->io_child_list));
49074887
ASSERT(zio->io_reexecute == 0);
49084888
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
49094889

0 commit comments

Comments
 (0)