Skip to content

Commit 8469b5a

Browse files
authored
Another set of vdev queue optimizations.
Switch FIFO queues (SYNC/TRIM) and active queue of vdev queue from time-sorted AVL-trees to simple lists. AVL-trees are too expensive for such a simple task. To change I/O priority without searching through the trees, add io_queue_state field to struct zio. To not check number of queued I/Os for each priority add vq_cqueued bitmap to struct vdev_queue. Update it when adding/removing I/Os. Make vq_cactive a separate array instead of struct vdev_queue_class member. Together those allow to avoid lots of cache misses when looking for work in vdev_queue_class_to_issue(). Introduce deadline of ~0.5s for LBA-sorted queues. Before this I saw some I/Os waiting in a queue for up to 8 seconds and possibly more due to starvation. With this change I no longer see it. I had to slightly more complicate the comparison function, but since it uses all the same cache lines the difference is minimal. For a sequential I/Os the new code in vdev_queue_io_to_issue() actually often uses more simple avl_first(), falling back to avl_find() and avl_nearest() only when needed. Arrange members in struct zio to access only one cache line when searching through vdev queues. While there, remove io_alloc_node, reusing the io_queue_node instead. Those two are never used same time. Remove zfs_vdev_aggregate_trim parameter. It was disabled for 4 years since implemented, while still wasted time maintaining the offset-sorted tree of TRIM requests. Just remove the tree. Remove locking from txg_all_lists_empty(). It is racy by design, while 2 pair of locks/unlocks take noticeable time under the vdev queue lock. With these changes in my tests with volblocksize=4KB I measure vdev queue lock spin time reduction by 50% on read and 75% on write. Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes #14925
1 parent 35a6247 commit 8469b5a

File tree

8 files changed

+205
-172
lines changed

8 files changed

+205
-172
lines changed

include/sys/vdev.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
164164
extern void vdev_queue_io_done(zio_t *zio);
165165
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
166166

167-
extern int vdev_queue_length(vdev_t *vd);
167+
extern uint32_t vdev_queue_length(vdev_t *vd);
168168
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
169+
extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
169170

170171
extern void vdev_config_dirty(vdev_t *vd);
171172
extern void vdev_config_clean(vdev_t *vd);

include/sys/vdev_impl.h

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -130,27 +130,24 @@ typedef const struct vdev_ops {
130130
/*
131131
* Virtual device properties
132132
*/
133-
typedef struct vdev_queue_class {
134-
uint32_t vqc_active;
135-
136-
/*
137-
* Sorted by offset or timestamp, depending on if the queue is
138-
* LBA-ordered vs FIFO.
139-
*/
140-
avl_tree_t vqc_queued_tree;
133+
typedef union vdev_queue_class {
134+
list_t vqc_list;
135+
avl_tree_t vqc_tree;
141136
} vdev_queue_class_t;
142137

143138
struct vdev_queue {
144139
vdev_t *vq_vdev;
145140
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
146-
avl_tree_t vq_active_tree;
147141
avl_tree_t vq_read_offset_tree;
148142
avl_tree_t vq_write_offset_tree;
149-
avl_tree_t vq_trim_offset_tree;
150143
uint64_t vq_last_offset;
151144
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
145+
uint32_t vq_cqueued; /* Classes with queued I/Os. */
146+
uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
147+
uint32_t vq_active; /* Number of active I/Os. */
152148
uint32_t vq_ia_active; /* Active interactive I/Os. */
153149
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
150+
list_t vq_active_list; /* List of active I/Os. */
154151
hrtime_t vq_io_complete_ts; /* time last i/o completed */
155152
hrtime_t vq_io_delta_ts;
156153
zio_t vq_io_search; /* used as local for stack reduction */

include/sys/zio.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,12 @@ typedef struct zio_link {
436436
list_node_t zl_child_node;
437437
} zio_link_t;
438438

439+
enum zio_qstate {
440+
ZIO_QS_NONE = 0,
441+
ZIO_QS_QUEUED,
442+
ZIO_QS_ACTIVE,
443+
};
444+
439445
struct zio {
440446
/* Core information about this I/O */
441447
zbookmark_phys_t io_bookmark;
@@ -479,16 +485,19 @@ struct zio {
479485
const zio_vsd_ops_t *io_vsd_ops;
480486
metaslab_class_t *io_metaslab_class; /* dva throttle class */
481487

488+
enum zio_qstate io_queue_state; /* vdev queue state */
489+
union {
490+
list_node_t l;
491+
avl_node_t a;
492+
} io_queue_node ____cacheline_aligned; /* allocator and vdev queues */
493+
avl_node_t io_offset_node; /* vdev offset queues */
482494
uint64_t io_offset;
483495
hrtime_t io_timestamp; /* submitted at */
484496
hrtime_t io_queued_timestamp;
485497
hrtime_t io_target_timestamp;
486498
hrtime_t io_delta; /* vdev queue service delta */
487499
hrtime_t io_delay; /* Device access time (disk or */
488500
/* file). */
489-
avl_node_t io_queue_node;
490-
avl_node_t io_offset_node;
491-
avl_node_t io_alloc_node;
492501
zio_alloc_list_t io_alloc_list;
493502

494503
/* Internal pipeline state */

man/man4/zfs.4

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2016,12 +2016,6 @@ Historical statistics for this many latest TXGs will be available in
20162016
Flush dirty data to disk at least every this many seconds (maximum TXG
20172017
duration).
20182018
.
2019-
.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
2020-
Allow TRIM I/O operations to be aggregated.
2021-
This is normally not helpful because the extents to be trimmed
2022-
will have been already been aggregated by the metaslab.
2023-
This option is provided for debugging and performance analysis.
2024-
.
20252019
.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
20262020
Max vdev I/O aggregation size.
20272021
.

module/zfs/spa_misc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
730730
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
731731
NULL);
732732
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
733-
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
733+
sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
734734
}
735735
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
736736
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));

module/zfs/txg.c

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl)
895895
boolean_t
896896
txg_all_lists_empty(txg_list_t *tl)
897897
{
898-
mutex_enter(&tl->tl_lock);
899-
for (int i = 0; i < TXG_SIZE; i++) {
900-
if (!txg_list_empty_impl(tl, i)) {
901-
mutex_exit(&tl->tl_lock);
902-
return (B_FALSE);
903-
}
904-
}
905-
mutex_exit(&tl->tl_lock);
906-
return (B_TRUE);
898+
boolean_t res = B_TRUE;
899+
for (int i = 0; i < TXG_SIZE; i++)
900+
res &= (tl->tl_head[i] == NULL);
901+
return (res);
907902
}
908903

909904
/*

module/zfs/vdev.c

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
46084608

46094609
memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
46104610

4611-
for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
4612-
vsx->vsx_active_queue[t] =
4613-
vd->vdev_queue.vq_class[t].vqc_active;
4614-
vsx->vsx_pend_queue[t] = avl_numnodes(
4615-
&vd->vdev_queue.vq_class[t].vqc_queued_tree);
4611+
for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
4612+
vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
4613+
vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
46164614
}
46174615
}
46184616
}
@@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag)
54705468
vdev_queue_t *vq = &vd->vdev_queue;
54715469

54725470
mutex_enter(&vq->vq_lock);
5473-
if (avl_numnodes(&vq->vq_active_tree) > 0) {
5471+
if (vq->vq_active > 0) {
54745472
spa_t *spa = vd->vdev_spa;
54755473
zio_t *fio;
54765474
uint64_t delta;
54775475

5478-
zfs_dbgmsg("slow vdev: %s has %lu active IOs",
5479-
vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
5476+
zfs_dbgmsg("slow vdev: %s has %u active IOs",
5477+
vd->vdev_path, vq->vq_active);
54805478

54815479
/*
54825480
* Look at the head of all the pending queues,
54835481
* if any I/O has been outstanding for longer than
54845482
* the spa_deadman_synctime invoke the deadman logic.
54855483
*/
5486-
fio = avl_first(&vq->vq_active_tree);
5484+
fio = list_head(&vq->vq_active_list);
54875485
delta = gethrtime() - fio->io_timestamp;
54885486
if (delta > spa_deadman_synctime(spa))
54895487
zio_deadman(fio, tag);

0 commit comments

Comments
 (0)