Skip to content

Commit b85d266

Browse files
committed
Another set of vdev queue optimizations.
Switch FIFO queues (SYNC/TRIM) and active queue of vdev queue from time-sorted AVL-trees to simple lists. AVL-trees are too expensive for such a simple task. To change I/O priority without searching through the trees, add io_queue_state field to struct zio. To not check number of queued I/Os for each priority add vq_cqueued bitmap to struct vdev_queue. Update it when adding/removing I/Os. Make vq_cactive a separate array instead of struct vdev_queue_class member. Together those allow to avoid lots of cache misses when looking for work in vdev_queue_class_to_issue(). Introduce deadline of ~0.5s for LBA-sorted queues. Before this I saw some I/Os waiting in a queue for up to 8 seconds and possibly more due to starvation. With this change I no longer see it. I had to slightly more complicate the comparison function, but since it uses all the same cache lines the difference is minimal. For a sequential I/Os the new code in vdev_queue_io_to_issue() actually often uses more simple avl_first(), falling back to avl_find() and avl_nearest() only when needed. Arrange members in struct zio to access only one cache line when searching through vdev queues. While there, remove io_alloc_node, reusing the io_queue_node instead. Those two are never used same time. Remove zfs_vdev_aggregate_trim parameter. It was disabled for 4 years since implemented, while still wasted time maintaining the offset-sorted tree of TRIM requests. Just remove the tree. Remove locking from txg_all_lists_empty(). It is racy by design, while 2 pair of locks/unlocks take noticeable time under the vdev queue lock. With these changes in my tests with volblocksize=4KB I measure vdev queue lock spin time reduction by 50% on read and 75% on write. Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc.
1 parent 6c96269 commit b85d266

File tree

8 files changed

+205
-172
lines changed

8 files changed

+205
-172
lines changed

include/sys/vdev.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
170170
extern void vdev_queue_io_done(zio_t *zio);
171171
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
172172

173-
extern int vdev_queue_length(vdev_t *vd);
173+
extern uint32_t vdev_queue_length(vdev_t *vd);
174174
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
175+
extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
175176

176177
extern void vdev_config_dirty(vdev_t *vd);
177178
extern void vdev_config_clean(vdev_t *vd);

include/sys/vdev_impl.h

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -149,27 +149,24 @@ struct vdev_cache {
149149
kmutex_t vc_lock;
150150
};
151151

152-
typedef struct vdev_queue_class {
153-
uint32_t vqc_active;
154-
155-
/*
156-
* Sorted by offset or timestamp, depending on if the queue is
157-
* LBA-ordered vs FIFO.
158-
*/
159-
avl_tree_t vqc_queued_tree;
152+
typedef union vdev_queue_class {
153+
list_t vqc_list;
154+
avl_tree_t vqc_tree;
160155
} vdev_queue_class_t;
161156

162157
struct vdev_queue {
163158
vdev_t *vq_vdev;
164159
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
165-
avl_tree_t vq_active_tree;
166160
avl_tree_t vq_read_offset_tree;
167161
avl_tree_t vq_write_offset_tree;
168-
avl_tree_t vq_trim_offset_tree;
169162
uint64_t vq_last_offset;
170163
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
164+
uint32_t vq_cqueued; /* Classes with queued I/Os. */
165+
uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
166+
uint32_t vq_active; /* Number of active I/Os. */
171167
uint32_t vq_ia_active; /* Active interactive I/Os. */
172168
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
169+
list_t vq_active_list; /* List of active I/Os. */
173170
hrtime_t vq_io_complete_ts; /* time last i/o completed */
174171
hrtime_t vq_io_delta_ts;
175172
zio_t vq_io_search; /* used as local for stack reduction */

include/sys/zio.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,12 @@ typedef struct zio_link {
437437
list_node_t zl_child_node;
438438
} zio_link_t;
439439

440+
enum zio_qstate {
441+
ZIO_QS_NONE = 0,
442+
ZIO_QS_QUEUED,
443+
ZIO_QS_ACTIVE,
444+
};
445+
440446
struct zio {
441447
/* Core information about this I/O */
442448
zbookmark_phys_t io_bookmark;
@@ -481,16 +487,19 @@ struct zio {
481487
const zio_vsd_ops_t *io_vsd_ops;
482488
metaslab_class_t *io_metaslab_class; /* dva throttle class */
483489

490+
enum zio_qstate io_queue_state; /* vdev queue state */
491+
union {
492+
list_node_t l;
493+
avl_node_t a;
494+
} io_queue_node ____cacheline_aligned; /* allocator and vdev queues */
495+
avl_node_t io_offset_node; /* vdev offset queues */
484496
uint64_t io_offset;
485497
hrtime_t io_timestamp; /* submitted at */
486498
hrtime_t io_queued_timestamp;
487499
hrtime_t io_target_timestamp;
488500
hrtime_t io_delta; /* vdev queue service delta */
489501
hrtime_t io_delay; /* Device access time (disk or */
490502
/* file). */
491-
avl_node_t io_queue_node;
492-
avl_node_t io_offset_node;
493-
avl_node_t io_alloc_node;
494503
zio_alloc_list_t io_alloc_list;
495504

496505
/* Internal pipeline state */

man/man4/zfs.4

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2016,12 +2016,6 @@ Historical statistics for this many latest TXGs will be available in
20162016
Flush dirty data to disk at least every this many seconds (maximum TXG
20172017
duration).
20182018
.
2019-
.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
2020-
Allow TRIM I/O operations to be aggregated.
2021-
This is normally not helpful because the extents to be trimmed
2022-
will have been already been aggregated by the metaslab.
2023-
This option is provided for debugging and performance analysis.
2024-
.
20252019
.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
20262020
Max vdev I/O aggregation size.
20272021
.

module/zfs/spa_misc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
730730
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
731731
NULL);
732732
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
733-
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
733+
sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
734734
}
735735
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
736736
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));

module/zfs/txg.c

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl)
895895
boolean_t
896896
txg_all_lists_empty(txg_list_t *tl)
897897
{
898-
mutex_enter(&tl->tl_lock);
899-
for (int i = 0; i < TXG_SIZE; i++) {
900-
if (!txg_list_empty_impl(tl, i)) {
901-
mutex_exit(&tl->tl_lock);
902-
return (B_FALSE);
903-
}
904-
}
905-
mutex_exit(&tl->tl_lock);
906-
return (B_TRUE);
898+
boolean_t res = B_TRUE;
899+
for (int i = 0; i < TXG_SIZE; i++)
900+
res &= (tl->tl_head[i] == NULL);
901+
return (res);
907902
}
908903

909904
/*

module/zfs/vdev.c

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4613,11 +4613,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
46134613

46144614
memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
46154615

4616-
for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
4617-
vsx->vsx_active_queue[t] =
4618-
vd->vdev_queue.vq_class[t].vqc_active;
4619-
vsx->vsx_pend_queue[t] = avl_numnodes(
4620-
&vd->vdev_queue.vq_class[t].vqc_queued_tree);
4616+
for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
4617+
vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
4618+
vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
46214619
}
46224620
}
46234621
}
@@ -5475,20 +5473,20 @@ vdev_deadman(vdev_t *vd, const char *tag)
54755473
vdev_queue_t *vq = &vd->vdev_queue;
54765474

54775475
mutex_enter(&vq->vq_lock);
5478-
if (avl_numnodes(&vq->vq_active_tree) > 0) {
5476+
if (vq->vq_active > 0) {
54795477
spa_t *spa = vd->vdev_spa;
54805478
zio_t *fio;
54815479
uint64_t delta;
54825480

5483-
zfs_dbgmsg("slow vdev: %s has %lu active IOs",
5484-
vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
5481+
zfs_dbgmsg("slow vdev: %s has %u active IOs",
5482+
vd->vdev_path, vq->vq_active);
54855483

54865484
/*
54875485
* Look at the head of all the pending queues,
54885486
* if any I/O has been outstanding for longer than
54895487
* the spa_deadman_synctime invoke the deadman logic.
54905488
*/
5491-
fio = avl_first(&vq->vq_active_tree);
5489+
fio = list_head(&vq->vq_active_list);
54925490
delta = gethrtime() - fio->io_timestamp;
54935491
if (delta > spa_deadman_synctime(spa))
54945492
zio_deadman(fio, tag);

0 commit comments

Comments
 (0)