Skip to content

Commit 2d126a9

Browse files
committed
Reduce latency effects of non-interactive I/O.
Investigating influence of scrub (especially sequential) on random read latency I've noticed that on some HDDs single 4KB read may take up to 4 seconds! Deeper investigation shown that many HDDs heavily prioritize sequential reads even when those are submitted with queue depth of 1. This patch addresses the latency from two sides: - by using _min_active queue depths for non-interactive requests while the interactive request(s) are active and few requests after; - by throttling it further if no interactive requests has completed while configured amount of non-interactive did. While there, I've also modified vdev_queue_class_to_issue() to give more chances to schedule at least _min_active requests to the lowest priorities. It should reduce starvation if several non-interactive processes are running same time with some interactive and I think should make possible setting of zfs_vdev_max_active to as low as 1. I've benchmarked this change with 4KB random reads from ZVOL with 16KB block size on newly written non-fragmented pool. On fragmented pool I also saw improvements, but not so dramatic. Below are log2 histograms of the random read latency in milliseconds for different devices: 4 2x mirror vdevs of SATA HDD WDC WD20EFRX-68EUZN0 before: 0, 0, 2, 1, 12, 21, 19, 18, 10, 15, 17, 21 after: 0, 0, 0, 24, 101, 195, 419, 250, 47, 4, 0, 0 , that means maximum latency reduction from 2s to 500ms. 4 2x mirror vdevs of SATA HDD WDC WD80EFZX-68UW8N0 before: 0, 0, 2, 31, 38, 28, 18, 12, 17, 20, 24, 10, 3 after: 0, 0, 55, 247, 455, 470, 412, 181, 36, 0, 0, 0, 0 , i.e. from 4s to 250ms. 1 SAS HDD SEAGATE ST14000NM0048 before: 0, 0, 29, 70, 107, 45, 27, 1, 0, 0, 1, 4, 19 after: 1, 29, 681, 1261, 676, 1633, 67, 1, 0, 0, 0, 0, 0 , i.e. from 4s to 125ms. 1 SAS SSD SEAGATE XS3840TE70014 before (microseconds): 0, 0, 0, 0, 0, 0, 0, 0, 70, 18343, 82548, 618 after: 0, 0, 0, 0, 0, 0, 0, 0, 283, 92351, 34844, 90 I've also measured scrub time during the test and on idle pools. On idle fragmented pool I've measured scrub getting few percent faster due to use of QD3 instead of QD2 before. On idle non-fragmented pool I've measured no difference. On busy non-fragmented pool I've measured scrub time increase about 1.5-1.7x, while IOPS increase reached 5-9x. Signed-off-by: Alexander Motin <[email protected]> Sponsored-By: iXsystems, Inc.
1 parent 52e585a commit 2d126a9

File tree

3 files changed

+134
-12
lines changed

3 files changed

+134
-12
lines changed

include/sys/vdev_impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ struct vdev_queue {
148148
avl_tree_t vq_write_offset_tree;
149149
avl_tree_t vq_trim_offset_tree;
150150
uint64_t vq_last_offset;
151+
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
152+
uint32_t vq_ia_active; /* Active interactive I/Os. */
153+
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
151154
hrtime_t vq_io_complete_ts; /* time last i/o completed */
152155
hrtime_t vq_io_delta_ts;
153156
zio_t vq_io_search; /* used as local for stack reduction */

man/man5/zfs-module-parameters.5

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2165,6 +2165,40 @@ See the section "ZFS I/O SCHEDULER".
21652165
Default value: \fB1\fR.
21662166
.RE
21672167

2168+
.sp
2169+
.ne 2
2170+
.na
2171+
\fBzfs_vdev_nia_delay\fR (int)
2172+
.ad
2173+
.RS 12n
2174+
To reduce the effects of non-interactive I/O on interactive I/O latency
2175+
they are limited to *_min_active while there are outstanding interactive
2176+
I/Os, or until an additional zfs_vdev_nia_delay I/O's complete after the
2177+
last interactive I/O.
2178+
See the section "ZFS I/O SCHEDULER".
2179+
.sp
2180+
Default value: \fB5\fR.
2181+
.RE
2182+
2183+
.sp
2184+
.ne 2
2185+
.na
2186+
\fBzfs_vdev_nia_credit\fR (int)
2187+
.ad
2188+
.RS 12n
2189+
Some HDDs tend to prioritize sequential I/O so high, that concurrent
2190+
random I/O latency reaches several seconds. On some HDDs it happens
2191+
even if sequential I/Os are submitted one at a time, and so setting
2192+
*_max_active to 1 does not help. To prevent non-interactive I/Os, like
2193+
scrub, from monopolizing the device no more than zfs_vdev_nia_credit
2194+
I/Os can be sent while there are outstanding incomplete interactive
2195+
I/Os. This enforced wait ensures the HDD services the interactive I/O
2196+
within a reasonable amount of time.
2197+
See the section "ZFS I/O SCHEDULER".
2198+
.sp
2199+
Default value: \fB5\fR.
2200+
.RE
2201+
21682202
.sp
21692203
.ne 2
21702204
.na

module/zfs/vdev_queue.c

Lines changed: 97 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ uint32_t zfs_vdev_async_read_max_active = 3;
151151
uint32_t zfs_vdev_async_write_min_active = 2;
152152
uint32_t zfs_vdev_async_write_max_active = 10;
153153
uint32_t zfs_vdev_scrub_min_active = 1;
154-
uint32_t zfs_vdev_scrub_max_active = 2;
154+
uint32_t zfs_vdev_scrub_max_active = 3;
155155
uint32_t zfs_vdev_removal_min_active = 1;
156156
uint32_t zfs_vdev_removal_max_active = 2;
157157
uint32_t zfs_vdev_initializing_min_active = 1;
@@ -171,6 +171,26 @@ uint32_t zfs_vdev_rebuild_max_active = 3;
171171
int zfs_vdev_async_write_active_min_dirty_percent = 30;
172172
int zfs_vdev_async_write_active_max_dirty_percent = 60;
173173

174+
/*
175+
* To reduce the effects of non-interactive I/O on interactive I/O latency
176+
* they are limited to *_min_active while there are outstanding interactive
177+
* I/Os, or until an additional zfs_vdev_nia_delay I/O's complete after the
178+
* last interactive I/O.
179+
*/
180+
u_int zfs_vdev_nia_delay = 5;
181+
182+
/*
183+
* Some HDDs tend to prioritize sequential I/O so high that concurrent
184+
* random I/O latency reaches several seconds. On some HDDs it happens
185+
* even if sequential I/Os are submitted one at a time, and so setting
186+
* *_max_active to 1 does not help. To prevent non-interactive I/Os, like
187+
* scrub, from monopolizing the device no more than zfs_vdev_nia_credit
188+
* I/Os can be sent while there are outstanding incomplete interactive
189+
* I/Os. This enforced wait ensures the HDD services the interactive I/O
190+
* within a reasonable amount of time.
191+
*/
192+
u_int zfs_vdev_nia_credit = 5;
193+
174194
/*
175195
* To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
176196
* For read I/Os, we also aggregate across small adjacency gaps; for writes
@@ -261,7 +281,7 @@ vdev_queue_timestamp_compare(const void *x1, const void *x2)
261281
}
262282

263283
static int
264-
vdev_queue_class_min_active(zio_priority_t p)
284+
vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
265285
{
266286
switch (p) {
267287
case ZIO_PRIORITY_SYNC_READ:
@@ -273,15 +293,19 @@ vdev_queue_class_min_active(zio_priority_t p)
273293
case ZIO_PRIORITY_ASYNC_WRITE:
274294
return (zfs_vdev_async_write_min_active);
275295
case ZIO_PRIORITY_SCRUB:
276-
return (zfs_vdev_scrub_min_active);
296+
return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active :
297+
MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active));
277298
case ZIO_PRIORITY_REMOVAL:
278-
return (zfs_vdev_removal_min_active);
299+
return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active :
300+
MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active));
279301
case ZIO_PRIORITY_INITIALIZING:
280-
return (zfs_vdev_initializing_min_active);
302+
return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active:
303+
MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active));
281304
case ZIO_PRIORITY_TRIM:
282305
return (zfs_vdev_trim_min_active);
283306
case ZIO_PRIORITY_REBUILD:
284-
return (zfs_vdev_rebuild_min_active);
307+
return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active :
308+
MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active));
285309
default:
286310
panic("invalid priority %u", p);
287311
return (0);
@@ -337,7 +361,7 @@ vdev_queue_max_async_writes(spa_t *spa)
337361
}
338362

339363
static int
340-
vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
364+
vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
341365
{
342366
switch (p) {
343367
case ZIO_PRIORITY_SYNC_READ:
@@ -349,14 +373,34 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
349373
case ZIO_PRIORITY_ASYNC_WRITE:
350374
return (vdev_queue_max_async_writes(spa));
351375
case ZIO_PRIORITY_SCRUB:
376+
if (vq->vq_ia_active > 0) {
377+
return (MIN(vq->vq_nia_credit,
378+
zfs_vdev_scrub_min_active));
379+
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
380+
return (zfs_vdev_scrub_min_active);
352381
return (zfs_vdev_scrub_max_active);
353382
case ZIO_PRIORITY_REMOVAL:
383+
if (vq->vq_ia_active > 0) {
384+
return (MIN(vq->vq_nia_credit,
385+
zfs_vdev_removal_min_active));
386+
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
387+
return (zfs_vdev_removal_min_active);
354388
return (zfs_vdev_removal_max_active);
355389
case ZIO_PRIORITY_INITIALIZING:
390+
if (vq->vq_ia_active > 0) {
391+
return (MIN(vq->vq_nia_credit,
392+
zfs_vdev_initializing_min_active));
393+
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
394+
return (zfs_vdev_initializing_min_active);
356395
return (zfs_vdev_initializing_max_active);
357396
case ZIO_PRIORITY_TRIM:
358397
return (zfs_vdev_trim_max_active);
359398
case ZIO_PRIORITY_REBUILD:
399+
if (vq->vq_ia_active > 0) {
400+
return (MIN(vq->vq_nia_credit,
401+
zfs_vdev_rebuild_min_active));
402+
} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
403+
return (zfs_vdev_rebuild_min_active);
360404
return (zfs_vdev_rebuild_max_active);
361405
default:
362406
panic("invalid priority %u", p);
@@ -372,17 +416,24 @@ static zio_priority_t
372416
vdev_queue_class_to_issue(vdev_queue_t *vq)
373417
{
374418
spa_t *spa = vq->vq_vdev->vdev_spa;
375-
zio_priority_t p;
419+
zio_priority_t p, n;
376420

377421
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
378422
return (ZIO_PRIORITY_NUM_QUEUEABLE);
379423

380-
/* find a queue that has not reached its minimum # outstanding i/os */
381-
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
424+
/*
425+
* Find a queue that has not reached its minimum # outstanding i/os.
426+
* Do round-robin to reduce starvation due to zfs_vdev_max_active
427+
* and vq_nia_credit limits.
428+
*/
429+
for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
430+
p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
382431
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
383432
vq->vq_class[p].vqc_active <
384-
vdev_queue_class_min_active(p))
433+
vdev_queue_class_min_active(vq, p)) {
434+
vq->vq_last_prio = p;
385435
return (p);
436+
}
386437
}
387438

388439
/*
@@ -392,8 +443,10 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
392443
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
393444
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
394445
vq->vq_class[p].vqc_active <
395-
vdev_queue_class_max_active(spa, p))
446+
vdev_queue_class_max_active(spa, vq, p)) {
447+
vq->vq_last_prio = p;
396448
return (p);
449+
}
397450
}
398451

399452
/* No eligible queued i/os */
@@ -493,6 +546,20 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
493546
}
494547
}
495548

549+
static boolean_t
550+
vdev_queue_is_interactive(zio_priority_t p)
551+
{
552+
switch (p) {
553+
case ZIO_PRIORITY_SCRUB:
554+
case ZIO_PRIORITY_REMOVAL:
555+
case ZIO_PRIORITY_INITIALIZING:
556+
case ZIO_PRIORITY_REBUILD:
557+
return (B_FALSE);
558+
default:
559+
return (B_TRUE);
560+
}
561+
}
562+
496563
static void
497564
vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
498565
{
@@ -502,6 +569,11 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
502569
ASSERT(MUTEX_HELD(&vq->vq_lock));
503570
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
504571
vq->vq_class[zio->io_priority].vqc_active++;
572+
if (vdev_queue_is_interactive(zio->io_priority)) {
573+
if (++vq->vq_ia_active == 1)
574+
vq->vq_nia_credit = 1;
575+
} else if (vq->vq_ia_active > 0)
576+
vq->vq_nia_credit--;
505577
avl_add(&vq->vq_active_tree, zio);
506578

507579
if (shk->kstat != NULL) {
@@ -520,6 +592,13 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
520592
ASSERT(MUTEX_HELD(&vq->vq_lock));
521593
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
522594
vq->vq_class[zio->io_priority].vqc_active--;
595+
if (vdev_queue_is_interactive(zio->io_priority)) {
596+
if (--vq->vq_ia_active == 0)
597+
vq->vq_nia_credit = 0;
598+
else
599+
vq->vq_nia_credit = zfs_vdev_nia_credit;
600+
} else if (vq->vq_ia_active == 0)
601+
vq->vq_nia_credit++;
523602
avl_remove(&vq->vq_active_tree, zio);
524603

525604
if (shk->kstat != NULL) {
@@ -1065,6 +1144,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
10651144
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
10661145
"Min active rebuild I/Os per vdev");
10671146

1147+
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW,
1148+
"Number of non-interactive I/Os to allow in sequence");
1149+
1150+
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW,
1151+
"Number of non-interactive I/Os before _max_active");
1152+
10681153
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
10691154
"Queue depth percentage for each top-level vdev");
10701155
/* END CSTYLED */

0 commit comments

Comments
 (0)