Skip to content

Commit 284d6e8

Browse files
committed
ZIL: Detect single-threaded workloads
... by checking that previous block is fully written and flushed. It allows to skip commit delays since we can give up on aggregation in that case. This removes zil_min_commit_timeout parameter, since for single-threaded workloads it is not needed at all, while on very fast devices even some multi-threaded workloads may get detected as single-threaded and still bypass the wait. To give multi-threaded workloads more aggregation chances increase zfs_commit_timeout_pct from 5 to 10%, as they should suffer less from additional latency. Also single-threaded workloads detection allows in perspective better prediction of the next block size. Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc.
1 parent 008baa0 commit 284d6e8

File tree

3 files changed

+44
-60
lines changed

3 files changed

+44
-60
lines changed

include/sys/zil_impl.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ typedef struct zil_vdev_node {
181181
avl_node_t zv_node; /* AVL tree linkage */
182182
} zil_vdev_node_t;
183183

184+
#define ZIL_BURSTS 8
184185
#define ZIL_PREV_BLKS 16
185186

186187
/*
@@ -222,8 +223,9 @@ struct zilog {
222223
clock_t zl_replay_time; /* lbolt of when replay started */
223224
uint64_t zl_replay_blks; /* number of log blocks replayed */
224225
zil_header_t zl_old_header; /* debugging aid */
225-
uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
226+
uint_t zl_parallel; /* workload is multi-threaded */
226227
uint_t zl_prev_rotor; /* rotor for zl_prev[] */
228+
uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
227229
txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */
228230
uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */
229231

man/man4/zfs.4

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,7 @@ Note that this should not be set below the ZED thresholds
798798
(currently 10 checksums over 10 seconds)
799799
or else the daemon may not trigger any action.
800800
.
801-
.It Sy zfs_commit_timeout_pct Ns = Ns Sy 5 Ns % Pq uint
801+
.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
802802
This controls the amount of time that a ZIL block (lwb) will remain "open"
803803
when it isn't "full", and it has a thread waiting for it to be committed to
804804
stable storage.
@@ -2155,13 +2155,6 @@ This sets the maximum number of write bytes logged via WR_COPIED.
21552155
It tunes a tradeoff between additional memory copy and possibly worse log
21562156
space efficiency vs additional range lock/unlock.
21572157
.
2158-
.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
2159-
This sets the minimum delay in nanoseconds ZIL care to delay block commit,
2160-
waiting for more records.
2161-
If ZIL writes are too fast, kernel may not be able sleep for so short interval,
2162-
increasing log latency above allowed by
2163-
.Sy zfs_commit_timeout_pct .
2164-
.
21652158
.It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
21662159
Disable the cache flush commands that are normally sent to disk by
21672160
the ZIL after an LWB write has completed.

module/zfs/zil.c

Lines changed: 40 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -91,15 +91,7 @@
9191
* committed to stable storage. Please refer to the zil_commit_waiter()
9292
* function (and the comments within it) for more details.
9393
*/
94-
static uint_t zfs_commit_timeout_pct = 5;
95-
96-
/*
97-
* Minimal time we care to delay commit waiting for more ZIL records.
98-
* At least FreeBSD kernel can't sleep for less than 2us at its best.
99-
* So requests to sleep for less then 5us is a waste of CPU time with
100-
* a risk of significant log latency increase due to oversleep.
101-
*/
102-
static uint64_t zil_min_commit_timeout = 5000;
94+
static uint_t zfs_commit_timeout_pct = 10;
10395

10496
/*
10597
* See zil.h for more information about these fields.
@@ -2696,6 +2688,19 @@ zil_commit_writer_stall(zilog_t *zilog)
26962688
ASSERT(list_is_empty(&zilog->zl_lwb_list));
26972689
}
26982690

2691+
static void
2692+
zil_burst_done(zilog_t *zilog)
2693+
{
2694+
if (!list_is_empty(&zilog->zl_itx_commit_list) ||
2695+
zilog->zl_cur_used == 0)
2696+
return;
2697+
2698+
if (zilog->zl_parallel)
2699+
zilog->zl_parallel--;
2700+
2701+
zilog->zl_cur_used = 0;
2702+
}
2703+
26992704
/*
27002705
* This function will traverse the commit list, creating new lwbs as
27012706
* needed, and committing the itxs from the commit list to these newly
@@ -2710,7 +2715,6 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
27102715
list_t nolwb_waiters;
27112716
lwb_t *lwb, *plwb;
27122717
itx_t *itx;
2713-
boolean_t first = B_TRUE;
27142718

27152719
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
27162720

@@ -2736,9 +2740,22 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
27362740
zil_commit_activate_saxattr_feature(zilog);
27372741
ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
27382742
lwb->lwb_state == LWB_STATE_OPENED);
2739-
first = (lwb->lwb_state == LWB_STATE_NEW) &&
2740-
((plwb = list_prev(&zilog->zl_lwb_list, lwb)) == NULL ||
2741-
plwb->lwb_state == LWB_STATE_FLUSH_DONE);
2743+
2744+
/*
2745+
* If the lwb is still opened, it means the workload is really
2746+
* multi-threaded and we won the chance of write aggregation.
2747+
* If it is not opened yet, but previous lwb is still not
2748+
* flushed, it still means the workload is multi-threaded, but
2749+
* there was too much time between the commits to aggregate, so
2750+
* we try aggregation next times, but without too much hopes.
2751+
*/
2752+
if (lwb->lwb_state == LWB_STATE_OPENED) {
2753+
zilog->zl_parallel = ZIL_BURSTS;
2754+
} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
2755+
!= NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
2756+
zilog->zl_parallel = MAX(zilog->zl_parallel,
2757+
ZIL_BURSTS / 2);
2758+
}
27422759
}
27432760

27442761
while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
@@ -2813,7 +2830,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
28132830
* Our lwb is done, leave the rest of
28142831
* itx list to somebody else who care.
28152832
*/
2816-
first = B_FALSE;
2833+
zilog->zl_parallel = ZIL_BURSTS;
28172834
break;
28182835
}
28192836
} else {
@@ -2905,28 +2922,15 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
29052922
* try and pack as many itxs into as few lwbs as
29062923
* possible, without significantly impacting the latency
29072924
* of each individual itx.
2908-
*
2909-
* If we had no already running or open LWBs, it can be
2910-
* the workload is single-threaded. And if the ZIL write
2911-
* latency is very small or if the LWB is almost full, it
2912-
* may be cheaper to bypass the delay.
29132925
*/
2914-
if (lwb->lwb_state == LWB_STATE_OPENED && first) {
2915-
hrtime_t sleep = zilog->zl_last_lwb_latency *
2916-
zfs_commit_timeout_pct / 100;
2917-
if (sleep < zil_min_commit_timeout ||
2918-
lwb->lwb_nmax - lwb->lwb_nused <
2919-
lwb->lwb_nmax / 8) {
2920-
list_insert_tail(ilwbs, lwb);
2921-
lwb = zil_lwb_write_close(zilog, lwb,
2922-
LWB_STATE_NEW);
2923-
zilog->zl_cur_used = 0;
2924-
if (lwb == NULL) {
2925-
while ((lwb = list_remove_head(ilwbs))
2926-
!= NULL)
2927-
zil_lwb_write_issue(zilog, lwb);
2928-
zil_commit_writer_stall(zilog);
2929-
}
2926+
if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
2927+
list_insert_tail(ilwbs, lwb);
2928+
lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
2929+
zil_burst_done(zilog);
2930+
if (lwb == NULL) {
2931+
while ((lwb = list_remove_head(ilwbs)) != NULL)
2932+
zil_lwb_write_issue(zilog, lwb);
2933+
zil_commit_writer_stall(zilog);
29302934
}
29312935
}
29322936
}
@@ -3084,19 +3088,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
30843088

30853089
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
30863090

3087-
/*
3088-
* Since the lwb's zio hadn't been issued by the time this thread
3089-
* reached its timeout, we reset the zilog's "zl_cur_used" field
3090-
* to influence the zil block size selection algorithm.
3091-
*
3092-
* By having to issue the lwb's zio here, it means the size of the
3093-
* lwb was too large, given the incoming throughput of itxs. By
3094-
* setting "zl_cur_used" to zero, we communicate this fact to the
3095-
* block size selection algorithm, so it can take this information
3096-
* into account, and potentially select a smaller size for the
3097-
* next lwb block that is allocated.
3098-
*/
3099-
zilog->zl_cur_used = 0;
3091+
zil_burst_done(zilog);
31003092

31013093
if (nlwb == NULL) {
31023094
/*
@@ -4214,9 +4206,6 @@ EXPORT_SYMBOL(zil_kstat_values_update);
42144206
ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
42154207
"ZIL block open timeout percentage");
42164208

4217-
ZFS_MODULE_PARAM(zfs_zil, zil_, min_commit_timeout, U64, ZMOD_RW,
4218-
"Minimum delay we care for ZIL block commit");
4219-
42204209
ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
42214210
"Disable intent logging replay");
42224211

0 commit comments

Comments
 (0)