Skip to content

Commit df8c9f3

Browse files
amotinbehlendorf
authored andcommitted
ZIL: Second attempt to reduce scope of zl_issuer_lock.
The previous patch #14841 appeared to have significant flaw, causing deadlocks if zl_get_data callback got blocked waiting for TXG sync. I already handled some of such cases in the original patch, but issue #14982 shown cases that were impossible to solve in that design. This patch fixes the problem by postponing log blocks allocation till the very end, just before the zios issue, leaving nothing blocking after that point to cause deadlocks. Before that point though any sleeps are now allowed, not causing sync thread blockage. This require slightly more complicated lwb state machine to allocate blocks and issue zios in proper order. But with removal of special early issue workarounds the new code is much cleaner now, and should even be more efficient. Since this patch uses null zios between write, I've found that null zios do not wait for logical children ready status in zio_ready(), that makes parent write to proceed prematurely, producing incorrect log blocks. Added ZIO_CHILD_LOGICAL_BIT to zio_wait_for_children() fixes it. Reviewed-by: Rob Norris <[email protected]> Reviewed-by: Mark Maybee <[email protected]> Reviewed-by: George Wilson <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes #15122
1 parent bb31ded commit df8c9f3

File tree

6 files changed

+355
-381
lines changed

6 files changed

+355
-381
lines changed

cmd/ztest.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2412,7 +2412,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
24122412
int error;
24132413

24142414
ASSERT3P(lwb, !=, NULL);
2415-
ASSERT3P(zio, !=, NULL);
24162415
ASSERT3U(size, !=, 0);
24172416

24182417
ztest_object_lock(zd, object, RL_READER);
@@ -2446,6 +2445,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
24462445
DMU_READ_NO_PREFETCH);
24472446
ASSERT0(error);
24482447
} else {
2448+
ASSERT3P(zio, !=, NULL);
24492449
size = doi.doi_data_block_size;
24502450
if (ISP2(size)) {
24512451
offset = P2ALIGN(offset, size);

include/sys/zil_impl.h

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,22 @@ extern "C" {
3838
/*
3939
* Possible states for a given lwb structure.
4040
*
41-
* An lwb will start out in the "closed" state, and then transition to
42-
* the "opened" state via a call to zil_lwb_write_open(). When
43-
* transitioning from "closed" to "opened" the zilog's "zl_issuer_lock"
44-
* must be held.
41+
* An lwb will start out in the "new" state, and transition to the "opened"
42+
* state via a call to zil_lwb_write_open() on first itx assignment. When
43+
* transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be
44+
* held.
4545
*
46-
* After the lwb is "opened", it can transition into the "issued" state
47-
* via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must
48-
* be held when making this transition.
46+
* After the lwb is "opened", it can be assigned number of itxs and transition
47+
* into the "closed" state via zil_lwb_write_close() when full or on timeout.
48+
* When transitioning from "opened" to "closed" the zilog's "zl_issuer_lock"
49+
* must be held. New lwb allocation also takes "zl_lock" to protect the list.
50+
*
51+
* After the lwb is "closed", it can transition into the "ready" state via
52+
* zil_lwb_write_issue(). "zl_lock" must be held when making this transition.
53+
* Since it is done by the same thread, "zl_issuer_lock" is not needed.
54+
*
55+
* When lwb in "ready" state receives its block pointer, it can transition to
56+
* "issued". "zl_lock" must be held when making this transition.
4957
*
5058
* After the lwb's write zio completes, it transitions into the "write
5159
* done" state via zil_lwb_write_done(); and then into the "flush done"
@@ -62,17 +70,20 @@ extern "C" {
6270
*
6371
* Additionally, correctness when reading an lwb's state is often
6472
* achieved by exploiting the fact that these state transitions occur in
65-
* this specific order; i.e. "closed" to "opened" to "issued" to "done".
73+
* this specific order; i.e. "new" to "opened" to "closed" to "ready" to
74+
* "issued" to "write_done" and finally "flush_done".
6675
*
67-
* Thus, if an lwb is in the "closed" or "opened" state, holding the
76+
* Thus, if an lwb is in the "new" or "opened" state, holding the
6877
* "zl_issuer_lock" will prevent a concurrent thread from transitioning
69-
* that lwb to the "issued" state. Likewise, if an lwb is already in the
70-
* "issued" state, holding the "zl_lock" will prevent a concurrent
71-
* thread from transitioning that lwb to the "write done" state.
78+
* that lwb to the "closed" state. Likewise, if an lwb is already in the
79+
* "ready" state, holding the "zl_lock" will prevent a concurrent thread
80+
* from transitioning that lwb to the "issued" state.
7281
*/
7382
typedef enum {
74-
LWB_STATE_CLOSED,
83+
LWB_STATE_NEW,
7584
LWB_STATE_OPENED,
85+
LWB_STATE_CLOSED,
86+
LWB_STATE_READY,
7687
LWB_STATE_ISSUED,
7788
LWB_STATE_WRITE_DONE,
7889
LWB_STATE_FLUSH_DONE,
@@ -91,17 +102,21 @@ typedef enum {
91102
typedef struct lwb {
92103
zilog_t *lwb_zilog; /* back pointer to log struct */
93104
blkptr_t lwb_blk; /* on disk address of this log blk */
105+
boolean_t lwb_slim; /* log block has slim format */
94106
boolean_t lwb_slog; /* lwb_blk is on SLOG device */
95-
boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */
107+
int lwb_error; /* log block allocation error */
108+
int lwb_nmax; /* max bytes in the buffer */
96109
int lwb_nused; /* # used bytes in buffer */
97110
int lwb_nfilled; /* # filled bytes in buffer */
98111
int lwb_sz; /* size of block and buffer */
99112
lwb_state_t lwb_state; /* the state of this lwb */
100113
char *lwb_buf; /* log write buffer */
114+
zio_t *lwb_child_zio; /* parent zio for children */
101115
zio_t *lwb_write_zio; /* zio for the lwb buffer */
102116
zio_t *lwb_root_zio; /* root zio for lwb write and flushes */
103117
hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
104118
uint64_t lwb_issued_txg; /* the txg when the write is issued */
119+
uint64_t lwb_alloc_txg; /* the txg when lwb_blk is allocated */
105120
uint64_t lwb_max_txg; /* highest txg in this lwb */
106121
list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
107122
list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */

module/zfs/zfs_vnops.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -839,7 +839,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
839839
uint64_t zp_gen;
840840

841841
ASSERT3P(lwb, !=, NULL);
842-
ASSERT3P(zio, !=, NULL);
843842
ASSERT3U(size, !=, 0);
844843

845844
/*
@@ -889,6 +888,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
889888
}
890889
ASSERT(error == 0 || error == ENOENT);
891890
} else { /* indirect write */
891+
ASSERT3P(zio, !=, NULL);
892892
/*
893893
* Have to lock the whole block to ensure when it's
894894
* written out and its checksum is being calculated

0 commit comments

Comments
 (0)