Skip to content

Commit a5bbbf9

Browse files
committed
zfs_putpage: handle page writeback errors
Page writeback is considered completed when the associated itc callback completes. A syncing writeback will recieve the error in its callback directly, but an in-flight async writeback that was promoted to sync by the ZIL may also recieve an error. Writeback errors, even syncing writeback errors, are not especially serious on their own, because the error will ultimately be returned to the zil_commit() caller, either zfs_fsync() for an explicit sync op (eg msync()) or to zfs_putpage() itself for a syncing (WB_SYNC_ALL) writeback (kernel housekeeping or sync_file_range(SYNC_FILE_RANGE_WAIT_AFTER). The only thing we need to do when a page writeback fails is to re-mark the page dirty, since we don't know if it made it to disk yet. This will ensure that it gets written out again in the future, either some scheduled async writeback or another explicit syncing call. On the other side, we need to make sure that if a syncing op arrives, any changes on dirty pages are written back to the DMU and/or the ZIL first. We do this by starting an _async_ (WB_SYNC_NONE) writeback on the file mapping at the start of the sync op (fsync(), msync(), etc). An async op will get an async itx created and logged, ready for the followup zfs_fsync()->zil_commit() to find, while avoiding a zil_commit() call for every page in the range. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris <[email protected]>
1 parent 5361b28 commit a5bbbf9

File tree

2 files changed

+88
-32
lines changed

2 files changed

+88
-32
lines changed

module/os/linux/zfs/zfs_vnops_os.c

Lines changed: 57 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3684,24 +3684,49 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
36843684
return (error);
36853685
}
36863686

3687-
static void
3688-
zfs_putpage_sync_commit_cb(void *arg)
3687+
/* Finish page writeback. */
3688+
static inline void
3689+
zfs_page_writeback_done(struct page *pp, int err, boolean_t for_sync)
36893690
{
3690-
struct page *pp = arg;
3691+
if (err != 0) {
3692+
/*
3693+
* Writeback failed. Re-dirty the page. It was undirtied before
3694+
* the IO was issued (in zfs_putpage() or write_cache_pages()).
3695+
* The kernel only considers writeback for dirty pages; if we
3696+
* don't do this, it is eligible for eviction without being
3697+
* written out, which we definitely don't want.
3698+
*/
3699+
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3700+
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3701+
#else
3702+
__set_page_dirty_nobuffers(pp);
3703+
#endif
3704+
}
36913705

36923706
ClearPageError(pp);
3707+
36933708
end_page_writeback(pp);
3709+
3710+
if (!for_sync) {
3711+
znode_t *zp = ITOZ(pp->mapping->host);
3712+
atomic_dec_32(&zp->z_async_writes_cnt);
3713+
}
36943714
}
36953715

3716+
/*
3717+
* These callbacks are passed to zfs_log_write() in zfs_putpage(), and are
3718+
* called with the ZIL itx has been written to the log, or if the ZIL crashes
3719+
* or the pool suspends. Any failure is passed as `err`.
3720+
*/
36963721
static void
3697-
zfs_putpage_async_commit_cb(void *arg)
3722+
zfs_putpage_sync_commit_cb(void *arg, int err)
36983723
{
3699-
struct page *pp = arg;
3700-
znode_t *zp = ITOZ(pp->mapping->host);
3701-
3702-
ClearPageError(pp);
3703-
end_page_writeback(pp);
3704-
atomic_dec_32(&zp->z_async_writes_cnt);
3724+
zfs_page_writeback_done(arg, err, B_TRUE);
3725+
}
3726+
static void
3727+
zfs_putpage_async_commit_cb(void *arg, int err)
3728+
{
3729+
zfs_page_writeback_done(arg, err, B_FALSE);
37053730
}
37063731

37073732
/*
@@ -3877,18 +3902,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
38773902
err = dmu_tx_assign(tx, DMU_TX_WAIT);
38783903
if (err != 0) {
38793904
dmu_tx_abort(tx);
3880-
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
3881-
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
3882-
#else
3883-
__set_page_dirty_nobuffers(pp);
3884-
#endif
3885-
ClearPageError(pp);
3886-
end_page_writeback(pp);
3887-
if (!for_sync)
3888-
atomic_dec_32(&zp->z_async_writes_cnt);
3905+
zfs_page_writeback_done(pp, err, for_sync);
38893906
zfs_rangelock_exit(lr);
38903907
zfs_exit(zfsvfs, FTAG);
3891-
return (err);
3908+
3909+
/*
3910+
* Don't return error for an async writeback; we've re-dirtied
3911+
* the page so it will be tried again some other time.
3912+
*/
3913+
return (wbc->sync_mode != WB_SYNC_NONE ? err : 0);
38923914
}
38933915

38943916
va = kmap(pp);
@@ -3911,14 +3933,14 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
39113933

39123934
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
39133935

3914-
boolean_t commit = B_FALSE;
3936+
enum { NONE, ASYNC, SYNC } commit = NONE;
39153937
if (wbc->sync_mode != WB_SYNC_NONE) {
39163938
/*
39173939
* Note that this is rarely called under writepages(), because
39183940
* writepages() normally handles the entire commit for
39193941
* performance reasons.
39203942
*/
3921-
commit = B_TRUE;
3943+
commit = SYNC;
39223944
} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
39233945
/*
39243946
* If the caller does not intend to wait synchronously
@@ -3928,23 +3950,30 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
39283950
* our writeback to complete. Refer to the comment in
39293951
* zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
39303952
*/
3931-
commit = B_TRUE;
3953+
commit = ASYNC;
39323954
}
39333955

3934-
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
3935-
B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
3956+
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen,
3957+
commit != NONE, B_FALSE,
3958+
for_sync ? zfs_putpage_sync_commit_cb :
39363959
zfs_putpage_async_commit_cb, pp);
39373960

39383961
dmu_tx_commit(tx);
3939-
39403962
zfs_rangelock_exit(lr);
39413963

3942-
if (commit) {
3964+
if (commit != NONE) {
3965+
/*
3966+
* If this is a sync write, or a sync write is in progress,
3967+
* forces this out now. However, if it was an async write
3968+
* while a sync write was in progress, ignore the error here,
3969+
* since no one actually asked for it.
3970+
*/
39433971
err = zil_commit(zfsvfs->z_log, zp->z_id);
3944-
if (err != 0) {
3972+
if (err != 0 && commit == SYNC) {
39453973
zfs_exit(zfsvfs, FTAG);
39463974
return (err);
39473975
}
3976+
err = 0;
39483977
}
39493978

39503979
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);

module/os/linux/zfs/zpl_file.c

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ zpl_iterate(struct file *filp, struct dir_context *ctx)
109109
return (error);
110110
}
111111

112+
static inline int
113+
zpl_write_cache_pages(struct address_space *mapping,
114+
struct writeback_control *wbc, void *data);
115+
112116
static int
113117
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
114118
{
@@ -151,7 +155,33 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
151155
zpl_exit(zfsvfs, FTAG);
152156
}
153157

154-
error = filemap_write_and_wait_range(inode->i_mapping, start, end);
158+
/*
159+
* Force dirty pages in the range out to the DMU and the log. This may
160+
* end up calling zil_commit(), which is fine; there will just be very
161+
* little for zfs_fsync() to do below. If the page writeback fails, the
162+
* zfs_putpage() callbacks will keep the page dirty.
163+
*
164+
* No matter what happens here, we always call zfs_fsync() and so
165+
* zil_commit(). The only way the writeback can fail is if the ZIL
166+
* itself has already crashed because the pool suspended, and so it
167+
* will return error below. Thus, we don't need to ever track writeback
168+
* errors on the mapping (or in page flags in older kernels), and so
169+
* this call is guaranteed to return 0.
170+
*
171+
* We call write_cache_pages() directly to ensure that zpl_putpage() is
172+
* called with the flags we need. We need WB_SYNC_NONE so that we don't
173+
* count these as syncing writes and so fall back to zil_commit()
174+
* (since we're doing this is a kind of pre-sync); but we do need
175+
* for_sync so we can avoid bumping z_async_writes_cnt as we go.
176+
*/
177+
int for_sync = 1;
178+
struct writeback_control wbc = {
179+
.sync_mode = WB_SYNC_NONE,
180+
.nr_to_write = LONG_MAX,
181+
.range_start = start,
182+
.range_end = end,
183+
};
184+
VERIFY0(zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync));
155185

156186
/*
157187
* The sync write is not complete yet but we decrement
@@ -164,9 +194,6 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
164194
*/
165195
atomic_dec_32(&zp->z_sync_writes_cnt);
166196

167-
if (error)
168-
return (error);
169-
170197
crhold(cr);
171198
cookie = spl_fstrans_mark();
172199
error = -zfs_fsync(zp, datasync, cr);

0 commit comments

Comments
 (0)