zfs_putpage: handle page writeback errors

robn · robn · commit a5bbbf918cbf · 2025-06-04T20:06:55.000+10:00
Page writeback is considered completed when the associated itc callback
completes. A syncing writeback will recieve the error in its callback
directly, but an in-flight async writeback that was promoted to sync by
the ZIL may also recieve an error.

Writeback errors, even syncing writeback errors, are not especially
serious on their own, because the error will ultimately be returned to
the zil_commit() caller, either zfs_fsync() for an explicit sync op (eg
msync()) or to zfs_putpage() itself for a syncing (WB_SYNC_ALL) writeback
(kernel housekeeping or sync_file_range(SYNC_FILE_RANGE_WAIT_AFTER).

The only thing we need to do when a page writeback fails is to re-mark
the page dirty, since we don't know if it made it to disk yet. This will
ensure that it gets written out again in the future, either some
scheduled async writeback or another explicit syncing call.

On the other side, we need to make sure that if a syncing op arrives,
any changes on dirty pages are written back to the DMU and/or the ZIL
first. We do this by starting an _async_ (WB_SYNC_NONE) writeback on the
file mapping at the start of the sync op (fsync(), msync(), etc). An
async op will get an async itx created and logged, ready for the
followup zfs_fsync()-&gt;zil_commit() to find, while avoiding a zil_commit()
call for every page in the range.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris &lt;rob.norris@klarasystems.com&gt;
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
@@ -3684,24 +3684,49 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
 	return (error);
 }
 
-static void
-zfs_putpage_sync_commit_cb(void *arg)
+/* Finish page writeback. */
+static inline void
+zfs_page_writeback_done(struct page *pp, int err, boolean_t for_sync)
 {
-	struct page *pp = arg;
+	if (err != 0) {
+		/*
+		 * Writeback failed. Re-dirty the page. It was undirtied before
+		 * the IO was issued (in zfs_putpage() or write_cache_pages()).
+		 * The kernel only considers writeback for dirty pages; if we
+		 * don't do this, it is eligible for eviction without being
+		 * written out, which we definitely don't want.
+		 */
+#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
+		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
+#else
+		__set_page_dirty_nobuffers(pp);
+#endif
+	}
 
 	ClearPageError(pp);
+
 	end_page_writeback(pp);
+
+	if (!for_sync) {
+		znode_t *zp = ITOZ(pp->mapping->host);
+		atomic_dec_32(&zp->z_async_writes_cnt);
+	}
 }
 
+/*
+ * These callbacks are passed to zfs_log_write() in zfs_putpage(), and are
+ * called with the ZIL itx has been written to the log, or if the ZIL crashes
+ * or the pool suspends. Any failure is passed as `err`.
+ */
 static void
-zfs_putpage_async_commit_cb(void *arg)
+zfs_putpage_sync_commit_cb(void *arg, int err)
 {
-	struct page *pp = arg;
-	znode_t *zp = ITOZ(pp->mapping->host);
-
-	ClearPageError(pp);
-	end_page_writeback(pp);
-	atomic_dec_32(&zp->z_async_writes_cnt);
+	zfs_page_writeback_done(arg, err, B_TRUE);
+}
+static void
+zfs_putpage_async_commit_cb(void *arg, int err)
+{
+	zfs_page_writeback_done(arg, err, B_FALSE);
 }
 
 /*
@@ -3877,18 +3902,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	err = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
-#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
-		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
-#else
-		__set_page_dirty_nobuffers(pp);
-#endif
-		ClearPageError(pp);
-		end_page_writeback(pp);
-		if (!for_sync)
-			atomic_dec_32(&zp->z_async_writes_cnt);
+		zfs_page_writeback_done(pp, err, for_sync);
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
-		return (err);
+
+		/*
+		 * Don't return error for an async writeback; we've re-dirtied
+		 * the page so it will be tried again some other time.
+		 */
+		return (wbc->sync_mode != WB_SYNC_NONE ? err : 0);
 	}
 
 	va = kmap(pp);
@@ -3911,14 +3933,14 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 
 	err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 
-	boolean_t commit = B_FALSE;
+	enum { NONE, ASYNC, SYNC } commit = NONE;
 	if (wbc->sync_mode != WB_SYNC_NONE) {
 		/*
 		 * Note that this is rarely called under writepages(), because
 		 * writepages() normally handles the entire commit for
 		 * performance reasons.
 		 */
-		commit = B_TRUE;
+		commit = SYNC;
 	} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
 		/*
 		 * If the caller does not intend to wait synchronously
@@ -3928,23 +3950,30 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 		 * our writeback to complete. Refer to the comment in
 		 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
 		 */
-		commit = B_TRUE;
+		commit = ASYNC;
 	}
 
-	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
-	    B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
+	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen,
+	    commit != NONE, B_FALSE,
+	    for_sync ? zfs_putpage_sync_commit_cb :
 	    zfs_putpage_async_commit_cb, pp);
 
 	dmu_tx_commit(tx);
-
 	zfs_rangelock_exit(lr);
 
-	if (commit) {
+	if (commit != NONE) {
+		/*
+		 * If this is a sync write, or a sync write is in progress,
+		 * forces this out now. However, if it was an async write
+		 * while a sync write was in progress, ignore the error here,
+		 * since no one actually asked for it.
+		 */
 		err = zil_commit(zfsvfs->z_log, zp->z_id);
-		if (err != 0) {
+		if (err != 0 && commit == SYNC) {
 			zfs_exit(zfsvfs, FTAG);
 			return (err);
 		}
+		err = 0;
 	}
 
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
@@ -109,6 +109,10 @@ zpl_iterate(struct file *filp, struct dir_context *ctx)
 	return (error);
 }
 
+static inline int
+zpl_write_cache_pages(struct address_space *mapping,
+    struct writeback_control *wbc, void *data);
+
 static int
 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
@@ -151,7 +155,33 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 		zpl_exit(zfsvfs, FTAG);
 	}
 
-	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	/*
+	 * Force dirty pages in the range out to the DMU and the log. This may
+	 * end up calling zil_commit(), which is fine; there will just be very
+	 * little for zfs_fsync() to do below. If the page writeback fails, the
+	 * zfs_putpage() callbacks will keep the page dirty.
+	 *
+	 * No matter what happens here, we always call zfs_fsync() and so
+	 * zil_commit(). The only way the writeback can fail is if the ZIL
+	 * itself has already crashed because the pool suspended, and so it
+	 * will return error below. Thus, we don't need to ever track writeback
+	 * errors on the mapping (or in page flags in older kernels), and so
+	 * this call is guaranteed to return 0.
+	 *
+	 * We call write_cache_pages() directly to ensure that zpl_putpage() is
+	 * called with the flags we need. We need WB_SYNC_NONE so that we don't
+	 * count these as syncing writes and so fall back to zil_commit()
+	 * (since we're doing this is a kind of pre-sync); but we do need
+	 * for_sync so we can avoid bumping z_async_writes_cnt as we go.
+	 */
+	int for_sync = 1;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = LONG_MAX,
+		.range_start = start,
+		.range_end = end,
+        };
+	VERIFY0(zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync));
 
 	/*
 	 * The sync write is not complete yet but we decrement
@@ -164,9 +194,6 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	 */
 	atomic_dec_32(&zp->z_sync_writes_cnt);
 
-	if (error)
-		return (error);
-
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 	error = -zfs_fsync(zp, datasync, cr);