Skip to content

Commit f586c6e

Browse files
committed
Split dmu_zfetch() speculation and execution parts.
To make better predictions on parrallel workloads dmu_zfetch() should be called as early as possible to reduce possible request reordering. In particular, it should be called before dmu_buf_hold_array_by_dnode() calls dbuf_hold(), which may sleep waiting for indirect blocks, waking up multiple threads same time on completion, that can significantly reorder the requests, making the stream look like random. But we should not issue prefetch requests before the on-demand ones, since they may get to the disks first despite the I/O scheduler, increasing on-demand request latency. This patch splits dmu_zfetch() into two functions: dmu_zfetch_prepare() and dmu_zfetch_run(). The first can be executed as early as needed. It only updates statistics and makes predictions without issuing any I/Os. The I/O issuance is handled by dmu_zfetch_run(), which can be called later when all on-demand I/Os are already issued. It even tracks the activity of other concurrent threads, issuing the prefetch only when _all_ on-demand requests are issued. For many years it was a big problem for storage servers, handling deeper request queues from their clients, having to either serialize consequential reads to make ZFS prefetcher usable, or execute the incoming requests as-is and get almost no prefetch from ZFS, relying only on deep enough prefetch by the clients. Benefits of those ways varied, but neither was perfect. With this patch deeper queue sequential read benchmarks with CrystalDiskMark from Windows via iSCSI to FreeBSD target show me much better throughput with almost 100% prefetcher hit rate, comparing to almost zero before. While there, I also removed per-stream zs_lock as useless, completely covered by parent zf_lock. Also I reused zs_blocks refcount to track zf_stream linkage of the stream, since I believe previous zs_fetch == NULL check in dmu_zfetch_stream_done() was racy. Signed-off-by: Alexander Motin <[email protected]> Sponsored-By: iXsystems, Inc.
1 parent 778fa36 commit f586c6e

File tree

3 files changed

+122
-84
lines changed

3 files changed

+122
-84
lines changed

include/sys/dmu_zfetch.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,19 +49,21 @@ typedef struct zfetch {
4949

5050
typedef struct zstream {
5151
uint64_t zs_blkid; /* expect next access at this blkid */
52-
uint64_t zs_pf_blkid; /* next block to prefetch */
52+
uint64_t zs_pf_blkid1; /* first block to prefetch */
53+
uint64_t zs_pf_blkid; /* block to prefetch up to */
5354

5455
/*
5556
* We will next prefetch the L1 indirect block of this level-0
5657
* block id.
5758
*/
58-
uint64_t zs_ipf_blkid;
59+
uint64_t zs_ipf_blkid1; /* first block to prefetch */
60+
uint64_t zs_ipf_blkid; /* block to prefetch up to */
5961

60-
kmutex_t zs_lock; /* protects stream */
62+
list_node_t zs_node; /* link for zf_stream */
6163
hrtime_t zs_atime; /* time last prefetch issued */
6264
hrtime_t zs_start_time; /* start of last prefetch */
63-
list_node_t zs_node; /* link for zf_stream */
6465
zfetch_t *zs_fetch; /* parent fetch */
66+
zfs_refcount_t zs_callers; /* number of pending callers */
6567
zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */
6668
} zstream_t;
6769

@@ -70,6 +72,9 @@ void zfetch_fini(void);
7072

7173
void dmu_zfetch_init(zfetch_t *, struct dnode *);
7274
void dmu_zfetch_fini(zfetch_t *);
75+
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
76+
boolean_t);
77+
void dmu_zfetch_run(zstream_t *, boolean_t);
7378
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t,
7479
boolean_t);
7580

module/zfs/dmu.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
497497
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
498498
{
499499
dmu_buf_t **dbp;
500+
zstream_t *zs = NULL;
500501
uint64_t blkid, nblks, i;
501502
uint32_t dbuf_flags;
502503
int err;
@@ -536,9 +537,16 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
536537
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
537538
ZIO_FLAG_CANFAIL);
538539
blkid = dbuf_whichblock(dn, 0, offset);
540+
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
541+
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
542+
zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks,
543+
read && DNODE_IS_CACHEABLE(dn), B_TRUE);
544+
}
539545
for (i = 0; i < nblks; i++) {
540546
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
541547
if (db == NULL) {
548+
if (zs)
549+
dmu_zfetch_run(zs, B_TRUE);
542550
rw_exit(&dn->dn_struct_rwlock);
543551
dmu_buf_rele_array(dbp, nblks, tag);
544552
if (read)
@@ -555,11 +563,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
555563
if (!read)
556564
zfs_racct_write(length, nblks);
557565

558-
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
559-
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
560-
dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
561-
read && DNODE_IS_CACHEABLE(dn), B_TRUE);
562-
}
566+
if (zs)
567+
dmu_zfetch_run(zs, B_TRUE);
563568
rw_exit(&dn->dn_struct_rwlock);
564569

565570
if (read) {

module/zfs/dmu_zfetch.c

Lines changed: 103 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -129,26 +129,18 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
129129
static void
130130
dmu_zfetch_stream_fini(zstream_t *zs)
131131
{
132-
mutex_destroy(&zs->zs_lock);
133132
kmem_free(zs, sizeof (*zs));
134133
}
135134

136135
static void
137136
dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
138-
{
139-
ASSERT(MUTEX_HELD(&zf->zf_lock));
140-
list_remove(&zf->zf_stream, zs);
141-
dmu_zfetch_stream_fini(zs);
142-
zf->zf_numstreams--;
143-
}
144-
145-
static void
146-
dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs)
147137
{
148138
ASSERT(MUTEX_HELD(&zf->zf_lock));
149139
list_remove(&zf->zf_stream, zs);
150140
zs->zs_fetch = NULL;
151141
zf->zf_numstreams--;
142+
if (zfs_refcount_remove(&zs->zs_blocks, NULL) == 0)
143+
dmu_zfetch_stream_fini(zs);
152144
}
153145

154146
/*
@@ -161,12 +153,8 @@ dmu_zfetch_fini(zfetch_t *zf)
161153
zstream_t *zs;
162154

163155
mutex_enter(&zf->zf_lock);
164-
while ((zs = list_head(&zf->zf_stream)) != NULL) {
165-
if (zfs_refcount_count(&zs->zs_blocks) != 0)
166-
dmu_zfetch_stream_orphan(zf, zs);
167-
else
168-
dmu_zfetch_stream_remove(zf, zs);
169-
}
156+
while ((zs = list_head(&zf->zf_stream)) != NULL)
157+
dmu_zfetch_stream_remove(zf, zs);
170158
mutex_exit(&zf->zf_lock);
171159
list_destroy(&zf->zf_stream);
172160
mutex_destroy(&zf->zf_lock);
@@ -195,9 +183,9 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
195183
zs != NULL; zs = zs_next) {
196184
zs_next = list_next(&zf->zf_stream, zs);
197185
/*
198-
* Skip gethrtime() call if there are still references
186+
* Skip if still active. 1 -- zf_stream reference.
199187
*/
200-
if (zfs_refcount_count(&zs->zs_blocks) != 0)
188+
if (zfs_refcount_count(&zs->zs_blocks) != 1)
201189
continue;
202190
if (((now - zs->zs_atime) / NANOSEC) >
203191
zfetch_min_sec_reap)
@@ -222,12 +210,16 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
222210

223211
zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
224212
zs->zs_blkid = blkid;
213+
zs->zs_pf_blkid1 = blkid;
225214
zs->zs_pf_blkid = blkid;
215+
zs->zs_ipf_blkid1 = blkid;
226216
zs->zs_ipf_blkid = blkid;
227217
zs->zs_atime = now;
228218
zs->zs_fetch = zf;
219+
zfs_refcount_create(&zs->zs_callers);
229220
zfs_refcount_create(&zs->zs_blocks);
230-
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
221+
/* One reference for zf_stream. */
222+
zfs_refcount_add(&zs->zs_blocks, NULL);
231223
zf->zf_numstreams++;
232224
list_insert_head(&zf->zf_stream, zs);
233225
}
@@ -247,13 +239,7 @@ dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
247239
ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta);
248240
}
249241

250-
if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0)
251-
return;
252-
253-
/*
254-
* The parent fetch structure has gone away
255-
*/
256-
if (zs->zs_fetch == NULL)
242+
if (zfs_refcount_remove(&zs->zs_blocks, NULL) == 0)
257243
dmu_zfetch_stream_fini(zs);
258244
}
259245

@@ -265,20 +251,20 @@ dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
265251
* FALSE -- prefetch only indirect blocks for predicted data blocks;
266252
* TRUE -- prefetch predicted data blocks plus following indirect blocks.
267253
*/
268-
void
269-
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
270-
boolean_t have_lock)
254+
zstream_t *
255+
dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
256+
boolean_t fetch_data, boolean_t have_lock)
271257
{
272258
zstream_t *zs;
273-
int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
259+
int64_t pf_start, ipf_start;
274260
int64_t pf_ahead_blks, max_blks;
275-
int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued;
261+
int max_dist_blks, pf_nblks, ipf_nblks;
276262
uint64_t end_of_access_blkid;
277263
end_of_access_blkid = blkid + nblks;
278264
spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
279265

280266
if (zfs_prefetch_disable)
281-
return;
267+
return (NULL);
282268
/*
283269
* If we haven't yet loaded the indirect vdevs' mappings, we
284270
* can only read from blocks that we carefully ensure are on
@@ -287,14 +273,14 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
287273
* blocks (e.g. of the MOS's dnode object).
288274
*/
289275
if (!spa_indirect_vdevs_loaded(spa))
290-
return;
276+
return (NULL);
291277

292278
/*
293279
* As a fast path for small (single-block) files, ignore access
294280
* to the first block.
295281
*/
296282
if (!have_lock && blkid == 0)
297-
return;
283+
return (NULL);
298284

299285
if (!have_lock)
300286
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
@@ -306,7 +292,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
306292
if (zf->zf_dnode->dn_maxblkid < 2) {
307293
if (!have_lock)
308294
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
309-
return;
295+
return (NULL);
310296
}
311297
mutex_enter(&zf->zf_lock);
312298

@@ -317,30 +303,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
317303
*/
318304
for (zs = list_head(&zf->zf_stream); zs != NULL;
319305
zs = list_next(&zf->zf_stream, zs)) {
320-
if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
321-
mutex_enter(&zs->zs_lock);
322-
/*
323-
* zs_blkid could have changed before we
324-
* acquired zs_lock; re-check them here.
325-
*/
326-
if (blkid == zs->zs_blkid) {
327-
break;
328-
} else if (blkid + 1 == zs->zs_blkid) {
329-
blkid++;
330-
nblks--;
331-
if (nblks == 0) {
332-
/* Already prefetched this before. */
333-
mutex_exit(&zs->zs_lock);
334-
mutex_exit(&zf->zf_lock);
335-
if (!have_lock) {
336-
rw_exit(&zf->zf_dnode->
337-
dn_struct_rwlock);
338-
}
339-
return;
306+
if (blkid == zs->zs_blkid) {
307+
break;
308+
} else if (blkid + 1 == zs->zs_blkid) {
309+
blkid++;
310+
nblks--;
311+
if (nblks == 0) {
312+
/* Already prefetched this before. */
313+
mutex_exit(&zf->zf_lock);
314+
if (!have_lock) {
315+
rw_exit(&zf->zf_dnode->
316+
dn_struct_rwlock);
340317
}
341-
break;
318+
return (NULL);
342319
}
343-
mutex_exit(&zs->zs_lock);
320+
break;
344321
}
345322
}
346323

@@ -355,7 +332,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
355332
mutex_exit(&zf->zf_lock);
356333
if (!have_lock)
357334
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
358-
return;
335+
return (NULL);
359336
}
360337

361338
/*
@@ -369,6 +346,8 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
369346
* start just after the block we just accessed.
370347
*/
371348
pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
349+
if (zs->zs_pf_blkid1 < end_of_access_blkid)
350+
zs->zs_pf_blkid1 = end_of_access_blkid;
372351

373352
/*
374353
* Double our amount of prefetched data, but don't let the
@@ -398,6 +377,8 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
398377
* that point to them).
399378
*/
400379
ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
380+
if (zs->zs_ipf_blkid1 < zs->zs_pf_blkid)
381+
zs->zs_ipf_blkid1 = zs->zs_pf_blkid;
401382
max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
402383
/*
403384
* We want to double our distance ahead of the data prefetch
@@ -411,45 +392,92 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
411392
ipf_nblks = MIN(pf_ahead_blks, max_blks);
412393
zs->zs_ipf_blkid = ipf_start + ipf_nblks;
413394

414-
epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
415-
ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
416-
ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
417-
418395
zs->zs_atime = gethrtime();
419-
/* no prior reads in progress */
420-
if (zfs_refcount_count(&zs->zs_blocks) == 0)
396+
/* Protect the stream from reclamation. 2 -- zf_stream + us. */
397+
if (zfs_refcount_add(&zs->zs_blocks, NULL) == 2)
421398
zs->zs_start_time = zs->zs_atime;
399+
/* Count concurrent callers. */
400+
zfs_refcount_add(&zs->zs_callers, NULL);
422401
zs->zs_blkid = end_of_access_blkid;
423-
zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart,
424-
NULL);
425-
mutex_exit(&zs->zs_lock);
426402
mutex_exit(&zf->zf_lock);
427-
issued = 0;
403+
404+
if (!have_lock)
405+
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
406+
407+
ZFETCHSTAT_BUMP(zfetchstat_hits);
408+
return (zs);
409+
}
410+
411+
void
412+
dmu_zfetch_run(zstream_t *zs, boolean_t have_lock)
413+
{
414+
zfetch_t *zf = zs->zs_fetch;
415+
int64_t pf_start, pf_end, ipf_start, ipf_end;
416+
int epbs, issued;
428417

429418
/*
430-
* dbuf_prefetch() is asynchronous (even when it needs to read
431-
* indirect blocks), but we still prefer to drop our locks before
432-
* calling it to reduce the time we hold them.
419+
* Postpone the prefetch if there are more concurrent callers.
420+
* It happens when multiple requests are waiting for the same
421+
* indirect block. The last one will run the prefetch for all.
433422
*/
423+
if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) {
424+
/* Drop reference taken in dmu_zfetch_prepare(). */
425+
VERIFY3S(zfs_refcount_remove(&zs->zs_blocks, NULL), >, 0);
426+
return;
427+
}
428+
429+
mutex_enter(&zf->zf_lock);
430+
pf_start = zs->zs_pf_blkid1;
431+
pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid;
432+
ipf_start = zs->zs_ipf_blkid1;
433+
ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid;
434+
mutex_exit(&zf->zf_lock);
435+
436+
if (!have_lock)
437+
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
434438

435-
for (int i = 0; i < pf_nblks; i++) {
436-
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i,
439+
epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
440+
ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
441+
ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs;
442+
issued = pf_end - pf_start + ipf_end - ipf_start;
443+
if (issued > 1) {
444+
/* More references on top of taken in dmu_zfetch_prepare(). */
445+
zfs_refcount_add_many(&zs->zs_blocks, issued - 1, NULL);
446+
} else if (issued == 0) {
447+
/* Some other thread has done our work, so drop the ref. */
448+
VERIFY3S(zfs_refcount_remove(&zs->zs_blocks, NULL), >, 0);
449+
}
450+
451+
issued = 0;
452+
for (int64_t blk = pf_start; blk < pf_end; blk++) {
453+
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
437454
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
438455
dmu_zfetch_stream_done, zs);
439456
}
440-
for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
457+
for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
441458
issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
442459
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
443460
dmu_zfetch_stream_done, zs);
444461
}
462+
445463
if (!have_lock)
446464
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
447-
ZFETCHSTAT_BUMP(zfetchstat_hits);
448465

449466
if (issued)
450467
ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
451468
}
452469

470+
void
471+
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
472+
boolean_t have_lock)
473+
{
474+
zstream_t *zs;
475+
476+
zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
477+
if (zs)
478+
dmu_zfetch_run(zs, have_lock);
479+
}
480+
453481
/* BEGIN CSTYLED */
454482
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
455483
"Disable all ZFS prefetching");

0 commit comments

Comments
 (0)