@@ -129,26 +129,18 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
129
129
static void
130
130
dmu_zfetch_stream_fini (zstream_t * zs )
131
131
{
132
- mutex_destroy (& zs -> zs_lock );
133
132
kmem_free (zs , sizeof (* zs ));
134
133
}
135
134
136
135
static void
137
136
dmu_zfetch_stream_remove (zfetch_t * zf , zstream_t * zs )
138
- {
139
- ASSERT (MUTEX_HELD (& zf -> zf_lock ));
140
- list_remove (& zf -> zf_stream , zs );
141
- dmu_zfetch_stream_fini (zs );
142
- zf -> zf_numstreams -- ;
143
- }
144
-
145
- static void
146
- dmu_zfetch_stream_orphan (zfetch_t * zf , zstream_t * zs )
147
137
{
148
138
ASSERT (MUTEX_HELD (& zf -> zf_lock ));
149
139
list_remove (& zf -> zf_stream , zs );
150
140
zs -> zs_fetch = NULL ;
151
141
zf -> zf_numstreams -- ;
142
+ if (zfs_refcount_remove (& zs -> zs_blocks , NULL ) == 0 )
143
+ dmu_zfetch_stream_fini (zs );
152
144
}
153
145
154
146
/*
@@ -161,12 +153,8 @@ dmu_zfetch_fini(zfetch_t *zf)
161
153
zstream_t * zs ;
162
154
163
155
mutex_enter (& zf -> zf_lock );
164
- while ((zs = list_head (& zf -> zf_stream )) != NULL ) {
165
- if (zfs_refcount_count (& zs -> zs_blocks ) != 0 )
166
- dmu_zfetch_stream_orphan (zf , zs );
167
- else
168
- dmu_zfetch_stream_remove (zf , zs );
169
- }
156
+ while ((zs = list_head (& zf -> zf_stream )) != NULL )
157
+ dmu_zfetch_stream_remove (zf , zs );
170
158
mutex_exit (& zf -> zf_lock );
171
159
list_destroy (& zf -> zf_stream );
172
160
mutex_destroy (& zf -> zf_lock );
@@ -195,9 +183,9 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
195
183
zs != NULL ; zs = zs_next ) {
196
184
zs_next = list_next (& zf -> zf_stream , zs );
197
185
/*
198
- * Skip gethrtime() call if there are still references
186
+ * Skip if still active. 1 -- zf_stream reference.
199
187
*/
200
- if (zfs_refcount_count (& zs -> zs_blocks ) != 0 )
188
+ if (zfs_refcount_count (& zs -> zs_blocks ) != 1 )
201
189
continue ;
202
190
if (((now - zs -> zs_atime ) / NANOSEC ) >
203
191
zfetch_min_sec_reap )
@@ -222,12 +210,16 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
222
210
223
211
zstream_t * zs = kmem_zalloc (sizeof (* zs ), KM_SLEEP );
224
212
zs -> zs_blkid = blkid ;
213
+ zs -> zs_pf_blkid1 = blkid ;
225
214
zs -> zs_pf_blkid = blkid ;
215
+ zs -> zs_ipf_blkid1 = blkid ;
226
216
zs -> zs_ipf_blkid = blkid ;
227
217
zs -> zs_atime = now ;
228
218
zs -> zs_fetch = zf ;
219
+ zfs_refcount_create (& zs -> zs_callers );
229
220
zfs_refcount_create (& zs -> zs_blocks );
230
- mutex_init (& zs -> zs_lock , NULL , MUTEX_DEFAULT , NULL );
221
+ /* One reference for zf_stream. */
222
+ zfs_refcount_add (& zs -> zs_blocks , NULL );
231
223
zf -> zf_numstreams ++ ;
232
224
list_insert_head (& zf -> zf_stream , zs );
233
225
}
@@ -247,13 +239,7 @@ dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
247
239
ZFETCHSTAT_SET (zfetchstat_max_completion_us , delta );
248
240
}
249
241
250
- if (zfs_refcount_remove (& zs -> zs_blocks , NULL ) != 0 )
251
- return ;
252
-
253
- /*
254
- * The parent fetch structure has gone away
255
- */
256
- if (zs -> zs_fetch == NULL )
242
+ if (zfs_refcount_remove (& zs -> zs_blocks , NULL ) == 0 )
257
243
dmu_zfetch_stream_fini (zs );
258
244
}
259
245
@@ -265,20 +251,20 @@ dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
265
251
* FALSE -- prefetch only indirect blocks for predicted data blocks;
266
252
* TRUE -- prefetch predicted data blocks plus following indirect blocks.
267
253
*/
268
- void
269
- dmu_zfetch (zfetch_t * zf , uint64_t blkid , uint64_t nblks , boolean_t fetch_data ,
270
- boolean_t have_lock )
254
+ zstream_t *
255
+ dmu_zfetch_prepare (zfetch_t * zf , uint64_t blkid , uint64_t nblks ,
256
+ boolean_t fetch_data , boolean_t have_lock )
271
257
{
272
258
zstream_t * zs ;
273
- int64_t pf_start , ipf_start , ipf_istart , ipf_iend ;
259
+ int64_t pf_start , ipf_start ;
274
260
int64_t pf_ahead_blks , max_blks ;
275
- int epbs , max_dist_blks , pf_nblks , ipf_nblks , issued ;
261
+ int max_dist_blks , pf_nblks , ipf_nblks ;
276
262
uint64_t end_of_access_blkid ;
277
263
end_of_access_blkid = blkid + nblks ;
278
264
spa_t * spa = zf -> zf_dnode -> dn_objset -> os_spa ;
279
265
280
266
if (zfs_prefetch_disable )
281
- return ;
267
+ return ( NULL ) ;
282
268
/*
283
269
* If we haven't yet loaded the indirect vdevs' mappings, we
284
270
* can only read from blocks that we carefully ensure are on
@@ -287,14 +273,14 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
287
273
* blocks (e.g. of the MOS's dnode object).
288
274
*/
289
275
if (!spa_indirect_vdevs_loaded (spa ))
290
- return ;
276
+ return ( NULL ) ;
291
277
292
278
/*
293
279
* As a fast path for small (single-block) files, ignore access
294
280
* to the first block.
295
281
*/
296
282
if (!have_lock && blkid == 0 )
297
- return ;
283
+ return ( NULL ) ;
298
284
299
285
if (!have_lock )
300
286
rw_enter (& zf -> zf_dnode -> dn_struct_rwlock , RW_READER );
@@ -306,7 +292,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
306
292
if (zf -> zf_dnode -> dn_maxblkid < 2 ) {
307
293
if (!have_lock )
308
294
rw_exit (& zf -> zf_dnode -> dn_struct_rwlock );
309
- return ;
295
+ return ( NULL ) ;
310
296
}
311
297
mutex_enter (& zf -> zf_lock );
312
298
@@ -317,30 +303,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
317
303
*/
318
304
for (zs = list_head (& zf -> zf_stream ); zs != NULL ;
319
305
zs = list_next (& zf -> zf_stream , zs )) {
320
- if (blkid == zs -> zs_blkid || blkid + 1 == zs -> zs_blkid ) {
321
- mutex_enter (& zs -> zs_lock );
322
- /*
323
- * zs_blkid could have changed before we
324
- * acquired zs_lock; re-check them here.
325
- */
326
- if (blkid == zs -> zs_blkid ) {
327
- break ;
328
- } else if (blkid + 1 == zs -> zs_blkid ) {
329
- blkid ++ ;
330
- nblks -- ;
331
- if (nblks == 0 ) {
332
- /* Already prefetched this before. */
333
- mutex_exit (& zs -> zs_lock );
334
- mutex_exit (& zf -> zf_lock );
335
- if (!have_lock ) {
336
- rw_exit (& zf -> zf_dnode ->
337
- dn_struct_rwlock );
338
- }
339
- return ;
306
+ if (blkid == zs -> zs_blkid ) {
307
+ break ;
308
+ } else if (blkid + 1 == zs -> zs_blkid ) {
309
+ blkid ++ ;
310
+ nblks -- ;
311
+ if (nblks == 0 ) {
312
+ /* Already prefetched this before. */
313
+ mutex_exit (& zf -> zf_lock );
314
+ if (!have_lock ) {
315
+ rw_exit (& zf -> zf_dnode ->
316
+ dn_struct_rwlock );
340
317
}
341
- break ;
318
+ return ( NULL ) ;
342
319
}
343
- mutex_exit ( & zs -> zs_lock ) ;
320
+ break ;
344
321
}
345
322
}
346
323
@@ -355,7 +332,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
355
332
mutex_exit (& zf -> zf_lock );
356
333
if (!have_lock )
357
334
rw_exit (& zf -> zf_dnode -> dn_struct_rwlock );
358
- return ;
335
+ return ( NULL ) ;
359
336
}
360
337
361
338
/*
@@ -369,6 +346,8 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
369
346
* start just after the block we just accessed.
370
347
*/
371
348
pf_start = MAX (zs -> zs_pf_blkid , end_of_access_blkid );
349
+ if (zs -> zs_pf_blkid1 < end_of_access_blkid )
350
+ zs -> zs_pf_blkid1 = end_of_access_blkid ;
372
351
373
352
/*
374
353
* Double our amount of prefetched data, but don't let the
@@ -398,6 +377,8 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
398
377
* that point to them).
399
378
*/
400
379
ipf_start = MAX (zs -> zs_ipf_blkid , zs -> zs_pf_blkid );
380
+ if (zs -> zs_ipf_blkid1 < zs -> zs_pf_blkid )
381
+ zs -> zs_ipf_blkid1 = zs -> zs_pf_blkid ;
401
382
max_dist_blks = zfetch_max_idistance >> zf -> zf_dnode -> dn_datablkshift ;
402
383
/*
403
384
* We want to double our distance ahead of the data prefetch
@@ -411,45 +392,92 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
411
392
ipf_nblks = MIN (pf_ahead_blks , max_blks );
412
393
zs -> zs_ipf_blkid = ipf_start + ipf_nblks ;
413
394
414
- epbs = zf -> zf_dnode -> dn_indblkshift - SPA_BLKPTRSHIFT ;
415
- ipf_istart = P2ROUNDUP (ipf_start , 1 << epbs ) >> epbs ;
416
- ipf_iend = P2ROUNDUP (zs -> zs_ipf_blkid , 1 << epbs ) >> epbs ;
417
-
418
395
zs -> zs_atime = gethrtime ();
419
- /* no prior reads in progress */
420
- if (zfs_refcount_count (& zs -> zs_blocks ) == 0 )
396
+ /* Protect the stream from reclamation. 2 -- zf_stream + us. */
397
+ if (zfs_refcount_add (& zs -> zs_blocks , NULL ) == 2 )
421
398
zs -> zs_start_time = zs -> zs_atime ;
399
+ /* Count concurrent callers. */
400
+ zfs_refcount_add (& zs -> zs_callers , NULL );
422
401
zs -> zs_blkid = end_of_access_blkid ;
423
- zfs_refcount_add_many (& zs -> zs_blocks , pf_nblks + ipf_iend - ipf_istart ,
424
- NULL );
425
- mutex_exit (& zs -> zs_lock );
426
402
mutex_exit (& zf -> zf_lock );
427
- issued = 0 ;
403
+
404
+ if (!have_lock )
405
+ rw_exit (& zf -> zf_dnode -> dn_struct_rwlock );
406
+
407
+ ZFETCHSTAT_BUMP (zfetchstat_hits );
408
+ return (zs );
409
+ }
410
+
411
+ void
412
+ dmu_zfetch_run (zstream_t * zs , boolean_t have_lock )
413
+ {
414
+ zfetch_t * zf = zs -> zs_fetch ;
415
+ int64_t pf_start , pf_end , ipf_start , ipf_end ;
416
+ int epbs , issued ;
428
417
429
418
/*
430
- * dbuf_prefetch() is asynchronous (even when it needs to read
431
- * indirect blocks), but we still prefer to drop our locks before
432
- * calling it to reduce the time we hold them .
419
+ * Postpone the prefetch if there are more concurrent callers.
420
+ * It happens when multiple requests are waiting for the same
421
+ * indirect block. The last one will run the prefetch for all .
433
422
*/
423
+ if (zfs_refcount_remove (& zs -> zs_callers , NULL ) != 0 ) {
424
+ /* Drop reference taken in dmu_zfetch_prepare(). */
425
+ VERIFY3S (zfs_refcount_remove (& zs -> zs_blocks , NULL ), > , 0 );
426
+ return ;
427
+ }
428
+
429
+ mutex_enter (& zf -> zf_lock );
430
+ pf_start = zs -> zs_pf_blkid1 ;
431
+ pf_end = zs -> zs_pf_blkid1 = zs -> zs_pf_blkid ;
432
+ ipf_start = zs -> zs_ipf_blkid1 ;
433
+ ipf_end = zs -> zs_ipf_blkid1 = zs -> zs_ipf_blkid ;
434
+ mutex_exit (& zf -> zf_lock );
435
+
436
+ if (!have_lock )
437
+ rw_enter (& zf -> zf_dnode -> dn_struct_rwlock , RW_READER );
434
438
435
- for (int i = 0 ; i < pf_nblks ; i ++ ) {
436
- issued += dbuf_prefetch_impl (zf -> zf_dnode , 0 , pf_start + i ,
439
+ epbs = zf -> zf_dnode -> dn_indblkshift - SPA_BLKPTRSHIFT ;
440
+ ipf_start = P2ROUNDUP (ipf_start , 1 << epbs ) >> epbs ;
441
+ ipf_end = P2ROUNDUP (ipf_end , 1 << epbs ) >> epbs ;
442
+ issued = pf_end - pf_start + ipf_end - ipf_start ;
443
+ if (issued > 1 ) {
444
+ /* More references on top of taken in dmu_zfetch_prepare(). */
445
+ zfs_refcount_add_many (& zs -> zs_blocks , issued - 1 , NULL );
446
+ } else if (issued == 0 ) {
447
+ /* Some other thread has done our work, so drop the ref. */
448
+ VERIFY3S (zfs_refcount_remove (& zs -> zs_blocks , NULL ), > , 0 );
449
+ }
450
+
451
+ issued = 0 ;
452
+ for (int64_t blk = pf_start ; blk < pf_end ; blk ++ ) {
453
+ issued += dbuf_prefetch_impl (zf -> zf_dnode , 0 , blk ,
437
454
ZIO_PRIORITY_ASYNC_READ , ARC_FLAG_PREDICTIVE_PREFETCH ,
438
455
dmu_zfetch_stream_done , zs );
439
456
}
440
- for (int64_t iblk = ipf_istart ; iblk < ipf_iend ; iblk ++ ) {
457
+ for (int64_t iblk = ipf_start ; iblk < ipf_end ; iblk ++ ) {
441
458
issued += dbuf_prefetch_impl (zf -> zf_dnode , 1 , iblk ,
442
459
ZIO_PRIORITY_ASYNC_READ , ARC_FLAG_PREDICTIVE_PREFETCH ,
443
460
dmu_zfetch_stream_done , zs );
444
461
}
462
+
445
463
if (!have_lock )
446
464
rw_exit (& zf -> zf_dnode -> dn_struct_rwlock );
447
- ZFETCHSTAT_BUMP (zfetchstat_hits );
448
465
449
466
if (issued )
450
467
ZFETCHSTAT_ADD (zfetchstat_io_issued , issued );
451
468
}
452
469
470
+ void
471
+ dmu_zfetch (zfetch_t * zf , uint64_t blkid , uint64_t nblks , boolean_t fetch_data ,
472
+ boolean_t have_lock )
473
+ {
474
+ zstream_t * zs ;
475
+
476
+ zs = dmu_zfetch_prepare (zf , blkid , nblks , fetch_data , have_lock );
477
+ if (zs )
478
+ dmu_zfetch_run (zs , have_lock );
479
+ }
480
+
453
481
/* BEGIN CSTYLED */
454
482
ZFS_MODULE_PARAM (zfs_prefetch , zfs_prefetch_ , disable , INT , ZMOD_RW ,
455
483
"Disable all ZFS prefetching" );
0 commit comments