Skip to content

Commit ed2f7ba

Browse files
authored
Implement uncached prefetch
Previously the primarycache property was handled only in the dbuf layer. Since the speculative prefetcher is implemented in the ARC, it had to be disabled for uncacheable buffers. This change gives the ARC knowledge about uncacheable buffers via arc_read() and arc_write(). So when remove_reference() drops the last reference on the ARC header, it can either immediately destroy it, or if it is marked as prefetch, put it into a new arc_uncached state. That state is scanned every second, evicting stale buffers that were not demand read. This change also tracks dbufs that were read from the beginning, but not to the end. It is assumed that such buffers may receive further reads, and so they are stored in dbuf cache. If a following reads reaches the end of the buffer, it is immediately evicted. Otherwise it will follow regular dbuf cache eviction. Since the dbuf layer does not know actual file sizes, this logic is not applied to the final buffer of a dnode. Since uncacheable buffers should no longer stay in the ARC for long, this patch also tries to optimize I/O by allocating ARC physical buffers as linear to allow buffer sharing. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: George Wilson <[email protected]> Reviewed-by: Ryan Moeller <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes #14243
1 parent c935fe2 commit ed2f7ba

File tree

11 files changed

+243
-139
lines changed

11 files changed

+243
-139
lines changed

include/os/linux/zfs/sys/trace_arc.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict);
108108
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete);
109109
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru);
110110
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu);
111+
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__uncached);
111112
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync);
112113
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit);
113114
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);
@@ -392,6 +393,7 @@ DEFINE_DTRACE_PROBE1(arc__evict);
392393
DEFINE_DTRACE_PROBE1(arc__delete);
393394
DEFINE_DTRACE_PROBE1(new_state__mru);
394395
DEFINE_DTRACE_PROBE1(new_state__mfu);
396+
DEFINE_DTRACE_PROBE1(new_state__uncached);
395397
DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync);
396398
DEFINE_DTRACE_PROBE1(l2arc__hit);
397399
DEFINE_DTRACE_PROBE1(l2arc__miss);

include/sys/arc.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ typedef enum arc_flags
115115
ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */
116116
ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
117117
ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
118+
ARC_FLAG_UNCACHED = 1 << 5, /* evict after use */
118119
ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */
119120

120121
/*
@@ -228,6 +229,7 @@ typedef enum arc_state_type {
228229
ARC_STATE_MFU,
229230
ARC_STATE_MFU_GHOST,
230231
ARC_STATE_L2C_ONLY,
232+
ARC_STATE_UNCACHED,
231233
ARC_STATE_NUMTYPES
232234
} arc_state_type_t;
233235

@@ -301,8 +303,8 @@ int arc_referenced(arc_buf_t *buf);
301303
int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
302304
arc_read_done_func_t *done, void *priv, zio_priority_t priority,
303305
int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
304-
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
305-
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
306+
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
307+
arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp,
306308
arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
307309
arc_write_done_func_t *physdone, arc_write_done_func_t *done,
308310
void *priv, zio_priority_t priority, int zio_flags,

include/sys/arc_impl.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ extern "C" {
4646
* ARC_mru_ghost - recently used, no longer in cache
4747
* ARC_mfu - frequently used, currently cached
4848
* ARC_mfu_ghost - frequently used, no longer in cache
49+
* ARC_uncached - uncacheable prefetch, to be evicted
4950
* ARC_l2c_only - exists in L2ARC but not other states
5051
* When there are no active references to the buffer, they are
5152
* are linked onto a list in one of these arc states. These are
@@ -542,6 +543,7 @@ typedef struct arc_stats {
542543
kstat_named_t arcstat_mru_ghost_hits;
543544
kstat_named_t arcstat_mfu_hits;
544545
kstat_named_t arcstat_mfu_ghost_hits;
546+
kstat_named_t arcstat_uncached_hits;
545547
kstat_named_t arcstat_deleted;
546548
/*
547549
* Number of buffers that could not be evicted because the hash lock
@@ -744,6 +746,21 @@ typedef struct arc_stats {
744746
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
745747
*/
746748
kstat_named_t arcstat_mfu_ghost_evictable_metadata;
749+
/*
750+
* Total number of bytes that are going to be evicted from ARC due to
751+
* ARC_FLAG_UNCACHED being set.
752+
*/
753+
kstat_named_t arcstat_uncached_size;
754+
/*
755+
* Number of data bytes that are going to be evicted from ARC due to
756+
* ARC_FLAG_UNCACHED being set.
757+
*/
758+
kstat_named_t arcstat_uncached_evictable_data;
759+
/*
760+
* Number of metadata bytes that that are going to be evicted from ARC
761+
* due to ARC_FLAG_UNCACHED being set.
762+
*/
763+
kstat_named_t arcstat_uncached_evictable_metadata;
747764
kstat_named_t arcstat_l2_hits;
748765
kstat_named_t arcstat_l2_misses;
749766
/*
@@ -900,6 +917,7 @@ typedef struct arc_sums {
900917
wmsum_t arcstat_mru_ghost_hits;
901918
wmsum_t arcstat_mfu_hits;
902919
wmsum_t arcstat_mfu_ghost_hits;
920+
wmsum_t arcstat_uncached_hits;
903921
wmsum_t arcstat_deleted;
904922
wmsum_t arcstat_mutex_miss;
905923
wmsum_t arcstat_access_skip;
@@ -1006,6 +1024,7 @@ typedef struct arc_evict_waiter {
10061024
#define arc_mfu (&ARC_mfu)
10071025
#define arc_mfu_ghost (&ARC_mfu_ghost)
10081026
#define arc_l2c_only (&ARC_l2c_only)
1027+
#define arc_uncached (&ARC_uncached)
10091028

10101029
extern taskq_t *arc_prune_taskq;
10111030
extern arc_stats_t arc_stats;

include/sys/dbuf.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ extern "C" {
5555
#define DB_RF_NEVERWAIT (1 << 4)
5656
#define DB_RF_CACHED (1 << 5)
5757
#define DB_RF_NO_DECRYPT (1 << 6)
58+
#define DB_RF_PARTIAL_FIRST (1 << 7)
59+
#define DB_RF_PARTIAL_MORE (1 << 8)
5860

5961
/*
6062
* The simplified state transition diagram for dbufs looks like:
@@ -321,6 +323,9 @@ typedef struct dmu_buf_impl {
321323
uint8_t db_pending_evict;
322324

323325
uint8_t db_dirtycnt;
326+
327+
/* The buffer was partially read. More reads may follow. */
328+
uint8_t db_partial_read;
324329
} dmu_buf_impl_t;
325330

326331
#define DBUF_HASH_MUTEX(h, idx) \

include/sys/dnode.h

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -457,15 +457,11 @@ void dnode_free_interior_slots(dnode_t *dn);
457457
#define DNODE_IS_DIRTY(_dn) \
458458
((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))
459459

460-
#define DNODE_IS_CACHEABLE(_dn) \
460+
#define DNODE_LEVEL_IS_CACHEABLE(_dn, _level) \
461461
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
462-
(DMU_OT_IS_METADATA((_dn)->dn_type) && \
462+
(((_level) > 0 || DMU_OT_IS_METADATA((_dn)->dn_type)) && \
463463
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
464464

465-
#define DNODE_META_IS_CACHEABLE(_dn) \
466-
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
467-
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
468-
469465
/*
470466
* Used for dnodestats kstat.
471467
*/

module/os/freebsd/zfs/sysctl_os.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,10 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
366366
&ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
367367
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
368368
&ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
369-
"size of anonymous state");
369+
"size of metadata in anonymous state");
370370
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
371371
&ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
372-
"size of anonymous state");
372+
"size of data in anonymous state");
373373
/* END CSTYLED */
374374

375375
extern arc_state_t ARC_mru;
@@ -424,6 +424,19 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
424424
"size of data in mfu ghost state");
425425
/* END CSTYLED */
426426

427+
extern arc_state_t ARC_uncached;
428+
429+
/* BEGIN CSTYLED */
430+
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_size, CTLFLAG_RD,
431+
&ARC_uncached.arcs_size.rc_count, 0, "size of uncached state");
432+
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD,
433+
&ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
434+
"size of metadata in uncached state");
435+
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD,
436+
&ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
437+
"size of data in uncached state");
438+
/* END CSTYLED */
439+
427440
extern arc_state_t ARC_l2c_only;
428441

429442
/* BEGIN CSTYLED */

0 commit comments

Comments
 (0)