Skip to content

Implement parallel ARC eviction #16486

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 36 additions & 4 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
.\" Copyright (c) 2013 by Turbo Fredriksson <[email protected]>. All rights reserved.
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
.\" Copyright (c) 2019 Datto Inc.
.\" Copyright (c) 2023, 2024 Klara, Inc.
.\" Copyright (c) 2023, 2024, 2025, Klara, Inc.
.\" The contents of this file are subject to the terms of the Common Development
.\" and Distribution License (the "License"). You may not use this file except
.\" in compliance with the License. You can obtain a copy of the license at
Expand All @@ -17,9 +17,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.\" Copyright (c) 2024, Klara, Inc.
.\"
.Dd November 1, 2024
.Dd May 7, 2025
.Dt ZFS 4
.Os
.
Expand Down Expand Up @@ -735,6 +733,40 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list.
This batch-style operation prevents entire sub-lists from being evicted at once
but comes at a cost of additional unlocking and locking.
.
.It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq int
Sets the number of ARC eviction threads to be used.
.Pp
If set greater than 0, ZFS will dedicate up to that many threads to ARC
eviction.
Each thread will process one sub-list at a time,
until the eviction target is reached or all sub-lists have been processed.
When set to 0, ZFS will compute a reasonable number of eviction threads based
on the number of CPUs.
.TS
box;
lb l l .
CPUs Threads
_
1-4 1
5-8 2
9-15 3
16-31 4
32-63 6
64-95 8
96-127 9
128-160 11
160-191 12
192-223 13
224-255 14
256+ 16
.TE
.Pp
More threads may improve the responsiveness of ZFS to memory pressure.
This can be important for performance when eviction from the ARC becomes
a bottleneck for reads and writes.
.Pp
This parameter can only be set at module load time.
.
.It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint
If set to a non zero value, it will replace the
.Sy arc_grow_retry
Expand Down
214 changes: 197 additions & 17 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
* Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2019, loli10K <[email protected]>. All rights reserved.
* Copyright (c) 2020, George Amanakis. All rights reserved.
* Copyright (c) 2019, 2024, Klara Inc.
* Copyright (c) 2019, 2024, 2025, Klara, Inc.
* Copyright (c) 2019, Allan Jude
* Copyright (c) 2020, The FreeBSD Foundation [1]
* Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
Expand Down Expand Up @@ -336,6 +336,9 @@ static kmutex_t arc_evict_lock;
static boolean_t arc_evict_needed = B_FALSE;
static clock_t arc_last_uncached_flush;

static taskq_t *arc_evict_taskq;
static struct evict_arg *arc_evict_arg;

/*
* Count of bytes evicted since boot.
*/
Expand Down Expand Up @@ -469,6 +472,18 @@ static int zfs_arc_prune_task_threads = 1;
/* Used by spa_export/spa_destroy to flush the arc asynchronously */
static taskq_t *arc_flush_taskq;

/*
* Controls the number of ARC eviction threads to dispatch sublists to.
*
* Possible values:
* 0 (auto) compute the number of threads using a logarithmic formula.
* 1 (disabled) one thread - parallel eviction is disabled.
* 2+ (manual) set the number manually.
*
* See arc_evict_thread_init() for how "auto" is computed.
*/
static uint_t zfs_arc_evict_threads = 0;

/* The 7 states: */
arc_state_t ARC_anon;
arc_state_t ARC_mru;
Expand Down Expand Up @@ -4048,6 +4063,62 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
kmem_free(markers, sizeof (*markers) * count);
}

typedef struct evict_arg {
taskq_ent_t eva_tqent;
multilist_t *eva_ml;
arc_buf_hdr_t *eva_marker;
int eva_idx;
uint64_t eva_spa;
uint64_t eva_bytes;
uint64_t eva_evicted;
} evict_arg_t;

static void
arc_evict_task(void *arg)
{
evict_arg_t *eva = arg;
eva->eva_evicted = arc_evict_state_impl(eva->eva_ml, eva->eva_idx,
eva->eva_marker, eva->eva_spa, eva->eva_bytes);
}

static void
arc_evict_thread_init(void)
{
if (zfs_arc_evict_threads == 0) {
/*
* Compute number of threads we want to use for eviction.
*
* Normally, it's log2(ncpus) + ncpus/32, which gets us to the
* default max of 16 threads at ~256 CPUs.
*
* However, that formula goes to two threads at 4 CPUs, which
* is still rather to low to be really useful, so we just go
* with 1 thread at fewer than 6 cores.
*/
if (max_ncpus < 6)
zfs_arc_evict_threads = 1;
else
zfs_arc_evict_threads =
(highbit64(max_ncpus) - 1) + max_ncpus / 32;
} else if (zfs_arc_evict_threads > max_ncpus)
zfs_arc_evict_threads = max_ncpus;

if (zfs_arc_evict_threads > 1) {
arc_evict_taskq = taskq_create("arc_evict",
zfs_arc_evict_threads, defclsyspri, 0, INT_MAX,
TASKQ_PREPOPULATE);
arc_evict_arg = kmem_zalloc(
sizeof (evict_arg_t) * zfs_arc_evict_threads, KM_SLEEP);
}
}

/*
* The minimum number of bytes we can evict at once is a block size.
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
* We use this value to compute a scaling factor for the eviction tasks.
*/
#define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)

/*
* Evict buffers from the given arc state, until we've removed the
* specified number of bytes. Move the removed buffers to the
Expand All @@ -4069,9 +4140,12 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
multilist_t *ml = &state->arcs_list[type];
int num_sublists;
arc_buf_hdr_t **markers;
evict_arg_t *eva = NULL;

num_sublists = multilist_get_num_sublists(ml);

boolean_t use_evcttq = zfs_arc_evict_threads > 1;

/*
* If we've tried to evict from each sublist, made some
* progress, but still have not hit the target number of bytes
Expand All @@ -4093,25 +4167,91 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
multilist_sublist_unlock(mls);
}

if (use_evcttq) {
if (zthr_iscurthread(arc_evict_zthr))
eva = arc_evict_arg;
else
eva = kmem_alloc(sizeof (evict_arg_t) *
zfs_arc_evict_threads, KM_NOSLEEP);
if (eva) {
for (int i = 0; i < zfs_arc_evict_threads; i++) {
taskq_init_ent(&eva[i].eva_tqent);
eva[i].eva_ml = ml;
eva[i].eva_spa = spa;
}
} else {
/*
* Fall back to the regular single evict if it is not
* possible to allocate memory for the taskq entries.
*/
use_evcttq = B_FALSE;
}
}

/*
* Start eviction using a randomly selected sublist, this is to try and
* evenly balance eviction across all sublists. Always starting at the
* same sublist (e.g. index 0) would cause evictions to favor certain
* sublists over others.
*/
uint64_t scan_evicted = 0;
int sublists_left = num_sublists;
int sublist_idx = multilist_get_random_index(ml);

/*
* While we haven't hit our target number of bytes to evict, or
* we're evicting all available buffers.
*/
while (total_evicted < bytes) {
int sublist_idx = multilist_get_random_index(ml);
uint64_t scan_evicted = 0;
uint64_t evict = MIN_EVICT_SIZE;
uint_t ntasks = zfs_arc_evict_threads;

/*
* Start eviction using a randomly selected sublist,
* this is to try and evenly balance eviction across all
* sublists. Always starting at the same sublist
* (e.g. index 0) would cause evictions to favor certain
* sublists over others.
*/
for (int i = 0; i < num_sublists; i++) {
if (use_evcttq) {
if (sublists_left < ntasks)
ntasks = sublists_left;

if (ntasks < 2)
use_evcttq = B_FALSE;
}

if (use_evcttq) {
uint64_t left = bytes - total_evicted;

if (bytes == ARC_EVICT_ALL) {
evict = bytes;
} else if (left > ntasks * MIN_EVICT_SIZE) {
evict = DIV_ROUND_UP(left, ntasks);
} else {
ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE);
if (ntasks == 1)
use_evcttq = B_FALSE;
}
}

for (int i = 0; sublists_left > 0; i++, sublist_idx++,
sublists_left--) {
uint64_t bytes_remaining;
uint64_t bytes_evicted;

/* we've reached the end, wrap to the beginning */
if (sublist_idx >= num_sublists)
sublist_idx = 0;

if (use_evcttq) {
if (i == ntasks)
break;

eva[i].eva_marker = markers[sublist_idx];
eva[i].eva_idx = sublist_idx;
eva[i].eva_bytes = evict;

taskq_dispatch_ent(arc_evict_taskq,
arc_evict_task, &eva[i], 0,
&eva[i].eva_tqent);

continue;
}

if (total_evicted < bytes)
bytes_remaining = bytes - total_evicted;
else
Expand All @@ -4122,18 +4262,23 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,

scan_evicted += bytes_evicted;
total_evicted += bytes_evicted;
}

/* we've reached the end, wrap to the beginning */
if (++sublist_idx >= num_sublists)
sublist_idx = 0;
if (use_evcttq) {
taskq_wait(arc_evict_taskq);

for (int i = 0; i < ntasks; i++) {
scan_evicted += eva[i].eva_evicted;
total_evicted += eva[i].eva_evicted;
}
}

/*
* If we didn't evict anything during this scan, we have
* no reason to believe we'll evict more during another
* If we scanned all sublists and didn't evict anything, we
* have no reason to believe we'll evict more during another
* scan, so break the loop.
*/
if (scan_evicted == 0) {
if (scan_evicted == 0 && sublists_left == 0) {
/* This isn't possible, let's make that obvious */
ASSERT3S(bytes, !=, 0);

Expand All @@ -4150,13 +4295,33 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,

break;
}

/*
* If we scanned all sublists but still have more to do,
* reset the counts so we can go around again.
*/
if (sublists_left == 0) {
sublists_left = num_sublists;
sublist_idx = multilist_get_random_index(ml);
scan_evicted = 0;

/*
* Since we're about to reconsider all sublists,
* re-enable use of the evict threads if available.
*/
use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL);
}
}

if (eva != NULL && eva != arc_evict_arg)
kmem_free(eva, sizeof (evict_arg_t) * zfs_arc_evict_threads);

for (int i = 0; i < num_sublists; i++) {
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
multilist_sublist_remove(mls, markers[i]);
multilist_sublist_unlock(mls);
}

if (markers != arc_state_evict_markers)
arc_state_free_markers(markers, num_sublists);

Expand Down Expand Up @@ -7795,6 +7960,7 @@ arc_set_limits(uint64_t allmem)
/* How to set default max varies by platform. */
arc_c_max = arc_default_max(arc_c_min, allmem);
}

void
arc_init(void)
{
Expand Down Expand Up @@ -7872,6 +8038,8 @@ arc_init(void)
arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);

arc_evict_thread_init();

list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
offsetof(arc_async_flush_t, af_node));
mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
Expand Down Expand Up @@ -7972,11 +8140,20 @@ arc_fini(void)
list_destroy(&arc_prune_list);
mutex_destroy(&arc_prune_mtx);

if (arc_evict_taskq != NULL)
taskq_wait(arc_evict_taskq);

(void) zthr_cancel(arc_evict_zthr);
(void) zthr_cancel(arc_reap_zthr);
arc_state_free_markers(arc_state_evict_markers,
arc_state_evict_marker_count);

if (arc_evict_taskq != NULL) {
taskq_destroy(arc_evict_taskq);
kmem_free(arc_evict_arg,
sizeof (evict_arg_t) * zfs_arc_evict_threads);
}

mutex_destroy(&arc_evict_lock);
list_destroy(&arc_evict_waiters);

Expand Down Expand Up @@ -11100,3 +11277,6 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,

ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
"Number of arc_prune threads");

ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RD,
"Number of threads to use for ARC eviction.");
Loading