Skip to content

Commit f29f808

Browse files
allanjuderobn0mpAlexander Stetsenko
authored andcommitted
ARC: parallel eviction
On systems with enormous amounts of memory, the single arc_evict thread can become a bottleneck if reads and writes are stuck behind it, waiting for old data to be evicted before new data can take its place. This commit adds support for evicting from multiple ARC lists in parallel, by farming the evict work out to some number of threads and then accumulating their results. A new tuneable, zfs_arc_evict_threads, sets the number of threads. By default, it will scale based on the number of CPUs. Sponsored-by: Expensify, Inc. Sponsored-by: Klara, Inc. Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Youzhong Yang <[email protected]> Signed-off-by: Allan Jude <[email protected]> Signed-off-by: Mateusz Piotrowski <[email protected]> Signed-off-by: Alexander Stetsenko <[email protected]> Signed-off-by: Rob Norris <[email protected]> Co-authored-by: Rob Norris <[email protected]> Co-authored-by: Mateusz Piotrowski <[email protected]> Co-authored-by: Alexander Stetsenko <[email protected]> Closes openzfs#16486
1 parent 2ea7136 commit f29f808

File tree

2 files changed

+233
-21
lines changed

2 files changed

+233
-21
lines changed

man/man4/zfs.4

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
.\" Copyright (c) 2013 by Turbo Fredriksson <[email protected]>. All rights reserved.
44
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
55
.\" Copyright (c) 2019 Datto Inc.
6-
.\" Copyright (c) 2023, 2024 Klara, Inc.
6+
.\" Copyright (c) 2023, 2024, 2025, Klara, Inc.
77
.\" The contents of this file are subject to the terms of the Common Development
88
.\" and Distribution License (the "License"). You may not use this file except
99
.\" in compliance with the License. You can obtain a copy of the license at
@@ -17,9 +17,7 @@
1717
.\" own identifying information:
1818
.\" Portions Copyright [yyyy] [name of copyright owner]
1919
.\"
20-
.\" Copyright (c) 2024, Klara, Inc.
21-
.\"
22-
.Dd November 1, 2024
20+
.Dd May 7, 2025
2321
.Dt ZFS 4
2422
.Os
2523
.
@@ -730,6 +728,40 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list.
730728
This batch-style operation prevents entire sub-lists from being evicted at once
731729
but comes at a cost of additional unlocking and locking.
732730
.
731+
.It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq int
732+
Sets the number of ARC eviction threads to be used.
733+
.Pp
734+
If set greater than 0, ZFS will dedicate up to that many threads to ARC
735+
eviction.
736+
Each thread will process one sub-list at a time,
737+
until the eviction target is reached or all sub-lists have been processed.
738+
When set to 0, ZFS will compute a reasonable number of eviction threads based
739+
on the number of CPUs.
740+
.TS
741+
box;
742+
lb l l .
743+
CPUs Threads
744+
_
745+
1-4 1
746+
5-8 2
747+
9-15 3
748+
16-31 4
749+
32-63 6
750+
64-95 8
751+
96-127 9
752+
128-160 11
753+
160-191 12
754+
192-223 13
755+
224-255 14
756+
256+ 16
757+
.TE
758+
.Pp
759+
More threads may improve the responsiveness of ZFS to memory pressure.
760+
This can be important for performance when eviction from the ARC becomes
761+
a bottleneck for reads and writes.
762+
.Pp
763+
This parameter can only be set at module load time.
764+
.
733765
.It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint
734766
If set to a non zero value, it will replace the
735767
.Sy arc_grow_retry

module/zfs/arc.c

Lines changed: 197 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
* Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
2828
* Copyright (c) 2019, loli10K <[email protected]>. All rights reserved.
2929
* Copyright (c) 2020, George Amanakis. All rights reserved.
30-
* Copyright (c) 2019, 2024, Klara Inc.
30+
* Copyright (c) 2019, 2024, 2025, Klara, Inc.
3131
* Copyright (c) 2019, Allan Jude
3232
* Copyright (c) 2020, The FreeBSD Foundation [1]
3333
* Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
@@ -337,6 +337,9 @@ static kmutex_t arc_evict_lock;
337337
static boolean_t arc_evict_needed = B_FALSE;
338338
static clock_t arc_last_uncached_flush;
339339

340+
static taskq_t *arc_evict_taskq;
341+
static struct evict_arg *arc_evict_arg;
342+
340343
/*
341344
* Count of bytes evicted since boot.
342345
*/
@@ -470,6 +473,18 @@ static int zfs_arc_prune_task_threads = 1;
470473
/* Used by spa_export/spa_destroy to flush the arc asynchronously */
471474
static taskq_t *arc_flush_taskq;
472475

476+
/*
477+
* Controls the number of ARC eviction threads to dispatch sublists to.
478+
*
479+
* Possible values:
480+
* 0 (auto) compute the number of threads using a logarithmic formula.
481+
* 1 (disabled) one thread - parallel eviction is disabled.
482+
* 2+ (manual) set the number manually.
483+
*
484+
* See arc_evict_thread_init() for how "auto" is computed.
485+
*/
486+
static uint_t zfs_arc_evict_threads = 0;
487+
473488
/* The 7 states: */
474489
arc_state_t ARC_anon;
475490
arc_state_t ARC_mru;
@@ -4049,6 +4064,62 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
40494064
kmem_free(markers, sizeof (*markers) * count);
40504065
}
40514066

4067+
typedef struct evict_arg {
4068+
taskq_ent_t eva_tqent;
4069+
multilist_t *eva_ml;
4070+
arc_buf_hdr_t *eva_marker;
4071+
int eva_idx;
4072+
uint64_t eva_spa;
4073+
uint64_t eva_bytes;
4074+
uint64_t eva_evicted;
4075+
} evict_arg_t;
4076+
4077+
static void
4078+
arc_evict_task(void *arg)
4079+
{
4080+
evict_arg_t *eva = arg;
4081+
eva->eva_evicted = arc_evict_state_impl(eva->eva_ml, eva->eva_idx,
4082+
eva->eva_marker, eva->eva_spa, eva->eva_bytes);
4083+
}
4084+
4085+
static void
4086+
arc_evict_thread_init(void)
4087+
{
4088+
if (zfs_arc_evict_threads == 0) {
4089+
/*
4090+
* Compute number of threads we want to use for eviction.
4091+
*
4092+
* Normally, it's log2(ncpus) + ncpus/32, which gets us to the
4093+
* default max of 16 threads at ~256 CPUs.
4094+
*
4095+
* However, that formula goes to two threads at 4 CPUs, which
4096+
* is still rather to low to be really useful, so we just go
4097+
* with 1 thread at fewer than 6 cores.
4098+
*/
4099+
if (max_ncpus < 6)
4100+
zfs_arc_evict_threads = 1;
4101+
else
4102+
zfs_arc_evict_threads =
4103+
(highbit64(max_ncpus) - 1) + max_ncpus / 32;
4104+
} else if (zfs_arc_evict_threads > max_ncpus)
4105+
zfs_arc_evict_threads = max_ncpus;
4106+
4107+
if (zfs_arc_evict_threads > 1) {
4108+
arc_evict_taskq = taskq_create("arc_evict",
4109+
zfs_arc_evict_threads, defclsyspri, 0, INT_MAX,
4110+
TASKQ_PREPOPULATE);
4111+
arc_evict_arg = kmem_zalloc(
4112+
sizeof (evict_arg_t) * zfs_arc_evict_threads, KM_SLEEP);
4113+
}
4114+
}
4115+
4116+
/*
4117+
* The minimum number of bytes we can evict at once is a block size.
4118+
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
4119+
* We use this value to compute a scaling factor for the eviction tasks.
4120+
*/
4121+
#define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)
4122+
40524123
/*
40534124
* Evict buffers from the given arc state, until we've removed the
40544125
* specified number of bytes. Move the removed buffers to the
@@ -4070,9 +4141,12 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
40704141
multilist_t *ml = &state->arcs_list[type];
40714142
int num_sublists;
40724143
arc_buf_hdr_t **markers;
4144+
evict_arg_t *eva = NULL;
40734145

40744146
num_sublists = multilist_get_num_sublists(ml);
40754147

4148+
boolean_t use_evcttq = zfs_arc_evict_threads > 1;
4149+
40764150
/*
40774151
* If we've tried to evict from each sublist, made some
40784152
* progress, but still have not hit the target number of bytes
@@ -4094,25 +4168,91 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
40944168
multilist_sublist_unlock(mls);
40954169
}
40964170

4171+
if (use_evcttq) {
4172+
if (zthr_iscurthread(arc_evict_zthr))
4173+
eva = arc_evict_arg;
4174+
else
4175+
eva = kmem_alloc(sizeof (evict_arg_t) *
4176+
zfs_arc_evict_threads, KM_NOSLEEP);
4177+
if (eva) {
4178+
for (int i = 0; i < zfs_arc_evict_threads; i++) {
4179+
taskq_init_ent(&eva[i].eva_tqent);
4180+
eva[i].eva_ml = ml;
4181+
eva[i].eva_spa = spa;
4182+
}
4183+
} else {
4184+
/*
4185+
* Fall back to the regular single evict if it is not
4186+
* possible to allocate memory for the taskq entries.
4187+
*/
4188+
use_evcttq = B_FALSE;
4189+
}
4190+
}
4191+
4192+
/*
4193+
* Start eviction using a randomly selected sublist, this is to try and
4194+
* evenly balance eviction across all sublists. Always starting at the
4195+
* same sublist (e.g. index 0) would cause evictions to favor certain
4196+
* sublists over others.
4197+
*/
4198+
uint64_t scan_evicted = 0;
4199+
int sublists_left = num_sublists;
4200+
int sublist_idx = multilist_get_random_index(ml);
4201+
40974202
/*
40984203
* While we haven't hit our target number of bytes to evict, or
40994204
* we're evicting all available buffers.
41004205
*/
41014206
while (total_evicted < bytes) {
4102-
int sublist_idx = multilist_get_random_index(ml);
4103-
uint64_t scan_evicted = 0;
4207+
uint64_t evict = MIN_EVICT_SIZE;
4208+
uint_t ntasks = zfs_arc_evict_threads;
41044209

4105-
/*
4106-
* Start eviction using a randomly selected sublist,
4107-
* this is to try and evenly balance eviction across all
4108-
* sublists. Always starting at the same sublist
4109-
* (e.g. index 0) would cause evictions to favor certain
4110-
* sublists over others.
4111-
*/
4112-
for (int i = 0; i < num_sublists; i++) {
4210+
if (use_evcttq) {
4211+
if (sublists_left < ntasks)
4212+
ntasks = sublists_left;
4213+
4214+
if (ntasks < 2)
4215+
use_evcttq = B_FALSE;
4216+
}
4217+
4218+
if (use_evcttq) {
4219+
uint64_t left = bytes - total_evicted;
4220+
4221+
if (bytes == ARC_EVICT_ALL) {
4222+
evict = bytes;
4223+
} else if (left > ntasks * MIN_EVICT_SIZE) {
4224+
evict = DIV_ROUND_UP(left, ntasks);
4225+
} else {
4226+
ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE);
4227+
if (ntasks == 1)
4228+
use_evcttq = B_FALSE;
4229+
}
4230+
}
4231+
4232+
for (int i = 0; sublists_left > 0; i++, sublist_idx++,
4233+
sublists_left--) {
41134234
uint64_t bytes_remaining;
41144235
uint64_t bytes_evicted;
41154236

4237+
/* we've reached the end, wrap to the beginning */
4238+
if (sublist_idx >= num_sublists)
4239+
sublist_idx = 0;
4240+
4241+
if (use_evcttq) {
4242+
if (i == ntasks)
4243+
break;
4244+
4245+
eva[i].eva_marker = markers[sublist_idx];
4246+
eva[i].eva_idx = sublist_idx;
4247+
eva[i].eva_bytes = evict;
4248+
4249+
taskq_dispatch_ent(arc_evict_taskq,
4250+
arc_evict_task, &eva[i], 0,
4251+
&eva[i].eva_tqent);
4252+
4253+
continue;
4254+
}
4255+
41164256
if (total_evicted < bytes)
41174257
bytes_remaining = bytes - total_evicted;
41184258
else
@@ -4123,18 +4263,23 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
41234263

41244264
scan_evicted += bytes_evicted;
41254265
total_evicted += bytes_evicted;
4266+
}
41264267

4127-
/* we've reached the end, wrap to the beginning */
4128-
if (++sublist_idx >= num_sublists)
4129-
sublist_idx = 0;
4268+
if (use_evcttq) {
4269+
taskq_wait(arc_evict_taskq);
4270+
4271+
for (int i = 0; i < ntasks; i++) {
4272+
scan_evicted += eva[i].eva_evicted;
4273+
total_evicted += eva[i].eva_evicted;
4274+
}
41304275
}
41314276

41324277
/*
4133-
* If we didn't evict anything during this scan, we have
4134-
* no reason to believe we'll evict more during another
4278+
* If we scanned all sublists and didn't evict anything, we
4279+
* have no reason to believe we'll evict more during another
41354280
* scan, so break the loop.
41364281
*/
4137-
if (scan_evicted == 0) {
4282+
if (scan_evicted == 0 && sublists_left == 0) {
41384283
/* This isn't possible, let's make that obvious */
41394284
ASSERT3S(bytes, !=, 0);
41404285

@@ -4151,13 +4296,33 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
41514296

41524297
break;
41534298
}
4299+
4300+
/*
4301+
* If we scanned all sublists but still have more to do,
4302+
* reset the counts so we can go around again.
4303+
*/
4304+
if (sublists_left == 0) {
4305+
sublists_left = num_sublists;
4306+
sublist_idx = multilist_get_random_index(ml);
4307+
scan_evicted = 0;
4308+
4309+
/*
4310+
* Since we're about to reconsider all sublists,
4311+
* re-enable use of the evict threads if available.
4312+
*/
4313+
use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL);
4314+
}
41544315
}
41554316

4317+
if (eva != NULL && eva != arc_evict_arg)
4318+
kmem_free(eva, sizeof (evict_arg_t) * zfs_arc_evict_threads);
4319+
41564320
for (int i = 0; i < num_sublists; i++) {
41574321
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
41584322
multilist_sublist_remove(mls, markers[i]);
41594323
multilist_sublist_unlock(mls);
41604324
}
4325+
41614326
if (markers != arc_state_evict_markers)
41624327
arc_state_free_markers(markers, num_sublists);
41634328

@@ -7824,6 +7989,7 @@ arc_set_limits(uint64_t allmem)
78247989
/* How to set default max varies by platform. */
78257990
arc_c_max = arc_default_max(arc_c_min, allmem);
78267991
}
7992+
78277993
void
78287994
arc_init(void)
78297995
{
@@ -7901,6 +8067,8 @@ arc_init(void)
79018067
arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
79028068
defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
79038069

8070+
arc_evict_thread_init();
8071+
79048072
list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
79058073
offsetof(arc_async_flush_t, af_node));
79068074
mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -8001,11 +8169,20 @@ arc_fini(void)
80018169
list_destroy(&arc_prune_list);
80028170
mutex_destroy(&arc_prune_mtx);
80038171

8172+
if (arc_evict_taskq != NULL)
8173+
taskq_wait(arc_evict_taskq);
8174+
80048175
(void) zthr_cancel(arc_evict_zthr);
80058176
(void) zthr_cancel(arc_reap_zthr);
80068177
arc_state_free_markers(arc_state_evict_markers,
80078178
arc_state_evict_marker_count);
80088179

8180+
if (arc_evict_taskq != NULL) {
8181+
taskq_destroy(arc_evict_taskq);
8182+
kmem_free(arc_evict_arg,
8183+
sizeof (evict_arg_t) * zfs_arc_evict_threads);
8184+
}
8185+
80098186
mutex_destroy(&arc_evict_lock);
80108187
list_destroy(&arc_evict_waiters);
80118188

@@ -11129,3 +11306,6 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
1112911306

1113011307
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
1113111308
"Number of arc_prune threads");
11309+
11310+
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RD,
11311+
"Number of threads to use for ARC eviction.");

0 commit comments

Comments
 (0)