Skip to content

Commit 9f650a0

Browse files
robnallanjude0mpAlexander Stetsenko
committed
ARC: parallel eviction
On systems with enormous amounts of memory, the single arc_evict thread can become a bottleneck if reads and writes are stuck behind it, waiting for old data to be evicted before new data can take its place. This commit adds support for evicting from multiple ARC lists in parallel, by farming the evict work out to some number of threads and then accumulating their results. A new tuneable, zfs_arc_evict_threads, sets the number of threads. By default, it will scale based on the number of CPUs. Sponsored-by: Expensify, Inc. Sponsored-by: Klara, Inc. Co-authored-by: Allan Jude <[email protected]> Co-authored-by: Mateusz Piotrowski <[email protected]> Co-authored-by: Alexander Stetsenko <[email protected]> Signed-off-by: Allan Jude <[email protected]> Signed-off-by: Mateusz Piotrowski <[email protected]> Signed-off-by: Alexander Stetsenko <[email protected]> Signed-off-by: Rob Norris <[email protected]>
1 parent b1ccab1 commit 9f650a0

File tree

2 files changed

+232
-21
lines changed

2 files changed

+232
-21
lines changed

man/man4/zfs.4

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
.\" Copyright (c) 2013 by Turbo Fredriksson <[email protected]>. All rights reserved.
44
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
55
.\" Copyright (c) 2019 Datto Inc.
6-
.\" Copyright (c) 2023, 2024 Klara, Inc.
6+
.\" Copyright (c) 2023, 2024, 2025, Klara, Inc.
77
.\" The contents of this file are subject to the terms of the Common Development
88
.\" and Distribution License (the "License"). You may not use this file except
99
.\" in compliance with the License. You can obtain a copy of the license at
@@ -17,9 +17,7 @@
1717
.\" own identifying information:
1818
.\" Portions Copyright [yyyy] [name of copyright owner]
1919
.\"
20-
.\" Copyright (c) 2024, Klara, Inc.
21-
.\"
22-
.Dd November 1, 2024
20+
.Dd May 7, 2025
2321
.Dt ZFS 4
2422
.Os
2523
.
@@ -735,6 +733,40 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list.
735733
This batch-style operation prevents entire sub-lists from being evicted at once
736734
but comes at a cost of additional unlocking and locking.
737735
.
736+
.It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq int
737+
Sets the number of ARC eviction threads to be used.
738+
.Pp
739+
If set greater than 0, ZFS will dedicate up to that many threads to ARC
740+
eviction.
741+
Each thread will process one sub-list at a time,
742+
until the eviction target is reached or all sub-lists have been processed.
743+
When set to 0, ZFS will compute a reasonable number of eviction threads based
744+
on the number of CPUs.
745+
.TS
746+
box;
747+
lb l l .
748+
CPUs Threads
749+
_
750+
1-4 1
751+
5-8 2
752+
9-15 3
753+
16-31 4
754+
32-63 6
755+
64-95 8
756+
96-127 9
757+
128-160 11
758+
160-191 12
759+
192-223 13
760+
224-255 14
761+
256+ 16
762+
.TE
763+
.Pp
764+
More threads may improve the responsiveness of ZFS to memory pressure.
765+
This can be important for performance when eviction from the ARC becomes
766+
a bottleneck for reads and writes.
767+
.Pp
768+
This parameter can only be set at module load time.
769+
.
738770
.It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint
739771
If set to a non zero value, it will replace the
740772
.Sy arc_grow_retry

module/zfs/arc.c

Lines changed: 196 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
* Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
2828
* Copyright (c) 2019, loli10K <[email protected]>. All rights reserved.
2929
* Copyright (c) 2020, George Amanakis. All rights reserved.
30-
* Copyright (c) 2019, 2024, Klara Inc.
30+
* Copyright (c) 2019, 2024, 2025, Klara, Inc.
3131
* Copyright (c) 2019, Allan Jude
3232
* Copyright (c) 2020, The FreeBSD Foundation [1]
3333
* Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
@@ -336,6 +336,10 @@ static kmutex_t arc_evict_lock;
336336
static boolean_t arc_evict_needed = B_FALSE;
337337
static clock_t arc_last_uncached_flush;
338338

339+
static taskq_t *arc_evict_taskq;
340+
typedef struct evict_arg evict_arg_t;
341+
static evict_arg_t *arc_evict_arg;
342+
339343
/*
340344
* Count of bytes evicted since boot.
341345
*/
@@ -469,6 +473,18 @@ static int zfs_arc_prune_task_threads = 1;
469473
/* Used by spa_export/spa_destroy to flush the arc asynchronously */
470474
static taskq_t *arc_flush_taskq;
471475

476+
/*
477+
* Controls the number of ARC eviction threads to dispatch sublists to.
478+
*
479+
* Possible values:
480+
* 0 (auto) compute the number of threads using a logarithmic formula.
481+
* 1 (disabled) one thread - parallel eviction is disabled.
482+
* 2+ (manual) set the number manually.
483+
*
484+
* See arc_evict_thread_init() for how "auto" is computed.
485+
*/
486+
static uint_t zfs_arc_evict_threads = 0;
487+
472488
/* The 7 states: */
473489
arc_state_t ARC_anon;
474490
arc_state_t ARC_mru;
@@ -4048,6 +4064,62 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
40484064
kmem_free(markers, sizeof (*markers) * count);
40494065
}
40504066

4067+
typedef struct evict_arg {
4068+
taskq_ent_t eva_tqent;
4069+
multilist_t *eva_ml;
4070+
arc_buf_hdr_t *eva_marker;
4071+
int eva_idx;
4072+
uint64_t eva_spa;
4073+
uint64_t eva_bytes;
4074+
uint64_t eva_evicted;
4075+
} evict_arg_t;
4076+
4077+
static void
4078+
arc_evict_task(void *arg)
4079+
{
4080+
evict_arg_t *eva = arg;
4081+
eva->eva_evicted = arc_evict_state_impl(eva->eva_ml, eva->eva_idx,
4082+
eva->eva_marker, eva->eva_spa, eva->eva_bytes);
4083+
}
4084+
4085+
static void
4086+
arc_evict_thread_init(void)
4087+
{
4088+
if (zfs_arc_evict_threads == 0) {
4089+
/*
4090+
* Compute number of threads we want to use for eviction.
4091+
*
4092+
* Normally, it's log2(ncpus) + ncpus/32, which gets us to the
4093+
* default max of 16 threads at ~256 CPUs.
4094+
*
4095+
* However, that formula goes to two threads at 4 CPUs, which
4096+
* is still rather to low to be really useful, so we just go
4097+
* with 1 thread at fewer than 6 cores.
4098+
*/
4099+
if (max_ncpus < 6)
4100+
zfs_arc_evict_threads = 1;
4101+
else
4102+
zfs_arc_evict_threads =
4103+
(highbit64(max_ncpus) - 1) + max_ncpus / 32;
4104+
} else if (zfs_arc_evict_threads > max_ncpus)
4105+
zfs_arc_evict_threads = max_ncpus;
4106+
4107+
if (zfs_arc_evict_threads > 1) {
4108+
arc_evict_taskq = taskq_create("arc_evict",
4109+
zfs_arc_evict_threads, defclsyspri, 0, INT_MAX,
4110+
TASKQ_PREPOPULATE);
4111+
arc_evict_arg = kmem_zalloc(
4112+
sizeof (evict_arg_t) * zfs_arc_evict_threads, KM_SLEEP);
4113+
}
4114+
}
4115+
4116+
/*
4117+
* The minimum number of bytes we can evict at once is a block size.
4118+
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
4119+
* We use this value to compute a scaling factor for the eviction tasks.
4120+
*/
4121+
#define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)
4122+
40514123
/*
40524124
* Evict buffers from the given arc state, until we've removed the
40534125
* specified number of bytes. Move the removed buffers to the
@@ -4069,9 +4141,12 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
40694141
multilist_t *ml = &state->arcs_list[type];
40704142
int num_sublists;
40714143
arc_buf_hdr_t **markers;
4144+
evict_arg_t *eva = NULL;
40724145

40734146
num_sublists = multilist_get_num_sublists(ml);
40744147

4148+
boolean_t use_evcttq = zfs_arc_evict_threads > 1;
4149+
40754150
/*
40764151
* If we've tried to evict from each sublist, made some
40774152
* progress, but still have not hit the target number of bytes
@@ -4093,25 +4168,91 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
40934168
multilist_sublist_unlock(mls);
40944169
}
40954170

4171+
if (use_evcttq) {
4172+
if (zthr_iscurthread(arc_evict_zthr))
4173+
eva = arc_evict_arg;
4174+
else
4175+
eva = kmem_alloc(sizeof (evict_arg_t) *
4176+
zfs_arc_evict_threads, KM_NOSLEEP);
4177+
if (eva) {
4178+
for (int i = 0; i < zfs_arc_evict_threads; i++) {
4179+
taskq_init_ent(&eva[i].eva_tqent);
4180+
eva[i].eva_ml = ml;
4181+
eva[i].eva_spa = spa;
4182+
}
4183+
} else {
4184+
/*
4185+
* Fall back to the regular single evict if it is not
4186+
* possible to allocate memory for the taskq entries.
4187+
*/
4188+
use_evcttq = B_FALSE;
4189+
}
4190+
}
4191+
4192+
/*
4193+
* Start eviction using a randomly selected sublist, this is to try and
4194+
* evenly balance eviction across all sublists. Always starting at the
4195+
* same sublist (e.g. index 0) would cause evictions to favor certain
4196+
* sublists over others.
4197+
*/
4198+
uint64_t scan_evicted = 0;
4199+
int sublists_left = num_sublists;
4200+
int sublist_idx = multilist_get_random_index(ml);
4201+
40964202
/*
40974203
* While we haven't hit our target number of bytes to evict, or
40984204
* we're evicting all available buffers.
40994205
*/
41004206
while (total_evicted < bytes) {
4101-
int sublist_idx = multilist_get_random_index(ml);
4102-
uint64_t scan_evicted = 0;
4207+
uint64_t evict = MIN_EVICT_SIZE;
4208+
uint_t ntasks = zfs_arc_evict_threads;
41034209

4104-
/*
4105-
* Start eviction using a randomly selected sublist,
4106-
* this is to try and evenly balance eviction across all
4107-
* sublists. Always starting at the same sublist
4108-
* (e.g. index 0) would cause evictions to favor certain
4109-
* sublists over others.
4110-
*/
4111-
for (int i = 0; i < num_sublists; i++) {
4210+
if (use_evcttq) {
4211+
if (sublists_left < ntasks)
4212+
ntasks = sublists_left;
4213+
4214+
if (ntasks < 2)
4215+
use_evcttq = B_FALSE;
4216+
}
4217+
4218+
if (use_evcttq) {
4219+
uint64_t left = bytes - total_evicted;
4220+
4221+
if (bytes == ARC_EVICT_ALL) {
4222+
evict = bytes;
4223+
} else if (left > ntasks * MIN_EVICT_SIZE) {
4224+
evict = DIV_ROUND_UP(left, ntasks);
4225+
} else {
4226+
ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE);
4227+
if (ntasks == 1)
4228+
use_evcttq = B_FALSE;
4229+
}
4230+
}
4231+
4232+
for (int i = 0; sublists_left > 0; i++, sublist_idx++,
4233+
sublists_left--) {
41124234
uint64_t bytes_remaining;
41134235
uint64_t bytes_evicted;
41144236

4237+
/* we've reached the end, wrap to the beginning */
4238+
if (sublist_idx >= num_sublists)
4239+
sublist_idx = 0;
4240+
4241+
if (use_evcttq) {
4242+
if (i == ntasks)
4243+
break;
4244+
4245+
eva[i].eva_marker = markers[sublist_idx];
4246+
eva[i].eva_idx = sublist_idx;
4247+
eva[i].eva_bytes = evict;
4248+
4249+
taskq_dispatch_ent(arc_evict_taskq,
4250+
arc_evict_task, &eva[i], 0,
4251+
&eva[i].eva_tqent);
4252+
4253+
continue;
4254+
}
4255+
41154256
if (total_evicted < bytes)
41164257
bytes_remaining = bytes - total_evicted;
41174258
else
@@ -4122,18 +4263,23 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
41224263

41234264
scan_evicted += bytes_evicted;
41244265
total_evicted += bytes_evicted;
4266+
}
41254267

4126-
/* we've reached the end, wrap to the beginning */
4127-
if (++sublist_idx >= num_sublists)
4128-
sublist_idx = 0;
4268+
if (use_evcttq) {
4269+
taskq_wait(arc_evict_taskq);
4270+
4271+
for (int i = 0; i < ntasks; i++) {
4272+
scan_evicted += eva[i].eva_evicted;
4273+
total_evicted += eva[i].eva_evicted;
4274+
}
41294275
}
41304276

41314277
/*
4132-
* If we didn't evict anything during this scan, we have
4133-
* no reason to believe we'll evict more during another
4278+
* If we scanned all sublists and didn't evict anything, we
4279+
* have no reason to believe we'll evict more during another
41344280
* scan, so break the loop.
41354281
*/
4136-
if (scan_evicted == 0) {
4282+
if (scan_evicted == 0 && sublists_left == 0) {
41374283
/* This isn't possible, let's make that obvious */
41384284
ASSERT3S(bytes, !=, 0);
41394285

@@ -4150,13 +4296,33 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
41504296

41514297
break;
41524298
}
4299+
4300+
/*
4301+
* If we scanned all sublists but still have more to do,
4302+
* reset the counts so we can go around again.
4303+
*/
4304+
if (sublists_left == 0) {
4305+
sublists_left = num_sublists;
4306+
sublist_idx = multilist_get_random_index(ml);
4307+
scan_evicted = 0;
4308+
4309+
/*
4310+
* Since we're about to reconsider all sublists,
4311+
* re-enable use of the evict threads if available.
4312+
*/
4313+
use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL);
4314+
}
41534315
}
41544316

4317+
if (eva != NULL && eva != arc_evict_arg)
4318+
kmem_free(eva, sizeof (evict_arg_t) * zfs_arc_evict_threads);
4319+
41554320
for (int i = 0; i < num_sublists; i++) {
41564321
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
41574322
multilist_sublist_remove(mls, markers[i]);
41584323
multilist_sublist_unlock(mls);
41594324
}
4325+
41604326
if (markers != arc_state_evict_markers)
41614327
arc_state_free_markers(markers, num_sublists);
41624328

@@ -7795,6 +7961,7 @@ arc_set_limits(uint64_t allmem)
77957961
/* How to set default max varies by platform. */
77967962
arc_c_max = arc_default_max(arc_c_min, allmem);
77977963
}
7964+
77987965
void
77997966
arc_init(void)
78007967
{
@@ -7872,6 +8039,8 @@ arc_init(void)
78728039
arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
78738040
defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
78748041

8042+
arc_evict_thread_init();
8043+
78758044
list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
78768045
offsetof(arc_async_flush_t, af_node));
78778046
mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -7955,6 +8124,13 @@ arc_fini(void)
79558124
arc_ksp = NULL;
79568125
}
79578126

8127+
if (arc_evict_taskq != NULL) {
8128+
taskq_wait(arc_evict_taskq);
8129+
taskq_destroy(arc_evict_taskq);
8130+
kmem_free(arc_evict_arg,
8131+
sizeof (evict_arg_t) * zfs_arc_evict_threads);
8132+
}
8133+
79588134
taskq_wait(arc_prune_taskq);
79598135
taskq_destroy(arc_prune_taskq);
79608136

@@ -11100,3 +11276,6 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
1110011276

1110111277
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
1110211278
"Number of arc_prune threads");
11279+
11280+
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RD,
11281+
"Number of threads to use for ARC eviction.");

0 commit comments

Comments
 (0)