Skip to content

Commit e99733e

Browse files
Alexander Stetsenkoallanjude0mp
committed
Implement parallel ARC eviction
Read and write performance can become limited by the arc_evict process being single threaded. Additional data cannot be added to the ARC until sufficient existing data is evicted. On many-core systems with TBs of RAM, a single thread becomes a significant bottleneck. With the change we see a 25% increase in read and write throughput Sponsored-by: Expensify, Inc. Sponsored-by: Klara, Inc. Co-authored-by: Allan Jude <[email protected]> Co-authored-by: Mateusz Piotrowski <[email protected]> Signed-off-by: Alexander Stetsenko <[email protected]> Signed-off-by: Allan Jude <[email protected]> Signed-off-by: Mateusz Piotrowski <[email protected]>
1 parent aefc2da commit e99733e

File tree

2 files changed

+182
-8
lines changed

2 files changed

+182
-8
lines changed

man/man4/zfs.4

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,24 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list.
724724
This batch-style operation prevents entire sub-lists from being evicted at once
725725
but comes at a cost of additional unlocking and locking.
726726
.
727+
.It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq uint
728+
Sets the maximum number of ARC eviction threads to be used.
729+
.Pp
730+
When set to 0,
731+
ZFS uses the number of threads depending on the number of CPU cores.
732+
The minimum number of threads is 1 and applies to systems from 1 to 5 CPU cores.
733+
Systems with 6 CPU cores get 2 eviction threads.
734+
ZFS on systems larger than that uses log2 of the CPU count
735+
plus the CPU count shifted 6 bits.
736+
This way the number of eviction threads scales up more on high CPU counts.
737+
Currently, ZFS will not scale automatically beyond 16 threads.
738+
.Pp
739+
More threads may improve the responsiveness of ZFS to memory pressure.
740+
This can be important for performance when eviction from the ARC becomes
741+
a bottleneck for reads and writes.
742+
.Pp
743+
Note that the thread count cannot be changed during runtime.
744+
.
727745
.It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint
728746
If set to a non zero value, it will replace the
729747
.Sy arc_grow_retry

module/zfs/arc.c

Lines changed: 164 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,19 @@ static uint_t zfs_arc_lotsfree_percent = 10;
465465
*/
466466
static int zfs_arc_prune_task_threads = 1;
467467

468+
/*
469+
* Number of arc_evict threads
470+
*/
471+
static uint_t zfs_arc_evict_threads = 0;
472+
static uint_t zfs_arc_evict_threads_live = 0;
473+
474+
/*
475+
* The minimum number of bytes we can evict at once is a block size.
476+
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
477+
* We use this value to compute a scaling factor for the eviction tasks.
478+
*/
479+
#define MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT)
480+
468481
/* The 7 states: */
469482
arc_state_t ARC_anon;
470483
arc_state_t ARC_mru;
@@ -3890,7 +3903,6 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
38903903
* specifically implemented to ensure this is the case
38913904
* (only 'marker' will be removed and re-inserted).
38923905
*/
3893-
multilist_sublist_move_forward(mls, marker);
38943906

38953907
/*
38963908
* The only case where the b_spa field should ever be
@@ -3900,11 +3912,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
39003912
* dsl_pool_close() and zio_inject_fault()), so we must
39013913
* skip any markers we see from these other threads.
39023914
*/
3903-
if (hdr->b_spa == 0)
3915+
if (hdr->b_spa == 0) {
3916+
multilist_sublist_move_forward(mls, marker);
39043917
continue;
3918+
}
39053919

39063920
/* we're only interested in evicting buffers of a certain spa */
39073921
if (spa != 0 && hdr->b_spa != spa) {
3922+
multilist_sublist_move_forward(mls, marker);
39083923
ARCSTAT_BUMP(arcstat_evict_skip);
39093924
continue;
39103925
}
@@ -3939,6 +3954,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
39393954
evict_count--;
39403955

39413956
} else {
3957+
multilist_sublist_move_forward(mls, marker);
39423958
ARCSTAT_BUMP(arcstat_mutex_miss);
39433959
}
39443960
}
@@ -4026,6 +4042,35 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
40264042
kmem_free(markers, sizeof (*markers) * count);
40274043
}
40284044

4045+
taskq_t *arc_evict_taskq;
4046+
4047+
typedef struct evict_arg {
4048+
taskq_ent_t tqe;
4049+
multilist_t *ml;
4050+
int idx;
4051+
arc_buf_hdr_t *marker;
4052+
uint64_t spa;
4053+
uint64_t bytes;
4054+
volatile uint64_t *evicted_ptr;
4055+
} evict_arg_t;
4056+
4057+
static void
4058+
arc_evict_task(void *arg)
4059+
{
4060+
evict_arg_t *eva = arg;
4061+
volatile uint64_t *evictedp = eva->evicted_ptr;
4062+
multilist_t *ml = eva->ml;
4063+
arc_buf_hdr_t *marker = eva->marker;
4064+
int idx = eva->idx;
4065+
uint64_t spa = eva->spa;
4066+
uint64_t evict = eva->bytes;
4067+
uint64_t bytes_evicted;
4068+
4069+
bytes_evicted = arc_evict_state_impl(ml, idx, marker, spa, evict);
4070+
4071+
atomic_add_64(evictedp, bytes_evicted);
4072+
}
4073+
40294074
/*
40304075
* Evict buffers from the given arc state, until we've removed the
40314076
* specified number of bytes. Move the removed buffers to the
@@ -4045,10 +4090,11 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
40454090
{
40464091
uint64_t total_evicted = 0;
40474092
multilist_t *ml = &state->arcs_list[type];
4048-
int num_sublists;
40494093
arc_buf_hdr_t **markers;
4094+
unsigned num_sublists = multilist_get_num_sublists(ml);
40504095

4051-
num_sublists = multilist_get_num_sublists(ml);
4096+
if (bytes == 0)
4097+
return (total_evicted);
40524098

40534099
/*
40544100
* If we've tried to evict from each sublist, made some
@@ -4071,25 +4117,108 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
40714117
multilist_sublist_unlock(mls);
40724118
}
40734119

4120+
evict_arg_t *evarg = kmem_alloc(sizeof (*evarg) * num_sublists,
4121+
KM_SLEEP);
40744122
/*
40754123
* While we haven't hit our target number of bytes to evict, or
40764124
* we're evicting all available buffers.
40774125
*/
40784126
while (total_evicted < bytes) {
40794127
int sublist_idx = multilist_get_random_index(ml);
4128+
boolean_t usetskq = zfs_arc_evict_threads_live > 1;
40804129
uint64_t scan_evicted = 0;
40814130

4131+
uint64_t left = (bytes == ARC_EVICT_ALL ? bytes :
4132+
bytes - total_evicted);
4133+
4134+
/*
4135+
* How we scale
4136+
*
4137+
* Example 1, # of chunks less than # of tasks.
4138+
* We have:
4139+
* - 4 tasks
4140+
* - 3 chunks
4141+
* - 3 full col
4142+
* - 0 low cols.
4143+
*
4144+
* The first low col index is 3.
4145+
* The tasks #0-#2 evict 1 chunk each.
4146+
*
4147+
* 0 | 1 | 2 | 3 |
4148+
* +===+===+===+===+
4149+
* | x | x | x | |
4150+
* +---+---+---+---+
4151+
*
4152+
* Example 2, # of chunks more than # of tasks.
4153+
* We have:
4154+
* - 4 tasks
4155+
* - 9 chunks
4156+
* - 1 full col
4157+
* - 3 low cols
4158+
*
4159+
* The first low col index is 1.
4160+
* The task #0 evicts 3 chunks, the others evict 2 chunks each.
4161+
*
4162+
* 0 | 1 | 2 | 3 |
4163+
* +===+===+===+===+
4164+
* | x | x | x | x |
4165+
* +---+---+---+---+
4166+
* | x | x | x | x |
4167+
* +---+---+---+---+
4168+
* | x | | | |
4169+
* +---+---+---+---+
4170+
*/
4171+
4172+
/*
4173+
* Compute number of tasks to run (n), low col index (k)
4174+
* and normal and low bytes per task.
4175+
*/
4176+
uint64_t nchunks = ((left - 1) >> MIN_EVICT_PERTASK_SHIFT) + 1;
4177+
unsigned n = nchunks < num_sublists ? nchunks : num_sublists;
4178+
uint64_t fullrows = nchunks / n;
4179+
unsigned lastrowcols = nchunks % n;
4180+
unsigned k = (lastrowcols ? lastrowcols : n);
4181+
4182+
uint64_t bytes_pertask_low =
4183+
fullrows << MIN_EVICT_PERTASK_SHIFT;
4184+
uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ?
4185+
(1 << MIN_EVICT_PERTASK_SHIFT) : 0);
4186+
40824187
/*
40834188
* Start eviction using a randomly selected sublist,
40844189
* this is to try and evenly balance eviction across all
40854190
* sublists. Always starting at the same sublist
40864191
* (e.g. index 0) would cause evictions to favor certain
40874192
* sublists over others.
40884193
*/
4089-
for (int i = 0; i < num_sublists; i++) {
4194+
for (unsigned i = 0; i < n; i++, sublist_idx++) {
40904195
uint64_t bytes_remaining;
40914196
uint64_t bytes_evicted;
40924197

4198+
/* we've reached the end, wrap to the beginning */
4199+
if (sublist_idx >= num_sublists)
4200+
sublist_idx = 0;
4201+
4202+
if (usetskq) {
4203+
uint64_t evict = i < k ? bytes_pertask :
4204+
bytes_pertask_low;
4205+
4206+
ASSERT3S(n, <=, num_sublists);
4207+
4208+
memset(&evarg[i].tqe, 0, sizeof (evarg[i].tqe));
4209+
evarg[i].ml = ml;
4210+
evarg[i].marker = markers[sublist_idx];
4211+
evarg[i].spa = spa;
4212+
evarg[i].evicted_ptr = &scan_evicted;
4213+
evarg[i].idx = sublist_idx;
4214+
evarg[i].bytes = evict;
4215+
4216+
taskq_dispatch_ent(arc_evict_taskq,
4217+
arc_evict_task,
4218+
&evarg[i], 0, &evarg[i].tqe);
4219+
continue;
4220+
}
4221+
40934222
if (total_evicted < bytes)
40944223
bytes_remaining = bytes - total_evicted;
40954224
else
@@ -4100,10 +4229,11 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
41004229

41014230
scan_evicted += bytes_evicted;
41024231
total_evicted += bytes_evicted;
4232+
}
41034233

4104-
/* we've reached the end, wrap to the beginning */
4105-
if (++sublist_idx >= num_sublists)
4106-
sublist_idx = 0;
4234+
if (usetskq) {
4235+
taskq_wait(arc_evict_taskq);
4236+
total_evicted += scan_evicted;
41074237
}
41084238

41094239
/*
@@ -4130,11 +4260,14 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
41304260
}
41314261
}
41324262

4263+
kmem_free(evarg, sizeof (*evarg) * num_sublists);
4264+
41334265
for (int i = 0; i < num_sublists; i++) {
41344266
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
41354267
multilist_sublist_remove(mls, markers[i]);
41364268
multilist_sublist_unlock(mls);
41374269
}
4270+
41384271
if (markers != arc_state_evict_markers)
41394272
arc_state_free_markers(markers, num_sublists);
41404273

@@ -7673,6 +7806,13 @@ arc_set_limits(uint64_t allmem)
76737806
/* How to set default max varies by platform. */
76747807
arc_c_max = arc_default_max(arc_c_min, allmem);
76757808
}
7809+
7810+
static inline size_t
7811+
arc_ilog2(int a)
7812+
{
7813+
return (a > 1 ? 1 + arc_ilog2(a >> 1) : 0);
7814+
}
7815+
76767816
void
76777817
arc_init(void)
76787818
{
@@ -7743,12 +7883,22 @@ arc_init(void)
77437883

77447884
buf_init();
77457885

7886+
if (zfs_arc_evict_threads == 0)
7887+
zfs_arc_evict_threads_live = MIN(MAX(max_ncpus > 6 ? 2 : 1,
7888+
arc_ilog2(max_ncpus) + (max_ncpus >> 6)), 16);
7889+
else
7890+
zfs_arc_evict_threads_live = zfs_arc_evict_threads;
7891+
77467892
list_create(&arc_prune_list, sizeof (arc_prune_t),
77477893
offsetof(arc_prune_t, p_node));
77487894
mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
77497895

77507896
arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
77517897
defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
7898+
arc_evict_taskq = taskq_create("arc_evict",
7899+
MIN(zfs_arc_evict_threads_live, max_ncpus), defclsyspri,
7900+
MIN(zfs_arc_evict_threads_live, max_ncpus), max_ncpus,
7901+
TASKQ_PREPOPULATE);
77527902

77537903
arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
77547904
sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@@ -7823,6 +7973,9 @@ arc_fini(void)
78237973
arc_ksp = NULL;
78247974
}
78257975

7976+
taskq_wait(arc_evict_taskq);
7977+
taskq_destroy(arc_evict_taskq);
7978+
78267979
taskq_wait(arc_prune_taskq);
78277980
taskq_destroy(arc_prune_taskq);
78287981

@@ -10849,3 +11002,6 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
1084911002

1085011003
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
1085111004
"Number of arc_prune threads");
11005+
11006+
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RW,
11007+
"Maximum number of arc_evict threads");

0 commit comments

Comments
 (0)