Skip to content

Commit 4cd510d

Browse files
Alexander Stetsenkoallanjude
andcommitted
Implement parallel ARC eviction
Read and write performance can become limited by the arc_evict process being single threaded. Additional data cannot be added to the ARC until sufficient existing data is evicted. On many-core systems with TBs of RAM, a single thread becomes a significant bottleneck. With the change we see a 25% increase in read and write throughput Sponsored-by: Expensify, Inc. Sponsored-by: Klara, Inc. Co-authored-by: Allan Jude <[email protected]> Signed-off-by: Alexander Stetsenko <[email protected]> Signed-off-by: Allan Jude <[email protected]>
1 parent b3b7491 commit 4cd510d

File tree

2 files changed

+168
-8
lines changed

2 files changed

+168
-8
lines changed

man/man4/zfs.4

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,14 @@ with 8-byte pointers.
667667
For configurations with a known larger average block size,
668668
this value can be increased to reduce the memory footprint.
669669
.
670+
.It Sy zfs_arc_evict_parallel Ns = Ns Sy 0 Pq uint
671+
When set to 1, ZFS will use up to
672+
.Sy zfs_arc_evict_threads
673+
threads to evict data from the ARC in parallel, improving the responsiveness
674+
of ZFS to memory pressure.
675+
This can be important for performance when eviction from the ARC becomes
676+
a bottleneck for reads and writes.
677+
.
670678
.It Sy zfs_arc_eviction_pct Ns = Ns Sy 200 Ns % Pq uint
671679
When
672680
.Fn arc_is_overflowing ,
@@ -690,6 +698,10 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list.
690698
This batch-style operation prevents entire sub-lists from being evicted at once
691699
but comes at a cost of additional unlocking and locking.
692700
.
701+
.It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq uint
702+
Sets the maximum number of ARC eviction threads to be used.
703+
When set to 0, ZFS uses half the available CPUs or 16, whichever is less.
704+
.
693705
.It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint
694706
If set to a non zero value, it will replace the
695707
.Sy arc_grow_retry

module/zfs/arc.c

Lines changed: 156 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,20 @@ static uint_t zfs_arc_lotsfree_percent = 10;
464464
*/
465465
static int zfs_arc_prune_task_threads = 1;
466466

467+
/*
468+
* Number of arc_evict threads
469+
*/
470+
static uint_t zfs_arc_evict_threads = 0;
471+
472+
/*
473+
* The minimum number of bytes we can evict at once is a block size.
474+
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
475+
* We use this value to compute a scaling factor for the eviction tasks.
476+
*/
477+
#define MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT)
478+
479+
static uint_t zfs_arc_evict_parallel = 0;
480+
467481
/* The 7 states: */
468482
arc_state_t ARC_anon;
469483
arc_state_t ARC_mru;
@@ -3885,7 +3899,6 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
38853899
* specifically implemented to ensure this is the case
38863900
* (only 'marker' will be removed and re-inserted).
38873901
*/
3888-
multilist_sublist_move_forward(mls, marker);
38893902

38903903
/*
38913904
* The only case where the b_spa field should ever be
@@ -3895,11 +3908,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
38953908
* dsl_pool_close() and zio_inject_fault()), so we must
38963909
* skip any markers we see from these other threads.
38973910
*/
3898-
if (hdr->b_spa == 0)
3911+
if (hdr->b_spa == 0) {
3912+
multilist_sublist_move_forward(mls, marker);
38993913
continue;
3914+
}
39003915

39013916
/* we're only interested in evicting buffers of a certain spa */
39023917
if (spa != 0 && hdr->b_spa != spa) {
3918+
multilist_sublist_move_forward(mls, marker);
39033919
ARCSTAT_BUMP(arcstat_evict_skip);
39043920
continue;
39053921
}
@@ -3934,6 +3950,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
39343950
evict_count--;
39353951

39363952
} else {
3953+
multilist_sublist_move_forward(mls, marker);
39373954
ARCSTAT_BUMP(arcstat_mutex_miss);
39383955
}
39393956
}
@@ -4021,6 +4038,35 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
40214038
kmem_free(markers, sizeof (*markers) * count);
40224039
}
40234040

4041+
taskq_t *arc_evict_taskq;
4042+
4043+
typedef struct evict_arg {
4044+
taskq_ent_t tqe;
4045+
multilist_t *ml;
4046+
int idx;
4047+
arc_buf_hdr_t *marker;
4048+
uint64_t spa;
4049+
uint64_t bytes;
4050+
volatile uint64_t *evicted_ptr;
4051+
} evict_arg_t;
4052+
4053+
static void
4054+
arc_evict_task(void *arg)
4055+
{
4056+
evict_arg_t *eva = arg;
4057+
volatile uint64_t *evictedp = eva->evicted_ptr;
4058+
multilist_t *ml = eva->ml;
4059+
arc_buf_hdr_t *marker = eva->marker;
4060+
int idx = eva->idx;
4061+
uint64_t spa = eva->spa;
4062+
uint64_t evict = eva->bytes;
4063+
uint64_t bytes_evicted;
4064+
4065+
bytes_evicted = arc_evict_state_impl(ml, idx, marker, spa, evict);
4066+
4067+
atomic_add_64(evictedp, bytes_evicted);
4068+
}
4069+
40244070
/*
40254071
* Evict buffers from the given arc state, until we've removed the
40264072
* specified number of bytes. Move the removed buffers to the
@@ -4040,10 +4086,11 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
40404086
{
40414087
uint64_t total_evicted = 0;
40424088
multilist_t *ml = &state->arcs_list[type];
4043-
int num_sublists;
40444089
arc_buf_hdr_t **markers;
4090+
unsigned num_sublists = multilist_get_num_sublists(ml);
40454091

4046-
num_sublists = multilist_get_num_sublists(ml);
4092+
if (bytes == 0)
4093+
return (total_evicted);
40474094

40484095
/*
40494096
* If we've tried to evict from each sublist, made some
@@ -4066,25 +4113,107 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
40664113
multilist_sublist_unlock(mls);
40674114
}
40684115

4116+
evict_arg_t *evarg = kmem_alloc(sizeof (*evarg) * num_sublists,
4117+
KM_SLEEP);
40694118
/*
40704119
* While we haven't hit our target number of bytes to evict, or
40714120
* we're evicting all available buffers.
40724121
*/
40734122
while (total_evicted < bytes) {
40744123
int sublist_idx = multilist_get_random_index(ml);
4124+
boolean_t usetskq = zfs_arc_evict_parallel;
40754125
uint64_t scan_evicted = 0;
40764126

4127+
uint64_t left = (bytes == ARC_EVICT_ALL ? bytes :
4128+
bytes - total_evicted);
4129+
4130+
/*
4131+
How we scale
4132+
4133+
Example 1, # of chunks less than # of tasks.
4134+
We have:
4135+
- 4 tasks
4136+
- 3 chunks
4137+
- 3 full col
4138+
- 0 low cols.
4139+
4140+
The first low col index is 3.
4141+
The tasks #0-#2 evict 1 chunk each.
4142+
4143+
0 | 1 | 2 | 3 |
4144+
+===+===+===+===+
4145+
| x | x | x | |
4146+
+---+---+---+---+
4147+
4148+
Example 2, # of chunks more than # of tasks.
4149+
We have:
4150+
- 4 tasks
4151+
- 9 chunks
4152+
- 1 full col
4153+
- 3 low cols
4154+
4155+
The first low col index is 1.
4156+
The task #0 evicts 3 chunks, the others evict 2 chunks each.
4157+
4158+
0 | 1 | 2 | 3 |
4159+
+===+===+===+===+
4160+
| x | x | x | x |
4161+
+---+---+---+---+
4162+
| x | x | x | x |
4163+
+---+---+---+---+
4164+
| x | | | |
4165+
+---+---+---+---+
4166+
*/
4167+
4168+
/*
4169+
* Compute number of tasks to run (n), low col index (k)
4170+
* and normal and low bytes per task.
4171+
*/
4172+
uint64_t nchunks = ((left - 1) >> MIN_EVICT_PERTASK_SHIFT) + 1;
4173+
unsigned n = nchunks < num_sublists ? nchunks : num_sublists;
4174+
uint64_t fullrows = nchunks / n;
4175+
unsigned lastrowcols = nchunks % n;
4176+
unsigned k = (lastrowcols ? lastrowcols : n);
4177+
4178+
uint64_t bytes_pertask_low = fullrows << MIN_EVICT_PERTASK_SHIFT;
4179+
uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ?
4180+
(1 << MIN_EVICT_PERTASK_SHIFT) : 0);
4181+
40774182
/*
40784183
* Start eviction using a randomly selected sublist,
40794184
* this is to try and evenly balance eviction across all
40804185
* sublists. Always starting at the same sublist
40814186
* (e.g. index 0) would cause evictions to favor certain
40824187
* sublists over others.
40834188
*/
4084-
for (int i = 0; i < num_sublists; i++) {
4189+
for (unsigned i = 0; i < n; i++, sublist_idx++) {
40854190
uint64_t bytes_remaining;
40864191
uint64_t bytes_evicted;
40874192

4193+
/* we've reached the end, wrap to the beginning */
4194+
if (sublist_idx >= num_sublists)
4195+
sublist_idx = 0;
4196+
4197+
if (usetskq) {
4198+
uint64_t evict = i < k ? bytes_pertask :
4199+
bytes_pertask_low;
4200+
4201+
ASSERT3S(n, <=, num_sublists);
4202+
4203+
memset(&evarg[i].tqe, 0, sizeof (evarg[i].tqe));
4204+
evarg[i].ml = ml;
4205+
evarg[i].marker = markers[sublist_idx];
4206+
evarg[i].spa = spa;
4207+
evarg[i].evicted_ptr = &scan_evicted;
4208+
evarg[i].idx = sublist_idx;
4209+
evarg[i].bytes = evict;
4210+
4211+
taskq_dispatch_ent(arc_evict_taskq,
4212+
arc_evict_task,
4213+
&evarg[i], 0, &evarg[i].tqe);
4214+
continue;
4215+
}
4216+
40884217
if (total_evicted < bytes)
40894218
bytes_remaining = bytes - total_evicted;
40904219
else
@@ -4095,10 +4224,11 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
40954224

40964225
scan_evicted += bytes_evicted;
40974226
total_evicted += bytes_evicted;
4227+
}
40984228

4099-
/* we've reached the end, wrap to the beginning */
4100-
if (++sublist_idx >= num_sublists)
4101-
sublist_idx = 0;
4229+
if (usetskq) {
4230+
taskq_wait(arc_evict_taskq);
4231+
total_evicted += scan_evicted;
41024232
}
41034233

41044234
/*
@@ -4125,11 +4255,14 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
41254255
}
41264256
}
41274257

4258+
kmem_free(evarg, sizeof (*evarg) * num_sublists);
4259+
41284260
for (int i = 0; i < num_sublists; i++) {
41294261
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
41304262
multilist_sublist_remove(mls, markers[i]);
41314263
multilist_sublist_unlock(mls);
41324264
}
4265+
41334266
if (markers != arc_state_evict_markers)
41344267
arc_state_free_markers(markers, num_sublists);
41354268

@@ -7737,12 +7870,18 @@ arc_init(void)
77377870

77387871
buf_init();
77397872

7873+
if (zfs_arc_evict_threads == 0)
7874+
zfs_arc_evict_threads = MIN(16, max_ncpus >> 1);
7875+
77407876
list_create(&arc_prune_list, sizeof (arc_prune_t),
77417877
offsetof(arc_prune_t, p_node));
77427878
mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
77437879

77447880
arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
77457881
defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
7882+
arc_evict_taskq = taskq_create("arc_evict",
7883+
MIN(zfs_arc_evict_threads, max_ncpus), defclsyspri,
7884+
MIN(zfs_arc_evict_threads, max_ncpus), max_ncpus, TASKQ_PREPOPULATE);
77467885

77477886
arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
77487887
sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@@ -7817,6 +7956,9 @@ arc_fini(void)
78177956
arc_ksp = NULL;
78187957
}
78197958

7959+
taskq_wait(arc_evict_taskq);
7960+
taskq_destroy(arc_evict_taskq);
7961+
78207962
taskq_wait(arc_prune_taskq);
78217963
taskq_destroy(arc_prune_taskq);
78227964

@@ -10840,3 +10982,9 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
1084010982

1084110983
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
1084210984
"Number of arc_prune threads");
10985+
10986+
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_parallel, UINT, ZMOD_RW,
10987+
"Evict from the ARC in parallel using a taskq");
10988+
10989+
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RW,
10990+
"Maximum number of arc_evict threads");

0 commit comments

Comments
 (0)