27
27
* Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
28
28
* Copyright (c) 2019, loli10K <[email protected] >. All rights reserved.
29
29
* Copyright (c) 2020, George Amanakis. All rights reserved.
30
- * Copyright (c) 2019, 2024, Klara Inc.
30
+ * Copyright (c) 2019, 2024, 2025, Klara, Inc.
31
31
* Copyright (c) 2019, Allan Jude
32
32
* Copyright (c) 2020, The FreeBSD Foundation [1]
33
33
* Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
@@ -336,6 +336,10 @@ static kmutex_t arc_evict_lock;
336
336
static boolean_t arc_evict_needed = B_FALSE ;
337
337
static clock_t arc_last_uncached_flush ;
338
338
339
+ static taskq_t * arc_evict_taskq ;
340
+ typedef struct evict_arg evict_arg_t ;
341
+ static evict_arg_t * arc_evict_arg ;
342
+
339
343
/*
340
344
* Count of bytes evicted since boot.
341
345
*/
@@ -469,6 +473,18 @@ static int zfs_arc_prune_task_threads = 1;
469
473
/* Used by spa_export/spa_destroy to flush the arc asynchronously */
470
474
static taskq_t * arc_flush_taskq ;
471
475
476
+ /*
477
+ * Controls the number of ARC eviction threads to dispatch sublists to.
478
+ *
479
+ * Possible values:
480
+ * 0 (auto) compute the number of threads using a logarithmic formula.
481
+ * 1 (disabled) one thread - parallel eviction is disabled.
482
+ * 2+ (manual) set the number manually.
483
+ *
484
+ * See arc_evict_thread_init() for how "auto" is computed.
485
+ */
486
+ static uint_t zfs_arc_evict_threads = 0 ;
487
+
472
488
/* The 7 states: */
473
489
arc_state_t ARC_anon ;
474
490
arc_state_t ARC_mru ;
@@ -4048,6 +4064,62 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
4048
4064
kmem_free (markers , sizeof (* markers ) * count );
4049
4065
}
4050
4066
4067
+ typedef struct evict_arg {
4068
+ taskq_ent_t eva_tqent ;
4069
+ multilist_t * eva_ml ;
4070
+ arc_buf_hdr_t * eva_marker ;
4071
+ int eva_idx ;
4072
+ uint64_t eva_spa ;
4073
+ uint64_t eva_bytes ;
4074
+ uint64_t eva_evicted ;
4075
+ } evict_arg_t ;
4076
+
4077
+ static void
4078
+ arc_evict_task (void * arg )
4079
+ {
4080
+ evict_arg_t * eva = arg ;
4081
+ eva -> eva_evicted = arc_evict_state_impl (eva -> eva_ml , eva -> eva_idx ,
4082
+ eva -> eva_marker , eva -> eva_spa , eva -> eva_bytes );
4083
+ }
4084
+
4085
+ static void
4086
+ arc_evict_thread_init (void )
4087
+ {
4088
+ if (zfs_arc_evict_threads == 0 ) {
4089
+ /*
4090
+ * Compute number of threads we want to use for eviction.
4091
+ *
4092
+ * Normally, it's log2(ncpus) + ncpus/32, which gets us to the
4093
+ * default max of 16 threads at ~256 CPUs.
4094
+ *
4095
+ * However, that formula goes to two threads at 4 CPUs, which
4096
+ * is still rather to low to be really useful, so we just go
4097
+ * with 1 thread at fewer than 6 cores.
4098
+ */
4099
+ if (max_ncpus < 6 )
4100
+ zfs_arc_evict_threads = 1 ;
4101
+ else
4102
+ zfs_arc_evict_threads =
4103
+ (highbit64 (max_ncpus ) - 1 ) + max_ncpus / 32 ;
4104
+ } else if (zfs_arc_evict_threads > max_ncpus )
4105
+ zfs_arc_evict_threads = max_ncpus ;
4106
+
4107
+ if (zfs_arc_evict_threads > 1 ) {
4108
+ arc_evict_taskq = taskq_create ("arc_evict" ,
4109
+ zfs_arc_evict_threads , defclsyspri , 0 , INT_MAX ,
4110
+ TASKQ_PREPOPULATE );
4111
+ arc_evict_arg = kmem_zalloc (
4112
+ sizeof (evict_arg_t ) * zfs_arc_evict_threads , KM_SLEEP );
4113
+ }
4114
+ }
4115
+
4116
+ /*
4117
+ * The minimum number of bytes we can evict at once is a block size.
4118
+ * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
4119
+ * We use this value to compute a scaling factor for the eviction tasks.
4120
+ */
4121
+ #define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)
4122
+
4051
4123
/*
4052
4124
* Evict buffers from the given arc state, until we've removed the
4053
4125
* specified number of bytes. Move the removed buffers to the
@@ -4069,9 +4141,12 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4069
4141
multilist_t * ml = & state -> arcs_list [type ];
4070
4142
int num_sublists ;
4071
4143
arc_buf_hdr_t * * markers ;
4144
+ evict_arg_t * eva = NULL ;
4072
4145
4073
4146
num_sublists = multilist_get_num_sublists (ml );
4074
4147
4148
+ boolean_t use_evcttq = zfs_arc_evict_threads > 1 ;
4149
+
4075
4150
/*
4076
4151
* If we've tried to evict from each sublist, made some
4077
4152
* progress, but still have not hit the target number of bytes
@@ -4093,25 +4168,91 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4093
4168
multilist_sublist_unlock (mls );
4094
4169
}
4095
4170
4171
+ if (use_evcttq ) {
4172
+ if (zthr_iscurthread (arc_evict_zthr ))
4173
+ eva = arc_evict_arg ;
4174
+ else
4175
+ eva = kmem_alloc (sizeof (evict_arg_t ) *
4176
+ zfs_arc_evict_threads , KM_NOSLEEP );
4177
+ if (eva ) {
4178
+ for (int i = 0 ; i < zfs_arc_evict_threads ; i ++ ) {
4179
+ taskq_init_ent (& eva [i ].eva_tqent );
4180
+ eva [i ].eva_ml = ml ;
4181
+ eva [i ].eva_spa = spa ;
4182
+ }
4183
+ } else {
4184
+ /*
4185
+ * Fall back to the regular single evict if it is not
4186
+ * possible to allocate memory for the taskq entries.
4187
+ */
4188
+ use_evcttq = B_FALSE ;
4189
+ }
4190
+ }
4191
+
4192
+ /*
4193
+ * Start eviction using a randomly selected sublist, this is to try and
4194
+ * evenly balance eviction across all sublists. Always starting at the
4195
+ * same sublist (e.g. index 0) would cause evictions to favor certain
4196
+ * sublists over others.
4197
+ */
4198
+ uint64_t scan_evicted = 0 ;
4199
+ int sublists_left = num_sublists ;
4200
+ int sublist_idx = multilist_get_random_index (ml );
4201
+
4096
4202
/*
4097
4203
* While we haven't hit our target number of bytes to evict, or
4098
4204
* we're evicting all available buffers.
4099
4205
*/
4100
4206
while (total_evicted < bytes ) {
4101
- int sublist_idx = multilist_get_random_index ( ml ) ;
4102
- uint64_t scan_evicted = 0 ;
4207
+ uint64_t evict = MIN_EVICT_SIZE ;
4208
+ uint_t ntasks = zfs_arc_evict_threads ;
4103
4209
4104
- /*
4105
- * Start eviction using a randomly selected sublist,
4106
- * this is to try and evenly balance eviction across all
4107
- * sublists. Always starting at the same sublist
4108
- * (e.g. index 0) would cause evictions to favor certain
4109
- * sublists over others.
4110
- */
4111
- for (int i = 0 ; i < num_sublists ; i ++ ) {
4210
+ if (use_evcttq ) {
4211
+ if (sublists_left < ntasks )
4212
+ ntasks = sublists_left ;
4213
+
4214
+ if (ntasks < 2 )
4215
+ use_evcttq = B_FALSE ;
4216
+ }
4217
+
4218
+ if (use_evcttq ) {
4219
+ uint64_t left = bytes - total_evicted ;
4220
+
4221
+ if (bytes == ARC_EVICT_ALL ) {
4222
+ evict = bytes ;
4223
+ } else if (left > ntasks * MIN_EVICT_SIZE ) {
4224
+ evict = DIV_ROUND_UP (left , ntasks );
4225
+ } else {
4226
+ ntasks = DIV_ROUND_UP (left , MIN_EVICT_SIZE );
4227
+ if (ntasks == 1 )
4228
+ use_evcttq = B_FALSE ;
4229
+ }
4230
+ }
4231
+
4232
+ for (int i = 0 ; sublists_left > 0 ; i ++ , sublist_idx ++ ,
4233
+ sublists_left -- ) {
4112
4234
uint64_t bytes_remaining ;
4113
4235
uint64_t bytes_evicted ;
4114
4236
4237
+ /* we've reached the end, wrap to the beginning */
4238
+ if (sublist_idx >= num_sublists )
4239
+ sublist_idx = 0 ;
4240
+
4241
+ if (use_evcttq ) {
4242
+ if (i == ntasks )
4243
+ break ;
4244
+
4245
+ eva [i ].eva_marker = markers [sublist_idx ];
4246
+ eva [i ].eva_idx = sublist_idx ;
4247
+ eva [i ].eva_bytes = evict ;
4248
+
4249
+ taskq_dispatch_ent (arc_evict_taskq ,
4250
+ arc_evict_task , & eva [i ], 0 ,
4251
+ & eva [i ].eva_tqent );
4252
+
4253
+ continue ;
4254
+ }
4255
+
4115
4256
if (total_evicted < bytes )
4116
4257
bytes_remaining = bytes - total_evicted ;
4117
4258
else
@@ -4122,18 +4263,23 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4122
4263
4123
4264
scan_evicted += bytes_evicted ;
4124
4265
total_evicted += bytes_evicted ;
4266
+ }
4125
4267
4126
- /* we've reached the end, wrap to the beginning */
4127
- if (++ sublist_idx >= num_sublists )
4128
- sublist_idx = 0 ;
4268
+ if (use_evcttq ) {
4269
+ taskq_wait (arc_evict_taskq );
4270
+
4271
+ for (int i = 0 ; i < ntasks ; i ++ ) {
4272
+ scan_evicted += eva [i ].eva_evicted ;
4273
+ total_evicted += eva [i ].eva_evicted ;
4274
+ }
4129
4275
}
4130
4276
4131
4277
/*
4132
- * If we didn't evict anything during this scan , we have
4133
- * no reason to believe we'll evict more during another
4278
+ * If we scanned all sublists and didn't evict anything, we
4279
+ * have no reason to believe we'll evict more during another
4134
4280
* scan, so break the loop.
4135
4281
*/
4136
- if (scan_evicted == 0 ) {
4282
+ if (scan_evicted == 0 && sublists_left == 0 ) {
4137
4283
/* This isn't possible, let's make that obvious */
4138
4284
ASSERT3S (bytes , != , 0 );
4139
4285
@@ -4150,13 +4296,33 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4150
4296
4151
4297
break ;
4152
4298
}
4299
+
4300
+ /*
4301
+ * If we scanned all sublists but still have more to do,
4302
+ * reset the counts so we can go around again.
4303
+ */
4304
+ if (sublists_left == 0 ) {
4305
+ sublists_left = num_sublists ;
4306
+ sublist_idx = multilist_get_random_index (ml );
4307
+ scan_evicted = 0 ;
4308
+
4309
+ /*
4310
+ * Since we're about to reconsider all sublists,
4311
+ * re-enable use of the evict threads if available.
4312
+ */
4313
+ use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL );
4314
+ }
4153
4315
}
4154
4316
4317
+ if (eva != NULL && eva != arc_evict_arg )
4318
+ kmem_free (eva , sizeof (evict_arg_t ) * zfs_arc_evict_threads );
4319
+
4155
4320
for (int i = 0 ; i < num_sublists ; i ++ ) {
4156
4321
multilist_sublist_t * mls = multilist_sublist_lock_idx (ml , i );
4157
4322
multilist_sublist_remove (mls , markers [i ]);
4158
4323
multilist_sublist_unlock (mls );
4159
4324
}
4325
+
4160
4326
if (markers != arc_state_evict_markers )
4161
4327
arc_state_free_markers (markers , num_sublists );
4162
4328
@@ -7795,6 +7961,7 @@ arc_set_limits(uint64_t allmem)
7795
7961
/* How to set default max varies by platform. */
7796
7962
arc_c_max = arc_default_max (arc_c_min , allmem );
7797
7963
}
7964
+
7798
7965
void
7799
7966
arc_init (void )
7800
7967
{
@@ -7872,6 +8039,8 @@ arc_init(void)
7872
8039
arc_prune_taskq = taskq_create ("arc_prune" , zfs_arc_prune_task_threads ,
7873
8040
defclsyspri , 100 , INT_MAX , TASKQ_PREPOPULATE | TASKQ_DYNAMIC );
7874
8041
8042
+ arc_evict_thread_init ();
8043
+
7875
8044
list_create (& arc_async_flush_list , sizeof (arc_async_flush_t ),
7876
8045
offsetof(arc_async_flush_t , af_node ));
7877
8046
mutex_init (& arc_async_flush_lock , NULL , MUTEX_DEFAULT , NULL );
@@ -7955,6 +8124,13 @@ arc_fini(void)
7955
8124
arc_ksp = NULL ;
7956
8125
}
7957
8126
8127
+ if (arc_evict_taskq != NULL ) {
8128
+ taskq_wait (arc_evict_taskq );
8129
+ taskq_destroy (arc_evict_taskq );
8130
+ kmem_free (arc_evict_arg ,
8131
+ sizeof (evict_arg_t ) * zfs_arc_evict_threads );
8132
+ }
8133
+
7958
8134
taskq_wait (arc_prune_taskq );
7959
8135
taskq_destroy (arc_prune_taskq );
7960
8136
@@ -11100,3 +11276,6 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
11100
11276
11101
11277
ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , prune_task_threads , INT , ZMOD_RW ,
11102
11278
"Number of arc_prune threads" );
11279
+
11280
+ ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , evict_threads , UINT , ZMOD_RD ,
11281
+ "Number of threads to use for ARC eviction." );
0 commit comments