27
27
* Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
28
28
* Copyright (c) 2019, loli10K <[email protected] >. All rights reserved.
29
29
* Copyright (c) 2020, George Amanakis. All rights reserved.
30
- * Copyright (c) 2019, 2024, Klara Inc.
30
+ * Copyright (c) 2019, 2024, 2025, Klara, Inc.
31
31
* Copyright (c) 2019, Allan Jude
32
32
* Copyright (c) 2020, The FreeBSD Foundation [1]
33
33
* Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
@@ -337,6 +337,9 @@ static kmutex_t arc_evict_lock;
337
337
static boolean_t arc_evict_needed = B_FALSE ;
338
338
static clock_t arc_last_uncached_flush ;
339
339
340
+ static taskq_t * arc_evict_taskq ;
341
+ static struct evict_arg * arc_evict_arg ;
342
+
340
343
/*
341
344
* Count of bytes evicted since boot.
342
345
*/
@@ -470,6 +473,18 @@ static int zfs_arc_prune_task_threads = 1;
470
473
/* Used by spa_export/spa_destroy to flush the arc asynchronously */
471
474
static taskq_t * arc_flush_taskq ;
472
475
476
+ /*
477
+ * Controls the number of ARC eviction threads to dispatch sublists to.
478
+ *
479
+ * Possible values:
480
+ * 0 (auto) compute the number of threads using a logarithmic formula.
481
+ * 1 (disabled) one thread - parallel eviction is disabled.
482
+ * 2+ (manual) set the number manually.
483
+ *
484
+ * See arc_evict_thread_init() for how "auto" is computed.
485
+ */
486
+ static uint_t zfs_arc_evict_threads = 0 ;
487
+
473
488
/* The 7 states: */
474
489
arc_state_t ARC_anon ;
475
490
arc_state_t ARC_mru ;
@@ -4049,6 +4064,62 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
4049
4064
kmem_free (markers , sizeof (* markers ) * count );
4050
4065
}
4051
4066
4067
+ typedef struct evict_arg {
4068
+ taskq_ent_t eva_tqent ;
4069
+ multilist_t * eva_ml ;
4070
+ arc_buf_hdr_t * eva_marker ;
4071
+ int eva_idx ;
4072
+ uint64_t eva_spa ;
4073
+ uint64_t eva_bytes ;
4074
+ uint64_t eva_evicted ;
4075
+ } evict_arg_t ;
4076
+
4077
+ static void
4078
+ arc_evict_task (void * arg )
4079
+ {
4080
+ evict_arg_t * eva = arg ;
4081
+ eva -> eva_evicted = arc_evict_state_impl (eva -> eva_ml , eva -> eva_idx ,
4082
+ eva -> eva_marker , eva -> eva_spa , eva -> eva_bytes );
4083
+ }
4084
+
4085
+ static void
4086
+ arc_evict_thread_init (void )
4087
+ {
4088
+ if (zfs_arc_evict_threads == 0 ) {
4089
+ /*
4090
+ * Compute number of threads we want to use for eviction.
4091
+ *
4092
+ * Normally, it's log2(ncpus) + ncpus/32, which gets us to the
4093
+ * default max of 16 threads at ~256 CPUs.
4094
+ *
4095
+ * However, that formula goes to two threads at 4 CPUs, which
4096
+ * is still rather to low to be really useful, so we just go
4097
+ * with 1 thread at fewer than 6 cores.
4098
+ */
4099
+ if (max_ncpus < 6 )
4100
+ zfs_arc_evict_threads = 1 ;
4101
+ else
4102
+ zfs_arc_evict_threads =
4103
+ (highbit64 (max_ncpus ) - 1 ) + max_ncpus / 32 ;
4104
+ } else if (zfs_arc_evict_threads > max_ncpus )
4105
+ zfs_arc_evict_threads = max_ncpus ;
4106
+
4107
+ if (zfs_arc_evict_threads > 1 ) {
4108
+ arc_evict_taskq = taskq_create ("arc_evict" ,
4109
+ zfs_arc_evict_threads , defclsyspri , 0 , INT_MAX ,
4110
+ TASKQ_PREPOPULATE );
4111
+ arc_evict_arg = kmem_zalloc (
4112
+ sizeof (evict_arg_t ) * zfs_arc_evict_threads , KM_SLEEP );
4113
+ }
4114
+ }
4115
+
4116
+ /*
4117
+ * The minimum number of bytes we can evict at once is a block size.
4118
+ * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
4119
+ * We use this value to compute a scaling factor for the eviction tasks.
4120
+ */
4121
+ #define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)
4122
+
4052
4123
/*
4053
4124
* Evict buffers from the given arc state, until we've removed the
4054
4125
* specified number of bytes. Move the removed buffers to the
@@ -4070,9 +4141,12 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4070
4141
multilist_t * ml = & state -> arcs_list [type ];
4071
4142
int num_sublists ;
4072
4143
arc_buf_hdr_t * * markers ;
4144
+ evict_arg_t * eva = NULL ;
4073
4145
4074
4146
num_sublists = multilist_get_num_sublists (ml );
4075
4147
4148
+ boolean_t use_evcttq = zfs_arc_evict_threads > 1 ;
4149
+
4076
4150
/*
4077
4151
* If we've tried to evict from each sublist, made some
4078
4152
* progress, but still have not hit the target number of bytes
@@ -4094,25 +4168,91 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4094
4168
multilist_sublist_unlock (mls );
4095
4169
}
4096
4170
4171
+ if (use_evcttq ) {
4172
+ if (zthr_iscurthread (arc_evict_zthr ))
4173
+ eva = arc_evict_arg ;
4174
+ else
4175
+ eva = kmem_alloc (sizeof (evict_arg_t ) *
4176
+ zfs_arc_evict_threads , KM_NOSLEEP );
4177
+ if (eva ) {
4178
+ for (int i = 0 ; i < zfs_arc_evict_threads ; i ++ ) {
4179
+ taskq_init_ent (& eva [i ].eva_tqent );
4180
+ eva [i ].eva_ml = ml ;
4181
+ eva [i ].eva_spa = spa ;
4182
+ }
4183
+ } else {
4184
+ /*
4185
+ * Fall back to the regular single evict if it is not
4186
+ * possible to allocate memory for the taskq entries.
4187
+ */
4188
+ use_evcttq = B_FALSE ;
4189
+ }
4190
+ }
4191
+
4192
+ /*
4193
+ * Start eviction using a randomly selected sublist, this is to try and
4194
+ * evenly balance eviction across all sublists. Always starting at the
4195
+ * same sublist (e.g. index 0) would cause evictions to favor certain
4196
+ * sublists over others.
4197
+ */
4198
+ uint64_t scan_evicted = 0 ;
4199
+ int sublists_left = num_sublists ;
4200
+ int sublist_idx = multilist_get_random_index (ml );
4201
+
4097
4202
/*
4098
4203
* While we haven't hit our target number of bytes to evict, or
4099
4204
* we're evicting all available buffers.
4100
4205
*/
4101
4206
while (total_evicted < bytes ) {
4102
- int sublist_idx = multilist_get_random_index ( ml ) ;
4103
- uint64_t scan_evicted = 0 ;
4207
+ uint64_t evict = MIN_EVICT_SIZE ;
4208
+ uint_t ntasks = zfs_arc_evict_threads ;
4104
4209
4105
- /*
4106
- * Start eviction using a randomly selected sublist,
4107
- * this is to try and evenly balance eviction across all
4108
- * sublists. Always starting at the same sublist
4109
- * (e.g. index 0) would cause evictions to favor certain
4110
- * sublists over others.
4111
- */
4112
- for (int i = 0 ; i < num_sublists ; i ++ ) {
4210
+ if (use_evcttq ) {
4211
+ if (sublists_left < ntasks )
4212
+ ntasks = sublists_left ;
4213
+
4214
+ if (ntasks < 2 )
4215
+ use_evcttq = B_FALSE ;
4216
+ }
4217
+
4218
+ if (use_evcttq ) {
4219
+ uint64_t left = bytes - total_evicted ;
4220
+
4221
+ if (bytes == ARC_EVICT_ALL ) {
4222
+ evict = bytes ;
4223
+ } else if (left > ntasks * MIN_EVICT_SIZE ) {
4224
+ evict = DIV_ROUND_UP (left , ntasks );
4225
+ } else {
4226
+ ntasks = DIV_ROUND_UP (left , MIN_EVICT_SIZE );
4227
+ if (ntasks == 1 )
4228
+ use_evcttq = B_FALSE ;
4229
+ }
4230
+ }
4231
+
4232
+ for (int i = 0 ; sublists_left > 0 ; i ++ , sublist_idx ++ ,
4233
+ sublists_left -- ) {
4113
4234
uint64_t bytes_remaining ;
4114
4235
uint64_t bytes_evicted ;
4115
4236
4237
+ /* we've reached the end, wrap to the beginning */
4238
+ if (sublist_idx >= num_sublists )
4239
+ sublist_idx = 0 ;
4240
+
4241
+ if (use_evcttq ) {
4242
+ if (i == ntasks )
4243
+ break ;
4244
+
4245
+ eva [i ].eva_marker = markers [sublist_idx ];
4246
+ eva [i ].eva_idx = sublist_idx ;
4247
+ eva [i ].eva_bytes = evict ;
4248
+
4249
+ taskq_dispatch_ent (arc_evict_taskq ,
4250
+ arc_evict_task , & eva [i ], 0 ,
4251
+ & eva [i ].eva_tqent );
4252
+
4253
+ continue ;
4254
+ }
4255
+
4116
4256
if (total_evicted < bytes )
4117
4257
bytes_remaining = bytes - total_evicted ;
4118
4258
else
@@ -4123,18 +4263,23 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4123
4263
4124
4264
scan_evicted += bytes_evicted ;
4125
4265
total_evicted += bytes_evicted ;
4266
+ }
4126
4267
4127
- /* we've reached the end, wrap to the beginning */
4128
- if (++ sublist_idx >= num_sublists )
4129
- sublist_idx = 0 ;
4268
+ if (use_evcttq ) {
4269
+ taskq_wait (arc_evict_taskq );
4270
+
4271
+ for (int i = 0 ; i < ntasks ; i ++ ) {
4272
+ scan_evicted += eva [i ].eva_evicted ;
4273
+ total_evicted += eva [i ].eva_evicted ;
4274
+ }
4130
4275
}
4131
4276
4132
4277
/*
4133
- * If we didn't evict anything during this scan , we have
4134
- * no reason to believe we'll evict more during another
4278
+ * If we scanned all sublists and didn't evict anything, we
4279
+ * have no reason to believe we'll evict more during another
4135
4280
* scan, so break the loop.
4136
4281
*/
4137
- if (scan_evicted == 0 ) {
4282
+ if (scan_evicted == 0 && sublists_left == 0 ) {
4138
4283
/* This isn't possible, let's make that obvious */
4139
4284
ASSERT3S (bytes , != , 0 );
4140
4285
@@ -4151,13 +4296,33 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4151
4296
4152
4297
break ;
4153
4298
}
4299
+
4300
+ /*
4301
+ * If we scanned all sublists but still have more to do,
4302
+ * reset the counts so we can go around again.
4303
+ */
4304
+ if (sublists_left == 0 ) {
4305
+ sublists_left = num_sublists ;
4306
+ sublist_idx = multilist_get_random_index (ml );
4307
+ scan_evicted = 0 ;
4308
+
4309
+ /*
4310
+ * Since we're about to reconsider all sublists,
4311
+ * re-enable use of the evict threads if available.
4312
+ */
4313
+ use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL );
4314
+ }
4154
4315
}
4155
4316
4317
+ if (eva != NULL && eva != arc_evict_arg )
4318
+ kmem_free (eva , sizeof (evict_arg_t ) * zfs_arc_evict_threads );
4319
+
4156
4320
for (int i = 0 ; i < num_sublists ; i ++ ) {
4157
4321
multilist_sublist_t * mls = multilist_sublist_lock_idx (ml , i );
4158
4322
multilist_sublist_remove (mls , markers [i ]);
4159
4323
multilist_sublist_unlock (mls );
4160
4324
}
4325
+
4161
4326
if (markers != arc_state_evict_markers )
4162
4327
arc_state_free_markers (markers , num_sublists );
4163
4328
@@ -7824,6 +7989,7 @@ arc_set_limits(uint64_t allmem)
7824
7989
/* How to set default max varies by platform. */
7825
7990
arc_c_max = arc_default_max (arc_c_min , allmem );
7826
7991
}
7992
+
7827
7993
void
7828
7994
arc_init (void )
7829
7995
{
@@ -7901,6 +8067,8 @@ arc_init(void)
7901
8067
arc_prune_taskq = taskq_create ("arc_prune" , zfs_arc_prune_task_threads ,
7902
8068
defclsyspri , 100 , INT_MAX , TASKQ_PREPOPULATE | TASKQ_DYNAMIC );
7903
8069
8070
+ arc_evict_thread_init ();
8071
+
7904
8072
list_create (& arc_async_flush_list , sizeof (arc_async_flush_t ),
7905
8073
offsetof(arc_async_flush_t , af_node ));
7906
8074
mutex_init (& arc_async_flush_lock , NULL , MUTEX_DEFAULT , NULL );
@@ -8001,11 +8169,20 @@ arc_fini(void)
8001
8169
list_destroy (& arc_prune_list );
8002
8170
mutex_destroy (& arc_prune_mtx );
8003
8171
8172
+ if (arc_evict_taskq != NULL )
8173
+ taskq_wait (arc_evict_taskq );
8174
+
8004
8175
(void ) zthr_cancel (arc_evict_zthr );
8005
8176
(void ) zthr_cancel (arc_reap_zthr );
8006
8177
arc_state_free_markers (arc_state_evict_markers ,
8007
8178
arc_state_evict_marker_count );
8008
8179
8180
+ if (arc_evict_taskq != NULL ) {
8181
+ taskq_destroy (arc_evict_taskq );
8182
+ kmem_free (arc_evict_arg ,
8183
+ sizeof (evict_arg_t ) * zfs_arc_evict_threads );
8184
+ }
8185
+
8009
8186
mutex_destroy (& arc_evict_lock );
8010
8187
list_destroy (& arc_evict_waiters );
8011
8188
@@ -11129,3 +11306,6 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
11129
11306
11130
11307
ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , prune_task_threads , INT , ZMOD_RW ,
11131
11308
"Number of arc_prune threads" );
11309
+
11310
+ ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , evict_threads , UINT , ZMOD_RD ,
11311
+ "Number of threads to use for ARC eviction." );
0 commit comments