@@ -465,6 +465,19 @@ static uint_t zfs_arc_lotsfree_percent = 10;
465
465
*/
466
466
static int zfs_arc_prune_task_threads = 1 ;
467
467
468
+ /*
469
+ * Number of arc_evict threads
470
+ */
471
+ static uint_t zfs_arc_evict_threads = 0 ;
472
+ static uint_t zfs_arc_evict_threads_live = 0 ;
473
+
474
+ /*
475
+ * The minimum number of bytes we can evict at once is a block size.
476
+ * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
477
+ * We use this value to compute a scaling factor for the eviction tasks.
478
+ */
479
+ #define MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT)
480
+
468
481
/* The 7 states: */
469
482
arc_state_t ARC_anon ;
470
483
arc_state_t ARC_mru ;
@@ -3890,7 +3903,6 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
3890
3903
* specifically implemented to ensure this is the case
3891
3904
* (only 'marker' will be removed and re-inserted).
3892
3905
*/
3893
- multilist_sublist_move_forward (mls , marker );
3894
3906
3895
3907
/*
3896
3908
* The only case where the b_spa field should ever be
@@ -3900,11 +3912,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
3900
3912
* dsl_pool_close() and zio_inject_fault()), so we must
3901
3913
* skip any markers we see from these other threads.
3902
3914
*/
3903
- if (hdr -> b_spa == 0 )
3915
+ if (hdr -> b_spa == 0 ) {
3916
+ multilist_sublist_move_forward (mls , marker );
3904
3917
continue ;
3918
+ }
3905
3919
3906
3920
/* we're only interested in evicting buffers of a certain spa */
3907
3921
if (spa != 0 && hdr -> b_spa != spa ) {
3922
+ multilist_sublist_move_forward (mls , marker );
3908
3923
ARCSTAT_BUMP (arcstat_evict_skip );
3909
3924
continue ;
3910
3925
}
@@ -3939,6 +3954,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
3939
3954
evict_count -- ;
3940
3955
3941
3956
} else {
3957
+ multilist_sublist_move_forward (mls , marker );
3942
3958
ARCSTAT_BUMP (arcstat_mutex_miss );
3943
3959
}
3944
3960
}
@@ -4026,6 +4042,35 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
4026
4042
kmem_free (markers , sizeof (* markers ) * count );
4027
4043
}
4028
4044
4045
+ taskq_t * arc_evict_taskq ;
4046
+
4047
+ typedef struct evict_arg {
4048
+ taskq_ent_t tqe ;
4049
+ multilist_t * ml ;
4050
+ int idx ;
4051
+ arc_buf_hdr_t * marker ;
4052
+ uint64_t spa ;
4053
+ uint64_t bytes ;
4054
+ volatile uint64_t * evicted_ptr ;
4055
+ } evict_arg_t ;
4056
+
4057
+ static void
4058
+ arc_evict_task (void * arg )
4059
+ {
4060
+ evict_arg_t * eva = arg ;
4061
+ volatile uint64_t * evictedp = eva -> evicted_ptr ;
4062
+ multilist_t * ml = eva -> ml ;
4063
+ arc_buf_hdr_t * marker = eva -> marker ;
4064
+ int idx = eva -> idx ;
4065
+ uint64_t spa = eva -> spa ;
4066
+ uint64_t evict = eva -> bytes ;
4067
+ uint64_t bytes_evicted ;
4068
+
4069
+ bytes_evicted = arc_evict_state_impl (ml , idx , marker , spa , evict );
4070
+
4071
+ atomic_add_64 (evictedp , bytes_evicted );
4072
+ }
4073
+
4029
4074
/*
4030
4075
* Evict buffers from the given arc state, until we've removed the
4031
4076
* specified number of bytes. Move the removed buffers to the
@@ -4045,10 +4090,11 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4045
4090
{
4046
4091
uint64_t total_evicted = 0 ;
4047
4092
multilist_t * ml = & state -> arcs_list [type ];
4048
- int num_sublists ;
4049
4093
arc_buf_hdr_t * * markers ;
4094
+ unsigned num_sublists = multilist_get_num_sublists (ml );
4050
4095
4051
- num_sublists = multilist_get_num_sublists (ml );
4096
+ if (bytes == 0 )
4097
+ return (total_evicted );
4052
4098
4053
4099
/*
4054
4100
* If we've tried to evict from each sublist, made some
@@ -4071,25 +4117,108 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4071
4117
multilist_sublist_unlock (mls );
4072
4118
}
4073
4119
4120
+ evict_arg_t * evarg = kmem_alloc (sizeof (* evarg ) * num_sublists ,
4121
+ KM_SLEEP );
4074
4122
/*
4075
4123
* While we haven't hit our target number of bytes to evict, or
4076
4124
* we're evicting all available buffers.
4077
4125
*/
4078
4126
while (total_evicted < bytes ) {
4079
4127
int sublist_idx = multilist_get_random_index (ml );
4128
+ boolean_t usetskq = zfs_arc_evict_threads_live > 1 ;
4080
4129
uint64_t scan_evicted = 0 ;
4081
4130
4131
+ uint64_t left = (bytes == ARC_EVICT_ALL ? bytes :
4132
+ bytes - total_evicted );
4133
+
4134
+ /*
4135
+ * How we scale
4136
+ *
4137
+ * Example 1, # of chunks less than # of tasks.
4138
+ * We have:
4139
+ * - 4 tasks
4140
+ * - 3 chunks
4141
+ * - 3 full col
4142
+ * - 0 low cols.
4143
+ *
4144
+ * The first low col index is 3.
4145
+ * The tasks #0-#2 evict 1 chunk each.
4146
+ *
4147
+ * 0 | 1 | 2 | 3 |
4148
+ * +===+===+===+===+
4149
+ * | x | x | x | |
4150
+ * +---+---+---+---+
4151
+ *
4152
+ * Example 2, # of chunks more than # of tasks.
4153
+ * We have:
4154
+ * - 4 tasks
4155
+ * - 9 chunks
4156
+ * - 1 full col
4157
+ * - 3 low cols
4158
+ *
4159
+ * The first low col index is 1.
4160
+ * The task #0 evicts 3 chunks, the others evict 2 chunks each.
4161
+ *
4162
+ * 0 | 1 | 2 | 3 |
4163
+ * +===+===+===+===+
4164
+ * | x | x | x | x |
4165
+ * +---+---+---+---+
4166
+ * | x | x | x | x |
4167
+ * +---+---+---+---+
4168
+ * | x | | | |
4169
+ * +---+---+---+---+
4170
+ */
4171
+
4172
+ /*
4173
+ * Compute number of tasks to run (n), low col index (k)
4174
+ * and normal and low bytes per task.
4175
+ */
4176
+ uint64_t nchunks = ((left - 1 ) >> MIN_EVICT_PERTASK_SHIFT ) + 1 ;
4177
+ unsigned n = nchunks < num_sublists ? nchunks : num_sublists ;
4178
+ uint64_t fullrows = nchunks / n ;
4179
+ unsigned lastrowcols = nchunks % n ;
4180
+ unsigned k = (lastrowcols ? lastrowcols : n );
4181
+
4182
+ uint64_t bytes_pertask_low =
4183
+ fullrows << MIN_EVICT_PERTASK_SHIFT ;
4184
+ uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ?
4185
+ (1 << MIN_EVICT_PERTASK_SHIFT ) : 0 );
4186
+
4082
4187
/*
4083
4188
* Start eviction using a randomly selected sublist,
4084
4189
* this is to try and evenly balance eviction across all
4085
4190
* sublists. Always starting at the same sublist
4086
4191
* (e.g. index 0) would cause evictions to favor certain
4087
4192
* sublists over others.
4088
4193
*/
4089
- for (int i = 0 ; i < num_sublists ; i ++ ) {
4194
+ for (unsigned i = 0 ; i < n ; i ++ , sublist_idx ++ ) {
4090
4195
uint64_t bytes_remaining ;
4091
4196
uint64_t bytes_evicted ;
4092
4197
4198
+ /* we've reached the end, wrap to the beginning */
4199
+ if (sublist_idx >= num_sublists )
4200
+ sublist_idx = 0 ;
4201
+
4202
+ if (usetskq ) {
4203
+ uint64_t evict = i < k ? bytes_pertask :
4204
+ bytes_pertask_low ;
4205
+
4206
+ ASSERT3S (n , <=, num_sublists );
4207
+
4208
+ memset (& evarg [i ].tqe , 0 , sizeof (evarg [i ].tqe ));
4209
+ evarg [i ].ml = ml ;
4210
+ evarg [i ].marker = markers [sublist_idx ];
4211
+ evarg [i ].spa = spa ;
4212
+ evarg [i ].evicted_ptr = & scan_evicted ;
4213
+ evarg [i ].idx = sublist_idx ;
4214
+ evarg [i ].bytes = evict ;
4215
+
4216
+ taskq_dispatch_ent (arc_evict_taskq ,
4217
+ arc_evict_task ,
4218
+ & evarg [i ], 0 , & evarg [i ].tqe );
4219
+ continue ;
4220
+ }
4221
+
4093
4222
if (total_evicted < bytes )
4094
4223
bytes_remaining = bytes - total_evicted ;
4095
4224
else
@@ -4100,10 +4229,11 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4100
4229
4101
4230
scan_evicted += bytes_evicted ;
4102
4231
total_evicted += bytes_evicted ;
4232
+ }
4103
4233
4104
- /* we've reached the end, wrap to the beginning */
4105
- if ( ++ sublist_idx >= num_sublists )
4106
- sublist_idx = 0 ;
4234
+ if ( usetskq ) {
4235
+ taskq_wait ( arc_evict_taskq );
4236
+ total_evicted += scan_evicted ;
4107
4237
}
4108
4238
4109
4239
/*
@@ -4130,11 +4260,14 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4130
4260
}
4131
4261
}
4132
4262
4263
+ kmem_free (evarg , sizeof (* evarg ) * num_sublists );
4264
+
4133
4265
for (int i = 0 ; i < num_sublists ; i ++ ) {
4134
4266
multilist_sublist_t * mls = multilist_sublist_lock_idx (ml , i );
4135
4267
multilist_sublist_remove (mls , markers [i ]);
4136
4268
multilist_sublist_unlock (mls );
4137
4269
}
4270
+
4138
4271
if (markers != arc_state_evict_markers )
4139
4272
arc_state_free_markers (markers , num_sublists );
4140
4273
@@ -7673,6 +7806,13 @@ arc_set_limits(uint64_t allmem)
7673
7806
/* How to set default max varies by platform. */
7674
7807
arc_c_max = arc_default_max (arc_c_min , allmem );
7675
7808
}
7809
+
7810
+ static inline size_t
7811
+ arc_ilog2 (int a )
7812
+ {
7813
+ return (a > 1 ? 1 + arc_ilog2 (a >> 1 ) : 0 );
7814
+ }
7815
+
7676
7816
void
7677
7817
arc_init (void )
7678
7818
{
@@ -7743,12 +7883,22 @@ arc_init(void)
7743
7883
7744
7884
buf_init ();
7745
7885
7886
+ if (zfs_arc_evict_threads == 0 )
7887
+ zfs_arc_evict_threads_live = MIN (MAX (max_ncpus > 6 ? 2 : 1 ,
7888
+ arc_ilog2 (max_ncpus ) + (max_ncpus >> 6 )), 16 );
7889
+ else
7890
+ zfs_arc_evict_threads_live = zfs_arc_evict_threads ;
7891
+
7746
7892
list_create (& arc_prune_list , sizeof (arc_prune_t ),
7747
7893
offsetof(arc_prune_t , p_node ));
7748
7894
mutex_init (& arc_prune_mtx , NULL , MUTEX_DEFAULT , NULL );
7749
7895
7750
7896
arc_prune_taskq = taskq_create ("arc_prune" , zfs_arc_prune_task_threads ,
7751
7897
defclsyspri , 100 , INT_MAX , TASKQ_PREPOPULATE | TASKQ_DYNAMIC );
7898
+ arc_evict_taskq = taskq_create ("arc_evict" ,
7899
+ MIN (zfs_arc_evict_threads_live , max_ncpus ), defclsyspri ,
7900
+ MIN (zfs_arc_evict_threads_live , max_ncpus ), max_ncpus ,
7901
+ TASKQ_PREPOPULATE );
7752
7902
7753
7903
arc_ksp = kstat_create ("zfs" , 0 , "arcstats" , "misc" , KSTAT_TYPE_NAMED ,
7754
7904
sizeof (arc_stats ) / sizeof (kstat_named_t ), KSTAT_FLAG_VIRTUAL );
@@ -7823,6 +7973,9 @@ arc_fini(void)
7823
7973
arc_ksp = NULL ;
7824
7974
}
7825
7975
7976
+ taskq_wait (arc_evict_taskq );
7977
+ taskq_destroy (arc_evict_taskq );
7978
+
7826
7979
taskq_wait (arc_prune_taskq );
7827
7980
taskq_destroy (arc_prune_taskq );
7828
7981
@@ -10849,3 +11002,6 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
10849
11002
10850
11003
ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , prune_task_threads , INT , ZMOD_RW ,
10851
11004
"Number of arc_prune threads" );
11005
+
11006
+ ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , evict_threads , UINT , ZMOD_RW ,
11007
+ "Maximum number of arc_evict threads" );
0 commit comments