@@ -464,6 +464,20 @@ static uint_t zfs_arc_lotsfree_percent = 10;
464
464
*/
465
465
static int zfs_arc_prune_task_threads = 1 ;
466
466
467
+ /*
468
+ * Number of arc_evict threads
469
+ */
470
+ static uint_t zfs_arc_evict_threads = 0 ;
471
+
472
+ /*
473
+ * The minimum number of bytes we can evict at once is a block size.
474
+ * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
475
+ * We use this value to compute a scaling factor for the eviction tasks.
476
+ */
477
+ #define MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT)
478
+
479
+ static uint_t zfs_arc_evict_parallel = 0 ;
480
+
467
481
/* The 7 states: */
468
482
arc_state_t ARC_anon ;
469
483
arc_state_t ARC_mru ;
@@ -3885,7 +3899,6 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
3885
3899
* specifically implemented to ensure this is the case
3886
3900
* (only 'marker' will be removed and re-inserted).
3887
3901
*/
3888
- multilist_sublist_move_forward (mls , marker );
3889
3902
3890
3903
/*
3891
3904
* The only case where the b_spa field should ever be
@@ -3895,11 +3908,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
3895
3908
* dsl_pool_close() and zio_inject_fault()), so we must
3896
3909
* skip any markers we see from these other threads.
3897
3910
*/
3898
- if (hdr -> b_spa == 0 )
3911
+ if (hdr -> b_spa == 0 ) {
3912
+ multilist_sublist_move_forward (mls , marker );
3899
3913
continue ;
3914
+ }
3900
3915
3901
3916
/* we're only interested in evicting buffers of a certain spa */
3902
3917
if (spa != 0 && hdr -> b_spa != spa ) {
3918
+ multilist_sublist_move_forward (mls , marker );
3903
3919
ARCSTAT_BUMP (arcstat_evict_skip );
3904
3920
continue ;
3905
3921
}
@@ -3934,6 +3950,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
3934
3950
evict_count -- ;
3935
3951
3936
3952
} else {
3953
+ multilist_sublist_move_forward (mls , marker );
3937
3954
ARCSTAT_BUMP (arcstat_mutex_miss );
3938
3955
}
3939
3956
}
@@ -4021,6 +4038,35 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
4021
4038
kmem_free (markers , sizeof (* markers ) * count );
4022
4039
}
4023
4040
4041
+ taskq_t * arc_evict_taskq ;
4042
+
4043
+ typedef struct evict_arg {
4044
+ taskq_ent_t tqe ;
4045
+ multilist_t * ml ;
4046
+ int idx ;
4047
+ arc_buf_hdr_t * marker ;
4048
+ uint64_t spa ;
4049
+ uint64_t bytes ;
4050
+ volatile uint64_t * evicted_ptr ;
4051
+ } evict_arg_t ;
4052
+
4053
+ static void
4054
+ arc_evict_task (void * arg )
4055
+ {
4056
+ evict_arg_t * eva = arg ;
4057
+ volatile uint64_t * evictedp = eva -> evicted_ptr ;
4058
+ multilist_t * ml = eva -> ml ;
4059
+ arc_buf_hdr_t * marker = eva -> marker ;
4060
+ int idx = eva -> idx ;
4061
+ uint64_t spa = eva -> spa ;
4062
+ uint64_t evict = eva -> bytes ;
4063
+ uint64_t bytes_evicted ;
4064
+
4065
+ bytes_evicted = arc_evict_state_impl (ml , idx , marker , spa , evict );
4066
+
4067
+ atomic_add_64 (evictedp , bytes_evicted );
4068
+ }
4069
+
4024
4070
/*
4025
4071
* Evict buffers from the given arc state, until we've removed the
4026
4072
* specified number of bytes. Move the removed buffers to the
@@ -4040,10 +4086,11 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4040
4086
{
4041
4087
uint64_t total_evicted = 0 ;
4042
4088
multilist_t * ml = & state -> arcs_list [type ];
4043
- int num_sublists ;
4044
4089
arc_buf_hdr_t * * markers ;
4090
+ unsigned num_sublists = multilist_get_num_sublists (ml );
4045
4091
4046
- num_sublists = multilist_get_num_sublists (ml );
4092
+ if (bytes == 0 )
4093
+ return (total_evicted );
4047
4094
4048
4095
/*
4049
4096
* If we've tried to evict from each sublist, made some
@@ -4066,25 +4113,107 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4066
4113
multilist_sublist_unlock (mls );
4067
4114
}
4068
4115
4116
+ evict_arg_t * evarg = kmem_alloc (sizeof (* evarg ) * num_sublists ,
4117
+ KM_SLEEP );
4069
4118
/*
4070
4119
* While we haven't hit our target number of bytes to evict, or
4071
4120
* we're evicting all available buffers.
4072
4121
*/
4073
4122
while (total_evicted < bytes ) {
4074
4123
int sublist_idx = multilist_get_random_index (ml );
4124
+ boolean_t usetskq = zfs_arc_evict_parallel ;
4075
4125
uint64_t scan_evicted = 0 ;
4076
4126
4127
+ uint64_t left = (bytes == ARC_EVICT_ALL ? bytes :
4128
+ bytes - total_evicted );
4129
+
4130
+ /*
4131
+ How we scale
4132
+
4133
+ Example 1, # of chunks less than # of tasks.
4134
+ We have:
4135
+ - 4 tasks
4136
+ - 3 chunks
4137
+ - 3 full col
4138
+ - 0 low cols.
4139
+
4140
+ The first low col index is 3.
4141
+ The tasks #0-#2 evict 1 chunk each.
4142
+
4143
+ 0 | 1 | 2 | 3 |
4144
+ +===+===+===+===+
4145
+ | x | x | x | |
4146
+ +---+---+---+---+
4147
+
4148
+ Example 2, # of chunks more than # of tasks.
4149
+ We have:
4150
+ - 4 tasks
4151
+ - 9 chunks
4152
+ - 1 full col
4153
+ - 3 low cols
4154
+
4155
+ The first low col index is 1.
4156
+ The task #0 evicts 3 chunks, the others evict 2 chunks each.
4157
+
4158
+ 0 | 1 | 2 | 3 |
4159
+ +===+===+===+===+
4160
+ | x | x | x | x |
4161
+ +---+---+---+---+
4162
+ | x | x | x | x |
4163
+ +---+---+---+---+
4164
+ | x | | | |
4165
+ +---+---+---+---+
4166
+ */
4167
+
4168
+ /*
4169
+ * Compute number of tasks to run (n), low col index (k)
4170
+ * and normal and low bytes per task.
4171
+ */
4172
+ uint64_t nchunks = ((left - 1 ) >> MIN_EVICT_PERTASK_SHIFT ) + 1 ;
4173
+ unsigned n = nchunks < num_sublists ? nchunks : num_sublists ;
4174
+ uint64_t fullrows = nchunks / n ;
4175
+ unsigned lastrowcols = nchunks % n ;
4176
+ unsigned k = (lastrowcols ? lastrowcols : n );
4177
+
4178
+ uint64_t bytes_pertask_low = fullrows << MIN_EVICT_PERTASK_SHIFT ;
4179
+ uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ?
4180
+ (1 << MIN_EVICT_PERTASK_SHIFT ) : 0 );
4181
+
4077
4182
/*
4078
4183
* Start eviction using a randomly selected sublist,
4079
4184
* this is to try and evenly balance eviction across all
4080
4185
* sublists. Always starting at the same sublist
4081
4186
* (e.g. index 0) would cause evictions to favor certain
4082
4187
* sublists over others.
4083
4188
*/
4084
- for (int i = 0 ; i < num_sublists ; i ++ ) {
4189
+ for (unsigned i = 0 ; i < n ; i ++ , sublist_idx ++ ) {
4085
4190
uint64_t bytes_remaining ;
4086
4191
uint64_t bytes_evicted ;
4087
4192
4193
+ /* we've reached the end, wrap to the beginning */
4194
+ if (sublist_idx >= num_sublists )
4195
+ sublist_idx = 0 ;
4196
+
4197
+ if (usetskq ) {
4198
+ uint64_t evict = i < k ? bytes_pertask :
4199
+ bytes_pertask_low ;
4200
+
4201
+ ASSERT3S (n , <=, num_sublists );
4202
+
4203
+ memset (& evarg [i ].tqe , 0 , sizeof (evarg [i ].tqe ));
4204
+ evarg [i ].ml = ml ;
4205
+ evarg [i ].marker = markers [sublist_idx ];
4206
+ evarg [i ].spa = spa ;
4207
+ evarg [i ].evicted_ptr = & scan_evicted ;
4208
+ evarg [i ].idx = sublist_idx ;
4209
+ evarg [i ].bytes = evict ;
4210
+
4211
+ taskq_dispatch_ent (arc_evict_taskq ,
4212
+ arc_evict_task ,
4213
+ & evarg [i ], 0 , & evarg [i ].tqe );
4214
+ continue ;
4215
+ }
4216
+
4088
4217
if (total_evicted < bytes )
4089
4218
bytes_remaining = bytes - total_evicted ;
4090
4219
else
@@ -4095,10 +4224,11 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4095
4224
4096
4225
scan_evicted += bytes_evicted ;
4097
4226
total_evicted += bytes_evicted ;
4227
+ }
4098
4228
4099
- /* we've reached the end, wrap to the beginning */
4100
- if ( ++ sublist_idx >= num_sublists )
4101
- sublist_idx = 0 ;
4229
+ if ( usetskq ) {
4230
+ taskq_wait ( arc_evict_taskq );
4231
+ total_evicted += scan_evicted ;
4102
4232
}
4103
4233
4104
4234
/*
@@ -4125,11 +4255,14 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4125
4255
}
4126
4256
}
4127
4257
4258
+ kmem_free (evarg , sizeof (* evarg ) * num_sublists );
4259
+
4128
4260
for (int i = 0 ; i < num_sublists ; i ++ ) {
4129
4261
multilist_sublist_t * mls = multilist_sublist_lock_idx (ml , i );
4130
4262
multilist_sublist_remove (mls , markers [i ]);
4131
4263
multilist_sublist_unlock (mls );
4132
4264
}
4265
+
4133
4266
if (markers != arc_state_evict_markers )
4134
4267
arc_state_free_markers (markers , num_sublists );
4135
4268
@@ -7737,12 +7870,18 @@ arc_init(void)
7737
7870
7738
7871
buf_init ();
7739
7872
7873
+ if (zfs_arc_evict_threads == 0 )
7874
+ zfs_arc_evict_threads = MIN (16 , max_ncpus >> 1 );
7875
+
7740
7876
list_create (& arc_prune_list , sizeof (arc_prune_t ),
7741
7877
offsetof(arc_prune_t , p_node ));
7742
7878
mutex_init (& arc_prune_mtx , NULL , MUTEX_DEFAULT , NULL );
7743
7879
7744
7880
arc_prune_taskq = taskq_create ("arc_prune" , zfs_arc_prune_task_threads ,
7745
7881
defclsyspri , 100 , INT_MAX , TASKQ_PREPOPULATE | TASKQ_DYNAMIC );
7882
+ arc_evict_taskq = taskq_create ("arc_evict" ,
7883
+ MIN (zfs_arc_evict_threads , max_ncpus ), defclsyspri ,
7884
+ MIN (zfs_arc_evict_threads , max_ncpus ), max_ncpus , TASKQ_PREPOPULATE );
7746
7885
7747
7886
arc_ksp = kstat_create ("zfs" , 0 , "arcstats" , "misc" , KSTAT_TYPE_NAMED ,
7748
7887
sizeof (arc_stats ) / sizeof (kstat_named_t ), KSTAT_FLAG_VIRTUAL );
@@ -7817,6 +7956,9 @@ arc_fini(void)
7817
7956
arc_ksp = NULL ;
7818
7957
}
7819
7958
7959
+ taskq_wait (arc_evict_taskq );
7960
+ taskq_destroy (arc_evict_taskq );
7961
+
7820
7962
taskq_wait (arc_prune_taskq );
7821
7963
taskq_destroy (arc_prune_taskq );
7822
7964
@@ -10840,3 +10982,9 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
10840
10982
10841
10983
ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , prune_task_threads , INT , ZMOD_RW ,
10842
10984
"Number of arc_prune threads" );
10985
+
10986
+ ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , evict_parallel , UINT , ZMOD_RW ,
10987
+ "Evict from the ARC in parallel using a taskq" );
10988
+
10989
+ ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , evict_threads , UINT , ZMOD_RW ,
10990
+ "Maximum number of arc_evict threads" );
0 commit comments