@@ -183,6 +183,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
183
183
static kmem_cache_t * dbuf_kmem_cache ;
184
184
kmem_cache_t * dbuf_dirty_kmem_cache ;
185
185
static taskq_t * dbu_evict_taskq ;
186
+ static taskq_t * dbuf_evict_taskq ;
186
187
187
188
static kthread_t * dbuf_cache_evict_thread ;
188
189
static kmutex_t dbuf_evict_lock ;
@@ -237,6 +238,24 @@ static uint_t dbuf_metadata_cache_shift = 6;
237
238
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
238
239
static uint_t dbuf_mutex_cache_shift = 0 ;
239
240
241
+ /*
242
+ * Controls the number of dbuf eviction threads.
243
+ * Possible values:
244
+ * 0 (auto) compute the number of threads using a logarithmic formula.
245
+ * 1 (disabled) one thread - parallel eviction is disabled.
246
+ * 2+ (manual) set the number manually, limited by dbuf_evict_threads_max.
247
+ */
248
+ static uint_t dbuf_evict_threads = 1 ;
249
+
250
+ /*
251
+ * The number of allocated dbuf eviction threads. This limits the maximum value
252
+ * of dbuf_evict_threads.
253
+ * The number is set up at module load time and depends on the initial value of
254
+ * dbuf_evict_threads. If dbuf_evict_threads is set to auto, a logarithmic
255
+ * function is used to compute this value. Otherwise, it is set to max_ncpus.
256
+ */
257
+ static uint_t dbuf_evict_threads_max ;
258
+
240
259
static unsigned long dbuf_cache_target_bytes (void );
241
260
static unsigned long dbuf_metadata_cache_target_bytes (void );
242
261
@@ -768,26 +787,47 @@ dbuf_cache_above_lowater(void)
768
787
}
769
788
770
789
/*
771
- * Evict the oldest eligible dbuf from the dbuf cache.
790
+ * Evict the oldest eligible dbufs from the dbuf cache.
791
+ * Use the multilist sublist (mls) with the provided index #idx.
772
792
*/
773
793
static void
774
- dbuf_evict_one ( void )
794
+ dbuf_evict_many ( uint64_t bytes , unsigned int idx )
775
795
{
776
- int idx = multilist_get_random_index (& dbuf_caches [DB_DBUF_CACHE ].cache );
796
+ int64_t evicted = 0 ;
797
+ dmu_buf_impl_t * marker = kmem_cache_alloc (dbuf_kmem_cache , KM_SLEEP );
798
+ marker -> db_objset = NULL ;
799
+
800
+ ASSERT3U (idx , < , multilist_get_num_sublists (
801
+ & dbuf_caches [DB_DBUF_CACHE ].cache ));
802
+
777
803
multilist_sublist_t * mls = multilist_sublist_lock_idx (
778
804
& dbuf_caches [DB_DBUF_CACHE ].cache , idx );
779
805
780
806
ASSERT (!MUTEX_HELD (& dbuf_evict_lock ));
781
807
782
808
dmu_buf_impl_t * db = multilist_sublist_tail (mls );
783
- while (db != NULL && mutex_tryenter (& db -> db_mtx ) == 0 ) {
784
- db = multilist_sublist_prev (mls , db );
785
- }
809
+ multilist_sublist_insert_after (mls , db , marker );
810
+
811
+ while (db != NULL && evicted < bytes ) {
812
+ int skip = 0 ;
813
+ while (db != NULL && (db -> db_objset == NULL ||
814
+ mutex_tryenter (& db -> db_mtx ) == 0 )) {
815
+ db = multilist_sublist_prev (mls , db );
816
+ if (skip == 0 )
817
+ skip = 1 ;
818
+ }
786
819
787
- DTRACE_PROBE2 (dbuf__evict__one , dmu_buf_impl_t * , db ,
788
- multilist_sublist_t * , mls );
820
+ if (db == NULL )
821
+ break ;
822
+
823
+ if (skip ) {
824
+ multilist_sublist_remove (mls , marker );
825
+ multilist_sublist_insert_before (mls , db , marker );
826
+ }
827
+
828
+ DTRACE_PROBE2 (dbuf__evict__one , dmu_buf_impl_t * , db ,
829
+ multilist_sublist_t * , mls );
789
830
790
- if (db != NULL ) {
791
831
multilist_sublist_remove (mls , db );
792
832
multilist_sublist_unlock (mls );
793
833
uint64_t size = db -> db .db_size ;
@@ -803,9 +843,100 @@ dbuf_evict_one(void)
803
843
db -> db_caching_status = DB_NO_CACHE ;
804
844
dbuf_destroy (db );
805
845
DBUF_STAT_BUMP (cache_total_evicts );
846
+ evicted += size + usize ;
847
+
848
+ mls = multilist_sublist_lock_idx (
849
+ & dbuf_caches [DB_DBUF_CACHE ].cache , idx );
850
+ db = multilist_sublist_prev (mls , marker );
851
+ }
852
+
853
+ multilist_sublist_remove (mls , marker );
854
+ multilist_sublist_unlock (mls );
855
+ kmem_cache_free (dbuf_kmem_cache , marker );
856
+ }
857
+
858
+ typedef struct evict_arg {
859
+ taskq_ent_t tqe ;
860
+ unsigned idx ;
861
+ uint64_t bytes ;
862
+ } evict_arg_t ;
863
+
864
+ static void
865
+ dbuf_evict_task (void * arg )
866
+ {
867
+ evict_arg_t * eva = arg ;
868
+ dbuf_evict_many (eva -> bytes , eva -> idx );
869
+ }
870
+
871
+ /*
872
+ * The minimum number of bytes we can evict at once is a block size.
873
+ * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
874
+ */
875
+ #define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)
876
+
877
+ static void
878
+ dbuf_evict (void )
879
+ {
880
+ int64_t bytes = (zfs_refcount_count (& dbuf_caches [DB_DBUF_CACHE ].size ) -
881
+ dbuf_cache_lowater_bytes ());
882
+
883
+ if (bytes <= 0 )
884
+ return ;
885
+
886
+ evict_arg_t * evarg = NULL ;
887
+ int num_sublists = multilist_get_num_sublists (
888
+ & dbuf_caches [DB_DBUF_CACHE ].cache );
889
+
890
+ uint_t nthreads = (dbuf_evict_taskq == NULL ? 1 : MIN (num_sublists ,
891
+ (dbuf_evict_threads == 0 ? dbuf_evict_threads_max :
892
+ MIN (dbuf_evict_threads , dbuf_evict_threads_max ))));
893
+
894
+ boolean_t use_evcttq = nthreads > 1 ;
895
+
896
+
897
+ if (use_evcttq ) {
898
+ evarg = kmem_zalloc (sizeof (* evarg ) * nthreads , KM_NOSLEEP );
899
+ /*
900
+ * Fall back to a regular single-threaded eviction.
901
+ */
902
+ if (evarg == NULL )
903
+ use_evcttq = B_FALSE ;
904
+ }
905
+
906
+ unsigned idx = multilist_get_random_index (
907
+ & dbuf_caches [DB_DBUF_CACHE ].cache );
908
+
909
+ if (!use_evcttq )
910
+ return (dbuf_evict_many (bytes , idx ));
911
+
912
+ /*
913
+ * Go to the parallel eviction.
914
+ */
915
+ uint64_t evict ;
916
+ uint_t ntasks ;
917
+
918
+ if (bytes > nthreads * MIN_EVICT_SIZE ) {
919
+ evict = DIV_ROUND_UP (bytes , nthreads );
920
+ ntasks = nthreads ;
806
921
} else {
807
- multilist_sublist_unlock (mls );
922
+ evict = MIN_EVICT_SIZE ;
923
+ ntasks = DIV_ROUND_UP (bytes , MIN_EVICT_SIZE );
808
924
}
925
+
926
+ for (unsigned i = 0 ; i < ntasks ; i ++ ) {
927
+ evarg [i ].idx = idx ;
928
+ evarg [i ].bytes = evict ;
929
+
930
+ taskq_dispatch_ent (dbuf_evict_taskq , dbuf_evict_task ,
931
+ & evarg [i ], 0 , & evarg [i ].tqe );
932
+
933
+ /* wrap idx */
934
+ if (++ idx >= num_sublists )
935
+ idx = 0 ;
936
+ }
937
+
938
+ taskq_wait (dbuf_evict_taskq );
939
+ kmem_free (evarg , sizeof (* evarg ) * nthreads );
809
940
}
810
941
811
942
/*
@@ -839,7 +970,7 @@ dbuf_evict_thread(void *unused)
839
970
* minimize lock contention.
840
971
*/
841
972
while (dbuf_cache_above_lowater () && !dbuf_evict_thread_exit ) {
842
- dbuf_evict_one ();
973
+ dbuf_evict ();
843
974
}
844
975
845
976
mutex_enter (& dbuf_evict_lock );
@@ -866,7 +997,7 @@ dbuf_evict_notify(uint64_t size)
866
997
*/
867
998
if (size > dbuf_cache_target_bytes ()) {
868
999
if (size > dbuf_cache_hiwater_bytes ())
869
- dbuf_evict_one ();
1000
+ dbuf_evict ();
870
1001
cv_signal (& dbuf_evict_cv );
871
1002
}
872
1003
}
@@ -980,6 +1111,27 @@ dbuf_init(void)
980
1111
* configuration is not required.
981
1112
*/
982
1113
dbu_evict_taskq = taskq_create ("dbu_evict" , 1 , defclsyspri , 0 , 0 , 0 );
1114
+ if (max_ncpus > 1 ) {
1115
+ if (dbuf_evict_threads == 0 ) {
1116
+ /*
1117
+ * Limit the maximum number of threads by 16.
1118
+ * We reach the limit when max_ncpu == 256.
1119
+ */
1120
+ uint_t nthreads = MIN ((highbit64 (max_ncpus ) - 1 ) +
1121
+ max_ncpus / 32 , 16 );
1122
+ dbuf_evict_threads_max = max_ncpus < 4 ? 1 :
1123
+ nthreads ;
1124
+ } else {
1125
+ dbuf_evict_threads_max = max_ncpus / 2 ;
1126
+ }
1127
+
1128
+ if (dbuf_evict_threads_max > 1 ) {
1129
+ dbuf_evict_taskq = taskq_create ("dbuf_evict" ,
1130
+ dbuf_evict_threads_max ,
1131
+ defclsyspri , 0 , INT_MAX , TASKQ_PREPOPULATE );
1132
+ }
1133
+ }
1134
+
983
1135
984
1136
for (dbuf_cached_state_t dcs = 0 ; dcs < DB_CACHE_MAX ; dcs ++ ) {
985
1137
multilist_create (& dbuf_caches [dcs ].cache ,
@@ -1047,6 +1199,10 @@ dbuf_fini(void)
1047
1199
kmem_cache_destroy (dbuf_kmem_cache );
1048
1200
kmem_cache_destroy (dbuf_dirty_kmem_cache );
1049
1201
taskq_destroy (dbu_evict_taskq );
1202
+ if (dbuf_evict_taskq != NULL ) {
1203
+ taskq_wait (dbuf_evict_taskq );
1204
+ taskq_destroy (dbuf_evict_taskq );
1205
+ }
1050
1206
1051
1207
mutex_enter (& dbuf_evict_lock );
1052
1208
dbuf_evict_thread_exit = B_TRUE ;
@@ -4106,7 +4262,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag)
4106
4262
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
4107
4263
* ^ |
4108
4264
* | |
4109
- * +-----dbuf_destroy()<--dbuf_evict_one ()<--------+
4265
+ * +-----dbuf_destroy()<--dbuf_evict ()<---- --------+
4110
4266
*
4111
4267
*/
4112
4268
void
@@ -5440,3 +5596,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
5440
5596
5441
5597
ZFS_MODULE_PARAM (zfs_dbuf , dbuf_ , mutex_cache_shift , UINT , ZMOD_RD ,
5442
5598
"Set size of dbuf cache mutex array as log2 shift." );
5599
+
5600
+ ZFS_MODULE_PARAM (zfs_arc , dbuf_ , evict_threads , UINT , ZMOD_RW ,
5601
+ "Controls the number of dbuf eviction threads" );
5602
+
5603
+ ZFS_MODULE_PARAM (zfs_arc , dbuf_ , evict_threads_max , UINT , ZMOD_RD ,
5604
+ "The number of allocated dbuf eviction threads" );
0 commit comments