Skip to content

Commit b996a50

Browse files
Alexander Stetsenkoallanjude0mp
authored andcommitted
Implement parallel dbuf eviction
In the previous code, dbuf_evict_thread() would called dbuf_evict_one() in a look while dbuf_cache_above_lowater(). dbuf_evict_one() would select a random sublist from the dbuf cache, then walk it from the tail forward, attempting to acquire the lock on each object until it succeeded, then evict that object and return. As the name suggests, it would evict only a single object from the cache. However, evicting one object is not likely to bring us below the desired low water mark, so dbuf_evict_one() will be called again, where it will loop over all of the same busy objects again, until it founds one it can evict. This has been replaced with dbuf_evict_many() which takes a specific sublist as a parameter, as well as a desired amount of data to evict. It then walks the sublist from the tail forward, evicting what it can until the number of bytes evicted satisfies the input parameter or the head of the sublist is reached. The dbuf_evict_thread now runs is parallel as well, allowing it to keep up with demand more easily. For the dbuf cache, if the single thread was not able to keep up, ZFS would shift the work of evicting some items to each incoming I/O thread. While that is still the case it should be seen much less often now that dbuf_evict is more efficient and no longer bottlenecked to a single thread. Sponsored-by: Expensify, Inc. Sponsored-by: Klara, Inc. Co-authored-by: Allan Jude <[email protected]> Co-authored-by: Mateusz Piotrowski <[email protected]> Signed-off-by: Alexander Stetsenko <[email protected]> Signed-off-by: Allan Jude <[email protected]> Signed-off-by: Mateusz Piotrowski <[email protected]>
1 parent 919bc4d commit b996a50

File tree

2 files changed

+195
-15
lines changed

2 files changed

+195
-15
lines changed

man/man4/zfs.4

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616
.\" own identifying information:
1717
.\" Portions Copyright [yyyy] [name of copyright owner]
1818
.\"
19-
.\" Copyright (c) 2024, Klara, Inc.
20-
.\"
2119
.Dd November 1, 2024
2220
.Dt ZFS 4
2321
.Os
@@ -75,6 +73,26 @@ When set to
7573
.Sy 0
7674
the array is dynamically sized based on total system memory.
7775
.
76+
.It Sy dbuf_evict_threads Ns = Ns Sy 0 Pq int
77+
Controls the number of dbuf eviction threads to be used.
78+
.Pp
79+
When set to 0, ZFS will compute the number of required eviction threads
80+
depending on the number of CPU cores (ncpu_max).
81+
The minimum number of threads is 1 and applies to systems from 1 to 5 CPU cores.
82+
Systems with 6 CPU cores get 2 eviction threads.
83+
ZFS on systems larger than that uses log2 of the CPU count
84+
plus one for each 64 CPUs.
85+
This way the number of eviction threads scales up more on high CPU counts.
86+
Currently, ZFS will not scale automatically beyond 16 threads.
87+
.Pp
88+
When set to 1, the parallel dbuf eviction is disabled.
89+
Only one thread will be used to evict dbufs.
90+
.Pp
91+
When set to a value greater than 1, the value will be used as an exact number
92+
of eviction threads.
93+
If changed live, it will be limited by number of threads allocated on module
94+
load.
95+
.
7896
.It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint
7997
dnode slots allocated in a single operation as a power of 2.
8098
The default value minimizes lock contention for the bulk operation performed.

module/zfs/dbuf.c

Lines changed: 175 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
183183
static kmem_cache_t *dbuf_kmem_cache;
184184
kmem_cache_t *dbuf_dirty_kmem_cache;
185185
static taskq_t *dbu_evict_taskq;
186+
static taskq_t *dbuf_evict_taskq;
186187

187188
static kthread_t *dbuf_cache_evict_thread;
188189
static kmutex_t dbuf_evict_lock;
@@ -237,6 +238,24 @@ static uint_t dbuf_metadata_cache_shift = 6;
237238
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
238239
static uint_t dbuf_mutex_cache_shift = 0;
239240

241+
/*
242+
* Controls the number of dbuf eviction threads.
243+
* Possible values:
244+
* 0 (auto) compute the number of threads using a logarithmic formula.
245+
* 1 (disabled) one thread - parallel eviction is disabled.
246+
* 2+ (manual) set the number manually, limited by dbuf_evict_threads_max.
247+
*/
248+
static uint_t dbuf_evict_threads = 1;
249+
250+
/*
251+
* The number of allocated dbuf eviction threads. This limits the maximum value
252+
* of dbuf_evict_threads.
253+
* The number is set up at module load time and depends on the initial value of
254+
* dbuf_evict_threads. If dbuf_evict_threads is set to auto, a logarithmic
255+
* function is used to compute this value. Otherwise, it is set to max_ncpus.
256+
*/
257+
static uint_t dbuf_evict_threads_max;
258+
240259
static unsigned long dbuf_cache_target_bytes(void);
241260
static unsigned long dbuf_metadata_cache_target_bytes(void);
242261

@@ -768,26 +787,47 @@ dbuf_cache_above_lowater(void)
768787
}
769788

770789
/*
771-
* Evict the oldest eligible dbuf from the dbuf cache.
790+
* Evict the oldest eligible dbufs from the dbuf cache.
791+
* Use the multilist sublist (mls) with the provided index #idx.
772792
*/
773793
static void
774-
dbuf_evict_one(void)
794+
dbuf_evict_many(uint64_t bytes, unsigned int idx)
775795
{
776-
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
796+
int64_t evicted = 0;
797+
dmu_buf_impl_t *marker = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
798+
marker->db_objset = NULL;
799+
800+
ASSERT3U(idx, <, multilist_get_num_sublists(
801+
&dbuf_caches[DB_DBUF_CACHE].cache));
802+
777803
multilist_sublist_t *mls = multilist_sublist_lock_idx(
778804
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
779805

780806
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
781807

782808
dmu_buf_impl_t *db = multilist_sublist_tail(mls);
783-
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
784-
db = multilist_sublist_prev(mls, db);
785-
}
809+
multilist_sublist_insert_after(mls, db, marker);
810+
811+
while (db != NULL && evicted < bytes) {
812+
int skip = 0;
813+
while (db != NULL && (db->db_objset == NULL ||
814+
mutex_tryenter(&db->db_mtx) == 0)) {
815+
db = multilist_sublist_prev(mls, db);
816+
if (skip == 0)
817+
skip = 1;
818+
}
786819

787-
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
788-
multilist_sublist_t *, mls);
820+
if (db == NULL)
821+
break;
822+
823+
if (skip) {
824+
multilist_sublist_remove(mls, marker);
825+
multilist_sublist_insert_before(mls, db, marker);
826+
}
827+
828+
DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
829+
multilist_sublist_t *, mls);
789830

790-
if (db != NULL) {
791831
multilist_sublist_remove(mls, db);
792832
multilist_sublist_unlock(mls);
793833
uint64_t size = db->db.db_size;
@@ -803,9 +843,100 @@ dbuf_evict_one(void)
803843
db->db_caching_status = DB_NO_CACHE;
804844
dbuf_destroy(db);
805845
DBUF_STAT_BUMP(cache_total_evicts);
846+
evicted += size + usize;
847+
848+
mls = multilist_sublist_lock_idx(
849+
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
850+
db = multilist_sublist_prev(mls, marker);
851+
}
852+
853+
multilist_sublist_remove(mls, marker);
854+
multilist_sublist_unlock(mls);
855+
kmem_cache_free(dbuf_kmem_cache, marker);
856+
}
857+
858+
typedef struct evict_arg {
859+
taskq_ent_t tqe;
860+
unsigned idx;
861+
uint64_t bytes;
862+
} evict_arg_t;
863+
864+
static void
865+
dbuf_evict_task(void *arg)
866+
{
867+
evict_arg_t *eva = arg;
868+
dbuf_evict_many(eva->bytes, eva->idx);
869+
}
870+
871+
/*
872+
* The minimum number of bytes we can evict at once is a block size.
873+
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
874+
*/
875+
#define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)
876+
877+
static void
878+
dbuf_evict(void)
879+
{
880+
int64_t bytes = (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) -
881+
dbuf_cache_lowater_bytes());
882+
883+
if (bytes <= 0)
884+
return;
885+
886+
evict_arg_t *evarg = NULL;
887+
int num_sublists = multilist_get_num_sublists(
888+
&dbuf_caches[DB_DBUF_CACHE].cache);
889+
890+
uint_t nthreads = (dbuf_evict_taskq == NULL ? 1 : MIN(num_sublists,
891+
(dbuf_evict_threads == 0 ? dbuf_evict_threads_max :
892+
MIN(dbuf_evict_threads, dbuf_evict_threads_max))));
893+
894+
boolean_t use_evcttq = nthreads > 1;
895+
896+
897+
if (use_evcttq) {
898+
evarg = kmem_zalloc(sizeof (*evarg) * nthreads, KM_NOSLEEP);
899+
/*
900+
* Fall back to a regular single-threaded eviction.
901+
*/
902+
if (evarg == NULL)
903+
use_evcttq = B_FALSE;
904+
}
905+
906+
unsigned idx = multilist_get_random_index(
907+
&dbuf_caches[DB_DBUF_CACHE].cache);
908+
909+
if (!use_evcttq)
910+
return (dbuf_evict_many(bytes, idx));
911+
912+
/*
913+
* Go to the parallel eviction.
914+
*/
915+
uint64_t evict;
916+
uint_t ntasks;
917+
918+
if (bytes > nthreads * MIN_EVICT_SIZE) {
919+
evict = DIV_ROUND_UP(bytes, nthreads);
920+
ntasks = nthreads;
806921
} else {
807-
multilist_sublist_unlock(mls);
922+
evict = MIN_EVICT_SIZE;
923+
ntasks = DIV_ROUND_UP(bytes, MIN_EVICT_SIZE);
808924
}
925+
926+
for (unsigned i = 0; i < ntasks; i++) {
927+
evarg[i].idx = idx;
928+
evarg[i].bytes = evict;
929+
930+
taskq_dispatch_ent(dbuf_evict_taskq, dbuf_evict_task,
931+
&evarg[i], 0, &evarg[i].tqe);
932+
933+
/* wrap idx */
934+
if (++idx >= num_sublists)
935+
idx = 0;
936+
}
937+
938+
taskq_wait(dbuf_evict_taskq);
939+
kmem_free(evarg, sizeof (*evarg) * nthreads);
809940
}
810941

811942
/*
@@ -839,7 +970,7 @@ dbuf_evict_thread(void *unused)
839970
* minimize lock contention.
840971
*/
841972
while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
842-
dbuf_evict_one();
973+
dbuf_evict();
843974
}
844975

845976
mutex_enter(&dbuf_evict_lock);
@@ -866,7 +997,7 @@ dbuf_evict_notify(uint64_t size)
866997
*/
867998
if (size > dbuf_cache_target_bytes()) {
868999
if (size > dbuf_cache_hiwater_bytes())
869-
dbuf_evict_one();
1000+
dbuf_evict();
8701001
cv_signal(&dbuf_evict_cv);
8711002
}
8721003
}
@@ -980,6 +1111,27 @@ dbuf_init(void)
9801111
* configuration is not required.
9811112
*/
9821113
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
1114+
if (max_ncpus > 1) {
1115+
if (dbuf_evict_threads == 0) {
1116+
/*
1117+
* Limit the maximum number of threads by 16.
1118+
* We reach the limit when max_ncpu == 256.
1119+
*/
1120+
uint_t nthreads = MIN((highbit64(max_ncpus) - 1) +
1121+
max_ncpus / 32, 16);
1122+
dbuf_evict_threads_max = max_ncpus < 4 ? 1 :
1123+
nthreads;
1124+
} else {
1125+
dbuf_evict_threads_max = max_ncpus / 2;
1126+
}
1127+
1128+
if (dbuf_evict_threads_max > 1) {
1129+
dbuf_evict_taskq = taskq_create("dbuf_evict",
1130+
dbuf_evict_threads_max,
1131+
defclsyspri, 0, INT_MAX, TASKQ_PREPOPULATE);
1132+
}
1133+
}
1134+
9831135

9841136
for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
9851137
multilist_create(&dbuf_caches[dcs].cache,
@@ -1047,6 +1199,10 @@ dbuf_fini(void)
10471199
kmem_cache_destroy(dbuf_kmem_cache);
10481200
kmem_cache_destroy(dbuf_dirty_kmem_cache);
10491201
taskq_destroy(dbu_evict_taskq);
1202+
if (dbuf_evict_taskq != NULL) {
1203+
taskq_wait(dbuf_evict_taskq);
1204+
taskq_destroy(dbuf_evict_taskq);
1205+
}
10501206

10511207
mutex_enter(&dbuf_evict_lock);
10521208
dbuf_evict_thread_exit = B_TRUE;
@@ -4106,7 +4262,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag)
41064262
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
41074263
* ^ |
41084264
* | |
4109-
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
4265+
* +-----dbuf_destroy()<--dbuf_evict()<------------+
41104266
*
41114267
*/
41124268
void
@@ -5440,3 +5596,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
54405596

54415597
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
54425598
"Set size of dbuf cache mutex array as log2 shift.");
5599+
5600+
ZFS_MODULE_PARAM(zfs_arc, dbuf_, evict_threads, UINT, ZMOD_RW,
5601+
"Controls the number of dbuf eviction threads");
5602+
5603+
ZFS_MODULE_PARAM(zfs_arc, dbuf_, evict_threads_max, UINT, ZMOD_RD,
5604+
"The number of allocated dbuf eviction threads");

0 commit comments

Comments
 (0)