Skip to content

Implement parallel dbuf eviction #16487

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.\" Copyright (c) 2024, Klara, Inc.
.\"
.Dd November 1, 2024
.Dt ZFS 4
.Os
Expand Down Expand Up @@ -76,6 +74,26 @@ When set to
.Sy 0
the array is dynamically sized based on total system memory.
.
.It Sy dbuf_evict_threads Ns = Ns Sy 0 Pq int
Controls the number of dbuf eviction threads to be used.
.Pp
When set to 0, ZFS will compute the number of required eviction threads
depending on the number of CPU cores (ncpu_max).
The minimum number of threads is 1 and applies to systems from 1 to 5 CPU cores.
Systems with 6 CPU cores get 2 eviction threads.
ZFS on systems larger than that uses log2 of the CPU count
plus one for each 64 CPUs.
This way the number of eviction threads scales up more on high CPU counts.
Currently, ZFS will not scale automatically beyond 16 threads.
.Pp
When set to 1, the parallel dbuf eviction is disabled.
Only one thread will be used to evict dbufs.
.Pp
When set to a value greater than 1, the value will be used as an exact number
of eviction threads.
If changed live, it will be limited by number of threads allocated on module
load.
.
.It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint
dnode slots allocated in a single operation as a power of 2.
The default value minimizes lock contention for the bulk operation performed.
Expand Down
197 changes: 183 additions & 14 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
static kmem_cache_t *dbuf_kmem_cache;
kmem_cache_t *dbuf_dirty_kmem_cache;
static taskq_t *dbu_evict_taskq;
static taskq_t *dbuf_evict_taskq;

static kthread_t *dbuf_cache_evict_thread;
static kmutex_t dbuf_evict_lock;
Expand Down Expand Up @@ -238,6 +239,24 @@ static uint_t dbuf_metadata_cache_shift = 6;
/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
static uint_t dbuf_mutex_cache_shift = 0;

/*
* Controls the number of dbuf eviction threads.
* Possible values:
* 0 (auto) compute the number of threads using a logarithmic formula.
* 1 (disabled) one thread - parallel eviction is disabled.
* 2+ (manual) set the number manually, limited by dbuf_evict_threads_max.
*/
static uint_t dbuf_evict_threads = 0;

/*
* The number of allocated dbuf eviction threads. This limits the maximum value
* of dbuf_evict_threads.
* The number is set up at module load time and depends on the initial value of
* dbuf_evict_threads. If dbuf_evict_threads is set to auto, a logarithmic
* function is used to compute this value. Otherwise, it is set to max_ncpus.
*/
static uint_t dbuf_evict_threads_max;

static unsigned long dbuf_cache_target_bytes(void);
static unsigned long dbuf_metadata_cache_target_bytes(void);

Expand Down Expand Up @@ -769,26 +788,47 @@ dbuf_cache_above_lowater(void)
}

/*
* Evict the oldest eligible dbuf from the dbuf cache.
* Evict the oldest eligible dbufs from the dbuf cache.
* Use the multilist sublist (mls) with the provided index #idx.
*/
static void
dbuf_evict_one(void)
dbuf_evict_many(uint64_t bytes, unsigned int idx)
{
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
int64_t evicted = 0;
dmu_buf_impl_t *marker = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
marker->db_objset = NULL;

ASSERT3U(idx, <, multilist_get_num_sublists(
&dbuf_caches[DB_DBUF_CACHE].cache));

multilist_sublist_t *mls = multilist_sublist_lock_idx(
&dbuf_caches[DB_DBUF_CACHE].cache, idx);

ASSERT(!MUTEX_HELD(&dbuf_evict_lock));

dmu_buf_impl_t *db = multilist_sublist_tail(mls);
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
db = multilist_sublist_prev(mls, db);
}
multilist_sublist_insert_after(mls, db, marker);

while (db != NULL && evicted < bytes) {
int skip = 0;
while (db != NULL && (db->db_objset == NULL ||
mutex_tryenter(&db->db_mtx) == 0)) {
db = multilist_sublist_prev(mls, db);
if (skip == 0)
skip = 1;
}

DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
multilist_sublist_t *, mls);
if (db == NULL)
break;

if (skip) {
multilist_sublist_remove(mls, marker);
multilist_sublist_insert_before(mls, db, marker);
}

DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
multilist_sublist_t *, mls);

if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
uint64_t size = db->db.db_size;
Expand All @@ -804,9 +844,106 @@ dbuf_evict_one(void)
db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
DBUF_STAT_BUMP(cache_total_evicts);
} else {
multilist_sublist_unlock(mls);
evicted += size + usize;

mls = multilist_sublist_lock_idx(
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
db = multilist_sublist_prev(mls, marker);
}

multilist_sublist_remove(mls, marker);
multilist_sublist_unlock(mls);
kmem_cache_free(dbuf_kmem_cache, marker);
}

typedef struct evict_arg {
taskq_ent_t tqe;
unsigned idx;
uint64_t bytes;
} evict_arg_t;

static void
dbuf_evict_task(void *arg)
{
evict_arg_t *eva = arg;
dbuf_evict_many(eva->bytes, eva->idx);
}

/*
* The minimum number of bytes we can evict at once is a block size.
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
*/
#define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)

static void
dbuf_evict(void)
{
int64_t bytes = (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) -
dbuf_cache_lowater_bytes());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here you are making every thread to evict everything extra. At best you might end up evicting all the cache.


if (bytes <= 0)
return;

evict_arg_t *evarg = NULL;
int num_sublists = multilist_get_num_sublists(
&dbuf_caches[DB_DBUF_CACHE].cache);

uint_t nthreads = (dbuf_evict_taskq == NULL ? 1 : MIN(num_sublists,
(dbuf_evict_threads == 0 ? dbuf_evict_threads_max :
MIN(dbuf_evict_threads, dbuf_evict_threads_max))));

boolean_t use_evcttq = nthreads > 1;

int sublist_idx = multilist_get_random_index(
&dbuf_caches[DB_DBUF_CACHE].cache);

uint64_t evict = MIN_EVICT_SIZE;
uint_t ntasks = nthreads;

if (use_evcttq) {
if (bytes > nthreads * MIN_EVICT_SIZE) {
evict = DIV_ROUND_UP(bytes, nthreads);
} else {
ntasks = DIV_ROUND_UP(bytes, MIN_EVICT_SIZE);
if (ntasks == 1)
use_evcttq = B_FALSE;
}
}

if (use_evcttq) {
evarg = kmem_zalloc(sizeof (*evarg) * nthreads, KM_NOSLEEP);
if (evarg) {
for (int i = 0; i < nthreads; i++)
taskq_init_ent(&evarg[i].tqe);
} else {
/*
* Fall back to a regular single-threaded eviction.
*/
use_evcttq = B_FALSE;
}
}

if (!use_evcttq)
return (dbuf_evict_many(bytes, sublist_idx));

/*
* Go to the parallel eviction.
*/

for (int i = 0; i < ntasks; i++) {
evarg[i].idx = sublist_idx;
evarg[i].bytes = evict;

taskq_dispatch_ent(dbuf_evict_taskq, dbuf_evict_task,
&evarg[i], 0, &evarg[i].tqe);

/* wrap sublist_idx */
if (++sublist_idx >= num_sublists)
sublist_idx = 0;
}

taskq_wait(dbuf_evict_taskq);
kmem_free(evarg, sizeof (*evarg) * nthreads);
}

/*
Expand Down Expand Up @@ -840,7 +977,7 @@ dbuf_evict_thread(void *unused)
* minimize lock contention.
*/
while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
dbuf_evict_one();
dbuf_evict();
}

mutex_enter(&dbuf_evict_lock);
Expand All @@ -867,7 +1004,7 @@ dbuf_evict_notify(uint64_t size)
*/
if (size > dbuf_cache_target_bytes()) {
if (size > dbuf_cache_hiwater_bytes())
dbuf_evict_one();
dbuf_evict();
Comment on lines 1005 to +1007
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets assume we have 10 user threads calling this. I suppose each of them will try to create own task sets to evict the same full amount of extra dbuf caches using all the same CPUs. In best case it may end up with empty dbuf cache. I am not sure I greatly like the design of one main eviction thread calling bunch of other taskqs, but each client thread doing that definitely looks weird. I think if user threads has to do evictions, they should do it directly, just doing more than one buffer at a time to be more efficient, as you have said.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as I complained before, you are evicting everything extra from every additional caller thread if highwater is reached. But now you are making that thread to queue it to the same set of taskqs as the main eviction path (which may have some sense if main eviction thread started only one task and now waiting for it, but I am not sure is enough). I think we could either remove this eviction path, or if we keep it (), make it always execute synchronously without taskq and may be evict only one buffer same as it was originally here.

cv_signal(&dbuf_evict_cv);
}
}
Expand Down Expand Up @@ -981,6 +1118,27 @@ dbuf_init(void)
* configuration is not required.
*/
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
if (max_ncpus > 1) {
if (dbuf_evict_threads == 0) {
/*
* Limit the maximum number of threads by 16.
* We reach the limit when max_ncpu == 256.
*/
uint_t nthreads = MIN((highbit64(max_ncpus) - 1) +
max_ncpus / 32, 16);
dbuf_evict_threads_max = max_ncpus < 4 ? 1 :
nthreads;
} else {
dbuf_evict_threads_max = max_ncpus / 2;
}

if (dbuf_evict_threads_max > 1) {
dbuf_evict_taskq = taskq_create("dbuf_evict",
dbuf_evict_threads_max,
defclsyspri, 0, INT_MAX, TASKQ_PREPOPULATE);
}
}


for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
multilist_create(&dbuf_caches[dcs].cache,
Expand Down Expand Up @@ -1049,6 +1207,11 @@ dbuf_fini(void)
kmem_cache_destroy(dbuf_dirty_kmem_cache);
taskq_destroy(dbu_evict_taskq);

if (dbuf_evict_taskq != NULL) {
taskq_wait(dbuf_evict_taskq);
taskq_destroy(dbuf_evict_taskq);
}

mutex_enter(&dbuf_evict_lock);
dbuf_evict_thread_exit = B_TRUE;
while (dbuf_evict_thread_exit) {
Expand Down Expand Up @@ -4107,7 +4270,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag)
* dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
* ^ |
* | |
* +-----dbuf_destroy()<--dbuf_evict_one()<--------+
* +-----dbuf_destroy()<--dbuf_evict()<------------+
*
*/
void
Expand Down Expand Up @@ -5441,3 +5604,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,

ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
"Set size of dbuf cache mutex array as log2 shift.");

ZFS_MODULE_PARAM(zfs_arc, dbuf_, evict_threads, UINT, ZMOD_RW,
"Controls the number of dbuf eviction threads");

ZFS_MODULE_PARAM(zfs_arc, dbuf_, evict_threads_max, UINT, ZMOD_RD,
"The number of allocated dbuf eviction threads");
Loading