Skip to content

Commit bfea60d

Browse files
behlendorfandrewc12
authored andcommitted
Reduce dbuf_find() lock contention
Holding a dbuf is a common operation which can become highly contended in dbuf_find() when acquiring the dbuf hash mutex. This is particularly true on Linux when reading/writing volumes since by default up to 32 threads from the zvol_taskq may be taking a hold of the same dbuf. This should also be observable on FreeBSD as long as there are enough processes accessing the volume concurrently. This is further aggregrated by the fact that only the block id will be unique when calculating the dbuf hash for a single volume. The objset id, object id, and level will be the same for data blocks. This has been observed to result in a somehwat less than uniform hash distribution and a longer than expected max hash chain depth (~20) on a large memory system (256 GB) using volumes. This commit improves the siutation by switching the hash mutex to an rwlock to allow concurrent lookups, and increasing DBUF_RWLOCKS from 2048 to 8192 to further reduce the odds of a hash collision. Reviewed-by: Tony Hutter <[email protected]> Reviewed-by: Alexander Motin <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes openzfs#13405
1 parent 7dc74d1 commit bfea60d

File tree

3 files changed

+18
-19
lines changed

3 files changed

+18
-19
lines changed

include/sys/dbuf.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -321,13 +321,12 @@ typedef struct dmu_buf_impl {
321321
uint8_t db_dirtycnt;
322322
} dmu_buf_impl_t;
323323

324-
/* Note: the dbuf hash table is exposed only for the mdb module */
325-
#define DBUF_MUTEXES 2048
326-
#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
324+
#define DBUF_RWLOCKS 8192
325+
#define DBUF_HASH_RWLOCK(h, idx) (&(h)->hash_rwlocks[(idx) & (DBUF_RWLOCKS-1)])
327326
typedef struct dbuf_hash_table {
328327
uint64_t hash_table_mask;
329328
dmu_buf_impl_t **hash_table;
330-
kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned;
329+
krwlock_t hash_rwlocks[DBUF_RWLOCKS] ____cacheline_aligned;
331330
} dbuf_hash_table_t;
332331

333332
typedef void (*dbuf_prefetch_fn)(void *, boolean_t);

module/zfs/dbuf.c

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -339,18 +339,18 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
339339
hv = dbuf_hash(os, obj, level, blkid);
340340
idx = hv & h->hash_table_mask;
341341

342-
mutex_enter(DBUF_HASH_MUTEX(h, idx));
342+
rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_READER);
343343
for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
344344
if (DBUF_EQUAL(db, os, obj, level, blkid)) {
345345
mutex_enter(&db->db_mtx);
346346
if (db->db_state != DB_EVICTING) {
347-
mutex_exit(DBUF_HASH_MUTEX(h, idx));
347+
rw_exit(DBUF_HASH_RWLOCK(h, idx));
348348
return (db);
349349
}
350350
mutex_exit(&db->db_mtx);
351351
}
352352
}
353-
mutex_exit(DBUF_HASH_MUTEX(h, idx));
353+
rw_exit(DBUF_HASH_RWLOCK(h, idx));
354354
return (NULL);
355355
}
356356

@@ -393,13 +393,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
393393
hv = dbuf_hash(os, obj, level, blkid);
394394
idx = hv & h->hash_table_mask;
395395

396-
mutex_enter(DBUF_HASH_MUTEX(h, idx));
396+
rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER);
397397
for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
398398
dbf = dbf->db_hash_next, i++) {
399399
if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
400400
mutex_enter(&dbf->db_mtx);
401401
if (dbf->db_state != DB_EVICTING) {
402-
mutex_exit(DBUF_HASH_MUTEX(h, idx));
402+
rw_exit(DBUF_HASH_RWLOCK(h, idx));
403403
return (dbf);
404404
}
405405
mutex_exit(&dbf->db_mtx);
@@ -417,7 +417,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
417417
mutex_enter(&db->db_mtx);
418418
db->db_hash_next = h->hash_table[idx];
419419
h->hash_table[idx] = db;
420-
mutex_exit(DBUF_HASH_MUTEX(h, idx));
420+
rw_exit(DBUF_HASH_RWLOCK(h, idx));
421421
uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64);
422422
DBUF_STAT_MAX(hash_elements_max, he);
423423

@@ -474,13 +474,13 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
474474

475475
/*
476476
* We mustn't hold db_mtx to maintain lock ordering:
477-
* DBUF_HASH_MUTEX > db_mtx.
477+
* DBUF_HASH_RWLOCK > db_mtx.
478478
*/
479479
ASSERT(zfs_refcount_is_zero(&db->db_holds));
480480
ASSERT(db->db_state == DB_EVICTING);
481481
ASSERT(!MUTEX_HELD(&db->db_mtx));
482482

483-
mutex_enter(DBUF_HASH_MUTEX(h, idx));
483+
rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER);
484484
dbp = &h->hash_table[idx];
485485
while ((dbf = *dbp) != db) {
486486
dbp = &dbf->db_hash_next;
@@ -491,7 +491,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
491491
if (h->hash_table[idx] &&
492492
h->hash_table[idx]->db_hash_next == NULL)
493493
DBUF_STAT_BUMPDOWN(hash_chains);
494-
mutex_exit(DBUF_HASH_MUTEX(h, idx));
494+
rw_exit(DBUF_HASH_RWLOCK(h, idx));
495495
atomic_dec_64(&dbuf_stats.hash_elements.value.ui64);
496496
}
497497

@@ -914,8 +914,8 @@ dbuf_init(void)
914914
sizeof (dmu_buf_impl_t),
915915
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
916916

917-
for (i = 0; i < DBUF_MUTEXES; i++)
918-
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
917+
for (i = 0; i < DBUF_RWLOCKS; i++)
918+
rw_init(&h->hash_rwlocks[i], NULL, RW_DEFAULT, NULL);
919919

920920
dbuf_stats_init(h);
921921

@@ -981,8 +981,8 @@ dbuf_fini(void)
981981

982982
dbuf_stats_destroy();
983983

984-
for (i = 0; i < DBUF_MUTEXES; i++)
985-
mutex_destroy(&h->hash_mutexes[i]);
984+
for (i = 0; i < DBUF_RWLOCKS; i++)
985+
rw_destroy(&h->hash_rwlocks[i]);
986986
#if defined(_KERNEL)
987987
/*
988988
* Large allocations which do not require contiguous pages

module/zfs/dbuf_stats.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
137137
if (size)
138138
buf[0] = 0;
139139

140-
mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
140+
rw_enter(DBUF_HASH_RWLOCK(h, dsh->idx), RW_READER);
141141
for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
142142
/*
143143
* Returning ENOMEM will cause the data and header functions
@@ -158,7 +158,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
158158

159159
mutex_exit(&db->db_mtx);
160160
}
161-
mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
161+
rw_exit(DBUF_HASH_RWLOCK(h, dsh->idx));
162162

163163
return (error);
164164
}

0 commit comments

Comments
 (0)