Skip to content

Commit 687ae94

Browse files
committed
ZAP: Massively switch to _by_dnode() interfaces
Before this change ZAP called dnode_hold() for almost every block access, that was clearly visible in profiler under heavy load, such as BRT. This patch makes it always hold the dnode reference between zap_lockdir() and zap_unlockdir(). It allows to avoid most of dnode operations between those. It also adds several new _by_dnode() APIs to ZAP and uses them in BRT code. Also adds dmu_prefetch_by_dnode() variant and uses it in the ZAP code. Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc.
1 parent c28f94f commit 687ae94

File tree

7 files changed

+198
-149
lines changed

7 files changed

+198
-149
lines changed

include/sys/dmu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -902,6 +902,8 @@ extern uint_t zfs_max_recordsize;
902902
*/
903903
void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
904904
uint64_t len, enum zio_priority pri);
905+
void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
906+
uint64_t len, enum zio_priority pri);
905907
void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
906908

907909
typedef struct dmu_object_info {

include/sys/zap.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key,
253253
int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
254254
int key_numints, int integer_size, uint64_t num_integers,
255255
const void *val, dmu_tx_t *tx);
256+
int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
257+
int key_numints, int integer_size, uint64_t num_integers,
258+
const void *val, dmu_tx_t *tx);
256259

257260
/*
258261
* Set the attribute with the given name to the given value. If an
@@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
267270
int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
268271
int key_numints,
269272
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
273+
int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
274+
int key_numints,
275+
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
270276

271277
/*
272278
* Get the length (in integers) and the integer size of the specified
@@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
292298
int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
293299
int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
294300
int key_numints, dmu_tx_t *tx);
301+
int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
302+
int key_numints, dmu_tx_t *tx);
295303

296304
/*
297305
* Returns (in *count) the number of attributes in the specified zap

include/sys/zap_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ typedef struct zap {
145145
dmu_buf_user_t zap_dbu;
146146
objset_t *zap_objset;
147147
uint64_t zap_object;
148+
dnode_t *zap_dnode;
148149
struct dmu_buf *zap_dbuf;
149150
krwlock_t zap_rwlock;
150151
boolean_t zap_ismicro;

module/zfs/brt.c

Lines changed: 14 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -955,52 +955,10 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
955955
if (mos_entries == 0)
956956
return;
957957

958-
BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
959-
(u_longlong_t)mos_entries, (u_longlong_t)vdevid,
960-
(u_longlong_t)bre->bre_offset);
961958
(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
962959
(uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
963960
}
964961

965-
static int
966-
brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
967-
{
968-
int error;
969-
970-
ASSERT(RW_LOCK_HELD(&brt->brt_lock));
971-
ASSERT(brtvd->bv_mos_entries != 0);
972-
ASSERT(bre->bre_refcount > 0);
973-
974-
error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
975-
(uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
976-
sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
977-
BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
978-
"error=%d", (u_longlong_t)brtvd->bv_mos_entries,
979-
(u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
980-
(u_longlong_t)bre->bre_refcount, error);
981-
982-
return (error);
983-
}
984-
985-
static int
986-
brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
987-
{
988-
int error;
989-
990-
ASSERT(RW_LOCK_HELD(&brt->brt_lock));
991-
ASSERT(brtvd->bv_mos_entries != 0);
992-
ASSERT0(bre->bre_refcount);
993-
994-
error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
995-
(uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
996-
BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
997-
"error=%d", (u_longlong_t)brtvd->bv_mos_entries,
998-
(u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
999-
(u_longlong_t)bre->bre_refcount, error);
1000-
1001-
return (error);
1002-
}
1003-
1004962
/*
1005963
* Return TRUE if we _can_ have BRT entry for this bp. It might be false
1006964
* positive, but gives us quick answer if we should look into BRT, which
@@ -1559,24 +1517,16 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
15591517
}
15601518

15611519
static void
1562-
brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
1520+
brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
15631521
{
1564-
1565-
ASSERT(RW_WRITE_HELD(&brt->brt_lock));
1566-
ASSERT(brtvd->bv_mos_entries != 0);
1567-
15681522
if (bre->bre_refcount == 0) {
1569-
int error;
1570-
1571-
error = brt_entry_remove(brt, brtvd, bre, tx);
1572-
ASSERT(error == 0 || error == ENOENT);
1573-
/*
1574-
* If error == ENOENT then zfs_clone_range() was done from a
1575-
* removed (but opened) file (open(), unlink()).
1576-
*/
1577-
ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
1523+
int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
1524+
BRT_KEY_WORDS, tx);
1525+
VERIFY(error == 0 || error == ENOENT);
15781526
} else {
1579-
VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
1527+
VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
1528+
BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
1529+
&bre->bre_refcount, tx));
15801530
}
15811531
}
15821532

@@ -1585,6 +1535,7 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
15851535
{
15861536
brt_vdev_t *brtvd;
15871537
brt_entry_t *bre;
1538+
dnode_t *dn;
15881539
uint64_t vdevid;
15891540
void *c;
15901541

@@ -1608,14 +1559,19 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
16081559
if (brtvd->bv_mos_brtvdev == 0)
16091560
brt_vdev_create(brt, brtvd, tx);
16101561

1562+
VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
1563+
FTAG, &dn));
1564+
16111565
c = NULL;
16121566
while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
1613-
brt_sync_entry(brt, brtvd, bre, tx);
1567+
brt_sync_entry(dn, bre, tx);
16141568
brt_entry_free(bre);
16151569
ASSERT(brt->brt_nentries > 0);
16161570
brt->brt_nentries--;
16171571
}
16181572

1573+
dnode_rele(dn, FTAG);
1574+
16191575
brt_vdev_sync(brt, brtvd, tx);
16201576

16211577
if (brtvd->bv_totalcount == 0)

module/zfs/dmu.c

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -712,8 +712,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
712712
uint64_t len, zio_priority_t pri)
713713
{
714714
dnode_t *dn;
715-
int64_t level2 = level;
716-
uint64_t start, end, start2, end2;
717715

718716
if (dmu_prefetch_max == 0 || len == 0) {
719717
dmu_prefetch_dnode(os, object, pri);
@@ -723,6 +721,18 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
723721
if (dnode_hold(os, object, FTAG, &dn) != 0)
724722
return;
725723

724+
dmu_prefetch_by_dnode(dn, level, offset, len, pri);
725+
726+
dnode_rele(dn, FTAG);
727+
}
728+
729+
void
730+
dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
731+
uint64_t len, zio_priority_t pri)
732+
{
733+
int64_t level2 = level;
734+
uint64_t start, end, start2, end2;
735+
726736
/*
727737
* Depending on len we may do two prefetches: blocks [start, end) at
728738
* level, and following blocks [start2, end2) at higher level2.
@@ -762,8 +772,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
762772
for (uint64_t i = start2; i < end2; i++)
763773
dbuf_prefetch(dn, level2, i, pri, 0);
764774
rw_exit(&dn->dn_struct_rwlock);
765-
766-
dnode_rele(dn, FTAG);
767775
}
768776

769777
/*
@@ -2563,6 +2571,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
25632571
EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
25642572
EXPORT_SYMBOL(dmu_buf_rele_array);
25652573
EXPORT_SYMBOL(dmu_prefetch);
2574+
EXPORT_SYMBOL(dmu_prefetch_by_dnode);
2575+
EXPORT_SYMBOL(dmu_prefetch_dnode);
25662576
EXPORT_SYMBOL(dmu_free_range);
25672577
EXPORT_SYMBOL(dmu_free_long_range);
25682578
EXPORT_SYMBOL(dmu_free_long_object);

module/zfs/zap.c

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
133133
* set up block 1 - the first leaf
134134
*/
135135
dmu_buf_t *db;
136-
VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
136+
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
137137
1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
138138
dmu_buf_will_dirty(db, tx);
139139

@@ -182,7 +182,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
182182
newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
183183
tbl->zt_nextblk = newblk;
184184
ASSERT0(tbl->zt_blks_copied);
185-
dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
185+
dmu_prefetch_by_dnode(zap->zap_dnode, 0,
186186
tbl->zt_blk << bs, tbl->zt_numblks << bs,
187187
ZIO_PRIORITY_SYNC_READ);
188188
}
@@ -193,21 +193,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
193193

194194
uint64_t b = tbl->zt_blks_copied;
195195
dmu_buf_t *db_old;
196-
int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
196+
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
197197
(tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
198198
if (err != 0)
199199
return (err);
200200

201201
/* first half of entries in old[b] go to new[2*b+0] */
202202
dmu_buf_t *db_new;
203-
VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
203+
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
204204
(newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
205205
dmu_buf_will_dirty(db_new, tx);
206206
transfer_func(db_old->db_data, db_new->db_data, hepb);
207207
dmu_buf_rele(db_new, FTAG);
208208

209209
/* second half of entries in old[b] go to new[2*b+1] */
210-
VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
210+
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
211211
(newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
212212
dmu_buf_will_dirty(db_new, tx);
213213
transfer_func((uint64_t *)db_old->db_data + hepb,
@@ -255,7 +255,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
255255
uint64_t off = idx & ((1<<(bs-3))-1);
256256

257257
dmu_buf_t *db;
258-
int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
258+
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
259259
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
260260
if (err != 0)
261261
return (err);
@@ -267,7 +267,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
267267
uint64_t off2 = idx2 & ((1<<(bs-3))-1);
268268
dmu_buf_t *db2;
269269

270-
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
270+
err = dmu_buf_hold_by_dnode(zap->zap_dnode,
271271
(tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
272272
DMU_READ_NO_PREFETCH);
273273
if (err != 0) {
@@ -296,16 +296,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
296296
uint64_t blk = idx >> (bs-3);
297297
uint64_t off = idx & ((1<<(bs-3))-1);
298298

299-
/*
300-
* Note: this is equivalent to dmu_buf_hold(), but we use
301-
* _dnode_enter / _by_dnode because it's faster because we don't
302-
* have to hold the dnode.
303-
*/
304-
dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
305299
dmu_buf_t *db;
306-
int err = dmu_buf_hold_by_dnode(dn,
300+
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
307301
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
308-
dmu_buf_dnode_exit(zap->zap_dbuf);
309302
if (err != 0)
310303
return (err);
311304
*valp = ((uint64_t *)db->db_data)[off];
@@ -319,11 +312,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
319312
*/
320313
blk = (idx*2) >> (bs-3);
321314

322-
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
323-
err = dmu_buf_hold_by_dnode(dn,
315+
err = dmu_buf_hold_by_dnode(zap->zap_dnode,
324316
(tbl->zt_nextblk + blk) << bs, FTAG, &db,
325317
DMU_READ_NO_PREFETCH);
326-
dmu_buf_dnode_exit(zap->zap_dbuf);
327318
if (err == 0)
328319
dmu_buf_rele(db, FTAG);
329320
}
@@ -368,7 +359,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
368359

369360
uint64_t newblk = zap_allocate_blocks(zap, 1);
370361
dmu_buf_t *db_new;
371-
int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
362+
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
372363
newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
373364
DMU_READ_NO_PREFETCH);
374365
if (err != 0)
@@ -433,7 +424,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
433424
l->l_blkid = zap_allocate_blocks(zap, 1);
434425
l->l_dbuf = NULL;
435426

436-
VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
427+
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
437428
l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
438429
DMU_READ_NO_PREFETCH));
439430
dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
@@ -533,10 +524,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
533524
return (SET_ERROR(ENOENT));
534525

535526
int bs = FZAP_BLOCK_SHIFT(zap);
536-
dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
537-
int err = dmu_buf_hold_by_dnode(dn,
527+
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
538528
blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
539-
dmu_buf_dnode_exit(zap->zap_dbuf);
540529
if (err != 0)
541530
return (err);
542531

@@ -985,7 +974,7 @@ fzap_prefetch(zap_name_t *zn)
985974
if (zap_idx_to_blk(zap, idx, &blk) != 0)
986975
return;
987976
int bs = FZAP_BLOCK_SHIFT(zap);
988-
dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
977+
dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
989978
ZIO_PRIORITY_SYNC_READ);
990979
}
991980

@@ -1228,7 +1217,7 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
12281217
*/
12291218
if (zc->zc_hash == 0 && zap_iterate_prefetch &&
12301219
zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
1231-
dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
1220+
dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
12321221
zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
12331222
ZIO_PRIORITY_ASYNC_READ);
12341223
}
@@ -1356,7 +1345,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
13561345
zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
13571346
1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
13581347
} else {
1359-
dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
1348+
dmu_prefetch_by_dnode(zap->zap_dnode, 0,
13601349
zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
13611350
zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
13621351
ZIO_PRIORITY_SYNC_READ);
@@ -1366,7 +1355,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
13661355
dmu_buf_t *db;
13671356
int err;
13681357

1369-
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
1358+
err = dmu_buf_hold_by_dnode(zap->zap_dnode,
13701359
(zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
13711360
FTAG, &db, DMU_READ_NO_PREFETCH);
13721361
if (err == 0) {

0 commit comments

Comments
 (0)