Skip to content

Commit b718c69

Browse files
amotinsnajpa
authored andcommitted
Optimize microzaps
Microzap on-disk format does not include a hash tree, expecting one to be built in RAM during mzap_open(). The built tree is linked to DMU user buffer, freed when original DMU buffer is dropped from cache. I've found that workloads accessing many large directories and having active eviction from DMU cache spend significant amount of time building and then destroying the trees. I've also found that for each 64 byte mzap element additional 64 byte tree element is allocated, that is a waste of memory and CPU caches. Improve memory efficiency of the hash tree by switching from AVL-tree to B-tree. It allows to save 24 bytes per element just on pointers. Save 32 bits on mze_hash by storing only upper 32 bits since lower 32 bits are always zero for microzaps. Save 16 bits on mze_chunkid, since microzap can never have so many elements. Respectively with the 16 bits there can be no more than 16 bits of collision differentiators. As result, struct mzap_ent now drops from 48 (rounded to 64) to 8 bytes. Tune B-trees for small data. Reduce BTREE_CORE_ELEMS from 128 to 126 to allow struct zfs_btree_core in case of 8 byte elements to pack into 2KB instead of 4KB. Aside of the microzaps it should also help 32bit range trees. Allow custom B-tree leaf size to reduce memmove() time. Split zap_name_alloc() into zap_name_alloc() and zap_name_init_str(). It allows to not waste time allocating/freeing memory when processing multiple names in a loop during mzap_open(). Together on a pool with 10K directories of 1800 files each and DMU cache limited to 128MB this reduces time of `find . -name zzz` by 41% from 7.63s to 4.47s, and saves additional ~30% of CPU time on the DMU cache reclamation. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Matthew Ahrens <[email protected]> Reviewed-by: Ryan Moeller <[email protected]> Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc. Closes openzfs#14039
1 parent f292230 commit b718c69

File tree

5 files changed

+189
-134
lines changed

5 files changed

+189
-134
lines changed

include/sys/btree.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ extern "C" {
6565
* them, and increased memory overhead. Increasing these values results in
6666
* higher variance in operation time, and reduces memory overhead.
6767
*/
68-
#define BTREE_CORE_ELEMS 128
68+
#define BTREE_CORE_ELEMS 126
6969
#define BTREE_LEAF_SIZE 4096
7070

7171
extern kmem_cache_t *zfs_btree_leaf_cache;
@@ -95,9 +95,6 @@ typedef struct zfs_btree_leaf {
9595
uint8_t btl_elems[];
9696
} zfs_btree_leaf_t;
9797

98-
#define BTREE_LEAF_ESIZE (BTREE_LEAF_SIZE - \
99-
offsetof(zfs_btree_leaf_t, btl_elems))
100-
10198
typedef struct zfs_btree_index {
10299
zfs_btree_hdr_t *bti_node;
103100
uint32_t bti_offset;
@@ -109,14 +106,15 @@ typedef struct zfs_btree_index {
109106
} zfs_btree_index_t;
110107

111108
typedef struct btree {
112-
zfs_btree_hdr_t *bt_root;
113-
int64_t bt_height;
109+
int (*bt_compar) (const void *, const void *);
114110
size_t bt_elem_size;
111+
size_t bt_leaf_size;
115112
uint32_t bt_leaf_cap;
113+
int32_t bt_height;
116114
uint64_t bt_num_elems;
117115
uint64_t bt_num_nodes;
116+
zfs_btree_hdr_t *bt_root;
118117
zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading
119-
int (*bt_compar) (const void *, const void *);
120118
} zfs_btree_t;
121119

122120
/*
@@ -132,9 +130,12 @@ void zfs_btree_fini(void);
132130
* compar - function to compare two nodes, it must return exactly: -1, 0, or +1
133131
* -1 for <, 0 for ==, and +1 for >
134132
* size - the value of sizeof(struct my_type)
133+
* lsize - custom leaf size
135134
*/
136135
void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
137136
size_t);
137+
void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *),
138+
size_t, size_t);
138139

139140
/*
140141
* Find a node with a matching value in the tree. Returns the matching node

include/sys/zap_impl.h

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,9 @@ typedef struct mzap_phys {
6666
} mzap_phys_t;
6767

6868
typedef struct mzap_ent {
69-
avl_node_t mze_node;
70-
int mze_chunkid;
71-
uint64_t mze_hash;
72-
uint32_t mze_cd; /* copy from mze_phys->mze_cd */
69+
uint32_t mze_hash;
70+
uint16_t mze_cd; /* copy from mze_phys->mze_cd */
71+
uint16_t mze_chunkid;
7372
} mzap_ent_t;
7473

7574
#define MZE_PHYS(zap, mze) \
@@ -164,7 +163,7 @@ typedef struct zap {
164163
int16_t zap_num_entries;
165164
int16_t zap_num_chunks;
166165
int16_t zap_alloc_next;
167-
avl_tree_t zap_avl;
166+
zfs_btree_t zap_tree;
168167
} zap_micro;
169168
} zap_u;
170169
} zap_t;
@@ -202,7 +201,7 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
202201
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp);
203202
void zap_unlockdir(zap_t *zap, void *tag);
204203
void zap_evict_sync(void *dbu);
205-
zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
204+
zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt);
206205
void zap_name_free(zap_name_t *zn);
207206
int zap_hashbits(zap_t *zap);
208207
uint32_t zap_maxcd(zap_t *zap);

module/zfs/btree.c

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
102102
(void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size);
103103
(void) memset(leaf->btl_elems +
104104
(hdr->bth_first + hdr->bth_count) * size, 0x0f,
105-
BTREE_LEAF_ESIZE -
105+
tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) -
106106
(hdr->bth_first + hdr->bth_count) * size);
107107
}
108108
#endif
@@ -173,16 +173,44 @@ zfs_btree_fini(void)
173173
kmem_cache_destroy(zfs_btree_leaf_cache);
174174
}
175175

176+
static void *
177+
zfs_btree_leaf_alloc(zfs_btree_t *tree)
178+
{
179+
if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
180+
return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP));
181+
else
182+
return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP));
183+
}
184+
185+
static void
186+
zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
187+
{
188+
if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
189+
return (kmem_cache_free(zfs_btree_leaf_cache, ptr));
190+
else
191+
return (kmem_free(ptr, tree->bt_leaf_size));
192+
}
193+
176194
void
177195
zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
178196
size_t size)
179197
{
180-
ASSERT3U(size, <=, BTREE_LEAF_ESIZE / 2);
198+
zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE);
199+
}
200+
201+
void
202+
zfs_btree_create_custom(zfs_btree_t *tree,
203+
int (*compar) (const void *, const void *),
204+
size_t size, size_t lsize)
205+
{
206+
size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
181207

182-
bzero(tree, sizeof (*tree));
208+
ASSERT3U(size, <=, esize / 2);
209+
memset(tree, 0, sizeof (*tree));
183210
tree->bt_compar = compar;
184211
tree->bt_elem_size = size;
185-
tree->bt_leaf_cap = P2ALIGN(BTREE_LEAF_ESIZE / size, 2);
212+
tree->bt_leaf_size = lsize;
213+
tree->bt_leaf_cap = P2ALIGN(esize / size, 2);
186214
tree->bt_height = -1;
187215
tree->bt_bulk = NULL;
188216
}
@@ -290,7 +318,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
290318

291319
zfs_btree_core_t *node = NULL;
292320
uint32_t child = 0;
293-
uint64_t depth = 0;
321+
uint32_t depth = 0;
294322

295323
/*
296324
* Iterate down the tree, finding which child the value should be in
@@ -811,8 +839,7 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
811839
move_count++;
812840
}
813841
tree->bt_num_nodes++;
814-
zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
815-
KM_SLEEP);
842+
zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree);
816843
zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
817844
new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
818845
new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) +
@@ -1078,8 +1105,7 @@ zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
10781105
ASSERT0(where->bti_offset);
10791106

10801107
tree->bt_num_nodes++;
1081-
zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
1082-
KM_SLEEP);
1108+
zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree);
10831109
tree->bt_root = &leaf->btl_hdr;
10841110
tree->bt_height++;
10851111

@@ -1378,7 +1404,7 @@ zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
13781404
{
13791405
tree->bt_num_nodes--;
13801406
if (!zfs_btree_is_core(node)) {
1381-
kmem_cache_free(zfs_btree_leaf_cache, node);
1407+
zfs_btree_leaf_free(tree, node);
13821408
} else {
13831409
kmem_free(node, sizeof (zfs_btree_core_t) +
13841410
BTREE_CORE_ELEMS * tree->bt_elem_size);
@@ -1991,7 +2017,7 @@ zfs_btree_verify_counts(zfs_btree_t *tree)
19912017
*/
19922018
static uint64_t
19932019
zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
1994-
int64_t height)
2020+
int32_t height)
19952021
{
19962022
if (!zfs_btree_is_core(hdr)) {
19972023
VERIFY0(height);
@@ -2117,8 +2143,10 @@ zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
21172143
zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
21182144
for (size_t i = 0; i < hdr->bth_first * size; i++)
21192145
VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2146+
size_t esize = tree->bt_leaf_size -
2147+
offsetof(zfs_btree_leaf_t, btl_elems);
21202148
for (size_t i = (hdr->bth_first + hdr->bth_count) * size;
2121-
i < BTREE_LEAF_ESIZE; i++)
2149+
i < esize; i++)
21222150
VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
21232151
} else {
21242152
zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;

module/zfs/zap_leaf.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -647,7 +647,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
647647
* form of the name. But all callers have one of these on hand anyway,
648648
* so might as well take advantage. A cleaner but slower interface
649649
* would accept neither argument, and compute the normalized name as
650-
* needed (using zap_name_alloc(zap_entry_read_name(zeh))).
650+
* needed (using zap_name_alloc_str(zap_entry_read_name(zeh))).
651651
*/
652652
boolean_t
653653
zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
@@ -668,7 +668,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
668668
continue;
669669

670670
if (zn == NULL) {
671-
zn = zap_name_alloc(zap, name, MT_NORMALIZE);
671+
zn = zap_name_alloc_str(zap, name, MT_NORMALIZE);
672672
allocdzn = B_TRUE;
673673
}
674674
if (zap_leaf_array_match(zeh->zeh_leaf, zn,

0 commit comments

Comments
 (0)