From e24211e80806ef73e4a36cb775aed4498f302dfa Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Wed, 29 May 2019 12:54:57 -0700 Subject: [PATCH 01/18] Reduce loaded range tree memory usage Signed-off-by: Paul Dagnelie --- cmd/zdb/zdb.c | 17 +- include/sys/Makefile.am | 1 + include/sys/bitops.h | 90 ++ include/sys/btree.h | 231 ++++ include/sys/metaslab.h | 4 +- include/sys/metaslab_impl.h | 4 +- include/sys/range_tree.h | 240 +++- include/sys/spa.h | 46 +- include/sys/space_reftree.h | 2 +- include/sys/vdev.h | 4 +- include/sys/vdev_impl.h | 8 +- lib/libzpool/Makefile.am | 1 + module/zfs/Makefile.in | 1 + module/zfs/arc.c | 4 +- module/zfs/btree.c | 2110 ++++++++++++++++++++++++++++++++++ module/zfs/dnode.c | 5 +- module/zfs/dsl_scan.c | 45 +- module/zfs/metaslab.c | 591 +++++++--- module/zfs/range_tree.c | 566 +++++---- module/zfs/spa_misc.c | 9 +- module/zfs/space_map.c | 37 +- module/zfs/space_reftree.c | 11 +- module/zfs/vdev.c | 25 +- module/zfs/vdev_initialize.c | 31 +- module/zfs/vdev_raidz.c | 6 +- module/zfs/vdev_removal.c | 65 +- module/zfs/vdev_trim.c | 43 +- 27 files changed, 3629 insertions(+), 568 deletions(-) create mode 100644 include/sys/bitops.h create mode 100644 include/sys/btree.h create mode 100644 module/zfs/btree.c diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index a6368c67d4d7..9b2e4fdfd0bf 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -103,6 +103,7 @@ extern uint64_t zfs_arc_max, zfs_arc_meta_limit; extern int zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern int zfs_reconstruct_indirect_combinations_max; +extern int btree_verify_intensity; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; @@ -949,7 +950,7 @@ dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &msp->ms_allocatable_by_size; + btree_t *t = &msp->ms_allocatable_by_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ @@ -958,7 +959,7 @@ dump_metaslab_stats(metaslab_t *msp) zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", - "segments", avl_numnodes(t), "maxsize", maxbuf, + "segments", btree_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); @@ -4063,7 +4064,7 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) ASSERT0(range_tree_space(svr->svr_allocd_segs)); - range_tree_t *allocs = range_tree_create(NULL, NULL); + range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; @@ -6088,7 +6089,8 @@ dump_zpool(spa_t *spa) if (dump_opt['d'] || dump_opt['i']) { spa_feature_t f; - mos_refd_objs = range_tree_create(NULL, NULL); + mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); dump_objset(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { @@ -6643,6 +6645,13 @@ main(int argc, char **argv) if (spa_config_path_env != NULL) spa_config_path = spa_config_path_env; + /* + * For performance reasons, we set this tunable down. We do so before + * the arg parsing section so that the user can override this value if + * they choose. + */ + btree_verify_intensity = 3; + while ((c = getopt(argc, argv, "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) { switch (c) { diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 734d5c161d20..deb2c0109639 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -11,6 +11,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/bplist.h \ $(top_srcdir)/include/sys/bpobj.h \ $(top_srcdir)/include/sys/bptree.h \ + $(top_srcdir)/include/sys/btree.h \ $(top_srcdir)/include/sys/bqueue.h \ $(top_srcdir)/include/sys/cityhash.h \ $(top_srcdir)/include/sys/dataset_kstats.h \ diff --git a/include/sys/bitops.h b/include/sys/bitops.h new file mode 100644 index 000000000000..56d52073bcc8 --- /dev/null +++ b/include/sys/bitops.h @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017 Datto Inc. + */ + +#ifndef _SYS_BITOPS_H +#define _SYS_BITOPS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1U << (len)); \ + ASSERT3U(low + len, <=, 32); \ + (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ +_NOTE(CONSTCOND) } while (0) + +#define BF64_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1ULL << (len)); \ + ASSERT3U(low + len, <=, 64); \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ +_NOTE(CONSTCOND) } while (0) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +/* + * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in + * the case where val is a constant. We can't fix ASSERT because it's used as + * an expression in several places in the kernel; as a result, changing it to + * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option. + */ +#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) +#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BITOPS_H */ diff --git a/include/sys/btree.h b/include/sys/btree.h new file mode 100644 index 000000000000..a8b9da931ddb --- /dev/null +++ b/include/sys/btree.h @@ -0,0 +1,231 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +#ifndef _BTREE_H +#define _BTREE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * This file defines the interface for a B-Tree implementation for ZFS. The + * tree can be used to store arbitrary sortable data types with low overhead + * and good operation performance. In addition the tree intelligently + * optimizes bulk in-order insertions to improve memory use and performance. + * + * Note that for all B-Tree functions, the values returned are pointers to the + * internal copies of the data in the tree. The internal data can only be + * safely mutated if the changes cannot change the ordering of the element + * with respect to any other elements in the tree. + * + * The major drawback of the B-Tree is that any returned elements or indexes + * are only valid until a side-effectful operation occurs, since these can + * result in reallocation or relocation of data. Side effectful operations are + * defined as insertion, removal, and btree_destroy_nodes. + * + * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core + * nodes have an array of children pointing to other nodes, and an array of + * elements that act as separators between the elements of the subtrees rooted + * at its children. Leaf nodes only contain data elements, and form the bottom + * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the + * elements in the core nodes are not copies of or references to leaf node + * elements. Each element occcurs only once in the tree, no matter what kind + * of node it is in. + * + * The tree's height is the same throughout, unlike many other forms of search + * tree. Each node (except for the root) must be between 50% and 100% full of + * elements (and children) at all times. Any operation that would put the node + * outside of that range results in a rebalancing operation (stealing, + * merging, or splitting). + * + * This tree was implemented using descriptions from Wikipedia's articles on + * B-Trees and B+ Trees. + */ + +/* + * Decreasing these values results in smaller memmove operations, but more of + * them, and increased memory overhead. Increasing these values results in + * higher variance in operation time, and reduces memory overhead. + */ +#define BTREE_CORE_ELEMS 128 +#define BTREE_LEAF_SIZE 4096 + +typedef struct btree_hdr { + struct btree_core *bth_parent; + boolean_t bth_core; + /* + * For both leaf and core nodes, represents the number of elements in + * the node. For core nodes, they will have bth_count + 1 children. + */ + uint32_t bth_count; +} btree_hdr_t; + +typedef struct btree_core { + btree_hdr_t btc_hdr; + btree_hdr_t *btc_children[BTREE_CORE_ELEMS + 1]; + uint8_t btc_elems[]; +} btree_core_t; + +typedef struct btree_leaf { + btree_hdr_t btl_hdr; + uint8_t btl_elems[]; +} btree_leaf_t; + +typedef struct btree_index { + btree_hdr_t *bti_node; + uint64_t bti_offset; + /* + * True if the location is before the list offset, false if it's at + * the listed offset. + */ + boolean_t bti_before; +} btree_index_t; + +typedef struct btree { + btree_hdr_t *bt_root; + int64_t bt_height; + size_t bt_elem_size; + uint64_t bt_num_elems; + uint64_t bt_num_nodes; + btree_leaf_t *bt_bulk; // non-null if bulk loading + int (*bt_compar) (const void *, const void *); +} btree_t; + +/* + * Allocate and deallocate caches for btree nodes. + */ +void btree_init(void); +void btree_fini(void); + +/* + * Initialize an B-Tree. Arguments are: + * + * tree - the tree to be initialized + * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 + * -1 for <, 0 for ==, and +1 for > + * size - the value of sizeof(struct my_type) + */ +void btree_create(btree_t *, int (*) (const void *, const void *), size_t); + +/* + * Find a node with a matching value in the tree. Returns the matching node + * found. If not found, it returns NULL and then if "where" is not NULL it sets + * "where" for use with btree_insert() or btree_nearest(). + * + * node - node that has the value being looked for + * where - position for use with btree_nearest() or btree_insert(), may be NULL + */ +void *btree_find(btree_t *, const void *, btree_index_t *); + +/* + * Insert a node into the tree. + * + * node - the node to insert + * where - position as returned from btree_find() + */ +void btree_insert(btree_t *, const void *, const btree_index_t *); + +/* + * Return the first or last valued node in the tree. Will return NULL + * if the tree is empty. + */ +void *btree_first(btree_t *, btree_index_t *); +void *btree_last(btree_t *, btree_index_t *); + +/* + * Return the next or previous valued node in the tree. + */ +void *btree_next(btree_t *, const btree_index_t *, btree_index_t *); +void *btree_prev(btree_t *, const btree_index_t *, btree_index_t *); + +/* + * Get a value from a tree and an index. + */ +void *btree_get(btree_t *, btree_index_t *); + +/* + * Add a single value to the tree. The value must not compare equal to any + * other node already in the tree. + */ +void btree_add(btree_t *, const void *); + +/* + * Remove a single value from the tree. The value must be in the tree. The + * pointer passed in may be a pointer into a tree-controlled buffer, but it + * need not be. + */ +void btree_remove(btree_t *, const void *); + +/* + * Remove the value at the given location from the tree. + */ +void btree_remove_from(btree_t *, btree_index_t *); + +/* + * Return the number of nodes in the tree + */ +ulong_t btree_numnodes(btree_t *); + +/* + * Used to destroy any remaining nodes in a tree. The cookie argument should + * be initialized to NULL before the first call. Returns a node that has been + * removed from the tree and may be free()'d. Returns NULL when the tree is + * empty. + * + * Once you call btree_destroy_nodes(), you can only continuing calling it and + * finally btree_destroy(). No other B-Tree routines will be valid. + * + * cookie - an index used to save state between calls to btree_destroy_nodes() + * + * EXAMPLE: + * btree_t *tree; + * struct my_data *node; + * btree_index_t *cookie; + * + * cookie = NULL; + * while ((node = btree_destroy_nodes(tree, &cookie)) != NULL) + * data_destroy(node); + * btree_destroy(tree); + */ +void *btree_destroy_nodes(btree_t *, btree_index_t **); + +/* + * Destroys all nodes in the tree quickly. This doesn't give the caller an + * opportunity to iterate over each node and do its own cleanup; for that, use + * btree_destroy_nodes(). + */ +void btree_clear(btree_t *); + +/* + * Final destroy of an B-Tree. Arguments are: + * + * tree - the empty tree to destroy + */ +void btree_destroy(btree_t *tree); + +/* Runs a variety of self-checks on the btree to verify integrity. */ +void btree_verify(btree_t *tree); + +#ifdef __cplusplus +} +#endif + +#endif /* _BTREE_H */ diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 00b8b4758110..f8d9c6a82e2b 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -95,8 +95,8 @@ void metaslab_check_free(spa_t *, const blkptr_t *); void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); -void metaslab_alloc_trace_init(void); -void metaslab_alloc_trace_fini(void); +void metaslab_stat_init(void); +void metaslab_stat_fini(void); void metaslab_trace_init(zio_alloc_list_t *); void metaslab_trace_fini(zio_alloc_list_t *); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 3ce39183eca4..f6a7df004092 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -509,8 +509,8 @@ struct metaslab { * only difference is that the ms_allocatable_by_size is ordered by * segment sizes. */ - avl_tree_t ms_allocatable_by_size; - avl_tree_t ms_unflushed_frees_by_size; + btree_t ms_allocatable_by_size; + btree_t ms_unflushed_frees_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index e299613b86d3..ef8df8ec8e30 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -30,7 +30,7 @@ #ifndef _SYS_RANGE_TREE_H #define _SYS_RANGE_TREE_H -#include +#include #include #ifdef __cplusplus @@ -41,20 +41,35 @@ extern "C" { typedef struct range_tree_ops range_tree_ops_t; +typedef enum range_seg_type { + RANGE_SEG32, + RANGE_SEG64, + RANGE_SEG_GAP, + RANGE_SEG_NUM_TYPES, +} range_seg_type_t; + /* * Note: the range_tree may not be accessed concurrently; consumers * must provide external locking if required. */ typedef struct range_tree { - avl_tree_t rt_root; /* offset-ordered segment AVL tree */ + btree_t rt_root; /* offset-ordered segment b-tree */ uint64_t rt_space; /* sum of all segments in the map */ - uint64_t rt_gap; /* allowable inter-segment gap */ + range_seg_type_t rt_type; /* type of range_seg_t in use */ + /* + * All data that is stored in the range tree must have a start higher + * than or equal to rt_start, and all sizes and offsets must be + * multiples of 1 << rt_shift. + */ + uint8_t rt_shift; + uint64_t rt_start; range_tree_ops_t *rt_ops; - /* rt_avl_compare should only be set if rt_arg is an AVL tree */ + /* rt_btree_compare should only be set if rt_arg is a b-tree */ void *rt_arg; - int (*rt_avl_compare)(const void *, const void *); + int (*rt_btree_compare)(const void *, const void *); + uint64_t rt_gap; /* allowable inter-segment gap */ /* * The rt_histogram maintains a histogram of ranges. Each bucket, @@ -64,36 +79,215 @@ typedef struct range_tree { uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; } range_tree_t; -typedef struct range_seg { - avl_node_t rs_node; /* AVL node */ - avl_node_t rs_pp_node; /* AVL picker-private node */ +typedef struct range_seg32 { + uint32_t rs_start; /* starting offset of this segment */ + uint32_t rs_end; /* ending offset (non-inclusive) */ +} range_seg32_t; + +/* + * Extremely large metaslabs, vdev-wide trees, and dnode-wide trees may + * require 64-bit integers for ranges. + */ +typedef struct range_seg64 { + uint64_t rs_start; /* starting offset of this segment */ + uint64_t rs_end; /* ending offset (non-inclusive) */ +} range_seg64_t; + +typedef struct range_seg_gap { uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ uint64_t rs_fill; /* actual fill if gap mode is on */ -} range_seg_t; +} range_seg_gap_t; + +/* + * This type needs to be the largest of the range segs, since it will be stack + * allocated and then cast the actual type to do tree operations. + */ +typedef range_seg_gap_t range_seg_max_t; + +/* + * This is just for clarity of code purposes, so we can make it clear that a + * pointer is to a range seg of some type; when we need to do the actual math, + * we'll figure out the real type. + */ +typedef void range_seg_t; struct range_tree_ops { void (*rtop_create)(range_tree_t *rt, void *arg); void (*rtop_destroy)(range_tree_t *rt, void *arg); - void (*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg); - void (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg); + void (*rtop_add)(range_tree_t *rt, void *rs, void *arg); + void (*rtop_remove)(range_tree_t *rt, void *rs, void *arg); void (*rtop_vacate)(range_tree_t *rt, void *arg); }; +static inline uint64_t +rs_get_start_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + return (((range_seg32_t *)rs)->rs_start); + case RANGE_SEG64: + return (((range_seg64_t *)rs)->rs_start); + case RANGE_SEG_GAP: + return (((range_seg_gap_t *)rs)->rs_start); + default: + VERIFY(0); + return (0); + } +} + +static inline uint64_t +rs_get_end_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + return (((range_seg32_t *)rs)->rs_end); + case RANGE_SEG64: + return (((range_seg64_t *)rs)->rs_end); + case RANGE_SEG_GAP: + return (((range_seg_gap_t *)rs)->rs_end); + default: + VERIFY(0); + return (0); + } +} + +static inline uint64_t +rs_get_fill_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: { + const range_seg32_t *r32 = rs; + return (r32->rs_end - r32->rs_start); + } + case RANGE_SEG64: { + const range_seg64_t *r64 = rs; + return (r64->rs_end - r64->rs_start); + } + case RANGE_SEG_GAP: + return (((range_seg_gap_t *)rs)->rs_fill); + default: + VERIFY(0); + return (0); + } + +} + +static inline uint64_t +rs_get_start(const range_seg_t *rs, const range_tree_t *rt) +{ + return ((rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start); +} + +static inline uint64_t +rs_get_end(const range_seg_t *rs, const range_tree_t *rt) +{ + return ((rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start); +} + +static inline uint64_t +rs_get_fill(const range_seg_t *rs, const range_tree_t *rt) +{ + return (rs_get_fill_raw(rs, rt) << rt->rt_shift); +} + +static inline void +rs_set_start_raw(range_seg_t *rs, range_tree_t *rt, uint64_t start) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + ASSERT3U(start, <=, UINT32_MAX); + ((range_seg32_t *)rs)->rs_start = (uint32_t)start; + break; + case RANGE_SEG64: + ((range_seg64_t *)rs)->rs_start = start; + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_start = start; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_end_raw(range_seg_t *rs, range_tree_t *rt, uint64_t end) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + ASSERT3U(end, <=, UINT32_MAX); + ((range_seg32_t *)rs)->rs_end = (uint32_t)end; + break; + case RANGE_SEG64: + ((range_seg64_t *)rs)->rs_end = end; + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_end = end; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_fill_raw(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + case RANGE_SEG64: + ASSERT3U(fill, ==, rs_get_end_raw(rs, rt) - rs_get_start_raw(rs, + rt)); + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_fill = fill; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_start(range_seg_t *rs, range_tree_t *rt, uint64_t start) +{ + ASSERT3U(start, >=, rt->rt_start); + ASSERT(IS_P2ALIGNED(start, 1ULL << rt->rt_shift)); + rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift); +} + +static inline void +rs_set_end(range_seg_t *rs, range_tree_t *rt, uint64_t end) +{ + ASSERT3U(end, >=, rt->rt_start); + ASSERT(IS_P2ALIGNED(end, 1ULL << rt->rt_shift)); + rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift); +} + +static inline void +rs_set_fill(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +{ + ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift)); + rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); +} + typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); -void range_tree_init(void); -void range_tree_fini(void); -range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare) (const void *, const void *), uint64_t gap); -range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); +range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, + range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + int (*btree_compare) (const void *, const void *), uint64_t gap); +range_tree_t *range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, + void *arg, uint64_t start, uint64_t shift); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, uint64_t *ostart, uint64_t *osize); void range_tree_verify_not_present(range_tree_t *rt, uint64_t start, uint64_t size); -range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize); uint64_t range_tree_space(range_tree_t *rt); @@ -120,12 +314,12 @@ void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, range_tree_t *addto); -void rt_avl_create(range_tree_t *rt, void *arg); -void rt_avl_destroy(range_tree_t *rt, void *arg); -void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); -void rt_avl_vacate(range_tree_t *rt, void *arg); -extern struct range_tree_ops rt_avl_ops; +void rt_btree_create(range_tree_t *rt, void *arg); +void rt_btree_destroy(range_tree_t *rt, void *arg); +void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_btree_vacate(range_tree_t *rt, void *arg); +extern struct range_tree_ops rt_btree_ops; #ifdef __cplusplus } diff --git a/include/sys/spa.h b/include/sys/spa.h index caa2157b91ab..18933e0d4205 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -43,6 +43,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -69,51 +70,6 @@ struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; -/* - * General-purpose 32-bit and 64-bit bitfield encodings. - */ -#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) -#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) -#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) -#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) - -#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) -#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) - -#define BF32_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1U << (len)); \ - ASSERT3U(low + len, <=, 32); \ - (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ -_NOTE(CONSTCOND) } while (0) - -#define BF64_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1ULL << (len)); \ - ASSERT3U(low + len, <=, 64); \ - ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ -_NOTE(CONSTCOND) } while (0) - -#define BF32_GET_SB(x, low, len, shift, bias) \ - ((BF32_GET(x, low, len) + (bias)) << (shift)) -#define BF64_GET_SB(x, low, len, shift, bias) \ - ((BF64_GET(x, low, len) + (bias)) << (shift)) - -/* - * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in - * the case where val is a constant. We can't fix ASSERT because it's used as - * an expression in several places in the kernel; as a result, changing it to - * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option. - */ -#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) -#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) - /* * We currently support block sizes from 512 bytes to 16MB. * The benefits of larger blocks, and thus larger IO, need to be weighed diff --git a/include/sys/space_reftree.h b/include/sys/space_reftree.h index 249b15be6729..ca9d41dc1388 100644 --- a/include/sys/space_reftree.h +++ b/include/sys/space_reftree.h @@ -31,7 +31,7 @@ #define _SYS_SPACE_REFTREE_H #include - +#include #ifdef __cplusplus extern "C" { #endif diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 0d0d0234e151..c2fbcc549b28 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -96,8 +96,8 @@ extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd, char *tag); -extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, - range_seg_t *physical_rs); +extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs); extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index c179191e39db..ae82b75c0413 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -86,8 +86,8 @@ typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, * Given a target vdev, translates the logical range "in" to the physical * range "res" */ -typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in, - range_seg_t *res); +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *in, + range_seg64_t *res); typedef const struct vdev_ops { vdev_open_func_t *vdev_op_open; @@ -526,8 +526,8 @@ extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ -extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, - range_seg_t *out); +extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, + range_seg64_t *out); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 97f9abcf43d5..ca3038f7d308 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -45,6 +45,7 @@ KERNEL_C = \ bplist.c \ bpobj.c \ bptree.c \ + btree.c \ bqueue.c \ cityhash.c \ dbuf.c \ diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index b60b799b57c8..c00c8e42458f 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -22,6 +22,7 @@ $(MODULE)-objs += blkptr.o $(MODULE)-objs += bplist.o $(MODULE)-objs += bpobj.o $(MODULE)-objs += bptree.o +$(MODULE)-objs += btree.o $(MODULE)-objs += bqueue.o $(MODULE)-objs += cityhash.o $(MODULE)-objs += dataset_kstats.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 9b90690b6c28..9e3c012c8893 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4759,7 +4759,7 @@ arc_kmem_reap_soon(void) kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; - extern kmem_cache_t *range_seg_cache; + extern kmem_cache_t *btree_leaf_cache; #ifdef _KERNEL if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) && @@ -4796,7 +4796,7 @@ arc_kmem_reap_soon(void) kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); - kmem_cache_reap_now(range_seg_cache); + kmem_cache_reap_now(btree_leaf_cache); if (zio_arena != NULL) { /* diff --git a/module/zfs/btree.c b/module/zfs/btree.c new file mode 100644 index 000000000000..5e9056a44d03 --- /dev/null +++ b/module/zfs/btree.c @@ -0,0 +1,2110 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include + +kmem_cache_t *btree_leaf_cache; + +/* + * Control the extent of the verification that occurs when btree_verify is + * called. Primarily used for debugging when extending the btree logic and + * functionality. + */ +#ifdef ZFS_DEBUG +int btree_verify_intensity = 5; +#else +int btree_verify_intensity = 0; +#endif + +/* + * Purely a convenience function, for simpler argument ordering and silencing + * compiler warnings about return values. We would use bcopy, but the kernel + * version doesn't handle overlapping src and dest. + */ +static inline void +bmov(const void *src, void *dest, size_t size) +{ + (void) memmove(dest, src, size); +} + +static void +btree_poison_node(btree_t *tree, btree_hdr_t *hdr) +{ + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + btree_leaf_t *leaf = (btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f, + BTREE_LEAF_SIZE - sizeof (btree_hdr_t) - hdr->bth_count * + size); + } else { + btree_core_t *node = (btree_core_t *)hdr; + for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + node->btc_children[i] = + (btree_hdr_t *)0xabadb10cdeadbeef; + } + (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, + (BTREE_CORE_ELEMS - hdr->bth_count) * size); + } +} + +static inline void +btree_poison_node_at(btree_t *tree, btree_hdr_t *hdr, uint64_t offset) +{ + size_t size = tree->bt_elem_size; + ASSERT3U(offset, >=, hdr->bth_count); + if (!hdr->bth_core) { + btree_leaf_t *leaf = (btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems + offset * size, 0x0f, size); + } else { + btree_core_t *node = (btree_core_t *)hdr; + node->btc_children[offset + 1] = + (btree_hdr_t *)0xabadb10cdeadbeef; + (void) memset(node->btc_elems + offset * size, 0x0f, size); + } +} + +static inline void +btree_verify_poison_at(btree_t *tree, btree_hdr_t *hdr, uint64_t offset) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + uint8_t eval = 0x0f; + if (hdr->bth_core) { + btree_core_t *node = (btree_core_t *)hdr; + btree_hdr_t *cval = (btree_hdr_t *)0xabadb10cdeadbeef; + VERIFY3P(node->btc_children[offset + 1], ==, cval); + for (int i = 0; i < size; i++) + VERIFY3U(node->btc_elems[offset * size + i], ==, eval); + } else { + btree_leaf_t *leaf = (btree_leaf_t *)hdr; + for (int i = 0; i < size; i++) + VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval); + } +#endif +} + +void +btree_init(void) +{ + btree_leaf_cache = kmem_cache_create("btree_leaf_cache", + BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, + NULL, 0); +} + +void +btree_fini(void) +{ + kmem_cache_destroy(btree_leaf_cache); +} + +void +btree_create(btree_t *tree, int (*compar) (const void *, const void *), + size_t size) +{ + /* + * We need a minimmum of 4 elements so that when we split a we always + * have at least two elements in each node. This simplifies the logic + * in btree_bulk_finish, since it means the last leaf will always have + * a left sibling to share with (unless it's the root). + */ + ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (btree_hdr_t)) / 4); + + bzero(tree, sizeof (*tree)); + tree->bt_compar = compar; + tree->bt_elem_size = size; + tree->bt_height = -1; + tree->bt_bulk = NULL; +} + +/* + * Find value in the array of elements provided. Uses a simple binary search. + */ +static void * +btree_find_in_buf(btree_t *tree, uint8_t *buf, uint64_t nelems, + const void *value, btree_index_t *where) +{ + uint64_t max = nelems; + uint64_t min = 0; + while (max > min) { + uint64_t idx = (min + max) / 2; + uint8_t *cur = buf + idx * tree->bt_elem_size; + int comp = tree->bt_compar(cur, value); + if (comp == -1) { + min = idx + 1; + } else if (comp == 1) { + max = idx; + } else { + ASSERT0(comp); + where->bti_offset = idx; + where->bti_before = B_FALSE; + return (cur); + } + } + + where->bti_offset = max; + where->bti_before = B_TRUE; + return (NULL); +} + +/* + * Find the given value in the tree. where may be passed as null to use as a + * membership test or if the btree is being used as a map. + */ +void * +btree_find(btree_t *tree, const void *value, btree_index_t *where) +{ + if (tree->bt_height == -1) { + if (where != NULL) { + where->bti_node = NULL; + where->bti_offset = 0; + } + ASSERT0(tree->bt_num_elems); + return (NULL); + } + + /* + * If we're in bulk-insert mode, we check the last spot in the tree + * and the last leaf in the tree before doing the normal search, + * because for most workloads the vast majority of finds in + * bulk-insert mode are to insert new elements. + */ + btree_index_t idx; + if (tree->bt_bulk != NULL) { + btree_leaf_t *last_leaf = tree->bt_bulk; + int compar = tree->bt_compar(last_leaf->btl_elems + + ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size), + value); + if (compar < 0) { + /* + * If what they're looking for is after the last + * element, it's not in the tree. + */ + if (where != NULL) { + where->bti_node = (btree_hdr_t *)last_leaf; + where->bti_offset = + last_leaf->btl_hdr.bth_count; + where->bti_before = B_TRUE; + } + return (NULL); + } else if (compar == 0) { + if (where != NULL) { + where->bti_node = (btree_hdr_t *)last_leaf; + where->bti_offset = + last_leaf->btl_hdr.bth_count - 1; + where->bti_before = B_FALSE; + } + return (last_leaf->btl_elems + + ((last_leaf->btl_hdr.bth_count - 1) * + tree->bt_elem_size)); + } + if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) { + /* + * If what they're looking for is after the first + * element in the last leaf, it's in the last leaf or + * it's not in the tree. + */ + void *d = btree_find_in_buf(tree, last_leaf->btl_elems, + last_leaf->btl_hdr.bth_count, value, &idx); + + if (where != NULL) { + idx.bti_node = (btree_hdr_t *)last_leaf; + *where = idx; + } + return (d); + } + } + + btree_core_t *node = NULL; + uint64_t child = 0; + uint64_t depth = 0; + + /* + * Iterate down the tree, finding which child the value should be in + * by comparing with the separators. + */ + for (node = (btree_core_t *)tree->bt_root; depth < tree->bt_height; + node = (btree_core_t *)node->btc_children[child], depth++) { + ASSERT3P(node, !=, NULL); + void *d = btree_find_in_buf(tree, node->btc_elems, + node->btc_hdr.bth_count, value, &idx); + EQUIV(d != NULL, !idx.bti_before); + if (d != NULL) { + if (where != NULL) { + idx.bti_node = (btree_hdr_t *)node; + *where = idx; + } + return (d); + } + ASSERT(idx.bti_before); + child = idx.bti_offset; + } + + /* + * The value is in this leaf, or it would be if it were in the + * tree. Find its proper location and return it. + */ + btree_leaf_t *leaf = (depth == 0 ? (btree_leaf_t *)tree->bt_root : + (btree_leaf_t *)node); + void *d = btree_find_in_buf(tree, leaf->btl_elems, + leaf->btl_hdr.bth_count, value, &idx); + + if (where != NULL) { + idx.bti_node = (btree_hdr_t *)leaf; + *where = idx; + } + + return (d); +} + +/* + * Find the first element in the subtree rooted at hdr, return its value and + * put its location in where if non-null. + */ +static void * +btree_first_helper(btree_hdr_t *hdr, btree_index_t *where) +{ + btree_hdr_t *node; + + for (node = hdr; node->bth_core; node = + ((btree_core_t *)node)->btc_children[0]) + ; + + ASSERT(!node->bth_core); + btree_leaf_t *leaf = (btree_leaf_t *)node; + if (where != NULL) { + where->bti_node = node; + where->bti_offset = 0; + where->bti_before = B_FALSE; + } + return (&leaf->btl_elems[0]); +} + +/* + * Insert new_node into the parent of old_node directly after old_node, with + * buf as the dividing element between the two. + */ +static void +btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, + btree_hdr_t *new_node, void *buf) +{ + ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); + uint64_t size = tree->bt_elem_size; + btree_core_t *parent = old_node->bth_parent; + btree_hdr_t *par_hdr = &parent->btc_hdr; + + /* + * If this is the root node we were splitting, we create a new root + * and increase the height of the tree. + */ + if (parent == NULL) { + ASSERT3P(old_node, ==, tree->bt_root); + tree->bt_num_nodes++; + btree_core_t *new_root = kmem_alloc(sizeof (btree_core_t) + + BTREE_CORE_ELEMS * size, KM_SLEEP); + btree_hdr_t *new_root_hdr = &new_root->btc_hdr; + new_root_hdr->bth_parent = NULL; + new_root_hdr->bth_core = B_TRUE; + new_root_hdr->bth_count = 1; + + old_node->bth_parent = new_node->bth_parent = new_root; + new_root->btc_children[0] = old_node; + new_root->btc_children[1] = new_node; + bmov(buf, new_root->btc_elems, size); + + tree->bt_height++; + tree->bt_root = new_root_hdr; +#ifdef ZFS_DEBUG + btree_poison_node(tree, new_root_hdr); +#endif + return; + } + + /* + * Since we have the new separator, binary search for where to put + * new_node. + */ + btree_index_t idx; + ASSERT(par_hdr->bth_core); + VERIFY3P(btree_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, + buf, &idx), ==, NULL); + ASSERT(idx.bti_before); + uint64_t offset = idx.bti_offset; + ASSERT3U(offset, <=, par_hdr->bth_count); + ASSERT3P(parent->btc_children[offset], ==, old_node); + + /* + * If the parent isn't full, shift things to accomodate our insertions + * and return. + */ + if (par_hdr->bth_count != BTREE_CORE_ELEMS) { + if (btree_verify_intensity >= 5) { + btree_verify_poison_at(tree, par_hdr, + par_hdr->bth_count); + } + /* Move the child pointers back one. */ + btree_hdr_t **c_start = parent->btc_children + offset + 1; + uint64_t count = par_hdr->bth_count - offset; + bmov(c_start, c_start + 1, sizeof (*c_start) * count); + *c_start = new_node; + + /* Move the elements back one. */ + uint8_t *e_start = parent->btc_elems + offset * size; + bmov(e_start, e_start + size, count * size); + bmov(buf, e_start, size); + + par_hdr->bth_count++; + return; + } + + /* + * We need to split this core node into two. Currently there are + * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for + * BTREE_CORE_ELEMS + 2. Some of the children will be part of the + * current node, and the others will be moved to the new core node. + * There are BTREE_CORE_ELEMS + 1 elements including the new one. One + * will be used as the new separator in our parent, and the others + * will be split among the two core nodes. + * + * Usually we will split the node in half evenly, with + * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we + * instead move only about a quarter of the elements (and children) to + * the new node. Since the average state after a long time is a 3/4 + * full node, shortcutting directly to that state improves efficiency. + */ + uint64_t move_count = MAX(BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? + 2 : 4), 2); + uint64_t keep_count = BTREE_CORE_ELEMS - move_count; + ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); + tree->bt_num_nodes++; + btree_core_t *new_parent = kmem_alloc(sizeof (btree_core_t) + + BTREE_CORE_ELEMS * size, KM_SLEEP); + btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; + new_par_hdr->bth_parent = par_hdr->bth_parent; + new_par_hdr->bth_core = B_TRUE; + new_par_hdr->bth_count = move_count; +#ifdef ZFS_DEBUG + btree_poison_node(tree, new_par_hdr); +#endif + par_hdr->bth_count = keep_count; + + /* + * The three cases to consider are that the element in buf should be + * in the existing node (with lower values), the new node (with higher + * values), or that it should separate the two nodes. + */ + if (offset < keep_count) { + /* + * Copy the back part of the elements and children to the new + * leaf. + */ + uint64_t e_count = move_count; + uint8_t *e_start = parent->btc_elems + keep_count * size; + bmov(e_start, new_parent->btc_elems, e_count * size); + + uint64_t c_count = move_count + 1; + btree_hdr_t **c_start = parent->btc_children + keep_count; + bmov(c_start, new_parent->btc_children, c_count * + sizeof (*c_start)); + + /* Store the new separator in a buffer. */ + uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); + bmov(parent->btc_elems + (keep_count - 1) * size, tmp_buf, + size); + + /* + * Shift the remaining elements and children in the front half + * to handle the new value. + */ + e_start = parent->btc_elems + offset * size; + e_count = keep_count - 1 - offset; + bmov(e_start, e_start + size, e_count * size); + bmov(buf, e_start, size); + + c_start = parent->btc_children + (offset + 1); + c_count = keep_count - 1 - offset; + bmov(c_start, c_start + 1, c_count * sizeof (*c_start)); + *c_start = new_node; + ASSERT3P(*(c_start - 1), ==, old_node); + + /* + * Move the new separator to the existing buffer. + */ + bmov(tmp_buf, buf, size); + kmem_free(tmp_buf, size); + } else if (offset > keep_count) { + /* Store the new separator in a buffer. */ + uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); + uint8_t *e_start = parent->btc_elems + keep_count * size; + bmov(e_start, tmp_buf, size); + + /* + * Of the elements and children in the back half, move those + * before offset to the new leaf. + */ + e_start += size; + uint8_t *e_out = new_parent->btc_elems; + uint64_t e_count = offset - keep_count - 1; + bmov(e_start, e_out, e_count * size); + + btree_hdr_t **c_start = parent->btc_children + (keep_count + 1); + uint64_t c_count = offset - keep_count; + btree_hdr_t **c_out = new_parent->btc_children; + bmov(c_start, c_out, c_count * sizeof (*c_out)); + + /* Add the new value to the new leaf. */ + e_out += e_count * size; + bmov(buf, e_out, size); + + c_out += c_count; + *c_out = new_node; + ASSERT3P(*(c_out - 1), ==, old_node); + + /* + * Move the new separator to the existing buffer. + */ + bmov(tmp_buf, buf, size); + kmem_free(tmp_buf, size); + + /* Move the rest of the back half to the new leaf. */ + e_out += size; + e_start += e_count * size; + e_count = BTREE_CORE_ELEMS - offset; + bmov(e_start, e_out, e_count * size); + + c_out++; + c_start += c_count; + c_count = BTREE_CORE_ELEMS - offset; + bmov(c_start, c_out, c_count * sizeof (*c_out)); + } else { + /* + * The new value is the new separator, no change. + * + * Copy the back part of the elements and children to the new + * leaf. + */ + uint64_t e_count = move_count; + uint8_t *e_start = parent->btc_elems + keep_count * size; + bmov(e_start, new_parent->btc_elems, e_count * size); + + uint64_t c_count = move_count; + btree_hdr_t **c_start = parent->btc_children + keep_count + 1; + bmov(c_start, new_parent->btc_children + 1, c_count * + sizeof (*c_start)); + new_parent->btc_children[0] = new_node; + } +#ifdef ZFS_DEBUG + btree_poison_node(tree, par_hdr); +#endif + + for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++) + new_parent->btc_children[i]->bth_parent = new_parent; + + for (int i = 0; i <= parent->btc_hdr.bth_count; i++) + ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); + + /* + * Now that the node is split, we need to insert the new node into its + * parent. This may cause further splitting. + */ + btree_insert_into_parent(tree, &parent->btc_hdr, &new_parent->btc_hdr, + buf); + +} + +/* Helper function for inserting a new value into leaf at the given index. */ +static void +btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, + uint64_t idx) +{ + uint64_t size = tree->bt_elem_size; + uint8_t *start = leaf->btl_elems + (idx * size); + uint64_t count = leaf->btl_hdr.bth_count - idx; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (btree_hdr_t)) / + size, 2); + + /* + * If the leaf isn't full, shift the elements after idx and insert + * value. + */ + if (leaf->btl_hdr.bth_count != capacity) { + if (btree_verify_intensity >= 5) { + btree_verify_poison_at(tree, &leaf->btl_hdr, + leaf->btl_hdr.bth_count); + } + leaf->btl_hdr.bth_count++; + bmov(start, start + size, count * size); + bmov(value, start, size); + return; + } + + /* + * Otherwise, we split the leaf node into two nodes. If we're not bulk + * inserting, each is of size (capacity / 2). If we are bulk + * inserting, we move a quarter of the elements to the new node so + * inserts into the old node don't cause immediate splitting but the + * tree stays relatively dense. Since the average state after a long + * time is a 3/4 full node, shortcutting directly to that state + * improves efficiency. At the end of the bulk insertion process + * we'll need to go through and fix up any nodes (the last leaf and + * its ancestors, potentially) that are below the minimum. + * + * In either case, we're left with one extra element. The leftover + * element will become the new dividing element between the two nodes. + */ + uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4), + 2); + uint64_t keep_count = capacity - move_count; + ASSERT3U(capacity - move_count, >=, 2); + tree->bt_num_nodes++; + btree_leaf_t *new_leaf = kmem_cache_alloc(btree_leaf_cache, KM_SLEEP); + btree_hdr_t *new_hdr = &new_leaf->btl_hdr; + new_hdr->bth_parent = leaf->btl_hdr.bth_parent; + new_hdr->bth_core = B_FALSE; + new_hdr->bth_count = move_count; +#ifdef ZFS_DEBUG + btree_poison_node(tree, new_hdr); +#endif + leaf->btl_hdr.bth_count = keep_count; + + if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) + tree->bt_bulk = new_leaf; + + /* We store the new separator in a buffer we control for simplicity. */ + uint8_t *buf = kmem_alloc(size, KM_SLEEP); + + /* + * The three cases to consider are that value should be in the new + * first node, the new second node, or that it should separate the two + * nodes. + */ + if (idx < keep_count) { + /* Copy the back part to the new leaf. */ + start = leaf->btl_elems + keep_count * size; + count = move_count; + bmov(start, new_leaf->btl_elems, count * size); + + /* Store the new separator in a buffer. */ + bmov(leaf->btl_elems + (keep_count - 1) * size, buf, + size); + + /* + * Shift the remaining elements in the front part to handle + * the new value. + */ + start = leaf->btl_elems + idx * size; + count = (keep_count) - 1 - idx; + bmov(start, start + size, count * size); + bmov(value, start, size); + } else if (idx > keep_count) { + /* Store the new separator in a buffer. */ + start = leaf->btl_elems + (keep_count) * size; + bmov(start, buf, size); + + /* Move the back part before idx to the new leaf. */ + start += size; + uint8_t *out = new_leaf->btl_elems; + count = idx - keep_count - 1; + bmov(start, out, count * size); + + /* Add the new value to the new leaf. */ + out += count * size; + bmov(value, out, size); + + /* Move the rest of the back part to the new leaf. */ + out += size; + start += count * size; + count = capacity - idx; + bmov(start, out, count * size); + } else { + /* The new value is the new separator. */ + bmov(value, buf, size); + + /* Copy the back part to the new leaf. */ + start = leaf->btl_elems + keep_count * size; + count = move_count; + bmov(start, new_leaf->btl_elems, count * size); + } + +#ifdef ZFS_DEBUG + btree_poison_node(tree, &leaf->btl_hdr); +#endif + /* + * Now that the node is split, we need to insert the new node into its + * parent. This may cause further splitting, bur only of core nodes. + */ + btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, buf); + kmem_free(buf, size); +} + +static uint64_t +btree_find_parent_idx(btree_t *tree, btree_hdr_t *hdr) +{ + void *buf; + if (hdr->bth_core) { + buf = ((btree_core_t *)hdr)->btc_elems; + } else { + buf = ((btree_leaf_t *)hdr)->btl_elems; + } + btree_index_t idx; + btree_core_t *parent = hdr->bth_parent; + VERIFY3P(btree_find_in_buf(tree, parent->btc_elems, + parent->btc_hdr.bth_count, buf, &idx), ==, NULL); + ASSERT(idx.bti_before); + ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); + ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); + return (idx.bti_offset); +} + +/* + * Take the b-tree out of bulk insert mode. During bulk-insert mode, some + * nodes may violate the invariant that non-root nodes must be at least half + * full. All nodes violating this invariant should be the last node in their + * particular level. To correct the invariant, we steal values from their left + * neighbor until they are half full. They must have a left neighbor at their + * level because the last node at a level is not the first node unless it's + * the root. + */ +static void +btree_bulk_finish(btree_t *tree) +{ + ASSERT3P(tree->bt_bulk, !=, NULL); + ASSERT3P(tree->bt_root, !=, NULL); + btree_leaf_t *leaf = tree->bt_bulk; + btree_hdr_t *hdr = &leaf->btl_hdr; + btree_core_t *parent = hdr->bth_parent; + uint64_t size = tree->bt_elem_size; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (btree_hdr_t)) / + size, 2); + + /* + * The invariant doesn't apply to the root node, if that's the only + * node in the tree we're done. + */ + if (parent == NULL) { + tree->bt_bulk = NULL; + return; + } + + /* First, steal elements to rebalance the leaf node. */ + if (hdr->bth_count < capacity / 2) { + /* + * First, find the left neighbor. The simplest way to do this + * is to call btree_prev twice; the first time finds some + * ancestor of this node, and the second time finds the left + * neighbor. The ancestor found is the lowest common ancestor + * of leaf and the neighbor. + */ + btree_index_t idx = { + .bti_node = hdr, + .bti_offset = 0 + }; + VERIFY3P(btree_prev(tree, &idx, &idx), !=, NULL); + ASSERT(idx.bti_node->bth_core); + btree_core_t *common = (btree_core_t *)idx.bti_node; + uint64_t common_idx = idx.bti_offset; + + VERIFY3P(btree_prev(tree, &idx, &idx), !=, NULL); + ASSERT(!idx.bti_node->bth_core); + btree_leaf_t *l_neighbor = (btree_leaf_t *)idx.bti_node; + uint64_t move_count = (capacity / 2) - hdr->bth_count; + ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, + capacity / 2); + + if (btree_verify_intensity >= 5) { + for (int i = 0; i < move_count; i++) { + btree_verify_poison_at(tree, hdr, + leaf->btl_hdr.bth_count + i); + } + } + + /* First, shift elements in leaf back. */ + uint8_t *start = leaf->btl_elems; + uint8_t *out = leaf->btl_elems + (move_count * size); + uint64_t count = hdr->bth_count; + bmov(start, out, count * size); + + /* Next, move the separator from the common ancestor to leaf. */ + uint8_t *separator = common->btc_elems + (common_idx * size); + out -= size; + bmov(separator, out, size); + move_count--; + + /* + * Now we move elements from the tail of the left neighbor to + * fill the remaining spots in leaf. + */ + start = l_neighbor->btl_elems + (l_neighbor->btl_hdr.bth_count - + move_count) * size; + out = leaf->btl_elems; + bmov(start, out, move_count * size); + + /* + * Finally, move the new last element in the left neighbor to + * the separator. + */ + start -= size; + bmov(start, separator, size); + + /* Adjust the node's counts, and we're done. */ + l_neighbor->btl_hdr.bth_count -= move_count + 1; + hdr->bth_count += move_count + 1; + + ASSERT3U(l_neighbor->btl_hdr.bth_count, >=, capacity / 2); + ASSERT3U(hdr->bth_count, >=, capacity / 2); +#ifdef ZFS_DEBUG + btree_poison_node(tree, &l_neighbor->btl_hdr); +#endif + } + + /* + * Now we have to rebalance any ancestors of leaf that may also + * violate the invariant. + */ + capacity = BTREE_CORE_ELEMS; + while (parent->btc_hdr.bth_parent != NULL) { + btree_core_t *cur = parent; + btree_hdr_t *hdr = &cur->btc_hdr; + parent = hdr->bth_parent; + /* + * If the invariant isn't violated, move on to the next + * ancestor. + */ + if (hdr->bth_count >= capacity / 2) + continue; + + /* + * Because the smallest number of nodes we can move when + * splitting is 2, we never need to worry about not having a + * left sibling. + */ + uint64_t parent_idx = btree_find_parent_idx(tree, hdr); + ASSERT3U(parent_idx, >, 0); + btree_core_t *l_neighbor = (btree_core_t *)parent->btc_children[ + parent_idx - 1]; + uint64_t move_count = (capacity / 2) - hdr->bth_count; + ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, + capacity / 2); + + if (btree_verify_intensity >= 5) { + for (int i = 0; i < move_count; i++) { + btree_verify_poison_at(tree, hdr, + leaf->btl_hdr.bth_count + i); + } + } + /* First, shift things in the right node back. */ + uint8_t *e_start = cur->btc_elems; + uint8_t *e_out = cur->btc_elems + (move_count * size); + uint64_t e_count = hdr->bth_count; + bmov(e_start, e_out, e_count * size); + + btree_hdr_t **c_start = cur->btc_children; + btree_hdr_t **c_out = cur->btc_children + move_count; + uint64_t c_count = hdr->bth_count + 1; + bmov(c_start, c_out, c_count * sizeof (btree_hdr_t *)); + + /* Next, move the separator to the right node. */ + uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * + size); + e_out -= size; + bmov(separator, e_out, size); + + /* + * Now, move elements and children from the left node to the + * right. We move one more child than elements. + */ + move_count--; + e_start = l_neighbor->btc_elems + + (l_neighbor->btc_hdr.bth_count - move_count) * size; + e_out = cur->btc_elems; + e_count = move_count; + bmov(e_start, e_out, e_count * size); + + c_start = l_neighbor->btc_children + + (l_neighbor->btc_hdr.bth_count - move_count); + c_out = cur->btc_children; + c_count = move_count + 1; + bmov(c_start, c_out, c_count * sizeof (btree_hdr_t *)); + + /* + * Finally, move the last element in the left node to the + * separator's position. + */ + e_start -= size; + bmov(e_start, separator, size); + + l_neighbor->btc_hdr.bth_count -= move_count + 1; + hdr->bth_count += move_count + 1; + + ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); + ASSERT3U(hdr->bth_count, >=, capacity / 2); + +#ifdef ZFS_DEBUG + btree_poison_node(tree, &l_neighbor->btc_hdr); +#endif + for (int i = 0; i <= hdr->bth_count; i++) + cur->btc_children[i]->bth_parent = cur; + } + + tree->bt_bulk = NULL; +} + +/* + * Insert value into tree at the location specified by where. + */ +void +btree_insert(btree_t *tree, const void *value, const btree_index_t *where) +{ + btree_index_t idx = {0}; + + /* If we're not inserting in the last leaf, end bulk insert mode. */ + if (tree->bt_bulk != NULL) { + if (where->bti_node != &tree->bt_bulk->btl_hdr) { + btree_bulk_finish(tree); + VERIFY3P(btree_find(tree, value, &idx), ==, NULL); + where = &idx; + } + } + + tree->bt_num_elems++; + /* + * If this is the first element in the tree, create a leaf root node + * and add the value to it. + */ + if (where->bti_node == NULL) { + ASSERT3U(tree->bt_num_elems, ==, 1); + ASSERT3S(tree->bt_height, ==, -1); + ASSERT3P(tree->bt_root, ==, NULL); + ASSERT0(where->bti_offset); + + tree->bt_num_nodes++; + btree_leaf_t *leaf = kmem_cache_alloc(btree_leaf_cache, + KM_SLEEP); + tree->bt_root = &leaf->btl_hdr; + tree->bt_height++; + + btree_hdr_t *hdr = &leaf->btl_hdr; + hdr->bth_parent = NULL; + hdr->bth_core = B_FALSE; + hdr->bth_count = 0; +#ifdef ZFS_DEBUG + btree_poison_node(tree, hdr); +#endif + btree_insert_into_leaf(tree, leaf, value, 0); + tree->bt_bulk = leaf; + } else if (!where->bti_node->bth_core) { + /* + * If we're inserting into a leaf, go directly to the helper + * function. + */ + btree_insert_into_leaf(tree, (btree_leaf_t *)where->bti_node, + value, where->bti_offset); + } else { + /* + * If we're inserting into a core node, we can't just shift + * the existing element in that slot in the same node without + * breaking our ordering invariants. Instead we place the new + * value in the node at that spot and then insert the old + * separator into the first slot in the subtree to the right. + */ + ASSERT(where->bti_node->bth_core); + btree_core_t *node = (btree_core_t *)where->bti_node; + + /* + * We can ignore bti_before, because either way the value + * should end up in bti_offset. + */ + uint64_t off = where->bti_offset; + btree_hdr_t *subtree = node->btc_children[off + 1]; + size_t size = tree->bt_elem_size; + uint8_t *buf = kmem_alloc(size, KM_SLEEP); + bmov(node->btc_elems + off * size, buf, size); + bmov(value, node->btc_elems + off * size, size); + + /* + * Find the first slot in the subtree to the right, insert + * there. + */ + btree_index_t new_idx; + VERIFY3P(btree_first_helper(subtree, &new_idx), !=, NULL); + ASSERT0(new_idx.bti_offset); + ASSERT(!new_idx.bti_node->bth_core); + btree_insert_into_leaf(tree, (btree_leaf_t *)new_idx.bti_node, + buf, 0); + kmem_free(buf, size); + } + btree_verify(tree); +} + +/* + * Return the first element in the tree, and put its location in where if + * non-null. + */ +void * +btree_first(btree_t *tree, btree_index_t *where) +{ + if (tree->bt_height == -1) { + ASSERT0(tree->bt_num_elems); + return (NULL); + } + return (btree_first_helper(tree->bt_root, where)); + +} + +/* + * Find the last element in the subtree rooted at hdr, return its value and + * put its location in where if non-null. + */ +static void * +btree_last_helper(btree_t *btree, btree_hdr_t *hdr, btree_index_t *where) +{ + btree_hdr_t *node; + + for (node = hdr; node->bth_core; node = + ((btree_core_t *)node)->btc_children[node->bth_count]) + ; + + btree_leaf_t *leaf = (btree_leaf_t *)node; + if (where != NULL) { + where->bti_node = node; + where->bti_offset = node->bth_count - 1; + where->bti_before = B_FALSE; + } + return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size); +} + +/* + * Return the last element in the tree, and put its location in where if + * non-null. + */ +void * +btree_last(btree_t *tree, btree_index_t *where) +{ + if (tree->bt_height == -1) { + ASSERT0(tree->bt_num_elems); + return (NULL); + } + return (btree_last_helper(tree, tree->bt_root, where)); +} + +/* + * This function contains the logic to find the next node in the tree. A + * helper function is used because there are multiple internal consumemrs of + * this logic. The done_func is used by btree_destroy_nodes to clean up each + * node after we've finished with it. + */ +static void * +btree_next_helper(btree_t *tree, const btree_index_t *idx, + btree_index_t *out_idx, + void (*done_func)(btree_t *, btree_hdr_t *)) +{ + if (idx->bti_node == NULL) { + ASSERT3S(tree->bt_height, ==, -1); + return (NULL); + } + + uint64_t offset = idx->bti_offset; + if (!idx->bti_node->bth_core) { + /* + * When finding the next element of an element in a leaf, + * there are two cases. If the element isn't the last one in + * the leaf, in which case we just return the next element in + * the leaf. Otherwise, we need to traverse up our parents + * until we find one where our ancestor isn't the last child + * of its parent. Once we do, the next element is the + * separator after our ancestor in its parent. + */ + btree_leaf_t *leaf = (btree_leaf_t *)idx->bti_node; + uint64_t new_off = offset + (idx->bti_before ? 0 : 1); + if (leaf->btl_hdr.bth_count > new_off) { + out_idx->bti_node = &leaf->btl_hdr; + out_idx->bti_offset = new_off; + out_idx->bti_before = B_FALSE; + return (leaf->btl_elems + new_off * tree->bt_elem_size); + } + + btree_hdr_t *prev = &leaf->btl_hdr; + for (btree_core_t *node = leaf->btl_hdr.bth_parent; + node != NULL; node = node->btc_hdr.bth_parent) { + btree_hdr_t *hdr = &node->btc_hdr; + ASSERT(hdr->bth_core); + uint64_t i = btree_find_parent_idx(tree, prev); + if (done_func != NULL) + done_func(tree, prev); + if (i == hdr->bth_count) { + prev = hdr; + continue; + } + out_idx->bti_node = hdr; + out_idx->bti_offset = i; + out_idx->bti_before = B_FALSE; + return (node->btc_elems + i * tree->bt_elem_size); + } + if (done_func != NULL) + done_func(tree, prev); + /* + * We've traversed all the way up and been at the end of the + * node every time, so this was the last element in the tree. + */ + return (NULL); + } + + /* If we were before an element in a core node, return that element. */ + ASSERT(idx->bti_node->bth_core); + btree_core_t *node = (btree_core_t *)idx->bti_node; + if (idx->bti_before) { + out_idx->bti_before = B_FALSE; + return (node->btc_elems + offset * tree->bt_elem_size); + } + + /* + * The next element from one in a core node is the first element in + * the subtree just to the right of the separator. + */ + btree_hdr_t *child = node->btc_children[offset + 1]; + return (btree_first_helper(child, out_idx)); +} + +/* + * Return the next valued node in the tree. The same address can be safely + * passed for idx and out_idx. + */ +void * +btree_next(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) +{ + return (btree_next_helper(tree, idx, out_idx, NULL)); +} + +/* + * Return the previous valued node in the tree. The same value can be safely + * passed for idx and out_idx. + */ +void * +btree_prev(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) +{ + if (idx->bti_node == NULL) { + ASSERT3S(tree->bt_height, ==, -1); + return (NULL); + } + + uint64_t offset = idx->bti_offset; + if (!idx->bti_node->bth_core) { + /* + * When finding the previous element of an element in a leaf, + * there are two cases. If the element isn't the first one in + * the leaf, in which case we just return the next element in + * the leaf. Otherwise, we need to traverse up our parents + * until we find one where our previous ancestor isn't the + * first child. Once we do, the next element is the separator + * before our previous ancestor. + */ + btree_leaf_t *leaf = (btree_leaf_t *)idx->bti_node; + if (offset != 0) { + out_idx->bti_node = &leaf->btl_hdr; + out_idx->bti_offset = offset - 1; + out_idx->bti_before = B_FALSE; + return (leaf->btl_elems + (offset - 1) * + tree->bt_elem_size); + } + btree_hdr_t *prev = &leaf->btl_hdr; + for (btree_core_t *node = leaf->btl_hdr.bth_parent; + node != NULL; node = node->btc_hdr.bth_parent) { + btree_hdr_t *hdr = &node->btc_hdr; + ASSERT(hdr->bth_core); + uint64_t i = btree_find_parent_idx(tree, prev); + if (i == 0) { + prev = hdr; + continue; + } + out_idx->bti_node = hdr; + out_idx->bti_offset = i - 1; + out_idx->bti_before = B_FALSE; + return (node->btc_elems + (i - 1) * tree->bt_elem_size); + } + /* + * We've traversed all the way up and been at the start of the + * node every time, so this was the first node in the tree. + */ + return (NULL); + } + + /* + * The previous element from one in a core node is the last element in + * the subtree just to the lefet of the separator. + */ + ASSERT(idx->bti_node->bth_core); + btree_core_t *node = (btree_core_t *)idx->bti_node; + btree_hdr_t *child = node->btc_children[offset]; + return (btree_last_helper(tree, child, out_idx)); +} + +/* + * Get the value at the provided index in the tree. + * + * Note that the value returned from this function can be mutataed, but only + * if it will not change the ordering of the element with respect to any other + * elements that could be in the tree. + */ +void * +btree_get(btree_t *tree, btree_index_t *idx) +{ + ASSERT(!idx->bti_before); + if (!idx->bti_node->bth_core) { + btree_leaf_t *leaf = (btree_leaf_t *)idx->bti_node; + return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size); + } + ASSERT(idx->bti_node->bth_core); + btree_core_t *node = (btree_core_t *)idx->bti_node; + return (node->btc_elems + idx->bti_offset * tree->bt_elem_size); +} + +/* Add the given value to the tree. Must not already be in the tree. */ +void +btree_add(btree_t *tree, const void *node) +{ + btree_index_t where = {0}; + VERIFY3P(btree_find(tree, node, &where), ==, NULL); + btree_insert(tree, node, &where); +} + +/* Helper function to free a tree node. */ +static void +btree_node_destroy(btree_t *tree, btree_hdr_t *node) +{ + tree->bt_num_nodes--; + if (!node->bth_core) { + kmem_cache_free(btree_leaf_cache, node); + } else { + kmem_free(node, sizeof (btree_core_t) + + BTREE_CORE_ELEMS * tree->bt_elem_size); + } +} + +/* + * Remove the rm_hdr and the separator to its left from the parent node. The + * buffer that rm_hdr was stored in may already be freed, so its contents + * cannot be accessed. + */ +static void +btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) +{ + size_t size = tree->bt_elem_size; + uint64_t min_count = BTREE_CORE_ELEMS / 2; + btree_hdr_t *hdr = &node->btc_hdr; + /* + * If the node is the root node and rm_hdr is one of two children, + * promote the other child to the root. + */ + if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { + ASSERT3U(hdr->bth_count, ==, 1); + ASSERT3P(tree->bt_root, ==, node); + ASSERT3P(node->btc_children[1], ==, rm_hdr); + tree->bt_root = node->btc_children[0]; + node->btc_children[0]->bth_parent = NULL; + btree_node_destroy(tree, hdr); + tree->bt_height--; + return; + } + + uint64_t idx; + for (idx = 0; idx <= hdr->bth_count; idx++) { + if (node->btc_children[idx] == rm_hdr) + break; + } + ASSERT3U(idx, <=, hdr->bth_count); + hdr->bth_count--; + + /* + * If the node is the root or it has more than the minimum number of + * children, just remove the child and separator, and return. + */ + if (hdr->bth_parent == NULL || + hdr->bth_count >= min_count) { + /* + * Shift the element and children to the right of rm_hdr to + * the left by one spot. + */ + uint8_t *e_start = node->btc_elems + idx * size; + uint8_t *e_out = e_start - size; + uint64_t e_count = hdr->bth_count - idx + 1; + bmov(e_start, e_out, e_count * size); + + btree_hdr_t **c_start = node->btc_children + idx + 1; + btree_hdr_t **c_out = c_start - 1; + uint64_t c_count = hdr->bth_count - idx + 1; + bmov(c_start, c_out, c_count * sizeof (*c_start)); +#ifdef ZFS_DEBUG + btree_poison_node_at(tree, hdr, hdr->bth_count); +#endif + return; + } + + ASSERT3U(hdr->bth_count, ==, min_count - 1); + + /* + * Now we try to steal a node from a neighbor. We check left, then + * right. If the neighbor exists and has more than the minimum number + * of elements, we move the separator betweeen us and them to our + * node, move their closest element (last for left, first for right) + * to the separator, and move their closest child to our node. Along + * the way we need to collapse the gap made by idx, and (for our right + * neighbor) the gap made by removing their first element and child. + * + * Note: this logic currently doesn't support stealing from a neighbor + * that isn't a sibling. This isn't critical functionality, but may be + * worth implementing in the future for completeness' sake. + */ + btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = btree_find_parent_idx(tree, hdr); + + btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + parent->btc_children[parent_idx - 1]); + if (l_hdr != NULL && l_hdr->bth_count > min_count) { + /* We can steal a node from the left neighbor. */ + ASSERT(l_hdr->bth_core); + btree_core_t *neighbor = (btree_core_t *)l_hdr; + + /* + * Start by shifting the elements and children in the current + * node to the right by one spot. + */ + uint8_t *e_start = node->btc_elems; + uint8_t *e_out = node->btc_elems + size; + uint64_t e_count = idx - 1; + bmov(e_start, e_out, e_count * size); + + btree_hdr_t **c_start = node->btc_children; + btree_hdr_t **c_out = c_start + 1; + uint64_t c_count = idx; + bmov(c_start, c_out, c_count * sizeof (*c_out)); + + /* + * Move the separator between node and neighbor to the first + * element slot in the current node. + */ + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, node->btc_elems, size); + + /* Move the last child of neighbor to our first child slot. */ + btree_hdr_t **steal_child = neighbor->btc_children + + l_hdr->bth_count; + bmov(steal_child, node->btc_children, sizeof (*steal_child)); + node->btc_children[0]->bth_parent = node; + + /* Move the last element of neighbor to the separator spot. */ + uint8_t *steal_elem = neighbor->btc_elems + + (l_hdr->bth_count - 1) * size; + bmov(steal_elem, separator, size); + l_hdr->bth_count--; + hdr->bth_count++; +#ifdef ZFS_DEBUG + btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); +#endif + return; + } + + btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + NULL : parent->btc_children[parent_idx + 1]); + if (r_hdr != NULL && r_hdr->bth_count > min_count) { + /* We can steal a node from the right neighbor. */ + ASSERT(r_hdr->bth_core); + btree_core_t *neighbor = (btree_core_t *)r_hdr; + + /* + * Shift elements in node left by one spot to overwrite rm_hdr + * and the separator before it. + */ + uint8_t *e_start = node->btc_elems + idx * size; + uint8_t *e_out = e_start - size; + uint64_t e_count = hdr->bth_count - idx + 1; + bmov(e_start, e_out, e_count * size); + + btree_hdr_t **c_start = node->btc_children + idx + 1; + btree_hdr_t **c_out = c_start - 1; + uint64_t c_count = hdr->bth_count - idx + 1; + bmov(c_start, c_out, c_count * sizeof (*c_out)); + + /* + * Move the separator between node and neighbor to the last + * element spot in node. + */ + uint8_t *separator = parent->btc_elems + parent_idx * size; + bmov(separator, node->btc_elems + hdr->bth_count * size, size); + + /* + * Move the first child of neighbor to the last child spot in + * node. + */ + btree_hdr_t **steal_child = neighbor->btc_children; + bmov(steal_child, node->btc_children + (hdr->bth_count + 1), + sizeof (*steal_child)); + node->btc_children[hdr->bth_count + 1]->bth_parent = node; + + /* Move the first element of neighbor to the separator spot. */ + uint8_t *steal_elem = neighbor->btc_elems; + bmov(steal_elem, separator, size); + r_hdr->bth_count--; + hdr->bth_count++; + + /* + * Shift the elements and children of neighbor to cover the + * stolen elements. + */ + bmov(neighbor->btc_elems + size, neighbor->btc_elems, + r_hdr->bth_count * size); + bmov(neighbor->btc_children + 1, neighbor->btc_children, + (r_hdr->bth_count + 1) * sizeof (steal_child)); +#ifdef ZFS_DEBUG + btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); +#endif + return; + } + + /* + * In this case, neither of our neighbors can spare an element, so we + * need to merge with one of them. We prefer the left one, + * arabitrarily. Move the separator into the leftmost merging node + * (which may be us or the left neighbor), and then move the right + * merging node's elements (skipping or overwriting idx, which we're + * deleting). Once that's done, go into the parent and delete the + * right merging node and the separator. This may cause further + * merging. + */ + btree_hdr_t *new_rm_hdr; + + if (l_hdr != NULL) { + ASSERT(l_hdr->bth_core); + btree_core_t *left = (btree_core_t *)l_hdr; + + if (btree_verify_intensity >= 5) { + for (int i = 0; i < hdr->bth_count + 1; i++) { + btree_verify_poison_at(tree, l_hdr, + l_hdr->bth_count + i); + } + } + /* Move the separator into the left node. */ + uint8_t *e_out = left->btc_elems + l_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, e_out, size); + + /* Move all our elements into the left node. */ + e_out += size; + uint8_t *e_start = node->btc_elems; + uint64_t e_count = idx - 1; + bmov(e_start, e_out, e_count * size); + + e_out += e_count * size; + e_start += (e_count + 1) * size; + e_count = hdr->bth_count - idx + 1; + bmov(e_start, e_out, e_count * size); + + /* Move all our children into the left node. */ + btree_hdr_t **c_start = node->btc_children; + btree_hdr_t **c_out = left->btc_children + + l_hdr->bth_count + 1; + uint64_t c_count = idx; + bmov(c_start, c_out, c_count * sizeof (*c_out)); + + c_out += c_count; + c_start += c_count + 1; + c_count = hdr->bth_count - idx + 1; + bmov(c_start, c_out, c_count * sizeof (*c_out)); + + /* Reparent all our children to point to the left node. */ + btree_hdr_t **new_start = left->btc_children + + l_hdr->bth_count + 1; + for (int i = 0; i < hdr->bth_count + 1; i++) + new_start[i]->bth_parent = left; + + /* Update bookkeeping */ + l_hdr->bth_count += hdr->bth_count + 1; + for (int i = 0; i <= l_hdr->bth_count; i++) + ASSERT3P(left->btc_children[i]->bth_parent, ==, left); + ASSERT3U(l_hdr->bth_count, ==, BTREE_CORE_ELEMS); + new_rm_hdr = hdr; + } else { + ASSERT3P(r_hdr, !=, NULL); + ASSERT(r_hdr->bth_core); + btree_core_t *right = (btree_core_t *)r_hdr; + + if (btree_verify_intensity >= 5) { + for (int i = 0; i < r_hdr->bth_count; i++) { + btree_verify_poison_at(tree, hdr, + hdr->bth_count + i + 1); + } + } + /* + * Overwrite rm_hdr and its separator by moving node's + * elements and children forward. + */ + uint8_t *e_start = node->btc_elems + idx * size; + uint8_t *e_out = e_start - size; + uint64_t e_count = hdr->bth_count - idx + 1; + bmov(e_start, e_out, e_count * size); + + btree_hdr_t **c_start = node->btc_children + idx + 1; + btree_hdr_t **c_out = c_start - 1; + uint64_t c_count = hdr->bth_count - idx + 1; + bmov(c_start, c_out, c_count * sizeof (*c_out)); + + /* + * Move the separator to the first open spot in node's + * elements. + */ + e_out += e_count * size; + uint8_t *separator = parent->btc_elems + parent_idx * size; + bmov(separator, e_out, size); + + /* Move the right node's elements and children to node. */ + e_out += size; + e_start = right->btc_elems; + e_count = r_hdr->bth_count; + bmov(e_start, e_out, e_count * size); + + c_out += c_count; + c_start = right->btc_children; + c_count = r_hdr->bth_count + 1; + bmov(c_start, c_out, c_count * sizeof (*c_out)); + + /* Reparent the right node's children to point to node. */ + for (int i = 0; i < c_count; i++) + c_out[i]->bth_parent = node; + + /* Update bookkeeping. */ + hdr->bth_count += r_hdr->bth_count + 1; + for (int i = 0; i <= hdr->bth_count; i++) + ASSERT3P(node->btc_children[i]->bth_parent, ==, node); + + ASSERT3U(hdr->bth_count, ==, BTREE_CORE_ELEMS); + new_rm_hdr = r_hdr; + } + + new_rm_hdr->bth_count = 0; + btree_node_destroy(tree, new_rm_hdr); + btree_remove_from_node(tree, parent, new_rm_hdr); +} + +/* Remove the element at the specific location. */ +void +btree_remove_from(btree_t *tree, btree_index_t *where) +{ + size_t size = tree->bt_elem_size; + btree_hdr_t *hdr = where->bti_node; + uint64_t idx = where->bti_offset; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (btree_hdr_t)) / + size, 2); + + ASSERT(!where->bti_before); + if (tree->bt_bulk != NULL) { + /* + * Leave bulk insert mode. Note that our index would be + * invalid after we correct the tree, so we copy the value + * we're planning to remove and find it again after + * bulk_finish. + */ + uint8_t *value = btree_get(tree, where); + uint8_t *tmp = kmem_alloc(size, KM_SLEEP); + bcopy(value, tmp, size); + btree_bulk_finish(tree); + VERIFY3P(btree_find(tree, tmp, where), !=, NULL); + kmem_free(tmp, size); + hdr = where->bti_node; + idx = where->bti_offset; + } + + tree->bt_num_elems--; + /* + * If the element happens to be in a core node, we move a leaf node's + * element into its place and then remove the leaf node element. This + * makes the rebalance logic not need to be recursive both upwards and + * downwards. + */ + if (hdr->bth_core) { + btree_core_t *node = (btree_core_t *)hdr; + btree_hdr_t *left_subtree = node->btc_children[idx]; + void *new_value = btree_last_helper(tree, left_subtree, where); + ASSERT3P(new_value, !=, NULL); + + bmov(new_value, node->btc_elems + idx * size, size); + + hdr = where->bti_node; + idx = where->bti_offset; + ASSERT(!where->bti_before); + } + + /* + * First, we'll update the leaf's metadata. Then, we shift any + * elements after the idx to the left. After that, we rebalance if + * needed. + */ + ASSERT(!hdr->bth_core); + btree_leaf_t *leaf = (btree_leaf_t *)hdr; + ASSERT3U(hdr->bth_count, >, 0); + hdr->bth_count--; + + uint64_t min_count = capacity / 2; + + /* + * If we're over the minimum size or this is the root, just overwrite + * the value and return. + */ + if (hdr->bth_count >= min_count || hdr->bth_parent == NULL) { + bmov(leaf->btl_elems + (idx + 1) * size, leaf->btl_elems + + idx * size, (hdr->bth_count - idx) * size); + if (hdr->bth_parent == NULL) { + ASSERT0(tree->bt_height); + if (hdr->bth_count == 0) { + tree->bt_root = NULL; + tree->bt_height--; + btree_node_destroy(tree, &leaf->btl_hdr); + } + } +#ifdef ZFS_DEBUG + if (tree->bt_root != NULL) + btree_poison_node_at(tree, hdr, hdr->bth_count); +#endif + btree_verify(tree); + return; + } + ASSERT3U(hdr->bth_count, ==, min_count - 1); + + /* + * Now we try to steal a node from a sibling. We check left, then + * right. If they exist and have more than the minimum number of + * elements, we move the separator betweeen us and them to our node + * and move their closest element (last for left, first for right) to + * the separator. Along the way we need to collapse the gap made by + * idx, and (for our right neighbor) the gap made by removing their + * first element. + * + * Note: this logic currently doesn't support stealing from a neighbor + * that isn't a sibling. This isn't critical functionality, but may be + * worth implementing in the future for completeness' sake. + */ + btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = btree_find_parent_idx(tree, hdr); + + btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + parent->btc_children[parent_idx - 1]); + if (l_hdr != NULL && l_hdr->bth_count > min_count) { + /* We can steal a node from the left neighbor. */ + ASSERT(!l_hdr->bth_core); + + /* + * Move our elements back by one spot to make room for the + * stolen element and overwrite the element being removed. + */ + bmov(leaf->btl_elems, leaf->btl_elems + size, idx * size); + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + uint8_t *steal_elem = ((btree_leaf_t *)l_hdr)->btl_elems + + (l_hdr->bth_count - 1) * size; + /* Move the separator to our first spot. */ + bmov(separator, leaf->btl_elems, size); + + /* Move our neighbor's last element to the separator. */ + bmov(steal_elem, separator, size); + + /* Update the bookkeeping. */ + l_hdr->bth_count--; + hdr->bth_count++; +#ifdef ZFS_DEBUG + btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); +#endif + btree_verify(tree); + return; + } + + btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + NULL : parent->btc_children[parent_idx + 1]); + if (r_hdr != NULL && r_hdr->bth_count > min_count) { + /* We can steal a node from the right neighbor. */ + ASSERT(!r_hdr->bth_core); + btree_leaf_t *neighbor = (btree_leaf_t *)r_hdr; + + /* + * Move our elements after the element being removed forwards + * by one spot to make room for the stolen element and + * overwrite the element being removed. + */ + bmov(leaf->btl_elems + (idx + 1) * size, leaf->btl_elems + + idx * size, (hdr->bth_count - idx) * size); + + uint8_t *separator = parent->btc_elems + parent_idx * size; + uint8_t *steal_elem = ((btree_leaf_t *)r_hdr)->btl_elems; + /* Move the separator between us to our last spot. */ + bmov(separator, leaf->btl_elems + hdr->bth_count * size, size); + + /* Move our neighbor's first element to the separator. */ + bmov(steal_elem, separator, size); + + /* Update the bookkeeping. */ + r_hdr->bth_count--; + hdr->bth_count++; + + /* + * Move our neighbors elements forwards to overwrite the + * stolen element. + */ + bmov(neighbor->btl_elems + size, neighbor->btl_elems, + r_hdr->bth_count * size); +#ifdef ZFS_DEBUG + btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); +#endif + btree_verify(tree); + return; + } + + /* + * In this case, neither of our neighbors can spare an element, so we + * need to merge with one of them. We prefer the left one, + * arabitrarily. Move the separator into the leftmost merging node + * (which may be us or the left neighbor), and then move the right + * merging node's elements (skipping or overwriting idx, which we're + * deleting). Once that's done, go into the parent and delete the + * right merging node and the separator. This may cause further + * merging. + */ + btree_hdr_t *rm_hdr; + + if (l_hdr != NULL) { + ASSERT(!l_hdr->bth_core); + btree_leaf_t *left = (btree_leaf_t *)l_hdr; + + if (btree_verify_intensity >= 5) { + for (int i = 0; i < hdr->bth_count + 1; i++) { + btree_verify_poison_at(tree, l_hdr, + l_hdr->bth_count + i); + } + } + /* + * Move the separator into the first open spot in the left + * neighbor. + */ + uint8_t *out = left->btl_elems + l_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, out, size); + + /* Move our elements to the left neighbor. */ + out += size; + uint8_t *start = leaf->btl_elems; + uint64_t count = idx; + bmov(start, out, count * size); + + out += count * size; + start += (count + 1) * size; + count = hdr->bth_count - idx; + bmov(start, out, count * size); + + /* Update the bookkeeping. */ + l_hdr->bth_count += hdr->bth_count + 1; + ASSERT3U(l_hdr->bth_count, ==, min_count * 2); + rm_hdr = hdr; + } else { + ASSERT3P(r_hdr, !=, NULL); + ASSERT(!r_hdr->bth_core); + if (btree_verify_intensity >= 5) { + for (int i = 0; i < r_hdr->bth_count; i++) { + btree_verify_poison_at(tree, hdr, + hdr->bth_count + i + 1); + } + } + btree_leaf_t *right = (btree_leaf_t *)r_hdr; + + /* + * Move our elements left to overwrite the element being + * removed. + */ + uint8_t *start = leaf->btl_elems + (idx + 1) * size; + uint8_t *out = start - size; + uint64_t count = hdr->bth_count - idx; + bmov(start, out, count * size); + + /* Move the separator to node's first open spot. */ + out += count * size; + uint8_t *separator = parent->btc_elems + parent_idx * size; + bmov(separator, out, size); + + /* Move the right neighbor's elements to node. */ + out += size; + start = right->btl_elems; + count = r_hdr->bth_count; + bmov(start, out, count * size); + + /* Update the bookkeeping. */ + hdr->bth_count += r_hdr->bth_count + 1; + ASSERT3U(hdr->bth_count, ==, min_count * 2); + rm_hdr = r_hdr; + } + rm_hdr->bth_count = 0; + btree_node_destroy(tree, rm_hdr); + /* Remove the emptied node from the parent. */ + btree_remove_from_node(tree, parent, rm_hdr); + btree_verify(tree); +} + +/* Remove the given value from the tree. */ +void +btree_remove(btree_t *tree, const void *value) +{ + btree_index_t where = {0}; + VERIFY3P(btree_find(tree, value, &where), !=, NULL); + btree_remove_from(tree, &where); +} + +/* Return the number of elements in the tree. */ +ulong_t +btree_numnodes(btree_t *tree) +{ + return (tree->bt_num_elems); +} + +/* + * This function is used to visit all the elements in the tree before + * destroying the tree. This allows the calling code to perform any cleanup it + * needs to do. This is more efficient than just removing the first element + * over and over, because it removes all rebalancing. Once the destroy_nodes() + * function has been called, no other btree operations are valid until it + * returns NULL, which point the only valid operation is btree_destroy(). + * + * example: + * + * btree_index_t *cookie = NULL; + * my_data_t *node; + * + * while ((node = btree_destroy_nodes(tree, &cookie)) != NULL) + * free(node->ptr); + * btree_destroy(tree); + * + */ +void * +btree_destroy_nodes(btree_t *tree, btree_index_t **cookie) +{ + if (*cookie == NULL) { + if (tree->bt_height == -1) + return (NULL); + *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); + return (btree_first(tree, *cookie)); + } + + void *rval = btree_next_helper(tree, *cookie, *cookie, + btree_node_destroy); + if (rval == NULL) { + tree->bt_root = NULL; + tree->bt_height = -1; + tree->bt_num_elems = 0; + kmem_free(*cookie, sizeof (**cookie)); + tree->bt_bulk = NULL; + } + return (rval); +} + +static void +btree_clear_helper(btree_t *tree, btree_hdr_t *hdr) +{ + if (hdr->bth_core) { + btree_core_t *btc = (btree_core_t *)hdr; + for (int i = 0; i <= hdr->bth_count; i++) { + btree_clear_helper(tree, btc->btc_children[i]); + } + } + + btree_node_destroy(tree, hdr); +} + +void +btree_clear(btree_t *tree) +{ + if (tree->bt_root == NULL) { + ASSERT0(tree->bt_num_elems); + return; + } + + btree_clear_helper(tree, tree->bt_root); + tree->bt_num_elems = 0; + tree->bt_root = NULL; + tree->bt_num_nodes = 0; + tree->bt_height = -1; + tree->bt_bulk = NULL; +} + +void +btree_destroy(btree_t *tree) +{ + ASSERT0(tree->bt_num_elems); + ASSERT3P(tree->bt_root, ==, NULL); +} + +/* Verify that every child of this node has the correct parent pointer. */ +static void +btree_verify_pointers_helper(btree_t *tree, btree_hdr_t *hdr) +{ + if (!hdr->bth_core) + return; + + btree_core_t *node = (btree_core_t *)hdr; + for (int i = 0; i <= hdr->bth_count; i++) { + VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); + btree_verify_pointers_helper(tree, node->btc_children[i]); + } +} + +/* Verify that every node has the correct parent pointer. */ +static void +btree_verify_pointers(btree_t *tree) +{ + if (tree->bt_height == -1) { + VERIFY3P(tree->bt_root, ==, NULL); + return; + } + VERIFY3P(tree->bt_root->bth_parent, ==, NULL); + btree_verify_pointers_helper(tree, tree->bt_root); +} + +/* + * Verify that all the current node and its children satisfy the count + * invariants, and return the total count in the subtree rooted in this node. + */ +static uint64_t +btree_verify_counts_helper(btree_t *tree, btree_hdr_t *hdr) +{ + if (!hdr->bth_core) { + if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) { + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (btree_hdr_t)) / tree->bt_elem_size, 2); + VERIFY3U(hdr->bth_count, >=, capacity / 2); + } + + return (hdr->bth_count); + } else { + + btree_core_t *node = (btree_core_t *)hdr; + uint64_t ret = hdr->bth_count; + if (tree->bt_root != hdr && tree->bt_bulk == NULL) + VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2); + for (int i = 0; i <= hdr->bth_count; i++) { + ret += btree_verify_counts_helper(tree, + node->btc_children[i]); + } + + return (ret); + } +} + +/* + * Verify that all nodes satisfy the invariants and that the total number of + * elements is correct. + */ +static void +btree_verify_counts(btree_t *tree) +{ + EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); + if (tree->bt_height == -1) { + return; + } + VERIFY3P(btree_verify_counts_helper(tree, tree->bt_root), ==, + tree->bt_num_elems); +} + +/* + * Check that the subtree rooted at this node has a uniform height. Returns + * the number of nodes under this node, to help verify bt_num_nodes. + */ +static uint64_t +btree_verify_height_helper(btree_t *tree, btree_hdr_t *hdr, int64_t height) +{ + if (!hdr->bth_core) { + VERIFY0(height); + return (1); + } + + VERIFY(hdr->bth_core); + btree_core_t *node = (btree_core_t *)hdr; + uint64_t ret = 1; + for (int i = 0; i <= hdr->bth_count; i++) { + ret += btree_verify_height_helper(tree, node->btc_children[i], + height - 1); + } + return (ret); +} + +/* + * Check that the tree rooted at this node has a uniform height, and that the + * bt_height in the tree is correct. + */ +static void +btree_verify_height(btree_t *tree) +{ + EQUIV(tree->bt_height == -1, tree->bt_root == NULL); + if (tree->bt_height == -1) { + return; + } + + VERIFY3U(btree_verify_height_helper(tree, tree->bt_root, + tree->bt_height), ==, tree->bt_num_nodes); +} + +/* + * Check that the elements in this node are sorted, and that if this is a core + * node, the separators are properly between the subtrees they separaate and + * that the children also satisfy this requirement. + */ +static void +btree_verify_order_helper(btree_t *tree, btree_hdr_t *hdr) +{ + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + btree_leaf_t *leaf = (btree_leaf_t *)hdr; + for (int i = 1; i < hdr->bth_count; i++) { + VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) * + size, leaf->btl_elems + i * size), ==, -1); + } + return; + } + + btree_core_t *node = (btree_core_t *)hdr; + for (int i = 1; i < hdr->bth_count; i++) { + VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, + node->btc_elems + i * size), ==, -1); + } + for (int i = 0; i < hdr->bth_count; i++) { + uint8_t *left_child_last = NULL; + btree_hdr_t *left_child_hdr = node->btc_children[i]; + if (left_child_hdr->bth_core) { + btree_core_t *left_child = + (btree_core_t *)left_child_hdr; + left_child_last = left_child->btc_elems + + (left_child_hdr->bth_count - 1) * size; + } else { + btree_leaf_t *left_child = + (btree_leaf_t *)left_child_hdr; + left_child_last = left_child->btl_elems + + (left_child_hdr->bth_count - 1) * size; + } + if (tree->bt_compar(node->btc_elems + i * size, + left_child_last) != 1) { + panic("btree: compar returned %d (expected 1) at " + "%p %d: compar(%p, %p)", tree->bt_compar( + node->btc_elems + i * size, left_child_last), + (void *)node, i, (void *)(node->btc_elems + i * + size), (void *)left_child_last); + } + + uint8_t *right_child_first = NULL; + btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; + if (right_child_hdr->bth_core) { + btree_core_t *right_child = + (btree_core_t *)right_child_hdr; + right_child_first = right_child->btc_elems; + } else { + btree_leaf_t *right_child = + (btree_leaf_t *)right_child_hdr; + right_child_first = right_child->btl_elems; + } + if (tree->bt_compar(node->btc_elems + i * size, + right_child_first) != -1) { + panic("btree: compar returned %d (expected -1) at " + "%p %d: compar(%p, %p)", tree->bt_compar( + node->btc_elems + i * size, right_child_first), + (void *)node, i, (void *)(node->btc_elems + i * + size), (void *)right_child_first); + } + } + for (int i = 0; i <= hdr->bth_count; i++) { + btree_verify_order_helper(tree, node->btc_children[i]); + } +} + +/* Check that all elements in the tree are in sorted order. */ +static void +btree_verify_order(btree_t *tree) +{ + EQUIV(tree->bt_height == -1, tree->bt_root == NULL); + if (tree->bt_height == -1) { + return; + } + + btree_verify_order_helper(tree, tree->bt_root); +} + +/* Check that all unused memory is poisoned correctly. */ +static void +btree_verify_poison_helper(btree_t *tree, btree_hdr_t *hdr) +{ + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + btree_leaf_t *leaf = (btree_leaf_t *)hdr; + uint8_t val = 0x0f; + for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE - + sizeof (btree_hdr_t); i++) { + VERIFY3U(leaf->btl_elems[i], ==, val); + } + } else { + btree_core_t *node = (btree_core_t *)hdr; + uint8_t val = 0x0f; + for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size; + i++) { + VERIFY3U(node->btc_elems[i], ==, val); + } + + for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + VERIFY3P(node->btc_children[i], ==, + (btree_hdr_t *)0xabadb10cdeadbeef); + } + + for (int i = 0; i <= hdr->bth_count; i++) { + btree_verify_poison_helper(tree, node->btc_children[i]); + } + } +} + +/* Check that unused memory in the tree is still poisoned. */ +static void +btree_verify_poison(btree_t *tree) +{ +#ifdef ZFS_DEBUG + if (tree->bt_height == -1) + return; + btree_verify_poison_helper(tree, tree->bt_root); +#endif +} + +void +btree_verify(btree_t *tree) +{ + if (btree_verify_intensity == 0) + return; + btree_verify_height(tree); + if (btree_verify_intensity == 1) + return; + btree_verify_pointers(tree); + if (btree_verify_intensity == 2) + return; + btree_verify_counts(tree); + if (btree_verify_intensity == 3) + return; + btree_verify_order(tree); + + if (btree_verify_intensity == 4) + return; + btree_verify_poison(tree); +} diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 95132344c2a9..85dbc0664857 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -2218,7 +2218,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { - dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL); + dn->dn_free_ranges[txgoff] = range_tree_create(NULL, + RANGE_SEG64, NULL, 0, 0); } range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f1621c9c9b66..0646af57652f 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -279,7 +279,7 @@ struct dsl_scan_io_queue { /* trees used for sorting I/Os and extents of I/Os */ range_tree_t *q_exts_by_addr; - avl_tree_t q_exts_by_size; + btree_t q_exts_by_size; avl_tree_t q_sios_by_addr; uint64_t q_sio_memused; @@ -646,7 +646,8 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) mutex_enter(&vd->vdev_scan_io_queue_lock); ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); - ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL); + ASSERT3P(btree_first(&q->q_exts_by_size, NULL), ==, + NULL); ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); mutex_exit(&vd->vdev_scan_io_queue_lock); } @@ -1242,7 +1243,7 @@ dsl_scan_should_clear(dsl_scan_t *scn) queue = tvd->vdev_scan_io_queue; if (queue != NULL) { /* # extents in exts_by_size = # in exts_by_addr */ - mused += avl_numnodes(&queue->q_exts_by_size) * + mused += btree_numnodes(&queue->q_exts_by_size) * sizeof (range_seg_t) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); @@ -2847,7 +2848,7 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) srch_sio = sio_alloc(1); srch_sio->sio_nr_dvas = 1; - SIO_SET_OFFSET(srch_sio, rs->rs_start); + SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr)); /* * The exact start of the extent might not contain any matching zios, @@ -2859,10 +2860,12 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); - while (sio != NULL && - SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) { - ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start); - ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end); + while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + queue->q_exts_by_addr) && num_sios <= 32) { + ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs, + queue->q_exts_by_addr)); + ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs, + queue->q_exts_by_addr)); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); @@ -2880,16 +2883,19 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ - if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) { + if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + queue->q_exts_by_addr)) { range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); range_tree_resize_segment(queue->q_exts_by_addr, rs, - SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio)); + SIO_GET_OFFSET(sio), rs_get_end(rs, + queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); return (B_TRUE); } else { - range_tree_remove(queue->q_exts_by_addr, rs->rs_start, - rs->rs_end - rs->rs_start); + uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr); + uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr); + range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); return (B_FALSE); } } @@ -2918,7 +2924,7 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) if (zfs_scan_issue_strategy == 1) { return (range_tree_first(queue->q_exts_by_addr)); } else if (zfs_scan_issue_strategy == 2) { - return (avl_first(&queue->q_exts_by_size)); + return (btree_first(&queue->q_exts_by_size, NULL)); } } @@ -2934,7 +2940,7 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) if (scn->scn_checkpointing) { return (range_tree_first(queue->q_exts_by_addr)); } else if (scn->scn_clearing) { - return (avl_first(&queue->q_exts_by_size)); + return (btree_first(&queue->q_exts_by_size, NULL)); } else { return (NULL); } @@ -4038,9 +4044,10 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, static int ext_size_compare(const void *x, const void *y) { - const range_seg_t *rsa = x, *rsb = y; - uint64_t sa = rsa->rs_end - rsa->rs_start, - sb = rsb->rs_end - rsb->rs_start; + const range_seg_gap_t *rsa = x, *rsb = y; + + uint64_t sa = rsa->rs_end - rsa->rs_start; + uint64_t sb = rsb->rs_end - rsb->rs_start; uint64_t score_a, score_b; score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * @@ -4083,8 +4090,8 @@ scan_io_queue_create(vdev_t *vd) q->q_vd = vd; q->q_sio_memused = 0; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); - q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, - &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); + q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP, + &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap); avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index d4dacbc2f7f0..5d27ef8982db 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -36,6 +36,7 @@ #include #include #include +#include #define WITH_DF_BLOCK_ALLOCATOR @@ -183,6 +184,13 @@ int metaslab_df_free_pct = 4; */ int metaslab_df_max_search = 16 * 1024 * 1024; +/* + * Forces the metaslab_block_picker function to search for at least this many + * segments forwards until giving up on finding a segment that the allocation + * will fit into. + */ +uint32_t metaslab_min_search_count = 100; + /* * If we are not searching forward (due to metaslab_df_max_search, * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable @@ -276,18 +284,33 @@ uint64_t metaslab_trace_max_entries = 5000; */ int max_disabled_ms = 3; +/* + * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. + * To avoid 64-bit overflow, don't set above UINT32_MAX. + */ +unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ + /* * Maximum percentage of memory to use on storing loaded metaslabs. If loading * a metaslab would take it over this percentage, the oldest selected metaslab * is automatically unloaded. */ -int zfs_metaslab_mem_limit = 25; +int zfs_metaslab_mem_limit = 75; /* - * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. - * To avoid 64-bit overflow, don't set above UINT32_MAX. + * Force the per-metaslab range trees to use 64-bit integers to store + * segments. Used for debugging purposes. */ -unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ +boolean_t zfs_metaslab_force_large_segs = B_FALSE; + +/* + * By default we only store segments over a certain size in the size-sorted + * metaslab trees (ms_allocatable_by_size and + * ms_unflushed_frees_by_size). This dramatically reduces memory usage and + * improves load and unload times at the cost of causing us to use slightly + * larger segments than we would otherwise in some cases. + */ +uint32_t metaslab_by_size_min_shift = 14; static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); @@ -299,8 +322,80 @@ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); +static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); #ifdef _METASLAB_TRACING kmem_cache_t *metaslab_alloc_trace_cache; +kstat_t *metaslab_trace_ksp; +kstat_named_t metaslab_trace_over_limit; +kstat_t *metaslab_df_floor_ksp; +kstat_named_t metaslab_df_find_under_floor; +kstat_t *metaslab_reload_ksp; +kstat_named_t metaslab_reload_tree; + +void +metaslab_stat_init(void) +{ + ASSERT(metaslab_alloc_trace_cache == NULL); + metaslab_alloc_trace_cache = kmem_cache_create( + "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", + "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); + if (metaslab_trace_ksp != NULL) { + metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; + kstat_named_init(&metaslab_trace_over_limit, + "metaslab_trace_over_limit", KSTAT_DATA_UINT64); + kstat_install(metaslab_trace_ksp); + } + metaslab_df_floor_ksp = kstat_create("zfs", 0, + "metaslab_df_floor_stats", "misc", KSTAT_TYPE_NAMED, 1, + KSTAT_FLAG_VIRTUAL); + if (metaslab_df_floor_ksp != NULL) { + metaslab_df_floor_ksp->ks_data = &metaslab_df_find_under_floor; + kstat_named_init(&metaslab_df_find_under_floor, + "metaslab_df_find_under_floor", KSTAT_DATA_UINT64); + kstat_install(metaslab_df_floor_ksp); + } + metaslab_reload_ksp = kstat_create("zfs", 0, + "metaslab_reload_stats", "misc", KSTAT_TYPE_NAMED, 1, + KSTAT_FLAG_VIRTUAL); + if (metaslab_reload_ksp != NULL) { + metaslab_reload_ksp->ks_data = &metaslab_reload_tree; + kstat_named_init(&metaslab_reload_tree, + "metaslab_reload_tree", KSTAT_DATA_UINT64); + kstat_install(metaslab_reload_ksp); + } +} + +void +metaslab_stat_fini(void) +{ + if (metaslab_reload_ksp != NULL) { + kstat_delete(metaslab_reload_ksp); + metaslab_reload_ksp = NULL; + } + if (metaslab_df_floor_ksp != NULL) { + kstat_delete(metaslab_df_floor_ksp); + metaslab_df_floor_ksp = NULL; + } + if (metaslab_trace_ksp != NULL) { + kstat_delete(metaslab_trace_ksp); + metaslab_trace_ksp = NULL; + } + kmem_cache_destroy(metaslab_alloc_trace_cache); + metaslab_alloc_trace_cache = NULL; +} +#else + +void +metaslab_stat_init(void) +{ +} + +void +metaslab_stat_fini(void) +{ +} #endif /* @@ -1212,14 +1307,35 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, */ /* - * Comparison function for the private size-ordered tree. Tree is sorted - * by size, larger sizes at the end of the tree. + * Comparison function for the private size-ordered tree using 32-bit + * ranges. Tree is sorted by size, larger sizes at the end of the tree. + */ +static int +metaslab_rangesize32_compare(const void *x1, const void *x2) +{ + const range_seg32_t *r1 = x1; + const range_seg32_t *r2 = x2; + + uint64_t rs_size1 = r1->rs_end - r1->rs_start; + uint64_t rs_size2 = r2->rs_end - r2->rs_start; + + int cmp = AVL_CMP(rs_size1, rs_size2); + if (likely(cmp)) + return (cmp); + + return (AVL_CMP(r1->rs_start, r2->rs_start)); +} + +/* + * Comparison function for the private size-ordered tree using 64-bit + * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ static int -metaslab_rangesize_compare(const void *x1, const void *x2) +metaslab_rangesize64_compare(const void *x1, const void *x2) { - const range_seg_t *r1 = x1; - const range_seg_t *r2 = x2; + const range_seg64_t *r1 = x1; + const range_seg64_t *r2 = x2; + uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; @@ -1229,6 +1345,131 @@ metaslab_rangesize_compare(const void *x1, const void *x2) return (AVL_CMP(r1->rs_start, r2->rs_start)); } +typedef struct metaslab_rt_arg { + btree_t *mra_bt; + uint32_t mra_floor_shift; +} metaslab_rt_arg_t; + +struct mssa_arg { + range_tree_t *rt; + metaslab_rt_arg_t *mra; +}; + +static void +metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) +{ + struct mssa_arg *mssap = arg; + range_tree_t *rt = mssap->rt; + metaslab_rt_arg_t *mrap = mssap->mra; + range_seg_max_t seg = {0}; + rs_set_start(&seg, rt, start); + rs_set_end(&seg, rt, start + size); + metaslab_rt_add(rt, &seg, mrap); +} + +static void +metaslab_size_tree_full_load(range_tree_t *rt) +{ + metaslab_rt_arg_t *mrap = rt->rt_arg; + btree_t *t = mrap->mra_bt; +#ifdef _METASLAB_TRACING + atomic_inc_64(&metaslab_reload_tree.value.ui64); +#endif + ASSERT0(btree_numnodes(t)); + mrap->mra_floor_shift = 0; + struct mssa_arg arg = {0}; + arg.rt = rt; + arg.mra = mrap; + range_tree_walk(rt, metaslab_size_sorted_add, &arg); +} + +/* + * Create any block allocator specific components. The current allocators + * rely on using both a size-ordered range_tree_t and an array of uint64_t's. + */ +/* ARGSUSED */ +static void +metaslab_rt_create(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + btree_t *size_tree = mrap->mra_bt; + + size_t size; + int (*compare) (const void *, const void *); + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + compare = metaslab_rangesize32_compare; + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + compare = metaslab_rangesize64_compare; + break; + default: + panic("Invalid range seg type %d", rt->rt_type); + } + btree_create(size_tree, compare, size); + mrap->mra_floor_shift = metaslab_by_size_min_shift; +} + +/* ARGSUSED */ +static void +metaslab_rt_destroy(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + btree_t *size_tree = mrap->mra_bt; + + btree_destroy(size_tree); + kmem_free(mrap, sizeof (*mrap)); +} + +/* ARGSUSED */ +static void +metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + btree_t *size_tree = mrap->mra_bt; + + if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < + (1 << mrap->mra_floor_shift)) + return; + + btree_add(size_tree, rs); +} + +/* ARGSUSED */ +static void +metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + btree_t *size_tree = mrap->mra_bt; + + if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 << + mrap->mra_floor_shift)) + return; + + btree_remove(size_tree, rs); +} + +/* ARGSUSED */ +static void +metaslab_rt_vacate(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + btree_t *size_tree = mrap->mra_bt; + btree_clear(size_tree); + btree_destroy(size_tree); + + metaslab_rt_create(rt, arg); +} + +static range_tree_ops_t metaslab_rt_ops = { + metaslab_rt_create, + metaslab_rt_destroy, + metaslab_rt_add, + metaslab_rt_remove, + metaslab_rt_vacate +}; /* * ========================================================================== @@ -1242,16 +1483,20 @@ metaslab_rangesize_compare(const void *x1, const void *x2) uint64_t metaslab_largest_allocatable(metaslab_t *msp) { - avl_tree_t *t = &msp->ms_allocatable_by_size; + btree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; if (t == NULL) return (0); - rs = avl_last(t); + if (btree_numnodes(t) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); + + rs = btree_last(t, NULL); if (rs == NULL) return (0); - return (rs->rs_end - rs->rs_start); + return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, + msp->ms_allocatable)); } /* @@ -1266,7 +1511,9 @@ metaslab_largest_unflushed_free(metaslab_t *msp) if (msp->ms_unflushed_frees == NULL) return (0); - range_seg_t *rs = avl_last(&msp->ms_unflushed_frees_by_size); + if (btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) + metaslab_size_tree_full_load(msp->ms_unflushed_frees); + range_seg_t *rs = btree_last(&msp->ms_unflushed_frees_by_size, NULL); if (rs == NULL) return (0); @@ -1293,8 +1540,8 @@ metaslab_largest_unflushed_free(metaslab_t *msp) * the largest segment; there may be other usable chunks in the * largest segment, but we ignore them. */ - uint64_t rstart = rs->rs_start; - uint64_t rsize = rs->rs_end - rstart; + uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); + uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; for (int t = 0; t < TXG_DEFER_SIZE; t++) { uint64_t start = 0; uint64_t size = 0; @@ -1318,17 +1565,18 @@ metaslab_largest_unflushed_free(metaslab_t *msp) } static range_seg_t * -metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) +metaslab_block_find(btree_t *t, range_tree_t *rt, uint64_t start, uint64_t size, + btree_index_t *where) { - range_seg_t *rs, rsearch; - avl_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch; - rsearch.rs_start = start; - rsearch.rs_end = start + size; + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, start + size); - rs = avl_find(t, &rsearch, &where); + rs = btree_find(t, &rsearch, where); if (rs == NULL) { - rs = avl_nearest(t, where, AVL_AFTER); + rs = btree_next(t, where, where); } return (rs); @@ -1337,27 +1585,34 @@ metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) #if defined(WITH_DF_BLOCK_ALLOCATOR) || \ defined(WITH_CF_BLOCK_ALLOCATOR) /* - * This is a helper function that can be used by the allocator to find - * a suitable block to allocate. This will search the specified AVL - * tree looking for a block that matches the specified criteria. + * This is a helper function that can be used by the allocator to find a + * suitable block to allocate. This will search the specified B-tree looking + * for a block that matches the specified criteria. */ static uint64_t -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, +metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, uint64_t max_search) { - range_seg_t *rs = metaslab_block_find(t, *cursor, size); + if (*cursor == 0) + *cursor = rt->rt_start; + btree_t *bt = &rt->rt_root; + btree_index_t where; + range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); uint64_t first_found; + int count_searched = 0; if (rs != NULL) - first_found = rs->rs_start; + first_found = rs_get_start(rs, rt); - while (rs != NULL && rs->rs_start - first_found <= max_search) { - uint64_t offset = rs->rs_start; - if (offset + size <= rs->rs_end) { + while (rs != NULL && (rs_get_start(rs, rt) - first_found <= + max_search || count_searched < metaslab_min_search_count)) { + uint64_t offset = rs_get_start(rs, rt); + if (offset + size <= rs_get_end(rs, rt)) { *cursor = offset + size; return (offset); } - rs = AVL_NEXT(t, rs); + rs = btree_next(bt, &where, &where); + count_searched++; } *cursor = 0; @@ -1403,8 +1658,6 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(&rt->rt_root), ==, - avl_numnodes(&msp->ms_allocatable_by_size)); /* * If we're running low on space, find a segment based on size, @@ -1414,22 +1667,33 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) free_pct < metaslab_df_free_pct) { offset = -1; } else { - offset = metaslab_block_picker(&rt->rt_root, + offset = metaslab_block_picker(rt, cursor, size, metaslab_df_max_search); } if (offset == -1) { range_seg_t *rs; + if (btree_numnodes(&msp->ms_allocatable_by_size) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); if (metaslab_df_use_largest_segment) { /* use largest free segment */ - rs = avl_last(&msp->ms_allocatable_by_size); + rs = btree_last(&msp->ms_allocatable_by_size, NULL); } else { + btree_index_t where; /* use segment of this size, or next largest */ +#ifdef _METASLAB_TRACING + metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; + if (size < (1 << mrap->mra_floor_shift)) { + atomic_inc_64( + &metaslab_df_find_under_floor.value.ui64); + } +#endif rs = metaslab_block_find(&msp->ms_allocatable_by_size, - 0, size); + rt, msp->ms_start, size, &where); } - if (rs != NULL && rs->rs_start + size <= rs->rs_end) { - offset = rs->rs_start; + if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, + rt)) { + offset = rs_get_start(rs, rt); *cursor = offset + size; } } @@ -1458,25 +1722,27 @@ static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { range_tree_t *rt = msp->ms_allocatable; - avl_tree_t *t = &msp->ms_allocatable_by_size; + btree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { range_seg_t *rs; - rs = avl_last(&msp->ms_allocatable_by_size); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) + if (btree_numnodes(t) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); + rs = btree_last(t, NULL); + if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < + size) return (-1ULL); - *cursor = rs->rs_start; - *cursor_end = rs->rs_end; + *cursor = rs_get_start(rs, rt); + *cursor_end = rs_get_end(rs, rt); } offset = *cursor; @@ -1511,39 +1777,40 @@ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { - avl_tree_t *t = &msp->ms_allocatable->rt_root; - avl_index_t where; - range_seg_t *rs, rsearch; + btree_t *t = &msp->ms_allocatable->rt_root; + range_tree_t *rt = msp->ms_allocatable; + btree_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_largest_allocatable(msp); ASSERT(MUTEX_HELD(&msp->ms_lock)); - ASSERT3U(avl_numnodes(t), ==, - avl_numnodes(&msp->ms_allocatable_by_size)); if (max_size < size) return (-1ULL); - rsearch.rs_start = *cursor; - rsearch.rs_end = *cursor + size; + rs_set_start(&rsearch, rt, *cursor); + rs_set_end(&rsearch, rt, *cursor + size); - rs = avl_find(t, &rsearch, &where); - if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { + rs = btree_find(t, &rsearch, &where); + if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; - rsearch.rs_start = 0; - rsearch.rs_end = MIN(max_size, - 1ULL << (hbit + metaslab_ndf_clump_shift)); - rs = avl_find(t, &rsearch, &where); + rs_set_start(&rsearch, rt, 0); + rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + + metaslab_ndf_clump_shift))); + + rs = btree_find(t, &rsearch, &where); if (rs == NULL) - rs = avl_nearest(t, where, AVL_AFTER); + rs = btree_next(t, &where, &where); ASSERT(rs != NULL); } - if ((rs->rs_end - rs->rs_start) >= size) { - *cursor = rs->rs_start + size; - return (rs->rs_start); + if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { + *cursor = rs_get_start(rs, rt) + size; + return (rs_get_start(rs, rt)); } return (-1ULL); } @@ -1889,9 +2156,9 @@ metaslab_potentially_evict(metaslab_class_t *mc) { #ifdef _KERNEL uint64_t allmem = arc_all_memory(); - extern kmem_cache_t *range_seg_cache; - uint64_t inuse = range_seg_cache->skc_obj_total; - uint64_t size = range_seg_cache->skc_obj_size; + extern kmem_cache_t *btree_leaf_cache; + uint64_t inuse = btree_leaf_cache->skc_obj_total; + uint64_t size = btree_leaf_cache->skc_obj_size; int tries = 0; for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2; @@ -1928,7 +2195,7 @@ metaslab_potentially_evict(metaslab_class_t *mc) */ if (msp->ms_loading) { msp = next_msp; - inuse = range_seg_cache->skc_obj_total; + inuse = btree_leaf_cache->skc_obj_total; continue; } /* @@ -1950,7 +2217,7 @@ metaslab_potentially_evict(metaslab_class_t *mc) } mutex_exit(&msp->ms_lock); msp = next_msp; - inuse = range_seg_cache->skc_obj_total; + inuse = btree_leaf_cache->skc_obj_total; } } #endif @@ -1993,10 +2260,39 @@ metaslab_load_impl(metaslab_t *msp) mutex_exit(&msp->ms_lock); hrtime_t load_start = gethrtime(); + metaslab_rt_arg_t *mrap; + if (msp->ms_allocatable->rt_arg == NULL) { + mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + } else { + mrap = msp->ms_allocatable->rt_arg; + msp->ms_allocatable->rt_ops = NULL; + msp->ms_allocatable->rt_arg = NULL; + } + mrap->mra_bt = &msp->ms_allocatable_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; + if (msp->ms_sm != NULL) { error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, SM_FREE, length); + + /* Now, populate the size-sorted tree. */ + metaslab_rt_create(msp->ms_allocatable, mrap); + msp->ms_allocatable->rt_ops = &metaslab_rt_ops; + msp->ms_allocatable->rt_arg = mrap; + + struct mssa_arg arg = {0}; + arg.rt = msp->ms_allocatable; + arg.mra = mrap; + range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, + &arg); } else { + /* + * Add the size-sorted tree first, since we don't need to load + * the metaslab from the spacemap. + */ + metaslab_rt_create(msp->ms_allocatable, mrap); + msp->ms_allocatable->rt_ops = &metaslab_rt_ops; + msp->ms_allocatable->rt_arg = mrap; /* * The space map has not been allocated yet, so treat * all the space in the metaslab as free and add it to the @@ -2252,6 +2548,29 @@ metaslab_unload(metaslab_t *msp) metaslab_recalculate_weight_and_sort(msp); } +/* + * We want to optimize the memory use of the per-metaslab range + * trees. To do this, we store the segments in the range trees in + * units of sectors, zero-indexing from the start of the metaslab. If + * the vdev_ms_shift - the vdev_ashift is less than 32, we can store + * the ranges using two uint32_ts, rather than two uint64_ts. + */ +static range_seg_type_t +metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, + uint64_t *start, uint64_t *shift) +{ + if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && + !zfs_metaslab_force_large_segs) { + *shift = vdev->vdev_ashift; + *start = msp->ms_start; + return (RANGE_SEG32); + } else { + *shift = 0; + *start = 0; + return (RANGE_SEG64); + } +} + void metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) { @@ -2327,6 +2646,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); + /* * We create the ms_allocatable here, but we don't create the * other range trees until metaslab_sync_done(). This serves @@ -2335,10 +2658,9 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, * we'd data fault on any attempt to use this metaslab before * it's ready. */ - ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, - &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); + ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); - ms->ms_trim = range_tree_create(NULL, NULL); + ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms, B_FALSE); @@ -2393,7 +2715,7 @@ metaslab_unflushed_changes_memused(metaslab_t *ms) { return ((range_tree_numsegs(ms->ms_unflushed_allocs) + range_tree_numsegs(ms->ms_unflushed_frees)) * - sizeof (range_seg_t)); + ms->ms_unflushed_allocs->rt_root.bt_elem_size); } void @@ -3187,7 +3509,7 @@ metaslab_should_condense(metaslab_t *msp) * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ - if (avl_is_empty(&msp->ms_allocatable_by_size) || + if (range_tree_numsegs(msp->ms_allocatable) == 0 || msp->ms_condense_wanted) return (B_TRUE); @@ -3233,28 +3555,29 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) * So to truncate the space map to represent all the entries of * previous TXGs we do the following: * - * 1] We create a range tree (condense tree) that is 100% allocated. - * 2] We remove from it all segments found in the ms_defer trees + * 1] We create a range tree (condense tree) that is 100% empty. + * 2] We add to it all segments found in the ms_defer trees * as those segments are marked as free in the original space * map. We do the same with the ms_allocating trees for the same - * reason. Removing these segments should be a relatively + * reason. Adding these segments should be a relatively * inexpensive operation since we expect these trees to have a * small number of nodes. - * 3] We vacate any unflushed allocs as they should already exist - * in the condense tree. Then we vacate any unflushed frees as - * they should already be part of ms_allocatable. - * 4] At this point, we would ideally like to remove all segments + * 3] We vacate any unflushed allocs, since they are not frees we + * need to add to the condense tree.. Then we vacate any + * unflushed frees as they should already be part of ms_allocatable. + * 4] At this point, we would ideally like to add all segments * in the ms_allocatable tree from the condense tree. This way * we would write all the entries of the condense tree as the - * condensed space map, which would only contain allocated - * segments with everything else assumed to be freed. + * condensed space map, which would only contain freeed + * segments with everything else assumed to be allocated. * * Doing so can be prohibitively expensive as ms_allocatable can - * be large, and therefore computationally expensive to subtract - * from the condense_tree. Instead we first sync out the - * condense_tree and then the ms_allocatable, in the condensed - * space map. While this is not optimal, it is typically close to - * optimal and more importantly much cheaper to compute. + * be large, and therefore computationally expensive to add to + * the condense_tree. Instead we first sync out an entry marking + * everything as allocated, then the condense_tree and then the + * ms_allocatable, in the condensed space map. While this is not + * optimal, it is typically close to optimal and more importantly + * much cheaper to compute. * * 5] Finally, as both of the unflushed trees were written to our * new and condensed metaslab space map, we basically flushed @@ -3268,22 +3591,26 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, spa->spa_name, space_map_length(msp->ms_sm), - avl_numnodes(&msp->ms_allocatable->rt_root), + range_tree_numsegs(msp->ms_allocatable), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; - condense_tree = range_tree_create(NULL, NULL); - range_tree_add(condense_tree, msp->ms_start, msp->ms_size); + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, + &start, &shift); + + condense_tree = range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { range_tree_walk(msp->ms_defer[t], - range_tree_remove, condense_tree); + range_tree_add, condense_tree); } for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], - range_tree_remove, condense_tree); + range_tree_add, condense_tree); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, @@ -3331,11 +3658,17 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ - space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); + range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start, + shift); + range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); + space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); + space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); + range_tree_vacate(tmp_tree, NULL, NULL); + range_tree_destroy(tmp_tree); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; @@ -3578,7 +3911,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) return; - VERIFY(txg <= spa_final_dirty_txg(spa)); + VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); /* * The only state that can actually be changing concurrently @@ -3867,32 +4200,46 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * range trees and add its capacity to the vdev. */ if (msp->ms_freed == NULL) { + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(vd, msp, &start, + &shift); + for (int t = 0; t < TXG_SIZE; t++) { ASSERT(msp->ms_allocating[t] == NULL); - msp->ms_allocating[t] = range_tree_create(NULL, NULL); + msp->ms_allocating[t] = range_tree_create(NULL, type, + NULL, start, shift); } ASSERT3P(msp->ms_freeing, ==, NULL); - msp->ms_freeing = range_tree_create(NULL, NULL); + msp->ms_freeing = range_tree_create(NULL, type, NULL, start, + shift); ASSERT3P(msp->ms_freed, ==, NULL); - msp->ms_freed = range_tree_create(NULL, NULL); + msp->ms_freed = range_tree_create(NULL, type, NULL, start, + shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { ASSERT3P(msp->ms_defer[t], ==, NULL); - msp->ms_defer[t] = range_tree_create(NULL, NULL); + msp->ms_defer[t] = range_tree_create(NULL, type, NULL, + start, shift); } ASSERT3P(msp->ms_checkpointing, ==, NULL); - msp->ms_checkpointing = range_tree_create(NULL, NULL); + msp->ms_checkpointing = range_tree_create(NULL, type, NULL, + start, shift); ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); - msp->ms_unflushed_allocs = range_tree_create(NULL, NULL); + msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL, + start, shift); + + metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + mrap->mra_bt = &msp->ms_unflushed_frees_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; ASSERT3P(msp->ms_unflushed_frees, ==, NULL); - msp->ms_unflushed_frees = range_tree_create_impl(&rt_avl_ops, - &msp->ms_unflushed_frees_by_size, - metaslab_rangesize_compare, 0); + msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, + type, mrap, start, shift); metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); } @@ -4052,36 +4399,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva) * ========================================================================== */ #ifdef _METASLAB_TRACING -kstat_t *metaslab_trace_ksp; -kstat_named_t metaslab_trace_over_limit; - -void -metaslab_alloc_trace_init(void) -{ - ASSERT(metaslab_alloc_trace_cache == NULL); - metaslab_alloc_trace_cache = kmem_cache_create( - "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); - metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", - "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); - if (metaslab_trace_ksp != NULL) { - metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; - kstat_named_init(&metaslab_trace_over_limit, - "metaslab_trace_over_limit", KSTAT_DATA_UINT64); - kstat_install(metaslab_trace_ksp); - } -} - -void -metaslab_alloc_trace_fini(void) -{ - if (metaslab_trace_ksp != NULL) { - kstat_delete(metaslab_trace_ksp); - metaslab_trace_ksp = NULL; - } - kmem_cache_destroy(metaslab_alloc_trace_cache); - metaslab_alloc_trace_cache = NULL; -} /* * Add an allocation trace element to the allocation tracing list. @@ -4160,16 +4477,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal) #define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) -void -metaslab_alloc_trace_init(void) -{ -} - -void -metaslab_alloc_trace_fini(void) -{ -} - void metaslab_trace_init(zio_alloc_list_t *zal) { diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 0e12972148bf..254629dfe3b8 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -74,42 +74,38 @@ * support removing complete segments. */ -kmem_cache_t *range_seg_cache; - -/* Generic ops for managing an AVL tree alongside a range tree */ -struct range_tree_ops rt_avl_ops = { - .rtop_create = rt_avl_create, - .rtop_destroy = rt_avl_destroy, - .rtop_add = rt_avl_add, - .rtop_remove = rt_avl_remove, - .rtop_vacate = rt_avl_vacate, -}; - -void -range_tree_init(void) -{ - ASSERT(range_seg_cache == NULL); - range_seg_cache = kmem_cache_create("range_seg_cache", - sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -range_tree_fini(void) -{ - kmem_cache_destroy(range_seg_cache); - range_seg_cache = NULL; +static inline void +rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + size_t size = 0; + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + break; + default: + VERIFY(0); + } + bcopy(src, dest, size); } void range_tree_stat_verify(range_tree_t *rt) { range_seg_t *rs; + btree_index_t where; uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; int i; - for (rs = avl_first(&rt->rt_root); rs != NULL; - rs = AVL_NEXT(&rt->rt_root, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (rs = btree_first(&rt->rt_root, &where); rs != NULL; + rs = btree_next(&rt->rt_root, &where, &where)) { + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; hist[idx]++; @@ -128,7 +124,7 @@ range_tree_stat_verify(range_tree_t *rt) static void range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) { - uint64_t size = rs->rs_end - rs->rs_start; + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); @@ -142,7 +138,7 @@ range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) static void range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) { - uint64_t size = rs->rs_end - rs->rs_start; + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); @@ -153,14 +149,35 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) rt->rt_histogram[idx]--; } -/* - * NOTE: caller is responsible for all locking. - */ static int -range_tree_seg_compare(const void *x1, const void *x2) +range_tree_seg32_compare(const void *x1, const void *x2) +{ + const range_seg32_t *r1 = x1; + const range_seg32_t *r2 = x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + +static int +range_tree_seg64_compare(const void *x1, const void *x2) +{ + const range_seg64_t *r1 = x1; + const range_seg64_t *r2 = x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + +static int +range_tree_seg_gap_compare(const void *x1, const void *x2) { - const range_seg_t *r1 = (const range_seg_t *)x1; - const range_seg_t *r2 = (const range_seg_t *)x2; + const range_seg_gap_t *r1 = x1; + const range_seg_gap_t *r2 = x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); @@ -169,18 +186,42 @@ range_tree_seg_compare(const void *x1, const void *x2) } range_tree_t * -range_tree_create_impl(range_tree_ops_t *ops, void *arg, - int (*avl_compare) (const void *, const void *), uint64_t gap) +range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, + uint64_t start, uint64_t shift, + int (*btree_compare) (const void *, const void *), + uint64_t gap) { range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); - avl_create(&rt->rt_root, range_tree_seg_compare, - sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); + ASSERT3U(shift, <, 64); + ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); + size_t size; + int (*compare) (const void *, const void *); + switch (type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + compare = range_tree_seg32_compare; + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + compare = range_tree_seg64_compare; + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + compare = range_tree_seg_gap_compare; + break; + default: + panic("Invalid range seg type %d", type); + } + btree_create(&rt->rt_root, compare, size); rt->rt_ops = ops; rt->rt_gap = gap; rt->rt_arg = arg; - rt->rt_avl_compare = avl_compare; + rt->rt_type = type; + rt->rt_start = start; + rt->rt_shift = shift; + rt->rt_btree_compare = btree_compare; if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); @@ -189,9 +230,10 @@ range_tree_create_impl(range_tree_ops_t *ops, void *arg, } range_tree_t * -range_tree_create(range_tree_ops_t *ops, void *arg) +range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, + void *arg, uint64_t start, uint64_t shift) { - return (range_tree_create_impl(ops, arg, NULL, 0)); + return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0)); } void @@ -202,19 +244,20 @@ range_tree_destroy(range_tree_t *rt) if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); - avl_destroy(&rt->rt_root); + btree_destroy(&rt->rt_root); kmem_free(rt, sizeof (*rt)); } void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) { - ASSERT3U(rs->rs_fill + delta, !=, 0); - ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); + ASSERT3U(rs_get_fill(rs, rt) + delta, !=, 0); + ASSERT3U(rs_get_fill(rs, rt) + delta, <=, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs->rs_fill += delta; + rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } @@ -223,28 +266,20 @@ static void range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) { range_tree_t *rt = arg; - avl_index_t where; - range_seg_t rsearch, *rs_before, *rs_after, *rs; + btree_index_t where; + range_seg_t *rs_before, *rs_after, *rs; + range_seg_max_t tmp, rsearch; uint64_t end = start + size, gap = rt->rt_gap; uint64_t bridge_size = 0; boolean_t merge_before, merge_after; ASSERT3U(size, !=, 0); ASSERT3U(fill, <=, size); + ASSERT3U(start + size, >, start); - rsearch.rs_start = start; - rsearch.rs_end = end; - rs = avl_find(&rt->rt_root, &rsearch, &where); - - if (gap == 0 && rs != NULL && - rs->rs_start <= start && rs->rs_end >= end) { - zfs_panic_recover("zfs: allocating allocated segment" - "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n", - (longlong_t)start, (longlong_t)size, - (longlong_t)rs->rs_start, - (longlong_t)rs->rs_end - rs->rs_start); - return; - } + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + rs = btree_find(&rt->rt_root, &rsearch, &where); /* * If this is a gap-supporting range tree, it is possible that we @@ -255,27 +290,28 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * the normal code paths. */ if (rs != NULL) { + ASSERT3U(rt->rt_gap, !=, 0); + uint64_t rstart = rs_get_start(rs, rt); + uint64_t rend = rs_get_end(rs, rt); ASSERT3U(gap, !=, 0); - if (rs->rs_start <= start && rs->rs_end >= end) { + if (rstart <= start && rend >= end) { range_tree_adjust_fill(rt, rs, fill); return; } - avl_remove(&rt->rt_root, rs); + btree_remove(&rt->rt_root, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); range_tree_stat_decr(rt, rs); - rt->rt_space -= rs->rs_end - rs->rs_start; + rt->rt_space -= rend - rstart; - fill += rs->rs_fill; - start = MIN(start, rs->rs_start); - end = MAX(end, rs->rs_end); + fill += rs_get_fill(rs, rt); + start = MIN(start, rstart); + end = MAX(end, rend); size = end - start; range_tree_add_impl(rt, start, size, fill); - - kmem_cache_free(range_seg_cache, rs); return; } @@ -286,19 +322,21 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * If gap != 0, we might need to merge with our neighbors even if we * aren't directly touching. */ - rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); - rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); + btree_index_t where_before, where_after; + rs_before = btree_prev(&rt->rt_root, &where, &where_before); + rs_after = btree_next(&rt->rt_root, &where, &where_after); - merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap); - merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap); + merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >= + start - gap); + merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end + + gap); if (merge_before && gap != 0) - bridge_size += start - rs_before->rs_end; + bridge_size += start - rs_get_end(rs_before, rt); if (merge_after && gap != 0) - bridge_size += rs_after->rs_start - end; + bridge_size += rs_get_start(rs_after, rt) - end; if (merge_before && merge_after) { - avl_remove(&rt->rt_root, rs_before); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); @@ -307,9 +345,19 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) range_tree_stat_decr(rt, rs_before); range_tree_stat_decr(rt, rs_after); - rs_after->rs_fill += rs_before->rs_fill + fill; - rs_after->rs_start = rs_before->rs_start; - kmem_cache_free(range_seg_cache, rs_before); + rs_copy(rs_after, &tmp, rt); + uint64_t before_start = rs_get_start_raw(rs_before, rt); + uint64_t before_fill = rs_get_fill(rs_before, rt); + uint64_t after_fill = rs_get_fill(rs_after, rt); + btree_remove_from(&rt->rt_root, &where_before); + + /* + * We have to re-find the node because our old reference is + * invalid as soon as we do any mutating btree operations. + */ + rs_after = btree_find(&rt->rt_root, &tmp, &where_after); + rs_set_start_raw(rs_after, rt, before_start); + rs_set_fill(rs_after, rt, after_fill + before_fill + fill); rs = rs_after; } else if (merge_before) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) @@ -317,8 +365,9 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) range_tree_stat_decr(rt, rs_before); - rs_before->rs_fill += fill; - rs_before->rs_end = end; + uint64_t before_fill = rs_get_fill(rs_before, rt); + rs_set_end(rs_before, rt, end); + rs_set_fill(rs_before, rt, before_fill + fill); rs = rs_before; } else if (merge_after) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) @@ -326,22 +375,26 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) range_tree_stat_decr(rt, rs_after); - rs_after->rs_fill += fill; - rs_after->rs_start = start; + uint64_t after_fill = rs_get_fill(rs_after, rt); + rs_set_start(rs_after, rt, start); + rs_set_fill(rs_after, rt, after_fill + fill); rs = rs_after; } else { - rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); + rs = &tmp; - rs->rs_fill = fill; - rs->rs_start = start; - rs->rs_end = end; - avl_insert(&rt->rt_root, rs, where); + rs_set_start(rs, rt, start); + rs_set_end(rs, rt, end); + rs_set_fill(rs, rt, fill); + btree_insert(&rt->rt_root, rs, &where); } - if (gap != 0) - ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); - else - ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start); + if (gap != 0) { + ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } else { + ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); @@ -360,22 +413,25 @@ static void range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, boolean_t do_fill) { - avl_index_t where; - range_seg_t rsearch, *rs, *newseg; + btree_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch, rs_tmp; uint64_t end = start + size; boolean_t left_over, right_over; VERIFY3U(size, !=, 0); VERIFY3U(size, <=, rt->rt_space); + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); - rsearch.rs_start = start; - rsearch.rs_end = end; - rs = avl_find(&rt->rt_root, &rsearch, &where); + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + rs = btree_find(&rt->rt_root, &rsearch, &where); /* Make sure we completely overlap with someone */ if (rs == NULL) { - zfs_panic_recover("zfs: freeing free segment " - "(offset=%llu size=%llu)", + zfs_panic_recover("zfs: removing nonexistent segment from " + "range tree (offset=%llu size=%llu)", (longlong_t)start, (longlong_t)size); return; } @@ -388,30 +444,32 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, */ if (rt->rt_gap != 0) { if (do_fill) { - if (rs->rs_fill == size) { - start = rs->rs_start; - end = rs->rs_end; + if (rs_get_fill(rs, rt) == size) { + start = rs_get_start(rs, rt); + end = rs_get_end(rs, rt); size = end - start; } else { range_tree_adjust_fill(rt, rs, -size); return; } - } else if (rs->rs_start != start || rs->rs_end != end) { + } else if (rs_get_start(rs, rt) != start || + rs_get_end(rs, rt) != end) { zfs_panic_recover("zfs: freeing partial segment of " "gap tree (offset=%llu size=%llu) of " "(offset=%llu size=%llu)", (longlong_t)start, (longlong_t)size, - (longlong_t)rs->rs_start, - (longlong_t)rs->rs_end - rs->rs_start); + (longlong_t)rs_get_start(rs, rt), + (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs, + rt)); return; } } - VERIFY3U(rs->rs_start, <=, start); - VERIFY3U(rs->rs_end, >=, end); + VERIFY3U(rs_get_start(rs, rt), <=, start); + VERIFY3U(rs_get_end(rs, rt), >=, end); - left_over = (rs->rs_start != start); - right_over = (rs->rs_end != end); + left_over = (rs_get_start(rs, rt) != start); + right_over = (rs_get_end(rs, rt) != end); range_tree_stat_decr(rt, rs); @@ -419,24 +477,33 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { - newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); - newseg->rs_start = end; - newseg->rs_end = rs->rs_end; - newseg->rs_fill = newseg->rs_end - newseg->rs_start; - range_tree_stat_incr(rt, newseg); + range_seg_max_t newseg; + rs_set_start(&newseg, rt, end); + rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt)); + rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end); + range_tree_stat_incr(rt, &newseg); - rs->rs_end = start; + // This modifies the buffer already inside the range tree + rs_set_end(rs, rt, start); + + rs_copy(rs, &rs_tmp, rt); + if (btree_next(&rt->rt_root, &where, &where) != NULL) + btree_insert(&rt->rt_root, &newseg, &where); + else + btree_add(&rt->rt_root, &newseg); - avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); + rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg); } else if (left_over) { - rs->rs_end = start; + // This modifies the buffer already inside the range tree + rs_set_end(rs, rt, start); + rs_copy(rs, &rs_tmp, rt); } else if (right_over) { - rs->rs_start = end; + // This modifies the buffer already inside the range tree + rs_set_start(rs, rt, end); + rs_copy(rs, &rs_tmp, rt); } else { - avl_remove(&rt->rt_root, rs); - kmem_cache_free(range_seg_cache, rs); + btree_remove_from(&rt->rt_root, &where); rs = NULL; } @@ -446,11 +513,12 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, * the size, since we do not support removing partial segments * of range trees with gaps. */ - rs->rs_fill = rs->rs_end - rs->rs_start; - range_tree_stat_incr(rt, rs); + rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) - + rs_get_start_raw(rs, rt)); + range_tree_stat_incr(rt, &rs_tmp); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) - rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg); } rt->rt_space -= size; @@ -472,14 +540,14 @@ void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, uint64_t newstart, uint64_t newsize) { - int64_t delta = newsize - (rs->rs_end - rs->rs_start); + int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt)); range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs->rs_start = newstart; - rs->rs_end = newstart + newsize; + rs_set_start(rs, rt, newstart); + rs_set_end(rs, rt, newstart + newsize); range_tree_stat_incr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) @@ -491,22 +559,27 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) { - range_seg_t rsearch; + range_seg_max_t rsearch; uint64_t end = start + size; VERIFY(size != 0); - rsearch.rs_start = start; - rsearch.rs_end = end; - return (avl_find(&rt->rt_root, &rsearch, NULL)); + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + return (btree_find(&rt->rt_root, &rsearch, NULL)); } -range_seg_t * +static range_seg_t * range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) { + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + range_seg_t *rs = range_tree_find_impl(rt, start, size); - if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) + if (rs != NULL && rs_get_start(rs, rt) <= start && + rs_get_end(rs, rt) >= start + size) { return (rs); + } return (NULL); } @@ -533,24 +606,28 @@ boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, uint64_t *ostart, uint64_t *osize) { - range_seg_t rsearch; - rsearch.rs_start = start; - rsearch.rs_end = start + 1; + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + + range_seg_max_t rsearch; + rs_set_start(&rsearch, rt, start); + rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1); - avl_index_t where; - range_seg_t *rs = avl_find(&rt->rt_root, &rsearch, &where); + btree_index_t where; + range_seg_t *rs = btree_find(&rt->rt_root, &rsearch, &where); if (rs != NULL) { *ostart = start; - *osize = MIN(size, rs->rs_end - start); + *osize = MIN(size, rs_get_end(rs, rt) - start); return (B_TRUE); } - rs = avl_nearest(&rt->rt_root, where, AVL_AFTER); - if (rs == NULL || rs->rs_start > start + size) + rs = btree_next(&rt->rt_root, &where, &where); + if (rs == NULL || rs_get_start(rs, rt) > start + size) return (B_FALSE); - *ostart = rs->rs_start; - *osize = MIN(start + size, rs->rs_end) - rs->rs_start; + *ostart = rs_get_start(rs, rt); + *osize = MIN(start + size, rs_get_end(rs, rt)) - + rs_get_start(rs, rt); return (B_TRUE); } @@ -566,9 +643,12 @@ range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) if (size == 0) return; + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { - uint64_t free_start = MAX(rs->rs_start, start); - uint64_t free_end = MIN(rs->rs_end, start + size); + uint64_t free_start = MAX(rs_get_start(rs, rt), start); + uint64_t free_end = MIN(rs_get_end(rs, rt), start + size); range_tree_remove(rt, free_start, free_end - free_start); } } @@ -579,7 +659,7 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) range_tree_t *rt; ASSERT0(range_tree_space(*rtdst)); - ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); + ASSERT0(btree_numnodes(&(*rtdst)->rt_root)); rt = *rtsrc; *rtsrc = *rtdst; @@ -589,16 +669,20 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) { - range_seg_t *rs; - void *cookie = NULL; - if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); - while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { - if (func != NULL) - func(arg, rs->rs_start, rs->rs_end - rs->rs_start); - kmem_cache_free(range_seg_cache, rs); + if (func != NULL) { + range_seg_t *rs; + btree_index_t *cookie = NULL; + + while ((rs = btree_destroy_nodes(&rt->rt_root, &cookie)) != + NULL) { + func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } + } else { + btree_clear(&rt->rt_root); } bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); @@ -608,16 +692,18 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) { - for (range_seg_t *rs = avl_first(&rt->rt_root); rs; - rs = AVL_NEXT(&rt->rt_root, rs)) { - func(arg, rs->rs_start, rs->rs_end - rs->rs_start); + btree_index_t where; + for (range_seg_t *rs = btree_first(&rt->rt_root, &where); rs != NULL; + rs = btree_next(&rt->rt_root, &where, &where)) { + func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - + rs_get_start(rs, rt)); } } range_seg_t * range_tree_first(range_tree_t *rt) { - return (avl_first(&rt->rt_root)); + return (btree_first(&rt->rt_root, NULL)); } uint64_t @@ -629,7 +715,7 @@ range_tree_space(range_tree_t *rt) uint64_t range_tree_numsegs(range_tree_t *rt) { - return ((rt == NULL) ? 0 : avl_numnodes(&rt->rt_root)); + return ((rt == NULL) ? 0 : btree_numnodes(&rt->rt_root)); } boolean_t @@ -639,71 +725,76 @@ range_tree_is_empty(range_tree_t *rt) return (range_tree_space(rt) == 0); } -/* Generic range tree functions for maintaining segments in an AVL tree. */ +/* ARGSUSED */ void -rt_avl_create(range_tree_t *rt, void *arg) -{ - avl_tree_t *tree = arg; - - avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t), - offsetof(range_seg_t, rs_pp_node)); +rt_btree_create(range_tree_t *rt, void *arg) +{ + btree_t *size_tree = arg; + + size_t size; + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + break; + default: + panic("Invalid range seg type %d", rt->rt_type); + } + btree_create(size_tree, rt->rt_btree_compare, size); } +/* ARGSUSED */ void -rt_avl_destroy(range_tree_t *rt, void *arg) +rt_btree_destroy(range_tree_t *rt, void *arg) { - avl_tree_t *tree = arg; + btree_t *size_tree = arg; + ASSERT0(btree_numnodes(size_tree)); - ASSERT0(avl_numnodes(tree)); - avl_destroy(tree); + btree_destroy(size_tree); } +/* ARGSUSED */ void -rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg) +rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg) { - avl_tree_t *tree = arg; - avl_add(tree, rs); -} + btree_t *size_tree = arg; -void -rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - avl_tree_t *tree = arg; - avl_remove(tree, rs); + btree_add(size_tree, rs); } +/* ARGSUSED */ void -rt_avl_vacate(range_tree_t *rt, void *arg) +rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { - /* - * Normally one would walk the tree freeing nodes along the way. - * Since the nodes are shared with the range trees we can avoid - * walking all nodes and just reinitialize the avl tree. The nodes - * will be freed by the range tree, so we don't want to free them here. - */ - rt_avl_create(rt, arg); -} + btree_t *size_tree = arg; -uint64_t -range_tree_min(range_tree_t *rt) -{ - range_seg_t *rs = avl_first(&rt->rt_root); - return (rs != NULL ? rs->rs_start : 0); + btree_remove(size_tree, rs); } -uint64_t -range_tree_max(range_tree_t *rt) +/* ARGSUSED */ +void +rt_btree_vacate(range_tree_t *rt, void *arg) { - range_seg_t *rs = avl_last(&rt->rt_root); - return (rs != NULL ? rs->rs_end : 0); -} + btree_t *size_tree = arg; + btree_clear(size_tree); + btree_destroy(size_tree); -uint64_t -range_tree_span(range_tree_t *rt) -{ - return (range_tree_max(rt) - range_tree_min(rt)); + rt_btree_create(rt, arg); } +range_tree_ops_t rt_btree_ops = { + rt_btree_create, + rt_btree_destroy, + rt_btree_add, + rt_btree_remove, + rt_btree_vacate +}; + /* * Remove any overlapping ranges between the given segment [start, end) * from removefrom. Add non-overlapping leftovers to addto. @@ -712,42 +803,62 @@ void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, range_tree_t *removefrom, range_tree_t *addto) { - avl_index_t where; - range_seg_t starting_rs = { - .rs_start = start, - .rs_end = start + 1 - }; + btree_index_t where; + range_seg_max_t starting_rs; + rs_set_start(&starting_rs, removefrom, start); + rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs, + removefrom) + 1); - range_seg_t *curr = avl_find(&removefrom->rt_root, + range_seg_t *curr = btree_find(&removefrom->rt_root, &starting_rs, &where); if (curr == NULL) - curr = avl_nearest(&removefrom->rt_root, where, AVL_AFTER); + curr = btree_next(&removefrom->rt_root, &where, &where); range_seg_t *next; for (; curr != NULL; curr = next) { - next = AVL_NEXT(&removefrom->rt_root, curr); - if (start == end) return; VERIFY3U(start, <, end); /* there is no overlap */ - if (end <= curr->rs_start) { + if (end <= rs_get_start(curr, removefrom)) { range_tree_add(addto, start, end - start); return; } - uint64_t overlap_start = MAX(curr->rs_start, start); - uint64_t overlap_end = MIN(curr->rs_end, end); + uint64_t overlap_start = MAX(rs_get_start(curr, removefrom), + start); + uint64_t overlap_end = MIN(rs_get_end(curr, removefrom), + end); uint64_t overlap_size = overlap_end - overlap_start; ASSERT3S(overlap_size, >, 0); + range_seg_max_t rs; + rs_copy(curr, &rs, removefrom); + range_tree_remove(removefrom, overlap_start, overlap_size); if (start < overlap_start) range_tree_add(addto, start, overlap_start - start); start = overlap_end; + next = btree_find(&removefrom->rt_root, &rs, &where); + /* + * If we find something here, we only removed part of the + * curr segment. Either there's some left at the end + * because we're reached the end of the range we're removing, + * or there's some left at the start because we started + * partway through the range. Either way, we continue with + * the loop. If it's the former, we'll return at the start of + * the loop, and if it's the latter we'll see if there is more + * area to process. + */ + if (next != NULL) { + ASSERT(start == end || start == rs_get_end(&rs, + removefrom)); + } + + next = btree_next(&removefrom->rt_root, &where, &where); } VERIFY3P(curr, ==, NULL); @@ -767,9 +878,30 @@ void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, range_tree_t *addto) { - for (range_seg_t *rs = avl_first(&rt->rt_root); rs; - rs = AVL_NEXT(&rt->rt_root, rs)) { - range_tree_remove_xor_add_segment(rs->rs_start, rs->rs_end, - removefrom, addto); + btree_index_t where; + for (range_seg_t *rs = btree_first(&rt->rt_root, &where); rs; + rs = btree_next(&rt->rt_root, &where, &where)) { + range_tree_remove_xor_add_segment(rs_get_start(rs, rt), + rs_get_end(rs, rt), removefrom, addto); } } + +uint64_t +range_tree_min(range_tree_t *rt) +{ + range_seg_t *rs = btree_first(&rt->rt_root, NULL); + return (rs != NULL ? rs_get_start(rs, rt) : 0); +} + +uint64_t +range_tree_max(range_tree_t *rt) +{ + range_seg_t *rs = btree_last(&rt->rt_root, NULL); + return (rs != NULL ? rs_get_end(rs, rt) : 0); +} + +uint64_t +range_tree_span(range_tree_t *rt) +{ + return (range_tree_max(rt) - range_tree_min(rt)); +} diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index ac0dd5ba949e..5bd89faeb8e8 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -58,6 +58,7 @@ #include #include #include "zfs_prop.h" +#include #include #include @@ -2318,8 +2319,8 @@ spa_init(int mode) fm_init(); zfs_refcount_init(); unique_init(); - range_tree_init(); - metaslab_alloc_trace_init(); + btree_init(); + metaslab_stat_init(); ddt_init(); zio_init(); dmu_init(); @@ -2353,8 +2354,8 @@ spa_fini(void) dmu_fini(); zio_fini(); ddt_fini(); - metaslab_alloc_trace_fini(); - range_tree_fini(); + metaslab_stat_fini(); + btree_fini(); unique_fini(); zfs_refcount_fini(); fm_fini(); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index b9467b26ba09..99bac9d4bd9c 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -523,8 +523,9 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) * dbuf must be dirty for the changes in sm_phys to take effect. */ static void -space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, - uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx) +space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend, + maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, + void *tag, dmu_tx_t *tx) { ASSERT3U(words, !=, 0); ASSERT3U(words, <=, 2); @@ -548,14 +549,14 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, ASSERT3P(block_cursor, <=, block_end); - uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; + uint64_t size = (rend - rstart) >> sm->sm_shift; + uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift; uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; - ASSERT3U(rs->rs_start, >=, sm->sm_start); - ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size); - ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size); - ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size); + ASSERT3U(rstart, >=, sm->sm_start); + ASSERT3U(rstart, <, sm->sm_start + sm->sm_size); + ASSERT3U(rend - rstart, <=, sm->sm_size); + ASSERT3U(rend, <=, sm->sm_start + sm->sm_size); while (size != 0) { ASSERT3P(block_cursor, <=, block_end); @@ -673,10 +674,14 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, dmu_buf_will_dirty(db, tx); - avl_tree_t *t = &rt->rt_root; - for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { - uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift; - uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + btree_t *t = &rt->rt_root; + btree_index_t where; + for (range_seg_t *rs = btree_first(t, &where); rs != NULL; + rs = btree_next(t, &where, &where)) { + uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >> + sm->sm_shift; + uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >> + sm->sm_shift; uint8_t words = 1; /* @@ -701,8 +706,8 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, spa_get_random(100) == 0))) words = 2; - space_map_write_seg(sm, rs, maptype, vdev_id, words, - &db, FTAG, tx); + space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs, + rt), maptype, vdev_id, words, &db, FTAG, tx); } dmu_buf_rele(db, FTAG); @@ -749,7 +754,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, else sm->sm_phys->smp_alloc -= range_tree_space(rt); - uint64_t nodes = avl_numnodes(&rt->rt_root); + uint64_t nodes = btree_numnodes(&rt->rt_root); uint64_t rt_space = range_tree_space(rt); space_map_write_impl(sm, rt, maptype, vdev_id, tx); @@ -758,7 +763,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * Ensure that the space_map's accounting wasn't changed * while we were in the middle of writing it out. */ - VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root)); + VERIFY3U(nodes, ==, btree_numnodes(&rt->rt_root)); VERIFY3U(range_tree_space(rt), ==, rt_space); } diff --git a/module/zfs/space_reftree.c b/module/zfs/space_reftree.c index aa289ba1061d..5dc604faa8c4 100644 --- a/module/zfs/space_reftree.c +++ b/module/zfs/space_reftree.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. */ #include @@ -109,10 +109,13 @@ space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) { - range_seg_t *rs; + btree_index_t where; - for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) - space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt); + for (range_seg_t *rs = btree_first(&rt->rt_root, &where); rs; rs = + btree_next(&rt->rt_root, &where, &where)) { + space_reftree_add_seg(t, rs_get_start(rs, rt), rs_get_end(rs, + rt), refcnt); + } } /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index af2d1a25a15b..3a120b001ce9 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -216,7 +216,7 @@ vdev_getops(const char *type) /* ARGSUSED */ void -vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) +vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res) { res->rs_start = in->rs_start; res->rs_end = in->rs_end; @@ -528,7 +528,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); - vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); + vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay @@ -561,7 +562,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { - vd->vdev_dtl[t] = range_tree_create(NULL, NULL); + vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); @@ -2539,14 +2541,11 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) static uint64_t vdev_dtl_min(vdev_t *vd) { - range_seg_t *rs; - ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); - rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); - return (rs->rs_start - 1); + return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* @@ -2555,14 +2554,11 @@ vdev_dtl_min(vdev_t *vd) static uint64_t vdev_dtl_max(vdev_t *vd) { - range_seg_t *rs; - ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); - rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); - return (rs->rs_end); + return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* @@ -2883,7 +2879,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) ASSERT(vd->vdev_dtl_sm != NULL); } - rtsync = range_tree_create(NULL, NULL); + rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); @@ -4730,7 +4726,8 @@ vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd) * translation function to do the real conversion. */ void -vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) +vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs) { /* * Walk up the vdev tree @@ -4757,7 +4754,7 @@ vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) * range into its physical components by calling the * vdev specific translate function. */ - range_seg_t intermediate = { { { 0, 0 } } }; + range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); physical_rs->rs_start = intermediate.rs_start; diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 169b9282def8..fcfb4effe6de 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -291,11 +291,13 @@ vdev_initialize_block_free(abd_t *data) static int vdev_initialize_ranges(vdev_t *vd, abd_t *data) { - avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; + range_tree_t *rt = vd->vdev_initialize_tree; + btree_t *bt = &rt->rt_root; + btree_index_t where; - for (range_seg_t *rs = avl_first(rt); rs != NULL; - rs = AVL_NEXT(rt, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (range_seg_t *rs = btree_first(bt, &where); rs != NULL; + rs = btree_next(bt, &where, &where)) { + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); /* Split range into legally-sized physical chunks */ uint64_t writes_required = @@ -305,7 +307,7 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) int error; error = vdev_initialize_write(vd, - VDEV_LABEL_START_SIZE + rs->rs_start + + VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) + (w * zfs_initialize_chunk_size), MIN(size - (w * zfs_initialize_chunk_size), zfs_initialize_chunk_size), data); @@ -341,7 +343,7 @@ vdev_initialize_calculate_progress(vdev_t *vd) * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; vdev_xlate(vd, &logical_rs, &physical_rs); @@ -365,10 +367,14 @@ vdev_initialize_calculate_progress(vdev_t *vd) */ VERIFY0(metaslab_load(msp)); - for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); - rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { - logical_rs.rs_start = rs->rs_start; - logical_rs.rs_end = rs->rs_end; + btree_index_t where; + range_tree_t *rt = msp->ms_allocatable; + for (range_seg_t *rs = + btree_first(&rt->rt_root, &where); rs; + rs = btree_next(&rt->rt_root, &where, + &where)) { + logical_rs.rs_start = rs_get_start(rs, rt); + logical_rs.rs_end = rs_get_end(rs, rt); vdev_xlate(vd, &logical_rs, &physical_rs); uint64_t size = physical_rs.rs_end - @@ -422,7 +428,7 @@ void vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; @@ -481,7 +487,8 @@ vdev_initialize_thread(void *arg) abd_t *deadbeef = vdev_initialize_block_alloc(); - vd->vdev_initialize_tree = range_tree_create(NULL, NULL); + vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index f63ccaa94cb8..3408ddf72852 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. */ @@ -1638,7 +1638,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = zio->io_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_raidz_asize(zio->io_vd, zio->io_size); @@ -2372,7 +2372,7 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) } static void -vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) +vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 549087163a58..a30f521cd8d6 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -198,11 +198,12 @@ spa_vdev_removal_create(vdev_t *vd) spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); - svr->svr_allocd_segs = range_tree_create(NULL, NULL); + svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { - svr->svr_frees[i] = range_tree_create(NULL, NULL); + svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); @@ -967,18 +968,14 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, * the allocation at the end of a segment, thus avoiding * additional split blocks. */ - range_seg_t search; - avl_index_t where; - search.rs_start = start + maxalloc; - search.rs_end = search.rs_start; - range_seg_t *rs = avl_find(&segs->rt_root, &search, &where); - if (rs == NULL) { - rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE); - } else { - rs = AVL_PREV(&segs->rt_root, rs); - } + range_seg_max_t search; + btree_index_t where; + rs_set_start(&search, segs, start + maxalloc); + rs_set_end(&search, segs, start + maxalloc); + (void) btree_find(&segs->rt_root, &search, &where); + range_seg_t *rs = btree_prev(&segs->rt_root, &where, &where); if (rs != NULL) { - size = rs->rs_end - start; + size = rs_get_end(rs, segs) - start; } else { /* * There are no segments that end before maxalloc. @@ -1011,20 +1008,22 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ - range_tree_t *obsolete_segs = range_tree_create(NULL, NULL); - - range_seg_t *rs = avl_first(&segs->rt_root); - ASSERT3U(rs->rs_start, ==, start); - uint64_t prev_seg_end = rs->rs_end; - while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) { - if (rs->rs_start >= start + size) { + range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); + + btree_index_t where; + range_seg_t *rs = btree_first(&segs->rt_root, &where); + ASSERT3U(rs_get_start(rs, segs), ==, start); + uint64_t prev_seg_end = rs_get_end(rs, segs); + while ((rs = btree_next(&segs->rt_root, &where, &where)) != NULL) { + if (rs_get_start(rs, segs) >= start + size) { break; } else { range_tree_add(obsolete_segs, prev_seg_end - start, - rs->rs_start - prev_seg_end); + rs_get_start(rs, segs) - prev_seg_end); } - prev_seg_end = rs->rs_end; + prev_seg_end = rs_get_end(rs, segs); } /* We don't end in the middle of an obsolete range */ ASSERT3U(start + size, <=, prev_seg_end); @@ -1268,9 +1267,10 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ - range_tree_t *segs = range_tree_create(NULL, NULL); + range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (;;) { - range_seg_t *rs = range_tree_first(svr->svr_allocd_segs); + range_tree_t *rt = svr->svr_allocd_segs; + range_seg_t *rs = range_tree_first(rt); if (rs == NULL) break; @@ -1279,17 +1279,17 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, if (range_tree_is_empty(segs)) { /* need to truncate the first seg based on max_alloc */ - seg_length = - MIN(rs->rs_end - rs->rs_start, *max_alloc); + seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs, + rt), *max_alloc); } else { - if (rs->rs_start - range_tree_max(segs) > + if (rs_get_start(rs, rt) - range_tree_max(segs) > vdev_removal_max_span) { /* * Including this segment would cause us to * copy a larger unneeded chunk than is allowed. */ break; - } else if (rs->rs_end - range_tree_min(segs) > + } else if (rs_get_end(rs, rt) - range_tree_min(segs) > *max_alloc) { /* * This additional segment would extend past @@ -1298,13 +1298,14 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, */ break; } else { - seg_length = rs->rs_end - rs->rs_start; + seg_length = rs_get_end(rs, rt) - + rs_get_start(rs, rt); } } - range_tree_add(segs, rs->rs_start, seg_length); + range_tree_add(segs, rs_get_start(rs, rt), seg_length); range_tree_remove(svr->svr_allocd_segs, - rs->rs_start, seg_length); + rs_get_start(rs, rt), seg_length); } if (range_tree_is_empty(segs)) { @@ -1483,7 +1484,7 @@ spa_vdev_remove_thread(void *arg) vca.vca_msp = msp; zfs_dbgmsg("copying %llu segments for metaslab %llu", - avl_numnodes(&svr->svr_allocd_segs->rt_root), + btree_numnodes(&svr->svr_allocd_segs->rt_root), msp->ms_id); while (!svr->svr_thread_exit && diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 771e7a159268..21ba593c6678 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -523,7 +523,8 @@ static int vdev_trim_ranges(trim_args_t *ta) { vdev_t *vd = ta->trim_vdev; - avl_tree_t *rt = &ta->trim_tree->rt_root; + btree_t *t = &ta->trim_tree->rt_root; + btree_index_t idx; uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; @@ -531,9 +532,10 @@ vdev_trim_ranges(trim_args_t *ta) ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; - for (range_seg_t *rs = avl_first(rt); rs != NULL; - rs = AVL_NEXT(rt, rs)) { - uint64_t size = rs->rs_end - rs->rs_start; + for (range_seg_t *rs = btree_first(t, &idx); rs != NULL; + rs = btree_next(t, &idx, &idx)) { + uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs, + ta->trim_tree); if (extent_bytes_min && size < extent_bytes_min) { spa_iostats_trim_add(spa, ta->trim_type, @@ -548,9 +550,9 @@ vdev_trim_ranges(trim_args_t *ta) int error; error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + - rs->rs_start + (w * extent_bytes_max), - MIN(size - (w * extent_bytes_max), - extent_bytes_max)); + rs_get_start(rs, ta->trim_tree) + + (w *extent_bytes_max), MIN(size - + (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { return (error); } @@ -588,7 +590,7 @@ vdev_trim_calculate_progress(vdev_t *vd) * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; vdev_xlate(vd, &logical_rs, &physical_rs); @@ -611,10 +613,13 @@ vdev_trim_calculate_progress(vdev_t *vd) */ VERIFY0(metaslab_load(msp)); - for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); - rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { - logical_rs.rs_start = rs->rs_start; - logical_rs.rs_end = rs->rs_end; + range_tree_t *rt = msp->ms_allocatable; + btree_t *bt = &rt->rt_root; + btree_index_t idx; + for (range_seg_t *rs = btree_first(bt, &idx); + rs != NULL; rs = btree_next(bt, &idx, &idx)) { + logical_rs.rs_start = rs_get_start(rs, rt); + logical_rs.rs_end = rs_get_end(rs, rt); vdev_xlate(vd, &logical_rs, &physical_rs); uint64_t size = physical_rs.rs_end - @@ -706,7 +711,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; - range_seg_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; @@ -719,7 +724,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) metaslab_t *msp = ta->trim_msp; VERIFY0(metaslab_load(msp)); VERIFY3B(msp->ms_loaded, ==, B_TRUE); - VERIFY(range_tree_find(msp->ms_allocatable, start, size)); + VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); } ASSERT(vd->vdev_ops->vdev_op_leaf); @@ -798,7 +803,7 @@ vdev_trim_thread(void *arg) ta.trim_vdev = vd; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; - ta.trim_tree = range_tree_create(NULL, NULL); + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_flags = 0; @@ -1080,7 +1085,7 @@ vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size) VERIFY3B(msp->ms_loaded, ==, B_TRUE); VERIFY3U(msp->ms_disabled, >, 0); - VERIFY(range_tree_find(msp->ms_allocatable, start, size) != NULL); + VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); } /* @@ -1178,7 +1183,8 @@ vdev_autotrim_thread(void *arg) * Allocate an empty range tree which is swapped in * for the existing ms_trim tree while it is processed. */ - trim_tree = range_tree_create(NULL, NULL); + trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); range_tree_swap(&msp->ms_trim, &trim_tree); ASSERT(range_tree_is_empty(msp->ms_trim)); @@ -1232,7 +1238,8 @@ vdev_autotrim_thread(void *arg) if (!cvd->vdev_ops->vdev_op_leaf) continue; - ta->trim_tree = range_tree_create(NULL, NULL); + ta->trim_tree = range_tree_create(NULL, + RANGE_SEG64, NULL, 0, 0); range_tree_walk(trim_tree, vdev_trim_range_add, ta); } From 0674b711035c94b02ff0d482dcbc3012d16af46f Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Mon, 19 Aug 2019 09:21:51 -0700 Subject: [PATCH 02/18] include bitops.h in makefile Signed-off-by: Paul Dagnelie --- include/sys/Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index deb2c0109639..5785bfd55a83 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -7,6 +7,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/arc_impl.h \ $(top_srcdir)/include/sys/avl.h \ $(top_srcdir)/include/sys/avl_impl.h \ + $(top_srcdir)/include/sys/bitops.h \ $(top_srcdir)/include/sys/blkptr.h \ $(top_srcdir)/include/sys/bplist.h \ $(top_srcdir)/include/sys/bpobj.h \ From 2c07366eb1d108bfd1402a40e3f53ac67ab04335 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 22 Aug 2019 21:36:59 -0700 Subject: [PATCH 03/18] feedback1 Signed-off-by: Paul Dagnelie --- include/sys/btree.h | 2 + include/sys/range_tree.h | 1 + module/zfs/arc.c | 1 - module/zfs/btree.c | 229 ++++++++++++++++++++++----------------- module/zfs/metaslab.c | 6 +- module/zfs/range_tree.c | 2 +- 6 files changed, 133 insertions(+), 108 deletions(-) diff --git a/include/sys/btree.h b/include/sys/btree.h index a8b9da931ddb..e9461ddefdbc 100644 --- a/include/sys/btree.h +++ b/include/sys/btree.h @@ -68,6 +68,8 @@ extern "C" { #define BTREE_CORE_ELEMS 128 #define BTREE_LEAF_SIZE 4096 +extern kmem_cache_t *btree_leaf_cache; + typedef struct btree_hdr { struct btree_core *bth_parent; boolean_t bth_core; diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index ef8df8ec8e30..6bc9690e3020 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -240,6 +240,7 @@ rs_set_fill_raw(range_seg_t *rs, range_tree_t *rt, uint64_t fill) ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { case RANGE_SEG32: + /* fall through */ case RANGE_SEG64: ASSERT3U(fill, ==, rs_get_end_raw(rs, rt) - rs_get_start_raw(rs, rt)); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 9e3c012c8893..8ba9eafbc8e1 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4759,7 +4759,6 @@ arc_kmem_reap_soon(void) kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; - extern kmem_cache_t *btree_leaf_cache; #ifdef _KERNEL if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) && diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 5e9056a44d03..c1eb5b7f3751 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -25,7 +25,33 @@ kmem_cache_t *btree_leaf_cache; /* * Control the extent of the verification that occurs when btree_verify is * called. Primarily used for debugging when extending the btree logic and - * functionality. + * functionality. As the intensity is increased, new verification steps are + * added. These steps are cumulative; intensity = 3 includes the intensity = 1 + * and intensity = 2 steps as well. + * + * Intensity 1: Verify that the tree's height is consistent throughout. + * Intensity 2: Verify that a core node's children's parent pointers point + * to the core node. + * Intensity 3: Verify that the total number of elements in the tree matches the + * sum of the number of elements in each node. Also verifies that each node's + * count obeys the invariants (less than or equal to maximum value, greater than + * or equal to half the maximum). + * Intensity 4: Verify that each element compares less than the element + * immediately after it and greater than the one immediately before it using the + * comparator function. For core nodes, also checks that each element is greater + * than the last element in the first of the two nodes it separates, and less + * than the first element in the second of the two nodes. + * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside + * of each node is poisoned appropriately. Note that poisoning always occurs if + * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal + * operation. + * + * Intensity 4 and 5 are particularly expensive to perform; the previous levels + * are a few memory operations per node, while these levels require multiple + * operations per element. In addition, when creating large btrees, these + * operations are called at every step, resulting in extremely sloww operation + * (while the asymptotic complexity of the other steps is the same, the + * importance of the constant factors cannot be denied). */ #ifdef ZFS_DEBUG int btree_verify_intensity = 5; @@ -33,17 +59,13 @@ int btree_verify_intensity = 5; int btree_verify_intensity = 0; #endif -/* - * Purely a convenience function, for simpler argument ordering and silencing - * compiler warnings about return values. We would use bcopy, but the kernel - * version doesn't handle overlapping src and dest. - */ -static inline void -bmov(const void *src, void *dest, size_t size) -{ - (void) memmove(dest, src, size); -} +#ifdef _ILP32 +#define BTREE_POISON 0xabadb10c +#else +#define BTREE_POISON 0xabadb10cdeadbeef +#endif +#ifdef ZFS_DEBUG static void btree_poison_node(btree_t *tree, btree_hdr_t *hdr) { @@ -57,7 +79,7 @@ btree_poison_node(btree_t *tree, btree_hdr_t *hdr) btree_core_t *node = (btree_core_t *)hdr; for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { node->btc_children[i] = - (btree_hdr_t *)0xabadb10cdeadbeef; + (btree_hdr_t *)BTREE_POISON; } (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, (BTREE_CORE_ELEMS - hdr->bth_count) * size); @@ -75,10 +97,11 @@ btree_poison_node_at(btree_t *tree, btree_hdr_t *hdr, uint64_t offset) } else { btree_core_t *node = (btree_core_t *)hdr; node->btc_children[offset + 1] = - (btree_hdr_t *)0xabadb10cdeadbeef; + (btree_hdr_t *)BTREE_POISON; (void) memset(node->btc_elems + offset * size, 0x0f, size); } } +#endif static inline void btree_verify_poison_at(btree_t *tree, btree_hdr_t *hdr, uint64_t offset) @@ -88,7 +111,7 @@ btree_verify_poison_at(btree_t *tree, btree_hdr_t *hdr, uint64_t offset) uint8_t eval = 0x0f; if (hdr->bth_core) { btree_core_t *node = (btree_core_t *)hdr; - btree_hdr_t *cval = (btree_hdr_t *)0xabadb10cdeadbeef; + btree_hdr_t *cval = (btree_hdr_t *)BTREE_POISON; VERIFY3P(node->btc_children[offset + 1], ==, cval); for (int i = 0; i < size; i++) VERIFY3U(node->btc_elems[offset * size + i], ==, eval); @@ -119,10 +142,10 @@ btree_create(btree_t *tree, int (*compar) (const void *, const void *), size_t size) { /* - * We need a minimmum of 4 elements so that when we split a we always - * have at least two elements in each node. This simplifies the logic - * in btree_bulk_finish, since it means the last leaf will always have - * a left sibling to share with (unless it's the root). + * We need a minimmum of 4 elements so that when we split a node we + * always have at least two elements in each node. This simplifies the + * logic in btree_bulk_finish, since it means the last leaf will + * always have a left sibling to share with (unless it's the root). */ ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (btree_hdr_t)) / 4); @@ -326,7 +349,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, old_node->bth_parent = new_node->bth_parent = new_root; new_root->btc_children[0] = old_node; new_root->btc_children[1] = new_node; - bmov(buf, new_root->btc_elems, size); + bcopy(buf, new_root->btc_elems, size); tree->bt_height++; tree->bt_root = new_root_hdr; @@ -361,13 +384,13 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, /* Move the child pointers back one. */ btree_hdr_t **c_start = parent->btc_children + offset + 1; uint64_t count = par_hdr->bth_count - offset; - bmov(c_start, c_start + 1, sizeof (*c_start) * count); + bcopy(c_start, c_start + 1, sizeof (*c_start) * count); *c_start = new_node; /* Move the elements back one. */ uint8_t *e_start = parent->btc_elems + offset * size; - bmov(e_start, e_start + size, count * size); - bmov(buf, e_start, size); + bcopy(e_start, e_start + size, count * size); + bcopy(buf, e_start, size); par_hdr->bth_count++; return; @@ -416,16 +439,16 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, */ uint64_t e_count = move_count; uint8_t *e_start = parent->btc_elems + keep_count * size; - bmov(e_start, new_parent->btc_elems, e_count * size); + bcopy(e_start, new_parent->btc_elems, e_count * size); uint64_t c_count = move_count + 1; btree_hdr_t **c_start = parent->btc_children + keep_count; - bmov(c_start, new_parent->btc_children, c_count * + bcopy(c_start, new_parent->btc_children, c_count * sizeof (*c_start)); /* Store the new separator in a buffer. */ uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); - bmov(parent->btc_elems + (keep_count - 1) * size, tmp_buf, + bcopy(parent->btc_elems + (keep_count - 1) * size, tmp_buf, size); /* @@ -434,25 +457,25 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, */ e_start = parent->btc_elems + offset * size; e_count = keep_count - 1 - offset; - bmov(e_start, e_start + size, e_count * size); - bmov(buf, e_start, size); + bcopy(e_start, e_start + size, e_count * size); + bcopy(buf, e_start, size); c_start = parent->btc_children + (offset + 1); c_count = keep_count - 1 - offset; - bmov(c_start, c_start + 1, c_count * sizeof (*c_start)); + bcopy(c_start, c_start + 1, c_count * sizeof (*c_start)); *c_start = new_node; ASSERT3P(*(c_start - 1), ==, old_node); /* * Move the new separator to the existing buffer. */ - bmov(tmp_buf, buf, size); + bcopy(tmp_buf, buf, size); kmem_free(tmp_buf, size); } else if (offset > keep_count) { /* Store the new separator in a buffer. */ uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); uint8_t *e_start = parent->btc_elems + keep_count * size; - bmov(e_start, tmp_buf, size); + bcopy(e_start, tmp_buf, size); /* * Of the elements and children in the back half, move those @@ -461,16 +484,16 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, e_start += size; uint8_t *e_out = new_parent->btc_elems; uint64_t e_count = offset - keep_count - 1; - bmov(e_start, e_out, e_count * size); + bcopy(e_start, e_out, e_count * size); btree_hdr_t **c_start = parent->btc_children + (keep_count + 1); uint64_t c_count = offset - keep_count; btree_hdr_t **c_out = new_parent->btc_children; - bmov(c_start, c_out, c_count * sizeof (*c_out)); + bcopy(c_start, c_out, c_count * sizeof (*c_out)); /* Add the new value to the new leaf. */ e_out += e_count * size; - bmov(buf, e_out, size); + bcopy(buf, e_out, size); c_out += c_count; *c_out = new_node; @@ -479,19 +502,19 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, /* * Move the new separator to the existing buffer. */ - bmov(tmp_buf, buf, size); + bcopy(tmp_buf, buf, size); kmem_free(tmp_buf, size); /* Move the rest of the back half to the new leaf. */ e_out += size; e_start += e_count * size; e_count = BTREE_CORE_ELEMS - offset; - bmov(e_start, e_out, e_count * size); + bcopy(e_start, e_out, e_count * size); c_out++; c_start += c_count; c_count = BTREE_CORE_ELEMS - offset; - bmov(c_start, c_out, c_count * sizeof (*c_out)); + bcopy(c_start, c_out, c_count * sizeof (*c_out)); } else { /* * The new value is the new separator, no change. @@ -501,11 +524,11 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, */ uint64_t e_count = move_count; uint8_t *e_start = parent->btc_elems + keep_count * size; - bmov(e_start, new_parent->btc_elems, e_count * size); + bcopy(e_start, new_parent->btc_elems, e_count * size); uint64_t c_count = move_count; btree_hdr_t **c_start = parent->btc_children + keep_count + 1; - bmov(c_start, new_parent->btc_children + 1, c_count * + bcopy(c_start, new_parent->btc_children + 1, c_count * sizeof (*c_start)); new_parent->btc_children[0] = new_node; } @@ -549,8 +572,8 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, leaf->btl_hdr.bth_count); } leaf->btl_hdr.bth_count++; - bmov(start, start + size, count * size); - bmov(value, start, size); + bcopy(start, start + size, count * size); + bcopy(value, start, size); return; } @@ -598,10 +621,10 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, /* Copy the back part to the new leaf. */ start = leaf->btl_elems + keep_count * size; count = move_count; - bmov(start, new_leaf->btl_elems, count * size); + bcopy(start, new_leaf->btl_elems, count * size); /* Store the new separator in a buffer. */ - bmov(leaf->btl_elems + (keep_count - 1) * size, buf, + bcopy(leaf->btl_elems + (keep_count - 1) * size, buf, size); /* @@ -610,36 +633,36 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, */ start = leaf->btl_elems + idx * size; count = (keep_count) - 1 - idx; - bmov(start, start + size, count * size); - bmov(value, start, size); + bcopy(start, start + size, count * size); + bcopy(value, start, size); } else if (idx > keep_count) { /* Store the new separator in a buffer. */ start = leaf->btl_elems + (keep_count) * size; - bmov(start, buf, size); + bcopy(start, buf, size); /* Move the back part before idx to the new leaf. */ start += size; uint8_t *out = new_leaf->btl_elems; count = idx - keep_count - 1; - bmov(start, out, count * size); + bcopy(start, out, count * size); /* Add the new value to the new leaf. */ out += count * size; - bmov(value, out, size); + bcopy(value, out, size); /* Move the rest of the back part to the new leaf. */ out += size; start += count * size; count = capacity - idx; - bmov(start, out, count * size); + bcopy(start, out, count * size); } else { /* The new value is the new separator. */ - bmov(value, buf, size); + bcopy(value, buf, size); /* Copy the back part to the new leaf. */ start = leaf->btl_elems + keep_count * size; count = move_count; - bmov(start, new_leaf->btl_elems, count * size); + bcopy(start, new_leaf->btl_elems, count * size); } #ifdef ZFS_DEBUG @@ -738,12 +761,12 @@ btree_bulk_finish(btree_t *tree) uint8_t *start = leaf->btl_elems; uint8_t *out = leaf->btl_elems + (move_count * size); uint64_t count = hdr->bth_count; - bmov(start, out, count * size); + bcopy(start, out, count * size); /* Next, move the separator from the common ancestor to leaf. */ uint8_t *separator = common->btc_elems + (common_idx * size); out -= size; - bmov(separator, out, size); + bcopy(separator, out, size); move_count--; /* @@ -753,14 +776,14 @@ btree_bulk_finish(btree_t *tree) start = l_neighbor->btl_elems + (l_neighbor->btl_hdr.bth_count - move_count) * size; out = leaf->btl_elems; - bmov(start, out, move_count * size); + bcopy(start, out, move_count * size); /* * Finally, move the new last element in the left neighbor to * the separator. */ start -= size; - bmov(start, separator, size); + bcopy(start, separator, size); /* Adjust the node's counts, and we're done. */ l_neighbor->btl_hdr.bth_count -= move_count + 1; @@ -812,18 +835,18 @@ btree_bulk_finish(btree_t *tree) uint8_t *e_start = cur->btc_elems; uint8_t *e_out = cur->btc_elems + (move_count * size); uint64_t e_count = hdr->bth_count; - bmov(e_start, e_out, e_count * size); + bcopy(e_start, e_out, e_count * size); btree_hdr_t **c_start = cur->btc_children; btree_hdr_t **c_out = cur->btc_children + move_count; uint64_t c_count = hdr->bth_count + 1; - bmov(c_start, c_out, c_count * sizeof (btree_hdr_t *)); + bcopy(c_start, c_out, c_count * sizeof (btree_hdr_t *)); /* Next, move the separator to the right node. */ uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * size); e_out -= size; - bmov(separator, e_out, size); + bcopy(separator, e_out, size); /* * Now, move elements and children from the left node to the @@ -834,20 +857,20 @@ btree_bulk_finish(btree_t *tree) (l_neighbor->btc_hdr.bth_count - move_count) * size; e_out = cur->btc_elems; e_count = move_count; - bmov(e_start, e_out, e_count * size); + bcopy(e_start, e_out, e_count * size); c_start = l_neighbor->btc_children + (l_neighbor->btc_hdr.bth_count - move_count); c_out = cur->btc_children; c_count = move_count + 1; - bmov(c_start, c_out, c_count * sizeof (btree_hdr_t *)); + bcopy(c_start, c_out, c_count * sizeof (btree_hdr_t *)); /* * Finally, move the last element in the left node to the * separator's position. */ e_start -= size; - bmov(e_start, separator, size); + bcopy(e_start, separator, size); l_neighbor->btc_hdr.bth_count -= move_count + 1; hdr->bth_count += move_count + 1; @@ -934,8 +957,8 @@ btree_insert(btree_t *tree, const void *value, const btree_index_t *where) btree_hdr_t *subtree = node->btc_children[off + 1]; size_t size = tree->bt_elem_size; uint8_t *buf = kmem_alloc(size, KM_SLEEP); - bmov(node->btc_elems + off * size, buf, size); - bmov(value, node->btc_elems + off * size, size); + bcopy(node->btc_elems + off * size, buf, size); + bcopy(value, node->btc_elems + off * size, size); /* * Find the first slot in the subtree to the right, insert @@ -1243,12 +1266,12 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) uint8_t *e_start = node->btc_elems + idx * size; uint8_t *e_out = e_start - size; uint64_t e_count = hdr->bth_count - idx + 1; - bmov(e_start, e_out, e_count * size); + bcopy(e_start, e_out, e_count * size); btree_hdr_t **c_start = node->btc_children + idx + 1; btree_hdr_t **c_out = c_start - 1; uint64_t c_count = hdr->bth_count - idx + 1; - bmov(c_start, c_out, c_count * sizeof (*c_start)); + bcopy(c_start, c_out, c_count * sizeof (*c_start)); #ifdef ZFS_DEBUG btree_poison_node_at(tree, hdr, hdr->bth_count); #endif @@ -1287,12 +1310,12 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) uint8_t *e_start = node->btc_elems; uint8_t *e_out = node->btc_elems + size; uint64_t e_count = idx - 1; - bmov(e_start, e_out, e_count * size); + bcopy(e_start, e_out, e_count * size); btree_hdr_t **c_start = node->btc_children; btree_hdr_t **c_out = c_start + 1; uint64_t c_count = idx; - bmov(c_start, c_out, c_count * sizeof (*c_out)); + bcopy(c_start, c_out, c_count * sizeof (*c_out)); /* * Move the separator between node and neighbor to the first @@ -1300,18 +1323,18 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) */ uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; - bmov(separator, node->btc_elems, size); + bcopy(separator, node->btc_elems, size); /* Move the last child of neighbor to our first child slot. */ btree_hdr_t **steal_child = neighbor->btc_children + l_hdr->bth_count; - bmov(steal_child, node->btc_children, sizeof (*steal_child)); + bcopy(steal_child, node->btc_children, sizeof (*steal_child)); node->btc_children[0]->bth_parent = node; /* Move the last element of neighbor to the separator spot. */ uint8_t *steal_elem = neighbor->btc_elems + (l_hdr->bth_count - 1) * size; - bmov(steal_elem, separator, size); + bcopy(steal_elem, separator, size); l_hdr->bth_count--; hdr->bth_count++; #ifdef ZFS_DEBUG @@ -1334,32 +1357,32 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) uint8_t *e_start = node->btc_elems + idx * size; uint8_t *e_out = e_start - size; uint64_t e_count = hdr->bth_count - idx + 1; - bmov(e_start, e_out, e_count * size); + bcopy(e_start, e_out, e_count * size); btree_hdr_t **c_start = node->btc_children + idx + 1; btree_hdr_t **c_out = c_start - 1; uint64_t c_count = hdr->bth_count - idx + 1; - bmov(c_start, c_out, c_count * sizeof (*c_out)); + bcopy(c_start, c_out, c_count * sizeof (*c_out)); /* * Move the separator between node and neighbor to the last * element spot in node. */ uint8_t *separator = parent->btc_elems + parent_idx * size; - bmov(separator, node->btc_elems + hdr->bth_count * size, size); + bcopy(separator, node->btc_elems + hdr->bth_count * size, size); /* * Move the first child of neighbor to the last child spot in * node. */ btree_hdr_t **steal_child = neighbor->btc_children; - bmov(steal_child, node->btc_children + (hdr->bth_count + 1), + bcopy(steal_child, node->btc_children + (hdr->bth_count + 1), sizeof (*steal_child)); node->btc_children[hdr->bth_count + 1]->bth_parent = node; /* Move the first element of neighbor to the separator spot. */ uint8_t *steal_elem = neighbor->btc_elems; - bmov(steal_elem, separator, size); + bcopy(steal_elem, separator, size); r_hdr->bth_count--; hdr->bth_count++; @@ -1367,9 +1390,9 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Shift the elements and children of neighbor to cover the * stolen elements. */ - bmov(neighbor->btc_elems + size, neighbor->btc_elems, + bcopy(neighbor->btc_elems + size, neighbor->btc_elems, r_hdr->bth_count * size); - bmov(neighbor->btc_children + 1, neighbor->btc_children, + bcopy(neighbor->btc_children + 1, neighbor->btc_children, (r_hdr->bth_count + 1) * sizeof (steal_child)); #ifdef ZFS_DEBUG btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); @@ -1403,30 +1426,30 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) uint8_t *e_out = left->btc_elems + l_hdr->bth_count * size; uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; - bmov(separator, e_out, size); + bcopy(separator, e_out, size); /* Move all our elements into the left node. */ e_out += size; uint8_t *e_start = node->btc_elems; uint64_t e_count = idx - 1; - bmov(e_start, e_out, e_count * size); + bcopy(e_start, e_out, e_count * size); e_out += e_count * size; e_start += (e_count + 1) * size; e_count = hdr->bth_count - idx + 1; - bmov(e_start, e_out, e_count * size); + bcopy(e_start, e_out, e_count * size); /* Move all our children into the left node. */ btree_hdr_t **c_start = node->btc_children; btree_hdr_t **c_out = left->btc_children + l_hdr->bth_count + 1; uint64_t c_count = idx; - bmov(c_start, c_out, c_count * sizeof (*c_out)); + bcopy(c_start, c_out, c_count * sizeof (*c_out)); c_out += c_count; c_start += c_count + 1; c_count = hdr->bth_count - idx + 1; - bmov(c_start, c_out, c_count * sizeof (*c_out)); + bcopy(c_start, c_out, c_count * sizeof (*c_out)); /* Reparent all our children to point to the left node. */ btree_hdr_t **new_start = left->btc_children + @@ -1458,12 +1481,12 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) uint8_t *e_start = node->btc_elems + idx * size; uint8_t *e_out = e_start - size; uint64_t e_count = hdr->bth_count - idx + 1; - bmov(e_start, e_out, e_count * size); + bcopy(e_start, e_out, e_count * size); btree_hdr_t **c_start = node->btc_children + idx + 1; btree_hdr_t **c_out = c_start - 1; uint64_t c_count = hdr->bth_count - idx + 1; - bmov(c_start, c_out, c_count * sizeof (*c_out)); + bcopy(c_start, c_out, c_count * sizeof (*c_out)); /* * Move the separator to the first open spot in node's @@ -1471,18 +1494,18 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) */ e_out += e_count * size; uint8_t *separator = parent->btc_elems + parent_idx * size; - bmov(separator, e_out, size); + bcopy(separator, e_out, size); /* Move the right node's elements and children to node. */ e_out += size; e_start = right->btc_elems; e_count = r_hdr->bth_count; - bmov(e_start, e_out, e_count * size); + bcopy(e_start, e_out, e_count * size); c_out += c_count; c_start = right->btc_children; c_count = r_hdr->bth_count + 1; - bmov(c_start, c_out, c_count * sizeof (*c_out)); + bcopy(c_start, c_out, c_count * sizeof (*c_out)); /* Reparent the right node's children to point to node. */ for (int i = 0; i < c_count; i++) @@ -1543,7 +1566,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) void *new_value = btree_last_helper(tree, left_subtree, where); ASSERT3P(new_value, !=, NULL); - bmov(new_value, node->btc_elems + idx * size, size); + bcopy(new_value, node->btc_elems + idx * size, size); hdr = where->bti_node; idx = where->bti_offset; @@ -1567,7 +1590,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * the value and return. */ if (hdr->bth_count >= min_count || hdr->bth_parent == NULL) { - bmov(leaf->btl_elems + (idx + 1) * size, leaf->btl_elems + + bcopy(leaf->btl_elems + (idx + 1) * size, leaf->btl_elems + idx * size, (hdr->bth_count - idx) * size); if (hdr->bth_parent == NULL) { ASSERT0(tree->bt_height); @@ -1612,16 +1635,16 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * Move our elements back by one spot to make room for the * stolen element and overwrite the element being removed. */ - bmov(leaf->btl_elems, leaf->btl_elems + size, idx * size); + bcopy(leaf->btl_elems, leaf->btl_elems + size, idx * size); uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; uint8_t *steal_elem = ((btree_leaf_t *)l_hdr)->btl_elems + (l_hdr->bth_count - 1) * size; /* Move the separator to our first spot. */ - bmov(separator, leaf->btl_elems, size); + bcopy(separator, leaf->btl_elems, size); /* Move our neighbor's last element to the separator. */ - bmov(steal_elem, separator, size); + bcopy(steal_elem, separator, size); /* Update the bookkeeping. */ l_hdr->bth_count--; @@ -1645,16 +1668,16 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * by one spot to make room for the stolen element and * overwrite the element being removed. */ - bmov(leaf->btl_elems + (idx + 1) * size, leaf->btl_elems + + bcopy(leaf->btl_elems + (idx + 1) * size, leaf->btl_elems + idx * size, (hdr->bth_count - idx) * size); uint8_t *separator = parent->btc_elems + parent_idx * size; uint8_t *steal_elem = ((btree_leaf_t *)r_hdr)->btl_elems; /* Move the separator between us to our last spot. */ - bmov(separator, leaf->btl_elems + hdr->bth_count * size, size); + bcopy(separator, leaf->btl_elems + hdr->bth_count * size, size); /* Move our neighbor's first element to the separator. */ - bmov(steal_elem, separator, size); + bcopy(steal_elem, separator, size); /* Update the bookkeeping. */ r_hdr->bth_count--; @@ -1664,7 +1687,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * Move our neighbors elements forwards to overwrite the * stolen element. */ - bmov(neighbor->btl_elems + size, neighbor->btl_elems, + bcopy(neighbor->btl_elems + size, neighbor->btl_elems, r_hdr->bth_count * size); #ifdef ZFS_DEBUG btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); @@ -1702,18 +1725,18 @@ btree_remove_from(btree_t *tree, btree_index_t *where) uint8_t *out = left->btl_elems + l_hdr->bth_count * size; uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; - bmov(separator, out, size); + bcopy(separator, out, size); /* Move our elements to the left neighbor. */ out += size; uint8_t *start = leaf->btl_elems; uint64_t count = idx; - bmov(start, out, count * size); + bcopy(start, out, count * size); out += count * size; start += (count + 1) * size; count = hdr->bth_count - idx; - bmov(start, out, count * size); + bcopy(start, out, count * size); /* Update the bookkeeping. */ l_hdr->bth_count += hdr->bth_count + 1; @@ -1737,18 +1760,18 @@ btree_remove_from(btree_t *tree, btree_index_t *where) uint8_t *start = leaf->btl_elems + (idx + 1) * size; uint8_t *out = start - size; uint64_t count = hdr->bth_count - idx; - bmov(start, out, count * size); + bcopy(start, out, count * size); /* Move the separator to node's first open spot. */ out += count * size; uint8_t *separator = parent->btc_elems + parent_idx * size; - bmov(separator, out, size); + bcopy(separator, out, size); /* Move the right neighbor's elements to node. */ out += size; start = right->btl_elems; count = r_hdr->bth_count; - bmov(start, out, count * size); + bcopy(start, out, count * size); /* Update the bookkeeping. */ hdr->bth_count += r_hdr->bth_count + 1; @@ -2046,6 +2069,7 @@ btree_verify_order(btree_t *tree) btree_verify_order_helper(tree, tree->bt_root); } +#ifdef ZFS_DEBUG /* Check that all unused memory is poisoned correctly. */ static void btree_verify_poison_helper(btree_t *tree, btree_hdr_t *hdr) @@ -2068,7 +2092,7 @@ btree_verify_poison_helper(btree_t *tree, btree_hdr_t *hdr) for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { VERIFY3P(node->btc_children[i], ==, - (btree_hdr_t *)0xabadb10cdeadbeef); + (btree_hdr_t *)BTREE_POISON); } for (int i = 0; i <= hdr->bth_count; i++) { @@ -2076,6 +2100,7 @@ btree_verify_poison_helper(btree_t *tree, btree_hdr_t *hdr) } } } +#endif /* Check that unused memory in the tree is still poisoned. */ static void diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 5d27ef8982db..9a0b147a6eb9 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1371,11 +1371,10 @@ static void metaslab_size_tree_full_load(range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; - btree_t *t = mrap->mra_bt; #ifdef _METASLAB_TRACING atomic_inc_64(&metaslab_reload_tree.value.ui64); #endif - ASSERT0(btree_numnodes(t)); + ASSERT0(btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; arg.rt = rt; @@ -2156,7 +2155,6 @@ metaslab_potentially_evict(metaslab_class_t *mc) { #ifdef _KERNEL uint64_t allmem = arc_all_memory(); - extern kmem_cache_t *btree_leaf_cache; uint64_t inuse = btree_leaf_cache->skc_obj_total; uint64_t size = btree_leaf_cache->skc_obj_size; int tries = 0; @@ -3563,7 +3561,7 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) * inexpensive operation since we expect these trees to have a * small number of nodes. * 3] We vacate any unflushed allocs, since they are not frees we - * need to add to the condense tree.. Then we vacate any + * need to add to the condense tree. Then we vacate any * unflushed frees as they should already be part of ms_allocatable. * 4] At this point, we would ideally like to add all segments * in the ms_allocatable tree from the condense tree. This way diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 254629dfe3b8..a98c4528f66b 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -846,7 +846,7 @@ range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, /* * If we find something here, we only removed part of the * curr segment. Either there's some left at the end - * because we're reached the end of the range we're removing, + * because we've reached the end of the range we're removing, * or there's some left at the start because we started * partway through the range. Either way, we continue with * the loop. If it's the former, we'll return at the start of From 99b4df46bbde568063e36d60846c820b9f1aba22 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 22 Aug 2019 22:32:23 -0700 Subject: [PATCH 04/18] switch to one kstat --- module/zfs/metaslab.c | 78 ++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 46 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 9a0b147a6eb9..3f1e401585b2 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -325,12 +325,24 @@ static void metaslab_evict(metaslab_t *, uint64_t); static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); #ifdef _METASLAB_TRACING kmem_cache_t *metaslab_alloc_trace_cache; -kstat_t *metaslab_trace_ksp; -kstat_named_t metaslab_trace_over_limit; -kstat_t *metaslab_df_floor_ksp; -kstat_named_t metaslab_df_find_under_floor; -kstat_t *metaslab_reload_ksp; -kstat_named_t metaslab_reload_tree; + +typedef struct metaslab_stats { + kstat_named_t metaslabstat_trace_over_limit; + kstat_named_t metaslabstat_df_find_under_floor; + kstat_named_t metaslabstat_reload_tree; +} metaslab_stats_t; + +static metaslab_stats_t metaslab_stats = { + { "trace_over_limit", KSTAT_DATA_UINT64 }, + { "df_find_under_floor", KSTAT_DATA_UINT64 }, + { "reload_tree", KSTAT_DATA_UINT64 }, +}; + +#define METASLABSTAT_BUMP(stat) \ + atomic_inc_64(&metaslab_stats.stat.value.ui64); + + +kstat_t *metaslab_ksp; void metaslab_stat_init(void) @@ -339,49 +351,23 @@ metaslab_stat_init(void) metaslab_alloc_trace_cache = kmem_cache_create( "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", - "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); - if (metaslab_trace_ksp != NULL) { - metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; - kstat_named_init(&metaslab_trace_over_limit, - "metaslab_trace_over_limit", KSTAT_DATA_UINT64); - kstat_install(metaslab_trace_ksp); - } - metaslab_df_floor_ksp = kstat_create("zfs", 0, - "metaslab_df_floor_stats", "misc", KSTAT_TYPE_NAMED, 1, - KSTAT_FLAG_VIRTUAL); - if (metaslab_df_floor_ksp != NULL) { - metaslab_df_floor_ksp->ks_data = &metaslab_df_find_under_floor; - kstat_named_init(&metaslab_df_find_under_floor, - "metaslab_df_find_under_floor", KSTAT_DATA_UINT64); - kstat_install(metaslab_df_floor_ksp); - } - metaslab_reload_ksp = kstat_create("zfs", 0, - "metaslab_reload_stats", "misc", KSTAT_TYPE_NAMED, 1, - KSTAT_FLAG_VIRTUAL); - if (metaslab_reload_ksp != NULL) { - metaslab_reload_ksp->ks_data = &metaslab_reload_tree; - kstat_named_init(&metaslab_reload_tree, - "metaslab_reload_tree", KSTAT_DATA_UINT64); - kstat_install(metaslab_reload_ksp); + metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats", + "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) / + sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (metaslab_ksp != NULL) { + metaslab_ksp->ks_data = &metaslab_stats; + kstat_install(metaslab_ksp); } } void metaslab_stat_fini(void) { - if (metaslab_reload_ksp != NULL) { - kstat_delete(metaslab_reload_ksp); - metaslab_reload_ksp = NULL; - } - if (metaslab_df_floor_ksp != NULL) { - kstat_delete(metaslab_df_floor_ksp); - metaslab_df_floor_ksp = NULL; - } - if (metaslab_trace_ksp != NULL) { - kstat_delete(metaslab_trace_ksp); - metaslab_trace_ksp = NULL; + if (metaslab_ksp != NULL) { + kstat_delete(metaslab_ksp); + metaslab_ksp = NULL; } + kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } @@ -1372,7 +1358,7 @@ metaslab_size_tree_full_load(range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; #ifdef _METASLAB_TRACING - atomic_inc_64(&metaslab_reload_tree.value.ui64); + METASLABSTAT_BUMP(metaslabstat_reload_tree); #endif ASSERT0(btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; @@ -1683,8 +1669,8 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) #ifdef _METASLAB_TRACING metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; if (size < (1 << mrap->mra_floor_shift)) { - atomic_inc_64( - &metaslab_df_find_under_floor.value.ui64); + METASLABSTAT_BUMP( + metaslabstat_df_find_under_floor); } #endif rs = metaslab_block_find(&msp->ms_allocatable_by_size, @@ -4423,7 +4409,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, #ifdef DEBUG panic("too many entries in allocation list"); #endif - atomic_inc_64(&metaslab_trace_over_limit.value.ui64); + METASLABSTAT_BUMP(metaslabstat_trace_over_limit); zal->zal_size--; mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); list_remove(&zal->zal_list, mat_next); From 758693ca34df0b39105041836a2e394fc6a23d73 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Mon, 26 Aug 2019 10:04:25 -0700 Subject: [PATCH 05/18] REFACTOR --- module/zfs/btree.c | 435 +++++++++++++++++++++++---------------------- 1 file changed, 227 insertions(+), 208 deletions(-) diff --git a/module/zfs/btree.c b/module/zfs/btree.c index c1eb5b7f3751..010f6f904b12 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -296,6 +296,144 @@ btree_find(btree_t *tree, const void *value, btree_index_t *where) return (d); } +/* + * To explain the following functions, it is useful to understand the four + * kinds of shifts used in btree operation. A shift has two fundamental + * properties, each of which can be one of two values, making four shifts. + * There is the direction of the shift (left or right) and the shape of the + * shift (parallelogram or isoceles trapezoid (shortened to trapezoid + * hereafter)). The shape distinction only applies to shifts of core nodes. + * + * The names derive from the following imagining of the layout of a node: + * + * Elements: * * * * * * * ... * * * + * + * Children: * * * * * * * * ... * * * + * + * This layout follows from the fact that the elements act as separators + * between pairs of children, and that children root subtrees "below" the + * current node. The distinction between a left and right shift is clear. A + * parallelogram shift is a shift with the same number of elements and children + * being moved, while a trapezoid shift is a shift that moves one more children + * than elements. + * + * Note that a parellelogram shift is always shaped like a left-leaning + * parallelogram; the starting index of the children being moved is always one + * higher than the starting index of the elements being moved. No right-leaning + * parallelogram shifts are needed to achieve any btree operations, so we ignore + * them. + */ + +/* + * Shift elements and children in the provided core node by off spots. + * The first element moved is idx, and count elements are moved. The + * shape of the shift is determined by trap; true if the shift is a trapezoid, + * false if it is a parallelogram. The direction is determined by left. + */ +static inline void +bt_shift_at_core(btree_t *tree, btree_core_t *node, uint64_t idx, + uint64_t count, uint64_t off, boolean_t trap, boolean_t left) +{ + size_t size = tree->bt_elem_size; + ASSERT(node->btc_hdr.bth_core); + + uint8_t *e_start = node->btc_elems + idx * size; + uint8_t *e_out = (left ? e_start - (off * size) : e_start + (off * + size)); + uint64_t e_count = count; + bcopy(e_start, e_out, e_count * size); + + btree_hdr_t **c_start = node->btc_children + idx + (trap ? 0 : 1); + btree_hdr_t **c_out = (left ? c_start - off : c_start + off); + uint64_t c_count = count + (trap ? 1 : 0); + bcopy(c_start, c_out, c_count * sizeof (*c_start)); +} + +/* + * Shift elements and children in the provided core node left by one spot. + * The first element moved is idx, and count elements are moved. The + * shape of the shift is determined by trap; true if the shift is a trapezoid, + * false if it is a parallelogram. + */ +static inline void +bt_shleft_at_core(btree_t *tree, btree_core_t *node, uint64_t idx, + uint64_t count, boolean_t trap) +{ + bt_shift_at_core(tree, node, idx, count, 1, trap, B_TRUE); +} + +/* + * Shift elements and children in the provided core node right by one spot. + * Starts with elements[idx] and children[idx] and one more child than element. + */ +static inline void +bt_shright_at_core(btree_t *tree, btree_core_t *node, uint64_t idx, + uint64_t count, boolean_t trap) +{ + bt_shift_at_core(tree, node, idx, count, 1, trap, B_FALSE); +} + +/* + * Shift elements and children in the provided leaf node by off spots. + * The first element moved is idx, and count elements are moved. The direction + * is determined by left. + */ +static inline void +bt_shift_at_leaf(btree_t *tree, btree_leaf_t *node, uint64_t idx, + uint64_t count, uint64_t off, boolean_t left) +{ + size_t size = tree->bt_elem_size; + ASSERT(!node->btl_hdr.bth_core); + + uint8_t *start = node->btl_elems + idx * size; + uint8_t *out = (left ? start - (off * size) : start + (off * + size)); + bcopy(start, out, count * size); +} + +static inline void +bt_shright_at_leaf(btree_t *tree, btree_leaf_t *leaf, uint64_t idx, + uint64_t count) +{ + bt_shift_at_leaf(tree, leaf, idx, count, 1, B_FALSE); +} + +static inline void +bt_shleft_at_leaf(btree_t *tree, btree_leaf_t *leaf, uint64_t idx, + uint64_t count) +{ + bt_shift_at_leaf(tree, leaf, idx, count, 1, B_TRUE); +} + +static inline void +bt_transfer_core(btree_t* tree, btree_core_t *source, uint64_t sidx, + uint64_t count, btree_core_t *dest, uint64_t didx, boolean_t trap) +{ + size_t size = tree->bt_elem_size; + ASSERT(source->btc_hdr.bth_core); + ASSERT(dest->btc_hdr.bth_core); + + bcopy(source->btc_elems + sidx * size, dest->btc_elems + didx * size, + count * size); + + uint64_t c_count = count + (trap ? 1 : 0); + bcopy(source->btc_children + sidx + (trap ? 0 : 1), + dest->btc_children + didx + (trap ? 0 : 1), + c_count * sizeof (*source->btc_children)); +} + +static inline void +bt_transfer_leaf(btree_t* tree, btree_leaf_t *source, uint64_t sidx, + uint64_t count, btree_leaf_t *dest, uint64_t didx) +{ + size_t size = tree->bt_elem_size; + ASSERT(!source->btl_hdr.bth_core); + ASSERT(!dest->btl_hdr.bth_core); + + bcopy(source->btl_elems + sidx * size, dest->btl_elems + didx * size, + count * size); +} + /* * Find the first element in the subtree rooted at hdr, return its value and * put its location in where if non-null. @@ -381,17 +519,14 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, btree_verify_poison_at(tree, par_hdr, par_hdr->bth_count); } - /* Move the child pointers back one. */ - btree_hdr_t **c_start = parent->btc_children + offset + 1; - uint64_t count = par_hdr->bth_count - offset; - bcopy(c_start, c_start + 1, sizeof (*c_start) * count); - *c_start = new_node; - /* Move the elements back one. */ - uint8_t *e_start = parent->btc_elems + offset * size; - bcopy(e_start, e_start + size, count * size); - bcopy(buf, e_start, size); + /* Shift existing elements and children */ + uint64_t count = par_hdr->bth_count - offset; + bt_shright_at_core(tree, parent, offset, count, B_FALSE); + /* Insert new values */ + parent->btc_children[offset + 1] = new_node; + bcopy(buf, parent->btc_elems + offset * size, size); par_hdr->bth_count++; return; } @@ -437,14 +572,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * Copy the back part of the elements and children to the new * leaf. */ - uint64_t e_count = move_count; - uint8_t *e_start = parent->btc_elems + keep_count * size; - bcopy(e_start, new_parent->btc_elems, e_count * size); - - uint64_t c_count = move_count + 1; - btree_hdr_t **c_start = parent->btc_children + keep_count; - bcopy(c_start, new_parent->btc_children, c_count * - sizeof (*c_start)); + bt_transfer_core(tree, parent, keep_count, move_count, new_parent, 0, B_TRUE); /* Store the new separator in a buffer. */ uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); @@ -455,14 +583,12 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * Shift the remaining elements and children in the front half * to handle the new value. */ - e_start = parent->btc_elems + offset * size; - e_count = keep_count - 1 - offset; - bcopy(e_start, e_start + size, e_count * size); + bt_shright_at_core(tree, parent, offset, keep_count - 1 - offset, B_FALSE); + + uint8_t *e_start = parent->btc_elems + offset * size; bcopy(buf, e_start, size); - c_start = parent->btc_children + (offset + 1); - c_count = keep_count - 1 - offset; - bcopy(c_start, c_start + 1, c_count * sizeof (*c_start)); + btree_hdr_t **c_start = parent->btc_children + (offset + 1); *c_start = new_node; ASSERT3P(*(c_start - 1), ==, old_node); @@ -481,21 +607,15 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * Of the elements and children in the back half, move those * before offset to the new leaf. */ - e_start += size; - uint8_t *e_out = new_parent->btc_elems; uint64_t e_count = offset - keep_count - 1; - bcopy(e_start, e_out, e_count * size); - - btree_hdr_t **c_start = parent->btc_children + (keep_count + 1); - uint64_t c_count = offset - keep_count; - btree_hdr_t **c_out = new_parent->btc_children; - bcopy(c_start, c_out, c_count * sizeof (*c_out)); + bt_transfer_core(tree, parent, keep_count + 1, + e_count, new_parent, 0, B_TRUE); /* Add the new value to the new leaf. */ - e_out += e_count * size; + uint8_t *e_out = new_parent->btc_elems + e_count * size; bcopy(buf, e_out, size); - c_out += c_count; + btree_hdr_t **c_out = new_parent->btc_children + e_count + 1; *c_out = new_node; ASSERT3P(*(c_out - 1), ==, old_node); @@ -506,15 +626,9 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, kmem_free(tmp_buf, size); /* Move the rest of the back half to the new leaf. */ - e_out += size; - e_start += e_count * size; - e_count = BTREE_CORE_ELEMS - offset; - bcopy(e_start, e_out, e_count * size); - - c_out++; - c_start += c_count; - c_count = BTREE_CORE_ELEMS - offset; - bcopy(c_start, c_out, c_count * sizeof (*c_out)); + bt_transfer_core(tree, parent, keep_count + 2, + BTREE_CORE_ELEMS - offset, new_parent, e_count + 1, + B_FALSE); } else { /* * The new value is the new separator, no change. @@ -522,14 +636,8 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * Copy the back part of the elements and children to the new * leaf. */ - uint64_t e_count = move_count; - uint8_t *e_start = parent->btc_elems + keep_count * size; - bcopy(e_start, new_parent->btc_elems, e_count * size); - - uint64_t c_count = move_count; - btree_hdr_t **c_start = parent->btc_children + keep_count + 1; - bcopy(c_start, new_parent->btc_children + 1, c_count * - sizeof (*c_start)); + bt_transfer_core(tree, parent, keep_count, move_count, + new_parent, 0, B_FALSE); new_parent->btc_children[0] = new_node; } #ifdef ZFS_DEBUG @@ -572,7 +680,7 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, leaf->btl_hdr.bth_count); } leaf->btl_hdr.bth_count++; - bcopy(start, start + size, count * size); + bt_shright_at_leaf(tree, leaf, idx, count); bcopy(value, start, size); return; } @@ -619,9 +727,8 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, */ if (idx < keep_count) { /* Copy the back part to the new leaf. */ - start = leaf->btl_elems + keep_count * size; - count = move_count; - bcopy(start, new_leaf->btl_elems, count * size); + bt_transfer_leaf(tree, leaf, keep_count, move_count, new_leaf, + 0); /* Store the new separator in a buffer. */ bcopy(leaf->btl_elems + (keep_count - 1) * size, buf, @@ -631,38 +738,32 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, * Shift the remaining elements in the front part to handle * the new value. */ - start = leaf->btl_elems + idx * size; - count = (keep_count) - 1 - idx; - bcopy(start, start + size, count * size); - bcopy(value, start, size); + bt_shright_at_leaf(tree, leaf, idx, keep_count - 1 - idx); + bcopy(value, leaf->btl_elems + idx * size, size); } else if (idx > keep_count) { /* Store the new separator in a buffer. */ start = leaf->btl_elems + (keep_count) * size; bcopy(start, buf, size); /* Move the back part before idx to the new leaf. */ - start += size; - uint8_t *out = new_leaf->btl_elems; - count = idx - keep_count - 1; - bcopy(start, out, count * size); + bt_transfer_leaf(tree, leaf, keep_count + 1, + idx - keep_count - 1, new_leaf, 0); /* Add the new value to the new leaf. */ - out += count * size; + uint8_t *out = new_leaf->btl_elems + (idx - keep_count - 1) * + size; bcopy(value, out, size); /* Move the rest of the back part to the new leaf. */ - out += size; - start += count * size; - count = capacity - idx; - bcopy(start, out, count * size); + bt_transfer_leaf(tree, leaf, idx, capacity - idx, new_leaf, + idx - keep_count); } else { /* The new value is the new separator. */ bcopy(value, buf, size); /* Copy the back part to the new leaf. */ - start = leaf->btl_elems + keep_count * size; - count = move_count; - bcopy(start, new_leaf->btl_elems, count * size); + bt_transfer_leaf(tree, leaf, keep_count, move_count, new_leaf, + 0); } #ifdef ZFS_DEBUG @@ -746,6 +847,7 @@ btree_bulk_finish(btree_t *tree) VERIFY3P(btree_prev(tree, &idx, &idx), !=, NULL); ASSERT(!idx.bti_node->bth_core); btree_leaf_t *l_neighbor = (btree_leaf_t *)idx.bti_node; + btree_hdr_t *l_hdr = idx.bti_node; uint64_t move_count = (capacity / 2) - hdr->bth_count; ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, capacity / 2); @@ -758,14 +860,12 @@ btree_bulk_finish(btree_t *tree) } /* First, shift elements in leaf back. */ - uint8_t *start = leaf->btl_elems; - uint8_t *out = leaf->btl_elems + (move_count * size); - uint64_t count = hdr->bth_count; - bcopy(start, out, count * size); + bt_shift_at_leaf(tree, leaf, 0, hdr->bth_count, move_count, + B_FALSE); /* Next, move the separator from the common ancestor to leaf. */ uint8_t *separator = common->btc_elems + (common_idx * size); - out -= size; + uint8_t *out = leaf->btl_elems + ((move_count - 1) * size); bcopy(separator, out, size); move_count--; @@ -773,26 +873,24 @@ btree_bulk_finish(btree_t *tree) * Now we move elements from the tail of the left neighbor to * fill the remaining spots in leaf. */ - start = l_neighbor->btl_elems + (l_neighbor->btl_hdr.bth_count - - move_count) * size; - out = leaf->btl_elems; - bcopy(start, out, move_count * size); + bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - + move_count, move_count, leaf, 0); /* * Finally, move the new last element in the left neighbor to * the separator. */ - start -= size; - bcopy(start, separator, size); + bcopy(l_neighbor->btl_elems + (l_hdr->bth_count - + move_count - 1) * size, separator, size); /* Adjust the node's counts, and we're done. */ - l_neighbor->btl_hdr.bth_count -= move_count + 1; + l_hdr->bth_count -= move_count + 1; hdr->bth_count += move_count + 1; - ASSERT3U(l_neighbor->btl_hdr.bth_count, >=, capacity / 2); + ASSERT3U(l_hdr->bth_count, >=, capacity / 2); ASSERT3U(hdr->bth_count, >=, capacity / 2); #ifdef ZFS_DEBUG - btree_poison_node(tree, &l_neighbor->btl_hdr); + btree_poison_node(tree, l_hdr); #endif } @@ -832,20 +930,12 @@ btree_bulk_finish(btree_t *tree) } } /* First, shift things in the right node back. */ - uint8_t *e_start = cur->btc_elems; - uint8_t *e_out = cur->btc_elems + (move_count * size); - uint64_t e_count = hdr->bth_count; - bcopy(e_start, e_out, e_count * size); - - btree_hdr_t **c_start = cur->btc_children; - btree_hdr_t **c_out = cur->btc_children + move_count; - uint64_t c_count = hdr->bth_count + 1; - bcopy(c_start, c_out, c_count * sizeof (btree_hdr_t *)); + bt_shift_at_core(tree, cur, 0, hdr->bth_count, move_count, B_TRUE, B_FALSE); /* Next, move the separator to the right node. */ uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * size); - e_out -= size; + uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); bcopy(separator, e_out, size); /* @@ -853,24 +943,15 @@ btree_bulk_finish(btree_t *tree) * right. We move one more child than elements. */ move_count--; - e_start = l_neighbor->btc_elems + - (l_neighbor->btc_hdr.bth_count - move_count) * size; - e_out = cur->btc_elems; - e_count = move_count; - bcopy(e_start, e_out, e_count * size); - - c_start = l_neighbor->btc_children + - (l_neighbor->btc_hdr.bth_count - move_count); - c_out = cur->btc_children; - c_count = move_count + 1; - bcopy(c_start, c_out, c_count * sizeof (btree_hdr_t *)); + uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; + bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, B_TRUE); /* * Finally, move the last element in the left node to the * separator's position. */ - e_start -= size; - bcopy(e_start, separator, size); + move_idx--; + bcopy(l_neighbor->btc_elems + move_idx * size, separator, size); l_neighbor->btc_hdr.bth_count -= move_count + 1; hdr->bth_count += move_count + 1; @@ -1263,15 +1344,8 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Shift the element and children to the right of rm_hdr to * the left by one spot. */ - uint8_t *e_start = node->btc_elems + idx * size; - uint8_t *e_out = e_start - size; - uint64_t e_count = hdr->bth_count - idx + 1; - bcopy(e_start, e_out, e_count * size); - - btree_hdr_t **c_start = node->btc_children + idx + 1; - btree_hdr_t **c_out = c_start - 1; - uint64_t c_count = hdr->bth_count - idx + 1; - bcopy(c_start, c_out, c_count * sizeof (*c_start)); + bt_shleft_at_core(tree, node, idx, hdr->bth_count - idx + 1, + B_FALSE); #ifdef ZFS_DEBUG btree_poison_node_at(tree, hdr, hdr->bth_count); #endif @@ -1307,15 +1381,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Start by shifting the elements and children in the current * node to the right by one spot. */ - uint8_t *e_start = node->btc_elems; - uint8_t *e_out = node->btc_elems + size; - uint64_t e_count = idx - 1; - bcopy(e_start, e_out, e_count * size); - - btree_hdr_t **c_start = node->btc_children; - btree_hdr_t **c_out = c_start + 1; - uint64_t c_count = idx; - bcopy(c_start, c_out, c_count * sizeof (*c_out)); + bt_shright_at_core(tree, node, 0, idx - 1, B_TRUE); /* * Move the separator between node and neighbor to the first @@ -1354,15 +1420,8 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Shift elements in node left by one spot to overwrite rm_hdr * and the separator before it. */ - uint8_t *e_start = node->btc_elems + idx * size; - uint8_t *e_out = e_start - size; - uint64_t e_count = hdr->bth_count - idx + 1; - bcopy(e_start, e_out, e_count * size); - - btree_hdr_t **c_start = node->btc_children + idx + 1; - btree_hdr_t **c_out = c_start - 1; - uint64_t c_count = hdr->bth_count - idx + 1; - bcopy(c_start, c_out, c_count * sizeof (*c_out)); + bt_shleft_at_core(tree, node, idx, hdr->bth_count - idx + 1, + B_FALSE); /* * Move the separator between node and neighbor to the last @@ -1390,10 +1449,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Shift the elements and children of neighbor to cover the * stolen elements. */ - bcopy(neighbor->btc_elems + size, neighbor->btc_elems, - r_hdr->bth_count * size); - bcopy(neighbor->btc_children + 1, neighbor->btc_children, - (r_hdr->bth_count + 1) * sizeof (steal_child)); + bt_shleft_at_core(tree, neighbor, 1, r_hdr->bth_count, B_TRUE); #ifdef ZFS_DEBUG btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); #endif @@ -1428,28 +1484,11 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) size; bcopy(separator, e_out, size); - /* Move all our elements into the left node. */ - e_out += size; - uint8_t *e_start = node->btc_elems; - uint64_t e_count = idx - 1; - bcopy(e_start, e_out, e_count * size); - - e_out += e_count * size; - e_start += (e_count + 1) * size; - e_count = hdr->bth_count - idx + 1; - bcopy(e_start, e_out, e_count * size); - - /* Move all our children into the left node. */ - btree_hdr_t **c_start = node->btc_children; - btree_hdr_t **c_out = left->btc_children + - l_hdr->bth_count + 1; - uint64_t c_count = idx; - bcopy(c_start, c_out, c_count * sizeof (*c_out)); - - c_out += c_count; - c_start += c_count + 1; - c_count = hdr->bth_count - idx + 1; - bcopy(c_start, c_out, c_count * sizeof (*c_out)); + /* Move all our elements and children into the left node. */ + bt_transfer_core(tree, node, 0, idx - 1, left, l_hdr->bth_count + + 1, B_TRUE); + bt_transfer_core(tree, node, idx, hdr->bth_count - idx + 1, + left, l_hdr->bth_count + idx, B_FALSE); /* Reparent all our children to point to the left node. */ btree_hdr_t **new_start = left->btc_children + @@ -1459,8 +1498,10 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) /* Update bookkeeping */ l_hdr->bth_count += hdr->bth_count + 1; - for (int i = 0; i <= l_hdr->bth_count; i++) + for (int i = 0; i <= l_hdr->bth_count; i++) { ASSERT3P(left->btc_children[i]->bth_parent, ==, left); + ASSERT3P(left->btc_children[i], !=, rm_hdr); + } ASSERT3U(l_hdr->bth_count, ==, BTREE_CORE_ELEMS); new_rm_hdr = hdr; } else { @@ -1478,43 +1519,34 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Overwrite rm_hdr and its separator by moving node's * elements and children forward. */ - uint8_t *e_start = node->btc_elems + idx * size; - uint8_t *e_out = e_start - size; - uint64_t e_count = hdr->bth_count - idx + 1; - bcopy(e_start, e_out, e_count * size); - - btree_hdr_t **c_start = node->btc_children + idx + 1; - btree_hdr_t **c_out = c_start - 1; - uint64_t c_count = hdr->bth_count - idx + 1; - bcopy(c_start, c_out, c_count * sizeof (*c_out)); + bt_shleft_at_core(tree, node, idx, hdr->bth_count - idx + 1, + B_FALSE); /* * Move the separator to the first open spot in node's * elements. */ - e_out += e_count * size; + uint8_t *e_out = node->btc_elems + (hdr->bth_count - idx) * + size; uint8_t *separator = parent->btc_elems + parent_idx * size; bcopy(separator, e_out, size); /* Move the right node's elements and children to node. */ - e_out += size; - e_start = right->btc_elems; - e_count = r_hdr->bth_count; - bcopy(e_start, e_out, e_count * size); - - c_out += c_count; - c_start = right->btc_children; - c_count = r_hdr->bth_count + 1; - bcopy(c_start, c_out, c_count * sizeof (*c_out)); + bt_transfer_core(tree, right, 0, r_hdr->bth_count, node, + hdr->bth_count - idx + 1, B_TRUE); /* Reparent the right node's children to point to node. */ - for (int i = 0; i < c_count; i++) - c_out[i]->bth_parent = node; + for (int i = hdr->bth_count - idx; i < r_hdr->bth_count + 1; + i++) { + node->btc_children[i]->bth_parent = node; + } /* Update bookkeeping. */ hdr->bth_count += r_hdr->bth_count + 1; - for (int i = 0; i <= hdr->bth_count; i++) + for (int i = 0; i <= hdr->bth_count; i++) { ASSERT3P(node->btc_children[i]->bth_parent, ==, node); + ASSERT3P(node->btc_children[i], !=, rm_hdr); + } ASSERT3U(hdr->bth_count, ==, BTREE_CORE_ELEMS); new_rm_hdr = r_hdr; @@ -1590,8 +1622,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * the value and return. */ if (hdr->bth_count >= min_count || hdr->bth_parent == NULL) { - bcopy(leaf->btl_elems + (idx + 1) * size, leaf->btl_elems + - idx * size, (hdr->bth_count - idx) * size); + bt_shleft_at_leaf(tree, leaf, idx + 1, hdr->bth_count - idx); if (hdr->bth_parent == NULL) { ASSERT0(tree->bt_height); if (hdr->bth_count == 0) { @@ -1635,7 +1666,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * Move our elements back by one spot to make room for the * stolen element and overwrite the element being removed. */ - bcopy(leaf->btl_elems, leaf->btl_elems + size, idx * size); + bt_shright_at_leaf(tree, leaf, 0, idx); uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; uint8_t *steal_elem = ((btree_leaf_t *)l_hdr)->btl_elems + @@ -1668,8 +1699,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * by one spot to make room for the stolen element and * overwrite the element being removed. */ - bcopy(leaf->btl_elems + (idx + 1) * size, leaf->btl_elems + - idx * size, (hdr->bth_count - idx) * size); + bt_shleft_at_leaf(tree, leaf, idx + 1, hdr->bth_count - idx); uint8_t *separator = parent->btc_elems + parent_idx * size; uint8_t *steal_elem = ((btree_leaf_t *)r_hdr)->btl_elems; @@ -1687,8 +1717,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * Move our neighbors elements forwards to overwrite the * stolen element. */ - bcopy(neighbor->btl_elems + size, neighbor->btl_elems, - r_hdr->bth_count * size); + bt_shleft_at_leaf(tree, neighbor, 1, r_hdr->bth_count); #ifdef ZFS_DEBUG btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); #endif @@ -1728,15 +1757,10 @@ btree_remove_from(btree_t *tree, btree_index_t *where) bcopy(separator, out, size); /* Move our elements to the left neighbor. */ - out += size; - uint8_t *start = leaf->btl_elems; - uint64_t count = idx; - bcopy(start, out, count * size); - - out += count * size; - start += (count + 1) * size; - count = hdr->bth_count - idx; - bcopy(start, out, count * size); + bt_transfer_leaf(tree, leaf, 0, idx, left, l_hdr->bth_count + + 1); + bt_transfer_leaf(tree, leaf, idx + 1, hdr->bth_count - idx, + left, l_hdr->bth_count + 1 + idx); /* Update the bookkeeping. */ l_hdr->bth_count += hdr->bth_count + 1; @@ -1757,21 +1781,16 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * Move our elements left to overwrite the element being * removed. */ - uint8_t *start = leaf->btl_elems + (idx + 1) * size; - uint8_t *out = start - size; - uint64_t count = hdr->bth_count - idx; - bcopy(start, out, count * size); + bt_shleft_at_leaf(tree, leaf, idx + 1, hdr->bth_count - idx); /* Move the separator to node's first open spot. */ - out += count * size; + uint8_t *out = leaf->btl_elems + hdr->bth_count * size; uint8_t *separator = parent->btc_elems + parent_idx * size; bcopy(separator, out, size); /* Move the right neighbor's elements to node. */ - out += size; - start = right->btl_elems; - count = r_hdr->bth_count; - bcopy(start, out, count * size); + bt_transfer_leaf(tree, right, 0, r_hdr->bth_count, leaf, + hdr->bth_count + 1); /* Update the bookkeeping. */ hdr->bth_count += r_hdr->bth_count + 1; From e76980de221c26137cf8bd771bfb117cbd1e2020 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 27 Aug 2019 10:05:08 -0700 Subject: [PATCH 06/18] style --- module/zfs/btree.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 010f6f904b12..4baa6b9c770e 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -45,11 +45,11 @@ kmem_cache_t *btree_leaf_cache; * of each node is poisoned appropriately. Note that poisoning always occurs if * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal * operation. - * + * * Intensity 4 and 5 are particularly expensive to perform; the previous levels * are a few memory operations per node, while these levels require multiple * operations per element. In addition, when creating large btrees, these - * operations are called at every step, resulting in extremely sloww operation + * operations are called at every step, resulting in extremely slow operation * (while the asymptotic complexity of the other steps is the same, the * importance of the constant factors cannot be denied). */ @@ -60,9 +60,9 @@ int btree_verify_intensity = 0; #endif #ifdef _ILP32 -#define BTREE_POISON 0xabadb10c +#define BTREE_POISON 0xabadb10c #else -#define BTREE_POISON 0xabadb10cdeadbeef +#define BTREE_POISON 0xabadb10cdeadbeef #endif #ifdef ZFS_DEBUG @@ -406,7 +406,7 @@ bt_shleft_at_leaf(btree_t *tree, btree_leaf_t *leaf, uint64_t idx, } static inline void -bt_transfer_core(btree_t* tree, btree_core_t *source, uint64_t sidx, +bt_transfer_core(btree_t *tree, btree_core_t *source, uint64_t sidx, uint64_t count, btree_core_t *dest, uint64_t didx, boolean_t trap) { size_t size = tree->bt_elem_size; @@ -423,7 +423,7 @@ bt_transfer_core(btree_t* tree, btree_core_t *source, uint64_t sidx, } static inline void -bt_transfer_leaf(btree_t* tree, btree_leaf_t *source, uint64_t sidx, +bt_transfer_leaf(btree_t *tree, btree_leaf_t *source, uint64_t sidx, uint64_t count, btree_leaf_t *dest, uint64_t didx) { size_t size = tree->bt_elem_size; @@ -572,7 +572,8 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * Copy the back part of the elements and children to the new * leaf. */ - bt_transfer_core(tree, parent, keep_count, move_count, new_parent, 0, B_TRUE); + bt_transfer_core(tree, parent, keep_count, move_count, + new_parent, 0, B_TRUE); /* Store the new separator in a buffer. */ uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); @@ -583,7 +584,8 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * Shift the remaining elements and children in the front half * to handle the new value. */ - bt_shright_at_core(tree, parent, offset, keep_count - 1 - offset, B_FALSE); + bt_shright_at_core(tree, parent, offset, keep_count - 1 - + offset, B_FALSE); uint8_t *e_start = parent->btc_elems + offset * size; bcopy(buf, e_start, size); @@ -930,7 +932,8 @@ btree_bulk_finish(btree_t *tree) } } /* First, shift things in the right node back. */ - bt_shift_at_core(tree, cur, 0, hdr->bth_count, move_count, B_TRUE, B_FALSE); + bt_shift_at_core(tree, cur, 0, hdr->bth_count, move_count, + B_TRUE, B_FALSE); /* Next, move the separator to the right node. */ uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * @@ -944,7 +947,8 @@ btree_bulk_finish(btree_t *tree) */ move_count--; uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; - bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, B_TRUE); + bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, + B_TRUE); /* * Finally, move the last element in the left node to the @@ -1485,8 +1489,8 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) bcopy(separator, e_out, size); /* Move all our elements and children into the left node. */ - bt_transfer_core(tree, node, 0, idx - 1, left, l_hdr->bth_count + - 1, B_TRUE); + bt_transfer_core(tree, node, 0, idx - 1, left, + l_hdr->bth_count + 1, B_TRUE); bt_transfer_core(tree, node, idx, hdr->bth_count - idx + 1, left, l_hdr->bth_count + idx, B_FALSE); From 12717e8425678a87bb488414920f047ce11beac5 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 29 Aug 2019 09:57:32 -0700 Subject: [PATCH 07/18] matt feedback --- module/zfs/btree.c | 181 +++++++++++++++++++++++++++------------------ 1 file changed, 107 insertions(+), 74 deletions(-) diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 4baa6b9c770e..b2dbe3e2ee35 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -298,54 +298,80 @@ btree_find(btree_t *tree, const void *value, btree_index_t *where) /* * To explain the following functions, it is useful to understand the four - * kinds of shifts used in btree operation. A shift has two fundamental - * properties, each of which can be one of two values, making four shifts. - * There is the direction of the shift (left or right) and the shape of the - * shift (parallelogram or isoceles trapezoid (shortened to trapezoid - * hereafter)). The shape distinction only applies to shifts of core nodes. + * kinds of shifts used in btree operation. First, a shift is a movement of + * elements within a node. It is used to create gaps for inserting new + * elements and children, or cover gaps created when things are removed. A + * shift has two fundamental properties, each of which can be one of two + * values, making four types of shifts. There is the direction of the shift + * (left or right) and the shape of the shift (parallelogram or isoceles + * trapezoid (shortened to trapezoid hereafter)). The shape distinction only + * applies to shifts of core nodes. * * The names derive from the following imagining of the layout of a node: * * Elements: * * * * * * * ... * * * - * * Children: * * * * * * * * ... * * * * * This layout follows from the fact that the elements act as separators * between pairs of children, and that children root subtrees "below" the - * current node. The distinction between a left and right shift is clear. A - * parallelogram shift is a shift with the same number of elements and children - * being moved, while a trapezoid shift is a shift that moves one more children - * than elements. + * current node. A left and right shift are fairly self-explanatory; a left + * shift moves things to the left, while a right shift moves things to the + * right. A parallelogram shift is a shift with the same number of elements + * and children being moved, while a trapezoid shift is a shift that moves one + * more children than elements. An example follows: * - * Note that a parellelogram shift is always shaped like a left-leaning - * parallelogram; the starting index of the children being moved is always one - * higher than the starting index of the elements being moved. No right-leaning - * parallelogram shifts are needed to achieve any btree operations, so we ignore - * them. + * A parallelogram shift could contain the following: + * _______________ + * \* * * * \ * * * ... * * * + * * \ * * * *\ * * * ... * * * + * --------------- + * A trapezoid shift could contain the following: + * ___________ + * * / * * * \ * * * ... * * * + * * / * * * *\ * * * ... * * * + * --------------- + * + * Note that a parellelogram shift is always shaped like a "left-leaning" + * parallelogram, where the starting index of the children being moved is + * always one higher than the starting index of the elements being moved. No + * "right-leaning" parallelogram shifts are needed (shifts where the starting + * element index and starting child index being moved are the same) to achieve + * any btree operations, so we ignore them. */ +enum bt_shift_shape { + BSS_TRAPEZOID, + BSS_PARALLELOGRAM +}; + +enum bt_shift_direction { + BSD_LEFT, + BSD_RIGHT +}; + /* - * Shift elements and children in the provided core node by off spots. - * The first element moved is idx, and count elements are moved. The - * shape of the shift is determined by trap; true if the shift is a trapezoid, - * false if it is a parallelogram. The direction is determined by left. + * Shift elements and children in the provided core node by off spots. The + * first element moved is idx, and count elements are moved. The shape of the + * shift is determined by shape. The direction is determined by dir. */ static inline void -bt_shift_at_core(btree_t *tree, btree_core_t *node, uint64_t idx, - uint64_t count, uint64_t off, boolean_t trap, boolean_t left) +bt_shift_core(btree_t *tree, btree_core_t *node, uint64_t idx, + uint64_t count, uint64_t off, enum bt_shift_shape shape, + enum bt_shift_direction dir) { size_t size = tree->bt_elem_size; ASSERT(node->btc_hdr.bth_core); uint8_t *e_start = node->btc_elems + idx * size; - uint8_t *e_out = (left ? e_start - (off * size) : e_start + (off * - size)); + int sign = (dir == BSD_LEFT ? -1 : +1); + uint8_t *e_out = e_start + sign * off * size; uint64_t e_count = count; bcopy(e_start, e_out, e_count * size); - btree_hdr_t **c_start = node->btc_children + idx + (trap ? 0 : 1); - btree_hdr_t **c_out = (left ? c_start - off : c_start + off); - uint64_t c_count = count + (trap ? 1 : 0); + btree_hdr_t **c_start = node->btc_children + idx + + (shape == BSS_TRAPEZOID ? 0 : 1); + btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : c_start + off); + uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); bcopy(c_start, c_out, c_count * sizeof (*c_start)); } @@ -356,10 +382,10 @@ bt_shift_at_core(btree_t *tree, btree_core_t *node, uint64_t idx, * false if it is a parallelogram. */ static inline void -bt_shleft_at_core(btree_t *tree, btree_core_t *node, uint64_t idx, - uint64_t count, boolean_t trap) +bt_shift_core_left(btree_t *tree, btree_core_t *node, uint64_t idx, + uint64_t count, enum bt_shift_shape shape) { - bt_shift_at_core(tree, node, idx, count, 1, trap, B_TRUE); + bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); } /* @@ -367,10 +393,10 @@ bt_shleft_at_core(btree_t *tree, btree_core_t *node, uint64_t idx, * Starts with elements[idx] and children[idx] and one more child than element. */ static inline void -bt_shright_at_core(btree_t *tree, btree_core_t *node, uint64_t idx, - uint64_t count, boolean_t trap) +bt_shift_core_right(btree_t *tree, btree_core_t *node, uint64_t idx, + uint64_t count, enum bt_shift_shape shape) { - bt_shift_at_core(tree, node, idx, count, 1, trap, B_FALSE); + bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); } /* @@ -379,35 +405,40 @@ bt_shright_at_core(btree_t *tree, btree_core_t *node, uint64_t idx, * is determined by left. */ static inline void -bt_shift_at_leaf(btree_t *tree, btree_leaf_t *node, uint64_t idx, - uint64_t count, uint64_t off, boolean_t left) +bt_shift_leaf(btree_t *tree, btree_leaf_t *node, uint64_t idx, + uint64_t count, uint64_t off, enum bt_shift_direction dir) { size_t size = tree->bt_elem_size; ASSERT(!node->btl_hdr.bth_core); uint8_t *start = node->btl_elems + idx * size; - uint8_t *out = (left ? start - (off * size) : start + (off * - size)); + int sign = (dir == BSD_LEFT ? -1 : +1); + uint8_t *out = start + sign * off * size; bcopy(start, out, count * size); } static inline void -bt_shright_at_leaf(btree_t *tree, btree_leaf_t *leaf, uint64_t idx, +bt_shift_leaf_right(btree_t *tree, btree_leaf_t *leaf, uint64_t idx, uint64_t count) { - bt_shift_at_leaf(tree, leaf, idx, count, 1, B_FALSE); + bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT); } static inline void -bt_shleft_at_leaf(btree_t *tree, btree_leaf_t *leaf, uint64_t idx, +bt_shift_leaf_left(btree_t *tree, btree_leaf_t *leaf, uint64_t idx, uint64_t count) { - bt_shift_at_leaf(tree, leaf, idx, count, 1, B_TRUE); + bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT); } +/* + * Move children and elements from one core node to another. The shape + * parameter behaves the same as it does in the shift logic. + */ static inline void bt_transfer_core(btree_t *tree, btree_core_t *source, uint64_t sidx, - uint64_t count, btree_core_t *dest, uint64_t didx, boolean_t trap) + uint64_t count, btree_core_t *dest, uint64_t didx, + enum bt_shift_shape shape) { size_t size = tree->bt_elem_size; ASSERT(source->btc_hdr.bth_core); @@ -416,9 +447,9 @@ bt_transfer_core(btree_t *tree, btree_core_t *source, uint64_t sidx, bcopy(source->btc_elems + sidx * size, dest->btc_elems + didx * size, count * size); - uint64_t c_count = count + (trap ? 1 : 0); - bcopy(source->btc_children + sidx + (trap ? 0 : 1), - dest->btc_children + didx + (trap ? 0 : 1), + uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); + bcopy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), + dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), c_count * sizeof (*source->btc_children)); } @@ -522,7 +553,8 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, /* Shift existing elements and children */ uint64_t count = par_hdr->bth_count - offset; - bt_shright_at_core(tree, parent, offset, count, B_FALSE); + bt_shift_core_right(tree, parent, offset, count, + BSS_PARALLELOGRAM); /* Insert new values */ parent->btc_children[offset + 1] = new_node; @@ -573,7 +605,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * leaf. */ bt_transfer_core(tree, parent, keep_count, move_count, - new_parent, 0, B_TRUE); + new_parent, 0, BSS_TRAPEZOID); /* Store the new separator in a buffer. */ uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); @@ -584,8 +616,8 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * Shift the remaining elements and children in the front half * to handle the new value. */ - bt_shright_at_core(tree, parent, offset, keep_count - 1 - - offset, B_FALSE); + bt_shift_core_right(tree, parent, offset, keep_count - 1 - + offset, BSS_PARALLELOGRAM); uint8_t *e_start = parent->btc_elems + offset * size; bcopy(buf, e_start, size); @@ -611,7 +643,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, */ uint64_t e_count = offset - keep_count - 1; bt_transfer_core(tree, parent, keep_count + 1, - e_count, new_parent, 0, B_TRUE); + e_count, new_parent, 0, BSS_TRAPEZOID); /* Add the new value to the new leaf. */ uint8_t *e_out = new_parent->btc_elems + e_count * size; @@ -630,7 +662,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, /* Move the rest of the back half to the new leaf. */ bt_transfer_core(tree, parent, keep_count + 2, BTREE_CORE_ELEMS - offset, new_parent, e_count + 1, - B_FALSE); + BSS_PARALLELOGRAM); } else { /* * The new value is the new separator, no change. @@ -639,7 +671,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * leaf. */ bt_transfer_core(tree, parent, keep_count, move_count, - new_parent, 0, B_FALSE); + new_parent, 0, BSS_PARALLELOGRAM); new_parent->btc_children[0] = new_node; } #ifdef ZFS_DEBUG @@ -682,7 +714,7 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, leaf->btl_hdr.bth_count); } leaf->btl_hdr.bth_count++; - bt_shright_at_leaf(tree, leaf, idx, count); + bt_shift_leaf_right(tree, leaf, idx, count); bcopy(value, start, size); return; } @@ -740,7 +772,7 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, * Shift the remaining elements in the front part to handle * the new value. */ - bt_shright_at_leaf(tree, leaf, idx, keep_count - 1 - idx); + bt_shift_leaf_right(tree, leaf, idx, keep_count - 1 - idx); bcopy(value, leaf->btl_elems + idx * size, size); } else if (idx > keep_count) { /* Store the new separator in a buffer. */ @@ -862,8 +894,8 @@ btree_bulk_finish(btree_t *tree) } /* First, shift elements in leaf back. */ - bt_shift_at_leaf(tree, leaf, 0, hdr->bth_count, move_count, - B_FALSE); + bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count, + BSD_RIGHT); /* Next, move the separator from the common ancestor to leaf. */ uint8_t *separator = common->btc_elems + (common_idx * size); @@ -932,8 +964,8 @@ btree_bulk_finish(btree_t *tree) } } /* First, shift things in the right node back. */ - bt_shift_at_core(tree, cur, 0, hdr->bth_count, move_count, - B_TRUE, B_FALSE); + bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, + BSS_TRAPEZOID, BSD_RIGHT); /* Next, move the separator to the right node. */ uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * @@ -948,7 +980,7 @@ btree_bulk_finish(btree_t *tree) move_count--; uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, - B_TRUE); + BSS_TRAPEZOID); /* * Finally, move the last element in the left node to the @@ -1348,8 +1380,8 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Shift the element and children to the right of rm_hdr to * the left by one spot. */ - bt_shleft_at_core(tree, node, idx, hdr->bth_count - idx + 1, - B_FALSE); + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx + 1, + BSS_PARALLELOGRAM); #ifdef ZFS_DEBUG btree_poison_node_at(tree, hdr, hdr->bth_count); #endif @@ -1385,7 +1417,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Start by shifting the elements and children in the current * node to the right by one spot. */ - bt_shright_at_core(tree, node, 0, idx - 1, B_TRUE); + bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); /* * Move the separator between node and neighbor to the first @@ -1424,8 +1456,8 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Shift elements in node left by one spot to overwrite rm_hdr * and the separator before it. */ - bt_shleft_at_core(tree, node, idx, hdr->bth_count - idx + 1, - B_FALSE); + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx + 1, + BSS_PARALLELOGRAM); /* * Move the separator between node and neighbor to the last @@ -1453,7 +1485,8 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Shift the elements and children of neighbor to cover the * stolen elements. */ - bt_shleft_at_core(tree, neighbor, 1, r_hdr->bth_count, B_TRUE); + bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, + BSS_TRAPEZOID); #ifdef ZFS_DEBUG btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); #endif @@ -1490,9 +1523,9 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) /* Move all our elements and children into the left node. */ bt_transfer_core(tree, node, 0, idx - 1, left, - l_hdr->bth_count + 1, B_TRUE); + l_hdr->bth_count + 1, BSS_TRAPEZOID); bt_transfer_core(tree, node, idx, hdr->bth_count - idx + 1, - left, l_hdr->bth_count + idx, B_FALSE); + left, l_hdr->bth_count + idx, BSS_PARALLELOGRAM); /* Reparent all our children to point to the left node. */ btree_hdr_t **new_start = left->btc_children + @@ -1523,8 +1556,8 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Overwrite rm_hdr and its separator by moving node's * elements and children forward. */ - bt_shleft_at_core(tree, node, idx, hdr->bth_count - idx + 1, - B_FALSE); + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx + 1, + BSS_PARALLELOGRAM); /* * Move the separator to the first open spot in node's @@ -1537,7 +1570,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) /* Move the right node's elements and children to node. */ bt_transfer_core(tree, right, 0, r_hdr->bth_count, node, - hdr->bth_count - idx + 1, B_TRUE); + hdr->bth_count - idx + 1, BSS_TRAPEZOID); /* Reparent the right node's children to point to node. */ for (int i = hdr->bth_count - idx; i < r_hdr->bth_count + 1; @@ -1626,7 +1659,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * the value and return. */ if (hdr->bth_count >= min_count || hdr->bth_parent == NULL) { - bt_shleft_at_leaf(tree, leaf, idx + 1, hdr->bth_count - idx); + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); if (hdr->bth_parent == NULL) { ASSERT0(tree->bt_height); if (hdr->bth_count == 0) { @@ -1670,7 +1703,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * Move our elements back by one spot to make room for the * stolen element and overwrite the element being removed. */ - bt_shright_at_leaf(tree, leaf, 0, idx); + bt_shift_leaf_right(tree, leaf, 0, idx); uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; uint8_t *steal_elem = ((btree_leaf_t *)l_hdr)->btl_elems + @@ -1703,7 +1736,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * by one spot to make room for the stolen element and * overwrite the element being removed. */ - bt_shleft_at_leaf(tree, leaf, idx + 1, hdr->bth_count - idx); + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); uint8_t *separator = parent->btc_elems + parent_idx * size; uint8_t *steal_elem = ((btree_leaf_t *)r_hdr)->btl_elems; @@ -1721,7 +1754,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * Move our neighbors elements forwards to overwrite the * stolen element. */ - bt_shleft_at_leaf(tree, neighbor, 1, r_hdr->bth_count); + bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count); #ifdef ZFS_DEBUG btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); #endif @@ -1785,7 +1818,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * Move our elements left to overwrite the element being * removed. */ - bt_shleft_at_leaf(tree, leaf, idx + 1, hdr->bth_count - idx); + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); /* Move the separator to node's first open spot. */ uint8_t *out = leaf->btl_elems + hdr->bth_count * size; From 723aef254dc4204bfe3cc5afb86124f8730b8f63 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 29 Aug 2019 15:48:58 -0700 Subject: [PATCH 08/18] bugfix --- module/zfs/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/btree.c b/module/zfs/btree.c index b2dbe3e2ee35..bda5184ab467 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -660,7 +660,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, kmem_free(tmp_buf, size); /* Move the rest of the back half to the new leaf. */ - bt_transfer_core(tree, parent, keep_count + 2, + bt_transfer_core(tree, parent, offset, BTREE_CORE_ELEMS - offset, new_parent, e_count + 1, BSS_PARALLELOGRAM); } else { From ea333def6a5c1754f368b280b4644c6290130953 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Fri, 30 Aug 2019 10:45:32 -0700 Subject: [PATCH 09/18] change AVL_ macros to TREE_ macros --- cmd/zdb/zdb.c | 2 +- include/sys/avl.h | 6 +++--- lib/libzfs/libzfs_dataset.c | 2 +- lib/libzfs/libzfs_iter.c | 2 +- lib/libzfs/libzfs_sendrecv.c | 2 +- lib/libzutil/zutil_import.c | 4 ++-- module/os/linux/zfs/zfs_znode.c | 2 +- module/zfs/ddt.c | 2 +- module/zfs/dmu_objset.c | 2 +- module/zfs/dmu_recv.c | 2 +- module/zfs/dmu_send.c | 14 +++++++------- module/zfs/dnode.c | 6 +++--- module/zfs/dsl_bookmark.c | 6 +++--- module/zfs/dsl_deadlist.c | 8 ++++---- module/zfs/dsl_deleg.c | 2 +- module/zfs/dsl_scan.c | 2 +- module/zfs/metaslab.c | 20 ++++++++++---------- module/zfs/sa.c | 6 +++--- module/zfs/spa.c | 2 +- module/zfs/spa_misc.c | 6 +++--- module/zfs/space_reftree.c | 4 ++-- module/zfs/unique.c | 2 +- module/zfs/vdev_cache.c | 4 ++-- module/zfs/vdev_label.c | 6 +++--- module/zfs/vdev_queue.c | 8 ++++---- module/zfs/zap_micro.c | 4 ++-- module/zfs/zfs_fuid.c | 4 ++-- module/zfs/zfs_rlock.c | 2 +- module/zfs/zil.c | 8 ++++---- 29 files changed, 70 insertions(+), 70 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 9b2e4fdfd0bf..3bd8054322df 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3142,7 +3142,7 @@ cksum_record_compare(const void *x1, const void *x2) int difference; for (int i = 0; i < arraysize; i++) { - difference = AVL_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); + difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); if (difference) break; } diff --git a/include/sys/avl.h b/include/sys/avl.h index 962e8b1cfb6f..6c4e7ed5cfbc 100644 --- a/include/sys/avl.h +++ b/include/sys/avl.h @@ -108,9 +108,9 @@ extern "C" { /* * AVL comparator helpers */ -#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0)) -#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b))) -#define AVL_PCMP(a, b) \ +#define TREE_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define TREE_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define TREE_PCMP(a, b) \ (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) /* diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index ef638a49806c..e88ace53f8ce 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -796,7 +796,7 @@ libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); - return (AVL_ISIGN(rv)); + return (TREE_ISIGN(rv)); } void diff --git a/lib/libzfs/libzfs_iter.c b/lib/libzfs/libzfs_iter.c index d3ba58a9e1ed..31b2ca30e6a1 100644 --- a/lib/libzfs/libzfs_iter.c +++ b/lib/libzfs/libzfs_iter.c @@ -302,7 +302,7 @@ zfs_snapshot_compare(const void *larg, const void *rarg) lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); - return (AVL_CMP(lcreate, rcreate)); + return (TREE_CMP(lcreate, rcreate)); } int diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 0b6b3156829b..29cd24eed3c0 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -511,7 +511,7 @@ fsavl_compare(const void *arg1, const void *arg2) const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; - return (AVL_CMP(fn1->fn_guid, fn2->fn_guid)); + return (TREE_CMP(fn1->fn_guid, fn2->fn_guid)); } /* diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index e85ce459448e..b8cdc118b263 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -972,11 +972,11 @@ slice_cache_compare(const void *arg1, const void *arg2) uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; int rv; - rv = AVL_ISIGN(strcmp(nm1, nm2)); + rv = TREE_ISIGN(strcmp(nm1, nm2)); if (rv) return (rv); - return (AVL_CMP(guid1, guid2)); + return (TREE_CMP(guid1, guid2)); } static int diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index a4d336abe0a1..ebfaae67bac1 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -247,7 +247,7 @@ zfs_znode_hold_compare(const void *a, const void *b) const znode_hold_t *zh_a = (const znode_hold_t *)a; const znode_hold_t *zh_b = (const znode_hold_t *)b; - return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj)); + return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); } boolean_t diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index a0f90496a4d1..5e7b31c76396 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -783,7 +783,7 @@ ddt_entry_compare(const void *x1, const void *x2) break; } - return (AVL_ISIGN(cmp)); + return (TREE_ISIGN(cmp)); } static ddt_t * diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index de57b170f0db..f7498854a8d1 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1788,7 +1788,7 @@ userquota_compare(const void *l, const void *r) */ rv = strcmp(luqn->uqn_id, ruqn->uqn_id); - return (AVL_ISIGN(rv)); + return (TREE_ISIGN(rv)); } static void diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 6249e165fa5d..48c3705c65a6 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1209,7 +1209,7 @@ guid_compare(const void *arg1, const void *arg2) const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; - return (AVL_CMP(gmep1->guid, gmep2->guid)); + return (TREE_CMP(gmep1->guid, gmep2->guid)); } static void diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 42116a1c35b0..ae4f91fc3439 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1329,11 +1329,11 @@ send_range_after(const struct send_range *from, const struct send_range *to) return (-1); if (from_obj >= to_end_obj) return (1); - int64_t cmp = AVL_CMP(to->type == OBJECT_RANGE, from->type == + int64_t cmp = TREE_CMP(to->type == OBJECT_RANGE, from->type == OBJECT_RANGE); if (unlikely(cmp)) return (cmp); - cmp = AVL_CMP(to->type == OBJECT, from->type == OBJECT); + cmp = TREE_CMP(to->type == OBJECT, from->type == OBJECT); if (unlikely(cmp)) return (cmp); if (from->end_blkid <= to->start_blkid) @@ -1402,7 +1402,7 @@ send_range_start_compare(struct send_range *r1, struct send_range *r2) uint64_t r1_l0equiv = r1->start_blkid; uint64_t r2_objequiv = r2->object; uint64_t r2_l0equiv = r2->start_blkid; - int64_t cmp = AVL_CMP(r1->eos_marker, r2->eos_marker); + int64_t cmp = TREE_CMP(r1->eos_marker, r2->eos_marker); if (unlikely(cmp)) return (cmp); if (r1->object == 0) { @@ -1414,17 +1414,17 @@ send_range_start_compare(struct send_range *r1, struct send_range *r2) r2_l0equiv = 0; } - cmp = AVL_CMP(r1_objequiv, r2_objequiv); + cmp = TREE_CMP(r1_objequiv, r2_objequiv); if (likely(cmp)) return (cmp); - cmp = AVL_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE); + cmp = TREE_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE); if (unlikely(cmp)) return (cmp); - cmp = AVL_CMP(r2->type == OBJECT, r1->type == OBJECT); + cmp = TREE_CMP(r2->type == OBJECT, r1->type == OBJECT); if (unlikely(cmp)) return (cmp); - return (AVL_CMP(r1_l0equiv, r2_l0equiv)); + return (TREE_CMP(r1_l0equiv, r2_l0equiv)); } enum q_idx { diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 85dbc0664857..9bca72d53cb7 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -89,11 +89,11 @@ dbuf_compare(const void *x1, const void *x2) const dmu_buf_impl_t *d1 = x1; const dmu_buf_impl_t *d2 = x2; - int cmp = AVL_CMP(d1->db_level, d2->db_level); + int cmp = TREE_CMP(d1->db_level, d2->db_level); if (likely(cmp)) return (cmp); - cmp = AVL_CMP(d1->db_blkid, d2->db_blkid); + cmp = TREE_CMP(d1->db_blkid, d2->db_blkid); if (likely(cmp)) return (cmp); @@ -105,7 +105,7 @@ dbuf_compare(const void *x1, const void *x2) return (1); } - return (AVL_PCMP(d1, d2)); + return (TREE_PCMP(d1, d2)); } /* ARGSUSED */ diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index 2126f3d9bdb0..42c612abc945 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -595,16 +595,16 @@ dsl_bookmark_compare(const void *l, const void *r) const dsl_bookmark_node_t *ldbn = l; const dsl_bookmark_node_t *rdbn = r; - int64_t cmp = AVL_CMP(ldbn->dbn_phys.zbm_creation_txg, + int64_t cmp = TREE_CMP(ldbn->dbn_phys.zbm_creation_txg, rdbn->dbn_phys.zbm_creation_txg); if (likely(cmp)) return (cmp); - cmp = AVL_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN), + cmp = TREE_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN), (rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)); if (likely(cmp)) return (cmp); cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name); - return (AVL_ISIGN(cmp)); + return (TREE_ISIGN(cmp)); } /* diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 2e4d18ade604..8cb0f90fb891 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -118,7 +118,7 @@ dsl_deadlist_compare(const void *arg1, const void *arg2) const dsl_deadlist_entry_t *dle1 = arg1; const dsl_deadlist_entry_t *dle2 = arg2; - return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); + return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); } static int @@ -127,7 +127,7 @@ dsl_deadlist_cache_compare(const void *arg1, const void *arg2) const dsl_deadlist_cache_entry_t *dlce1 = arg1; const dsl_deadlist_cache_entry_t *dlce2 = arg2; - return (AVL_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); + return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); } static void @@ -917,14 +917,14 @@ livelist_compare(const void *larg, const void *rarg) uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); if (l_dva0_vdev != r_dva0_vdev) - return (AVL_CMP(l_dva0_vdev, r_dva0_vdev)); + return (TREE_CMP(l_dva0_vdev, r_dva0_vdev)); /* if vdevs are equal, sort by offsets. */ uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); if (l_dva0_offset == r_dva0_offset) ASSERT3U(l->blk_birth, ==, r->blk_birth); - return (AVL_CMP(l_dva0_offset, r_dva0_offset)); + return (TREE_CMP(l_dva0_offset, r_dva0_offset)); } struct livelist_iter_arg { diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index cef460f02041..cf8a3c9bbdfb 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -399,7 +399,7 @@ perm_set_compare(const void *arg1, const void *arg2) val = strcmp(node1->p_setname, node2->p_setname); - return (AVL_ISIGN(val)); + return (TREE_ISIGN(val)); } /* diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 0646af57652f..997e0ea82863 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -4076,7 +4076,7 @@ sio_addr_compare(const void *x, const void *y) { const scan_io_t *a = x, *b = y; - return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); + return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); } /* IO queues are created on demand when they are needed. */ diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 3f1e401585b2..45cd131629db 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -689,13 +689,13 @@ metaslab_compare(const void *x1, const void *x2) if (sort1 > sort2) return (1); - int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); + int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); if (likely(cmp)) return (cmp); - IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); + IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); - return (AVL_CMP(m1->ms_start, m2->ms_start)); + return (TREE_CMP(m1->ms_start, m2->ms_start)); } /* @@ -792,17 +792,17 @@ metaslab_sort_by_flushed(const void *va, const void *vb) const metaslab_t *a = va; const metaslab_t *b = vb; - int cmp = AVL_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); + int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); if (likely(cmp)) return (cmp); uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; - cmp = AVL_CMP(a_vdev_id, b_vdev_id); + cmp = TREE_CMP(a_vdev_id, b_vdev_id); if (cmp) return (cmp); - return (AVL_CMP(a->ms_id, b->ms_id)); + return (TREE_CMP(a->ms_id, b->ms_id)); } metaslab_group_t * @@ -1305,11 +1305,11 @@ metaslab_rangesize32_compare(const void *x1, const void *x2) uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; - int cmp = AVL_CMP(rs_size1, rs_size2); + int cmp = TREE_CMP(rs_size1, rs_size2); if (likely(cmp)) return (cmp); - return (AVL_CMP(r1->rs_start, r2->rs_start)); + return (TREE_CMP(r1->rs_start, r2->rs_start)); } /* @@ -1325,11 +1325,11 @@ metaslab_rangesize64_compare(const void *x1, const void *x2) uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; - int cmp = AVL_CMP(rs_size1, rs_size2); + int cmp = TREE_CMP(rs_size1, rs_size2); if (likely(cmp)) return (cmp); - return (AVL_CMP(r1->rs_start, r2->rs_start)); + return (TREE_CMP(r1->rs_start, r2->rs_start)); } typedef struct metaslab_rt_arg { btree_t *mra_bt; diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 98bd2d3302a0..cc2a681bdfc7 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -252,7 +252,7 @@ layout_num_compare(const void *arg1, const void *arg2) const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; - return (AVL_CMP(node1->lot_num, node2->lot_num)); + return (TREE_CMP(node1->lot_num, node2->lot_num)); } static int @@ -261,11 +261,11 @@ layout_hash_compare(const void *arg1, const void *arg2) const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; - int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash); + int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash); if (likely(cmp)) return (cmp); - return (AVL_CMP(node1->lot_instance, node2->lot_instance)); + return (TREE_CMP(node1->lot_instance, node2->lot_instance)); } boolean_t diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 7e5c474eb2e2..529ffc03ddb3 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -939,7 +939,7 @@ spa_error_entry_compare(const void *a, const void *b) ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); - return (AVL_ISIGN(ret)); + return (TREE_ISIGN(ret)); } /* diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 5bd89faeb8e8..e7fe0a8f6fb5 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -620,7 +620,7 @@ spa_log_sm_sort_by_txg(const void *va, const void *vb) const spa_log_sm_t *a = va; const spa_log_sm_t *b = vb; - return (AVL_CMP(a->sls_txg, b->sls_txg)); + return (TREE_CMP(a->sls_txg, b->sls_txg)); } /* @@ -940,7 +940,7 @@ spa_aux_compare(const void *a, const void *b) const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; - return (AVL_CMP(sa->aux_guid, sb->aux_guid)); + return (TREE_CMP(sa->aux_guid, sb->aux_guid)); } void @@ -2271,7 +2271,7 @@ spa_name_compare(const void *a1, const void *a2) s = strcmp(s1->spa_name, s2->spa_name); - return (AVL_ISIGN(s)); + return (TREE_ISIGN(s)); } void diff --git a/module/zfs/space_reftree.c b/module/zfs/space_reftree.c index 5dc604faa8c4..ef9b0473bc15 100644 --- a/module/zfs/space_reftree.c +++ b/module/zfs/space_reftree.c @@ -57,11 +57,11 @@ space_reftree_compare(const void *x1, const void *x2) const space_ref_t *sr1 = (const space_ref_t *)x1; const space_ref_t *sr2 = (const space_ref_t *)x2; - int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset); + int cmp = TREE_CMP(sr1->sr_offset, sr2->sr_offset); if (likely(cmp)) return (cmp); - return (AVL_PCMP(sr1, sr2)); + return (TREE_PCMP(sr1, sr2)); } void diff --git a/module/zfs/unique.c b/module/zfs/unique.c index 5cdd025f49bc..0e076797a002 100644 --- a/module/zfs/unique.c +++ b/module/zfs/unique.c @@ -45,7 +45,7 @@ unique_compare(const void *a, const void *b) const unique_t *una = (const unique_t *)a; const unique_t *unb = (const unique_t *)b; - return (AVL_CMP(una->un_value, unb->un_value)); + return (TREE_CMP(una->un_value, unb->un_value)); } void diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index 35a5b480b6ed..6cf449d4a00e 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -111,7 +111,7 @@ vdev_cache_offset_compare(const void *a1, const void *a2) const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - return (AVL_CMP(ve1->ve_offset, ve2->ve_offset)); + return (TREE_CMP(ve1->ve_offset, ve2->ve_offset)); } static int @@ -120,7 +120,7 @@ vdev_cache_lastused_compare(const void *a1, const void *a2) const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - int cmp = AVL_CMP(ve1->ve_lastused, ve2->ve_lastused); + int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused); if (likely(cmp)) return (cmp); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 6320732ed6da..6bb3c3c68072 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1197,12 +1197,12 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) static int vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { - int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); + int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg); if (likely(cmp)) return (cmp); - cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); + cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp); if (likely(cmp)) return (cmp); @@ -1226,7 +1226,7 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) seq2 = MMP_SEQ(ub2); - return (AVL_CMP(seq1, seq2)); + return (TREE_CMP(seq1, seq2)); } struct ubl_cbdata { diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 5e4db3df6de1..e156e2b0139f 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -218,12 +218,12 @@ vdev_queue_offset_compare(const void *x1, const void *x2) const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = AVL_CMP(z1->io_offset, z2->io_offset); + int cmp = TREE_CMP(z1->io_offset, z2->io_offset); if (likely(cmp)) return (cmp); - return (AVL_PCMP(z1, z2)); + return (TREE_PCMP(z1, z2)); } static inline avl_tree_t * @@ -250,12 +250,12 @@ vdev_queue_timestamp_compare(const void *x1, const void *x2) const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = AVL_CMP(z1->io_timestamp, z2->io_timestamp); + int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); if (likely(cmp)) return (cmp); - return (AVL_PCMP(z1, z2)); + return (TREE_PCMP(z1, z2)); } static int diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 467812ff637c..c3d42d0ee159 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -280,11 +280,11 @@ mze_compare(const void *arg1, const void *arg2) const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; - int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash); + int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash); if (likely(cmp)) return (cmp); - return (AVL_CMP(mze1->mze_cd, mze2->mze_cd)); + return (TREE_CMP(mze1->mze_cd, mze2->mze_cd)); } static void diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 78071707a908..925d1934f794 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -73,7 +73,7 @@ idx_compare(const void *arg1, const void *arg2) const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; - return (AVL_CMP(node1->f_idx, node2->f_idx)); + return (TREE_CMP(node1->f_idx, node2->f_idx)); } /* @@ -88,7 +88,7 @@ domain_compare(const void *arg1, const void *arg2) val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); - return (AVL_ISIGN(val)); + return (TREE_ISIGN(val)); } void diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c index 94203a40c582..db587c731dc0 100644 --- a/module/zfs/zfs_rlock.c +++ b/module/zfs/zfs_rlock.c @@ -109,7 +109,7 @@ zfs_rangelock_compare(const void *arg1, const void *arg2) const locked_range_t *rl1 = (const locked_range_t *)arg1; const locked_range_t *rl2 = (const locked_range_t *)arg2; - return (AVL_CMP(rl1->lr_offset, rl2->lr_offset)); + return (TREE_CMP(rl1->lr_offset, rl2->lr_offset)); } /* diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 30a73515c7fe..7e65ac090396 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -146,11 +146,11 @@ zil_bp_compare(const void *x1, const void *x2) const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; - int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); if (likely(cmp)) return (cmp); - return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); + return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); } static void @@ -535,7 +535,7 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; - return (AVL_CMP(v1, v2)); + return (TREE_CMP(v1, v2)); } static lwb_t * @@ -1869,7 +1869,7 @@ zil_aitx_compare(const void *x1, const void *x2) const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; - return (AVL_CMP(o1, o2)); + return (TREE_CMP(o1, o2)); } /* From 9765f8832ffb3374c0c98f8c523147492f44fedf Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Fri, 30 Aug 2019 10:55:05 -0700 Subject: [PATCH 10/18] style Signed-off-by: Paul Dagnelie --- module/zfs/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/btree.c b/module/zfs/btree.c index bda5184ab467..3fe6ff5acd45 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -330,7 +330,7 @@ btree_find(btree_t *tree, const void *value, btree_index_t *where) * * / * * * \ * * * ... * * * * * / * * * *\ * * * ... * * * * --------------- - * + * * Note that a parellelogram shift is always shaped like a "left-leaning" * parallelogram, where the starting index of the children being moved is * always one higher than the starting index of the elements being moved. No From 81fc8a86caa56f8f13a6eae70fb8999926538d57 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 3 Sep 2019 10:53:33 -0700 Subject: [PATCH 11/18] add static initializers Signed-off-by: Paul Dagnelie --- include/sys/range_tree.h | 2 +- module/zfs/metaslab.c | 10 +++++----- module/zfs/range_tree.c | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index 6bc9690e3020..42a017953288 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -320,7 +320,7 @@ void rt_btree_destroy(range_tree_t *rt, void *arg); void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg); void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg); void rt_btree_vacate(range_tree_t *rt, void *arg); -extern struct range_tree_ops rt_btree_ops; +extern range_tree_ops_t rt_btree_ops; #ifdef __cplusplus } diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 45cd131629db..5767ea86053d 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1449,11 +1449,11 @@ metaslab_rt_vacate(range_tree_t *rt, void *arg) } static range_tree_ops_t metaslab_rt_ops = { - metaslab_rt_create, - metaslab_rt_destroy, - metaslab_rt_add, - metaslab_rt_remove, - metaslab_rt_vacate + .rtop_create = metaslab_rt_create, + .rtop_destroy = metaslab_rt_destroy, + .rtop_add = metaslab_rt_add, + .rtop_remove = metaslab_rt_remove, + .rtop_vacate = metaslab_rt_vacate }; /* diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index a98c4528f66b..df01e4dd9c31 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -788,11 +788,11 @@ rt_btree_vacate(range_tree_t *rt, void *arg) } range_tree_ops_t rt_btree_ops = { - rt_btree_create, - rt_btree_destroy, - rt_btree_add, - rt_btree_remove, - rt_btree_vacate + .rtop_create = rt_btree_create, + .rtop_destroy = rt_btree_destroy, + .rtop_add = rt_btree_add, + .rtop_remove = rt_btree_remove, + .rtop_vacate = rt_btree_vacate }; /* From 5d6cd17e21e90b9cd7a0096d1bc732d4303c5dde Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Fri, 6 Sep 2019 10:02:53 -0700 Subject: [PATCH 12/18] Matt feedback 1 Signed-off-by: Paul Dagnelie --- module/zfs/btree.c | 105 ++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 63 deletions(-) diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 3fe6ff5acd45..c25c71c646da 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -65,10 +65,10 @@ int btree_verify_intensity = 0; #define BTREE_POISON 0xabadb10cdeadbeef #endif -#ifdef ZFS_DEBUG static void btree_poison_node(btree_t *tree, btree_hdr_t *hdr) { +#ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; if (!hdr->bth_core) { btree_leaf_t *leaf = (btree_leaf_t *)hdr; @@ -84,11 +84,13 @@ btree_poison_node(btree_t *tree, btree_hdr_t *hdr) (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, (BTREE_CORE_ELEMS - hdr->bth_count) * size); } +#endif } static inline void btree_poison_node_at(btree_t *tree, btree_hdr_t *hdr, uint64_t offset) { +#ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; ASSERT3U(offset, >=, hdr->bth_count); if (!hdr->bth_core) { @@ -100,8 +102,8 @@ btree_poison_node_at(btree_t *tree, btree_hdr_t *hdr, uint64_t offset) (btree_hdr_t *)BTREE_POISON; (void) memset(node->btc_elems + offset * size, 0x0f, size); } -} #endif +} static inline void btree_verify_poison_at(btree_t *tree, btree_hdr_t *hdr, uint64_t offset) @@ -522,9 +524,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, tree->bt_height++; tree->bt_root = new_root_hdr; -#ifdef ZFS_DEBUG btree_poison_node(tree, new_root_hdr); -#endif return; } @@ -589,9 +589,8 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, new_par_hdr->bth_parent = par_hdr->bth_parent; new_par_hdr->bth_core = B_TRUE; new_par_hdr->bth_count = move_count; -#ifdef ZFS_DEBUG btree_poison_node(tree, new_par_hdr); -#endif + par_hdr->bth_count = keep_count; /* @@ -674,9 +673,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, new_parent, 0, BSS_PARALLELOGRAM); new_parent->btc_children[0] = new_node; } -#ifdef ZFS_DEBUG btree_poison_node(tree, par_hdr); -#endif for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++) new_parent->btc_children[i]->bth_parent = new_parent; @@ -690,7 +687,6 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, */ btree_insert_into_parent(tree, &parent->btc_hdr, &new_parent->btc_hdr, buf); - } /* Helper function for inserting a new value into leaf at the given index. */ @@ -743,9 +739,8 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, new_hdr->bth_parent = leaf->btl_hdr.bth_parent; new_hdr->bth_core = B_FALSE; new_hdr->bth_count = move_count; -#ifdef ZFS_DEBUG btree_poison_node(tree, new_hdr); -#endif + leaf->btl_hdr.bth_count = keep_count; if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) @@ -800,9 +795,8 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, 0); } -#ifdef ZFS_DEBUG btree_poison_node(tree, &leaf->btl_hdr); -#endif + /* * Now that the node is split, we need to insert the new node into its * parent. This may cause further splitting, bur only of core nodes. @@ -834,7 +828,7 @@ btree_find_parent_idx(btree_t *tree, btree_hdr_t *hdr) * Take the b-tree out of bulk insert mode. During bulk-insert mode, some * nodes may violate the invariant that non-root nodes must be at least half * full. All nodes violating this invariant should be the last node in their - * particular level. To correct the invariant, we steal values from their left + * particular level. To correct the invariant, we take values from their left * neighbor until they are half full. They must have a left neighbor at their * level because the last node at a level is not the first node unless it's * the root. @@ -860,7 +854,7 @@ btree_bulk_finish(btree_t *tree) return; } - /* First, steal elements to rebalance the leaf node. */ + /* First, take elements to rebalance the leaf node. */ if (hdr->bth_count < capacity / 2) { /* * First, find the left neighbor. The simplest way to do this @@ -923,9 +917,7 @@ btree_bulk_finish(btree_t *tree) ASSERT3U(l_hdr->bth_count, >=, capacity / 2); ASSERT3U(hdr->bth_count, >=, capacity / 2); -#ifdef ZFS_DEBUG btree_poison_node(tree, l_hdr); -#endif } /* @@ -947,7 +939,7 @@ btree_bulk_finish(btree_t *tree) /* * Because the smallest number of nodes we can move when * splitting is 2, we never need to worry about not having a - * left sibling. + * left sibling (a sibling is a neighbor with the same parent). */ uint64_t parent_idx = btree_find_parent_idx(tree, hdr); ASSERT3U(parent_idx, >, 0); @@ -995,9 +987,8 @@ btree_bulk_finish(btree_t *tree) ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); ASSERT3U(hdr->bth_count, >=, capacity / 2); -#ifdef ZFS_DEBUG btree_poison_node(tree, &l_neighbor->btc_hdr); -#endif + for (int i = 0; i <= hdr->bth_count; i++) cur->btc_children[i]->bth_parent = cur; } @@ -1043,9 +1034,8 @@ btree_insert(btree_t *tree, const void *value, const btree_index_t *where) hdr->bth_parent = NULL; hdr->bth_core = B_FALSE; hdr->bth_count = 0; -#ifdef ZFS_DEBUG btree_poison_node(tree, hdr); -#endif + btree_insert_into_leaf(tree, leaf, value, 0); tree->bt_bulk = leaf; } else if (!where->bti_node->bth_core) { @@ -1104,7 +1094,6 @@ btree_first(btree_t *tree, btree_index_t *where) return (NULL); } return (btree_first_helper(tree->bt_root, where)); - } /* @@ -1248,11 +1237,11 @@ btree_prev(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) /* * When finding the previous element of an element in a leaf, * there are two cases. If the element isn't the first one in - * the leaf, in which case we just return the next element in + * the leaf, in which case we just return the previous element in * the leaf. Otherwise, we need to traverse up our parents * until we find one where our previous ancestor isn't the - * first child. Once we do, the next element is the separator - * before our previous ancestor. + * first child. Once we do, the previous element is the separator + * after our previous ancestor. */ btree_leaf_t *leaf = (btree_leaf_t *)idx->bti_node; if (offset != 0) { @@ -1286,7 +1275,7 @@ btree_prev(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) /* * The previous element from one in a core node is the last element in - * the subtree just to the lefet of the separator. + * the subtree just to the left of the separator. */ ASSERT(idx->bti_node->bth_core); btree_core_t *node = (btree_core_t *)idx->bti_node; @@ -1297,7 +1286,7 @@ btree_prev(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) /* * Get the value at the provided index in the tree. * - * Note that the value returned from this function can be mutataed, but only + * Note that the value returned from this function can be mutated, but only * if it will not change the ordering of the element with respect to any other * elements that could be in the tree. */ @@ -1382,16 +1371,14 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) */ bt_shift_core_left(tree, node, idx, hdr->bth_count - idx + 1, BSS_PARALLELOGRAM); -#ifdef ZFS_DEBUG btree_poison_node_at(tree, hdr, hdr->bth_count); -#endif return; } ASSERT3U(hdr->bth_count, ==, min_count - 1); /* - * Now we try to steal a node from a neighbor. We check left, then + * Now we try to take a node from a neighbor. We check left, then * right. If the neighbor exists and has more than the minimum number * of elements, we move the separator betweeen us and them to our * node, move their closest element (last for left, first for right) @@ -1399,9 +1386,10 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * the way we need to collapse the gap made by idx, and (for our right * neighbor) the gap made by removing their first element and child. * - * Note: this logic currently doesn't support stealing from a neighbor - * that isn't a sibling. This isn't critical functionality, but may be - * worth implementing in the future for completeness' sake. + * Note: this logic currently doesn't support taking from a neighbor + * that isn't a sibling (i.e. a neighbor with a different + * parent). This isn't critical functionality, but may be worth + * implementing in the future for completeness' sake. */ btree_core_t *parent = hdr->bth_parent; uint64_t parent_idx = btree_find_parent_idx(tree, hdr); @@ -1409,7 +1397,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : parent->btc_children[parent_idx - 1]); if (l_hdr != NULL && l_hdr->bth_count > min_count) { - /* We can steal a node from the left neighbor. */ + /* We can take a node from the left neighbor. */ ASSERT(l_hdr->bth_core); btree_core_t *neighbor = (btree_core_t *)l_hdr; @@ -1428,27 +1416,25 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) bcopy(separator, node->btc_elems, size); /* Move the last child of neighbor to our first child slot. */ - btree_hdr_t **steal_child = neighbor->btc_children + + btree_hdr_t **take_child = neighbor->btc_children + l_hdr->bth_count; - bcopy(steal_child, node->btc_children, sizeof (*steal_child)); + bcopy(take_child, node->btc_children, sizeof (*take_child)); node->btc_children[0]->bth_parent = node; /* Move the last element of neighbor to the separator spot. */ - uint8_t *steal_elem = neighbor->btc_elems + + uint8_t *take_elem = neighbor->btc_elems + (l_hdr->bth_count - 1) * size; - bcopy(steal_elem, separator, size); + bcopy(take_elem, separator, size); l_hdr->bth_count--; hdr->bth_count++; -#ifdef ZFS_DEBUG btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); -#endif return; } btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? NULL : parent->btc_children[parent_idx + 1]); if (r_hdr != NULL && r_hdr->bth_count > min_count) { - /* We can steal a node from the right neighbor. */ + /* We can take a node from the right neighbor. */ ASSERT(r_hdr->bth_core); btree_core_t *neighbor = (btree_core_t *)r_hdr; @@ -1470,14 +1456,14 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Move the first child of neighbor to the last child spot in * node. */ - btree_hdr_t **steal_child = neighbor->btc_children; - bcopy(steal_child, node->btc_children + (hdr->bth_count + 1), - sizeof (*steal_child)); + btree_hdr_t **take_child = neighbor->btc_children; + bcopy(take_child, node->btc_children + (hdr->bth_count + 1), + sizeof (*take_child)); node->btc_children[hdr->bth_count + 1]->bth_parent = node; /* Move the first element of neighbor to the separator spot. */ - uint8_t *steal_elem = neighbor->btc_elems; - bcopy(steal_elem, separator, size); + uint8_t *take_elem = neighbor->btc_elems; + bcopy(take_elem, separator, size); r_hdr->bth_count--; hdr->bth_count++; @@ -1487,9 +1473,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) */ bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, BSS_TRAPEZOID); -#ifdef ZFS_DEBUG btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); -#endif return; } @@ -1668,17 +1652,15 @@ btree_remove_from(btree_t *tree, btree_index_t *where) btree_node_destroy(tree, &leaf->btl_hdr); } } -#ifdef ZFS_DEBUG if (tree->bt_root != NULL) btree_poison_node_at(tree, hdr, hdr->bth_count); -#endif btree_verify(tree); return; } ASSERT3U(hdr->bth_count, ==, min_count - 1); /* - * Now we try to steal a node from a sibling. We check left, then + * Now we try to take a node from a sibling. We check left, then * right. If they exist and have more than the minimum number of * elements, we move the separator betweeen us and them to our node * and move their closest element (last for left, first for right) to @@ -1686,7 +1668,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * idx, and (for our right neighbor) the gap made by removing their * first element. * - * Note: this logic currently doesn't support stealing from a neighbor + * Note: this logic currently doesn't support taking from a neighbor * that isn't a sibling. This isn't critical functionality, but may be * worth implementing in the future for completeness' sake. */ @@ -1696,7 +1678,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : parent->btc_children[parent_idx - 1]); if (l_hdr != NULL && l_hdr->bth_count > min_count) { - /* We can steal a node from the left neighbor. */ + /* We can take a node from the left neighbor. */ ASSERT(!l_hdr->bth_core); /* @@ -1706,20 +1688,19 @@ btree_remove_from(btree_t *tree, btree_index_t *where) bt_shift_leaf_right(tree, leaf, 0, idx); uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; - uint8_t *steal_elem = ((btree_leaf_t *)l_hdr)->btl_elems + + uint8_t *take_elem = ((btree_leaf_t *)l_hdr)->btl_elems + (l_hdr->bth_count - 1) * size; /* Move the separator to our first spot. */ bcopy(separator, leaf->btl_elems, size); /* Move our neighbor's last element to the separator. */ - bcopy(steal_elem, separator, size); + bcopy(take_elem, separator, size); /* Update the bookkeeping. */ l_hdr->bth_count--; hdr->bth_count++; -#ifdef ZFS_DEBUG btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); -#endif + btree_verify(tree); return; } @@ -1727,7 +1708,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? NULL : parent->btc_children[parent_idx + 1]); if (r_hdr != NULL && r_hdr->bth_count > min_count) { - /* We can steal a node from the right neighbor. */ + /* We can take a node from the right neighbor. */ ASSERT(!r_hdr->bth_core); btree_leaf_t *neighbor = (btree_leaf_t *)r_hdr; @@ -1739,12 +1720,12 @@ btree_remove_from(btree_t *tree, btree_index_t *where) bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); uint8_t *separator = parent->btc_elems + parent_idx * size; - uint8_t *steal_elem = ((btree_leaf_t *)r_hdr)->btl_elems; + uint8_t *take_elem = ((btree_leaf_t *)r_hdr)->btl_elems; /* Move the separator between us to our last spot. */ bcopy(separator, leaf->btl_elems + hdr->bth_count * size, size); /* Move our neighbor's first element to the separator. */ - bcopy(steal_elem, separator, size); + bcopy(take_elem, separator, size); /* Update the bookkeeping. */ r_hdr->bth_count--; @@ -1755,9 +1736,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * stolen element. */ bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count); -#ifdef ZFS_DEBUG btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); -#endif btree_verify(tree); return; } From 157c510d7c3c302d65e58af3c94157756b5921cf Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Fri, 6 Sep 2019 10:05:23 -0700 Subject: [PATCH 13/18] Prefix btree to avoid build issues when building with linux kernel Signed-off-by: Paul Dagnelie --- cmd/zdb/zdb.c | 8 +- include/sys/btree.h | 102 +++--- include/sys/metaslab_impl.h | 4 +- include/sys/range_tree.h | 4 +- module/zfs/arc.c | 2 +- module/zfs/btree.c | 596 +++++++++++++++++------------------ module/zfs/dsl_scan.c | 10 +- module/zfs/metaslab.c | 80 ++--- module/zfs/range_tree.c | 114 +++---- module/zfs/spa_misc.c | 4 +- module/zfs/space_map.c | 12 +- module/zfs/space_reftree.c | 6 +- module/zfs/vdev_initialize.c | 14 +- module/zfs/vdev_removal.c | 14 +- module/zfs/vdev_trim.c | 16 +- 15 files changed, 493 insertions(+), 493 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 3bd8054322df..0f09ec70a994 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -103,7 +103,7 @@ extern uint64_t zfs_arc_max, zfs_arc_meta_limit; extern int zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern int zfs_reconstruct_indirect_combinations_max; -extern int btree_verify_intensity; +extern int zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; @@ -950,7 +950,7 @@ dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; range_tree_t *rt = msp->ms_allocatable; - btree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; int free_pct = range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ @@ -959,7 +959,7 @@ dump_metaslab_stats(metaslab_t *msp) zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", - "segments", btree_numnodes(t), "maxsize", maxbuf, + "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); @@ -6650,7 +6650,7 @@ main(int argc, char **argv) * the arg parsing section so that the user can override this value if * they choose. */ - btree_verify_intensity = 3; + zfs_btree_verify_intensity = 3; while ((c = getopt(argc, argv, "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XY")) != -1) { diff --git a/include/sys/btree.h b/include/sys/btree.h index e9461ddefdbc..4298496eb597 100644 --- a/include/sys/btree.h +++ b/include/sys/btree.h @@ -39,7 +39,7 @@ extern "C" { * The major drawback of the B-Tree is that any returned elements or indexes * are only valid until a side-effectful operation occurs, since these can * result in reallocation or relocation of data. Side effectful operations are - * defined as insertion, removal, and btree_destroy_nodes. + * defined as insertion, removal, and zfs_btree_destroy_nodes. * * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core * nodes have an array of children pointing to other nodes, and an array of @@ -68,54 +68,54 @@ extern "C" { #define BTREE_CORE_ELEMS 128 #define BTREE_LEAF_SIZE 4096 -extern kmem_cache_t *btree_leaf_cache; +extern kmem_cache_t *zfs_btree_leaf_cache; -typedef struct btree_hdr { - struct btree_core *bth_parent; +typedef struct zfs_btree_hdr { + struct zfs_btree_core *bth_parent; boolean_t bth_core; /* * For both leaf and core nodes, represents the number of elements in * the node. For core nodes, they will have bth_count + 1 children. */ uint32_t bth_count; -} btree_hdr_t; +} zfs_btree_hdr_t; -typedef struct btree_core { - btree_hdr_t btc_hdr; - btree_hdr_t *btc_children[BTREE_CORE_ELEMS + 1]; +typedef struct zfs_btree_core { + zfs_btree_hdr_t btc_hdr; + zfs_btree_hdr_t *btc_children[BTREE_CORE_ELEMS + 1]; uint8_t btc_elems[]; -} btree_core_t; +} zfs_btree_core_t; -typedef struct btree_leaf { - btree_hdr_t btl_hdr; +typedef struct zfs_btree_leaf { + zfs_btree_hdr_t btl_hdr; uint8_t btl_elems[]; -} btree_leaf_t; +} zfs_btree_leaf_t; -typedef struct btree_index { - btree_hdr_t *bti_node; +typedef struct zfs_btree_index { + zfs_btree_hdr_t *bti_node; uint64_t bti_offset; /* * True if the location is before the list offset, false if it's at * the listed offset. */ boolean_t bti_before; -} btree_index_t; +} zfs_btree_index_t; typedef struct btree { - btree_hdr_t *bt_root; - int64_t bt_height; - size_t bt_elem_size; - uint64_t bt_num_elems; - uint64_t bt_num_nodes; - btree_leaf_t *bt_bulk; // non-null if bulk loading + zfs_btree_hdr_t *bt_root; + int64_t bt_height; + size_t bt_elem_size; + uint64_t bt_num_elems; + uint64_t bt_num_nodes; + zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading int (*bt_compar) (const void *, const void *); -} btree_t; +} zfs_btree_t; /* * Allocate and deallocate caches for btree nodes. */ -void btree_init(void); -void btree_fini(void); +void zfs_btree_init(void); +void zfs_btree_fini(void); /* * Initialize an B-Tree. Arguments are: @@ -125,66 +125,66 @@ void btree_fini(void); * -1 for <, 0 for ==, and +1 for > * size - the value of sizeof(struct my_type) */ -void btree_create(btree_t *, int (*) (const void *, const void *), size_t); +void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), size_t); /* * Find a node with a matching value in the tree. Returns the matching node * found. If not found, it returns NULL and then if "where" is not NULL it sets - * "where" for use with btree_insert() or btree_nearest(). + * "where" for use with zfs_btree_insert() or zfs_btree_nearest(). * * node - node that has the value being looked for - * where - position for use with btree_nearest() or btree_insert(), may be NULL + * where - position for use with zfs_btree_nearest() or zfs_btree_insert(), may be NULL */ -void *btree_find(btree_t *, const void *, btree_index_t *); +void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *); /* * Insert a node into the tree. * * node - the node to insert - * where - position as returned from btree_find() + * where - position as returned from zfs_btree_find() */ -void btree_insert(btree_t *, const void *, const btree_index_t *); +void zfs_btree_insert(zfs_btree_t *, const void *, const zfs_btree_index_t *); /* * Return the first or last valued node in the tree. Will return NULL * if the tree is empty. */ -void *btree_first(btree_t *, btree_index_t *); -void *btree_last(btree_t *, btree_index_t *); +void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *); +void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *); /* * Return the next or previous valued node in the tree. */ -void *btree_next(btree_t *, const btree_index_t *, btree_index_t *); -void *btree_prev(btree_t *, const btree_index_t *, btree_index_t *); +void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *, zfs_btree_index_t *); +void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *, zfs_btree_index_t *); /* * Get a value from a tree and an index. */ -void *btree_get(btree_t *, btree_index_t *); +void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *); /* * Add a single value to the tree. The value must not compare equal to any * other node already in the tree. */ -void btree_add(btree_t *, const void *); +void zfs_btree_add(zfs_btree_t *, const void *); /* * Remove a single value from the tree. The value must be in the tree. The * pointer passed in may be a pointer into a tree-controlled buffer, but it * need not be. */ -void btree_remove(btree_t *, const void *); +void zfs_btree_remove(zfs_btree_t *, const void *); /* * Remove the value at the given location from the tree. */ -void btree_remove_from(btree_t *, btree_index_t *); +void zfs_btree_remove_from(zfs_btree_t *, zfs_btree_index_t *); /* * Return the number of nodes in the tree */ -ulong_t btree_numnodes(btree_t *); +ulong_t zfs_btree_numnodes(zfs_btree_t *); /* * Used to destroy any remaining nodes in a tree. The cookie argument should @@ -192,39 +192,39 @@ ulong_t btree_numnodes(btree_t *); * removed from the tree and may be free()'d. Returns NULL when the tree is * empty. * - * Once you call btree_destroy_nodes(), you can only continuing calling it and - * finally btree_destroy(). No other B-Tree routines will be valid. + * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it and + * finally zfs_btree_destroy(). No other B-Tree routines will be valid. * - * cookie - an index used to save state between calls to btree_destroy_nodes() + * cookie - an index used to save state between calls to zfs_btree_destroy_nodes() * * EXAMPLE: - * btree_t *tree; + * zfs_btree_t *tree; * struct my_data *node; - * btree_index_t *cookie; + * zfs_btree_index_t *cookie; * * cookie = NULL; - * while ((node = btree_destroy_nodes(tree, &cookie)) != NULL) + * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) * data_destroy(node); - * btree_destroy(tree); + * zfs_btree_destroy(tree); */ -void *btree_destroy_nodes(btree_t *, btree_index_t **); +void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **); /* * Destroys all nodes in the tree quickly. This doesn't give the caller an * opportunity to iterate over each node and do its own cleanup; for that, use - * btree_destroy_nodes(). + * zfs_btree_destroy_nodes(). */ -void btree_clear(btree_t *); +void zfs_btree_clear(zfs_btree_t *); /* * Final destroy of an B-Tree. Arguments are: * * tree - the empty tree to destroy */ -void btree_destroy(btree_t *tree); +void zfs_btree_destroy(zfs_btree_t *tree); /* Runs a variety of self-checks on the btree to verify integrity. */ -void btree_verify(btree_t *tree); +void zfs_btree_verify(zfs_btree_t *tree); #ifdef __cplusplus } diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index f6a7df004092..d140f741d854 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -509,8 +509,8 @@ struct metaslab { * only difference is that the ms_allocatable_by_size is ordered by * segment sizes. */ - btree_t ms_allocatable_by_size; - btree_t ms_unflushed_frees_by_size; + zfs_btree_t ms_allocatable_by_size; + zfs_btree_t ms_unflushed_frees_by_size; uint64_t ms_lbas[MAX_LBAS]; metaslab_group_t *ms_group; /* metaslab group */ diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index 42a017953288..069128a3d44f 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -53,7 +53,7 @@ typedef enum range_seg_type { * must provide external locking if required. */ typedef struct range_tree { - btree_t rt_root; /* offset-ordered segment b-tree */ + zfs_btree_t rt_root; /* offset-ordered segment b-tree */ uint64_t rt_space; /* sum of all segments in the map */ range_seg_type_t rt_type; /* type of range_seg_t in use */ /* @@ -280,7 +280,7 @@ typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, - int (*btree_compare) (const void *, const void *), uint64_t gap); + int (*zfs_btree_compare) (const void *, const void *), uint64_t gap); range_tree_t *range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, void *arg, uint64_t start, uint64_t shift); void range_tree_destroy(range_tree_t *rt); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 8ba9eafbc8e1..c1ad8785df69 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4795,7 +4795,7 @@ arc_kmem_reap_soon(void) kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); - kmem_cache_reap_now(btree_leaf_cache); + kmem_cache_reap_now(zfs_btree_leaf_cache); if (zio_arena != NULL) { /* diff --git a/module/zfs/btree.c b/module/zfs/btree.c index c25c71c646da..b16055589e85 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -20,10 +20,10 @@ #include #include -kmem_cache_t *btree_leaf_cache; +kmem_cache_t *zfs_btree_leaf_cache; /* - * Control the extent of the verification that occurs when btree_verify is + * Control the extent of the verification that occurs when zfs_btree_verify is * called. Primarily used for debugging when extending the btree logic and * functionality. As the intensity is increased, new verification steps are * added. These steps are cumulative; intensity = 3 includes the intensity = 1 @@ -54,9 +54,9 @@ kmem_cache_t *btree_leaf_cache; * importance of the constant factors cannot be denied). */ #ifdef ZFS_DEBUG -int btree_verify_intensity = 5; +int zfs_btree_verify_intensity = 5; #else -int btree_verify_intensity = 0; +int zfs_btree_verify_intensity = 0; #endif #ifdef _ILP32 @@ -66,20 +66,20 @@ int btree_verify_intensity = 0; #endif static void -btree_poison_node(btree_t *tree, btree_hdr_t *hdr) +zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; if (!hdr->bth_core) { - btree_leaf_t *leaf = (btree_leaf_t *)hdr; + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f, - BTREE_LEAF_SIZE - sizeof (btree_hdr_t) - hdr->bth_count * + BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) - hdr->bth_count * size); } else { - btree_core_t *node = (btree_core_t *)hdr; + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { node->btc_children[i] = - (btree_hdr_t *)BTREE_POISON; + (zfs_btree_hdr_t *)BTREE_POISON; } (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, (BTREE_CORE_ELEMS - hdr->bth_count) * size); @@ -88,37 +88,37 @@ btree_poison_node(btree_t *tree, btree_hdr_t *hdr) } static inline void -btree_poison_node_at(btree_t *tree, btree_hdr_t *hdr, uint64_t offset) +zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, uint64_t offset) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; ASSERT3U(offset, >=, hdr->bth_count); if (!hdr->bth_core) { - btree_leaf_t *leaf = (btree_leaf_t *)hdr; + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; (void) memset(leaf->btl_elems + offset * size, 0x0f, size); } else { - btree_core_t *node = (btree_core_t *)hdr; + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; node->btc_children[offset + 1] = - (btree_hdr_t *)BTREE_POISON; + (zfs_btree_hdr_t *)BTREE_POISON; (void) memset(node->btc_elems + offset * size, 0x0f, size); } #endif } static inline void -btree_verify_poison_at(btree_t *tree, btree_hdr_t *hdr, uint64_t offset) +zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, uint64_t offset) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; uint8_t eval = 0x0f; if (hdr->bth_core) { - btree_core_t *node = (btree_core_t *)hdr; - btree_hdr_t *cval = (btree_hdr_t *)BTREE_POISON; + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; VERIFY3P(node->btc_children[offset + 1], ==, cval); for (int i = 0; i < size; i++) VERIFY3U(node->btc_elems[offset * size + i], ==, eval); } else { - btree_leaf_t *leaf = (btree_leaf_t *)hdr; + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; for (int i = 0; i < size; i++) VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval); } @@ -126,30 +126,30 @@ btree_verify_poison_at(btree_t *tree, btree_hdr_t *hdr, uint64_t offset) } void -btree_init(void) +zfs_btree_init(void) { - btree_leaf_cache = kmem_cache_create("btree_leaf_cache", + zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); } void -btree_fini(void) +zfs_btree_fini(void) { - kmem_cache_destroy(btree_leaf_cache); + kmem_cache_destroy(zfs_btree_leaf_cache); } void -btree_create(btree_t *tree, int (*compar) (const void *, const void *), +zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), size_t size) { /* * We need a minimmum of 4 elements so that when we split a node we * always have at least two elements in each node. This simplifies the - * logic in btree_bulk_finish, since it means the last leaf will + * logic in zfs_btree_bulk_finish, since it means the last leaf will * always have a left sibling to share with (unless it's the root). */ - ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (btree_hdr_t)) / 4); + ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4); bzero(tree, sizeof (*tree)); tree->bt_compar = compar; @@ -162,8 +162,8 @@ btree_create(btree_t *tree, int (*compar) (const void *, const void *), * Find value in the array of elements provided. Uses a simple binary search. */ static void * -btree_find_in_buf(btree_t *tree, uint8_t *buf, uint64_t nelems, - const void *value, btree_index_t *where) +zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems, + const void *value, zfs_btree_index_t *where) { uint64_t max = nelems; uint64_t min = 0; @@ -193,7 +193,7 @@ btree_find_in_buf(btree_t *tree, uint8_t *buf, uint64_t nelems, * membership test or if the btree is being used as a map. */ void * -btree_find(btree_t *tree, const void *value, btree_index_t *where) +zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) { if (tree->bt_height == -1) { if (where != NULL) { @@ -210,9 +210,9 @@ btree_find(btree_t *tree, const void *value, btree_index_t *where) * because for most workloads the vast majority of finds in * bulk-insert mode are to insert new elements. */ - btree_index_t idx; + zfs_btree_index_t idx; if (tree->bt_bulk != NULL) { - btree_leaf_t *last_leaf = tree->bt_bulk; + zfs_btree_leaf_t *last_leaf = tree->bt_bulk; int compar = tree->bt_compar(last_leaf->btl_elems + ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size), value); @@ -222,7 +222,7 @@ btree_find(btree_t *tree, const void *value, btree_index_t *where) * element, it's not in the tree. */ if (where != NULL) { - where->bti_node = (btree_hdr_t *)last_leaf; + where->bti_node = (zfs_btree_hdr_t *)last_leaf; where->bti_offset = last_leaf->btl_hdr.bth_count; where->bti_before = B_TRUE; @@ -230,7 +230,7 @@ btree_find(btree_t *tree, const void *value, btree_index_t *where) return (NULL); } else if (compar == 0) { if (where != NULL) { - where->bti_node = (btree_hdr_t *)last_leaf; + where->bti_node = (zfs_btree_hdr_t *)last_leaf; where->bti_offset = last_leaf->btl_hdr.bth_count - 1; where->bti_before = B_FALSE; @@ -245,18 +245,18 @@ btree_find(btree_t *tree, const void *value, btree_index_t *where) * element in the last leaf, it's in the last leaf or * it's not in the tree. */ - void *d = btree_find_in_buf(tree, last_leaf->btl_elems, + void *d = zfs_btree_find_in_buf(tree, last_leaf->btl_elems, last_leaf->btl_hdr.bth_count, value, &idx); if (where != NULL) { - idx.bti_node = (btree_hdr_t *)last_leaf; + idx.bti_node = (zfs_btree_hdr_t *)last_leaf; *where = idx; } return (d); } } - btree_core_t *node = NULL; + zfs_btree_core_t *node = NULL; uint64_t child = 0; uint64_t depth = 0; @@ -264,15 +264,15 @@ btree_find(btree_t *tree, const void *value, btree_index_t *where) * Iterate down the tree, finding which child the value should be in * by comparing with the separators. */ - for (node = (btree_core_t *)tree->bt_root; depth < tree->bt_height; - node = (btree_core_t *)node->btc_children[child], depth++) { + for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; + node = (zfs_btree_core_t *)node->btc_children[child], depth++) { ASSERT3P(node, !=, NULL); - void *d = btree_find_in_buf(tree, node->btc_elems, + void *d = zfs_btree_find_in_buf(tree, node->btc_elems, node->btc_hdr.bth_count, value, &idx); EQUIV(d != NULL, !idx.bti_before); if (d != NULL) { if (where != NULL) { - idx.bti_node = (btree_hdr_t *)node; + idx.bti_node = (zfs_btree_hdr_t *)node; *where = idx; } return (d); @@ -285,13 +285,13 @@ btree_find(btree_t *tree, const void *value, btree_index_t *where) * The value is in this leaf, or it would be if it were in the * tree. Find its proper location and return it. */ - btree_leaf_t *leaf = (depth == 0 ? (btree_leaf_t *)tree->bt_root : - (btree_leaf_t *)node); - void *d = btree_find_in_buf(tree, leaf->btl_elems, + zfs_btree_leaf_t *leaf = (depth == 0 ? (zfs_btree_leaf_t *)tree->bt_root : + (zfs_btree_leaf_t *)node); + void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems, leaf->btl_hdr.bth_count, value, &idx); if (where != NULL) { - idx.bti_node = (btree_hdr_t *)leaf; + idx.bti_node = (zfs_btree_hdr_t *)leaf; *where = idx; } @@ -357,7 +357,7 @@ enum bt_shift_direction { * shift is determined by shape. The direction is determined by dir. */ static inline void -bt_shift_core(btree_t *tree, btree_core_t *node, uint64_t idx, +bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, uint64_t count, uint64_t off, enum bt_shift_shape shape, enum bt_shift_direction dir) { @@ -370,9 +370,9 @@ bt_shift_core(btree_t *tree, btree_core_t *node, uint64_t idx, uint64_t e_count = count; bcopy(e_start, e_out, e_count * size); - btree_hdr_t **c_start = node->btc_children + idx + + zfs_btree_hdr_t **c_start = node->btc_children + idx + (shape == BSS_TRAPEZOID ? 0 : 1); - btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : c_start + off); + zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : c_start + off); uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); bcopy(c_start, c_out, c_count * sizeof (*c_start)); } @@ -384,7 +384,7 @@ bt_shift_core(btree_t *tree, btree_core_t *node, uint64_t idx, * false if it is a parallelogram. */ static inline void -bt_shift_core_left(btree_t *tree, btree_core_t *node, uint64_t idx, +bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, uint64_t count, enum bt_shift_shape shape) { bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); @@ -395,7 +395,7 @@ bt_shift_core_left(btree_t *tree, btree_core_t *node, uint64_t idx, * Starts with elements[idx] and children[idx] and one more child than element. */ static inline void -bt_shift_core_right(btree_t *tree, btree_core_t *node, uint64_t idx, +bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, uint64_t count, enum bt_shift_shape shape) { bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); @@ -407,7 +407,7 @@ bt_shift_core_right(btree_t *tree, btree_core_t *node, uint64_t idx, * is determined by left. */ static inline void -bt_shift_leaf(btree_t *tree, btree_leaf_t *node, uint64_t idx, +bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx, uint64_t count, uint64_t off, enum bt_shift_direction dir) { size_t size = tree->bt_elem_size; @@ -420,14 +420,14 @@ bt_shift_leaf(btree_t *tree, btree_leaf_t *node, uint64_t idx, } static inline void -bt_shift_leaf_right(btree_t *tree, btree_leaf_t *leaf, uint64_t idx, +bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, uint64_t count) { bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT); } static inline void -bt_shift_leaf_left(btree_t *tree, btree_leaf_t *leaf, uint64_t idx, +bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, uint64_t count) { bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT); @@ -438,8 +438,8 @@ bt_shift_leaf_left(btree_t *tree, btree_leaf_t *leaf, uint64_t idx, * parameter behaves the same as it does in the shift logic. */ static inline void -bt_transfer_core(btree_t *tree, btree_core_t *source, uint64_t sidx, - uint64_t count, btree_core_t *dest, uint64_t didx, +bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx, + uint64_t count, zfs_btree_core_t *dest, uint64_t didx, enum bt_shift_shape shape) { size_t size = tree->bt_elem_size; @@ -456,8 +456,8 @@ bt_transfer_core(btree_t *tree, btree_core_t *source, uint64_t sidx, } static inline void -bt_transfer_leaf(btree_t *tree, btree_leaf_t *source, uint64_t sidx, - uint64_t count, btree_leaf_t *dest, uint64_t didx) +bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx, + uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx) { size_t size = tree->bt_elem_size; ASSERT(!source->btl_hdr.bth_core); @@ -472,16 +472,16 @@ bt_transfer_leaf(btree_t *tree, btree_leaf_t *source, uint64_t sidx, * put its location in where if non-null. */ static void * -btree_first_helper(btree_hdr_t *hdr, btree_index_t *where) +zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) { - btree_hdr_t *node; + zfs_btree_hdr_t *node; for (node = hdr; node->bth_core; node = - ((btree_core_t *)node)->btc_children[0]) + ((zfs_btree_core_t *)node)->btc_children[0]) ; ASSERT(!node->bth_core); - btree_leaf_t *leaf = (btree_leaf_t *)node; + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; if (where != NULL) { where->bti_node = node; where->bti_offset = 0; @@ -495,13 +495,13 @@ btree_first_helper(btree_hdr_t *hdr, btree_index_t *where) * buf as the dividing element between the two. */ static void -btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, - btree_hdr_t *new_node, void *buf) +zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, + zfs_btree_hdr_t *new_node, void *buf) { ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); uint64_t size = tree->bt_elem_size; - btree_core_t *parent = old_node->bth_parent; - btree_hdr_t *par_hdr = &parent->btc_hdr; + zfs_btree_core_t *parent = old_node->bth_parent; + zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; /* * If this is the root node we were splitting, we create a new root @@ -510,9 +510,9 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, if (parent == NULL) { ASSERT3P(old_node, ==, tree->bt_root); tree->bt_num_nodes++; - btree_core_t *new_root = kmem_alloc(sizeof (btree_core_t) + + zfs_btree_core_t *new_root = kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * size, KM_SLEEP); - btree_hdr_t *new_root_hdr = &new_root->btc_hdr; + zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; new_root_hdr->bth_parent = NULL; new_root_hdr->bth_core = B_TRUE; new_root_hdr->bth_count = 1; @@ -524,7 +524,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, tree->bt_height++; tree->bt_root = new_root_hdr; - btree_poison_node(tree, new_root_hdr); + zfs_btree_poison_node(tree, new_root_hdr); return; } @@ -532,9 +532,9 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * Since we have the new separator, binary search for where to put * new_node. */ - btree_index_t idx; + zfs_btree_index_t idx; ASSERT(par_hdr->bth_core); - VERIFY3P(btree_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); uint64_t offset = idx.bti_offset; @@ -546,8 +546,8 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * and return. */ if (par_hdr->bth_count != BTREE_CORE_ELEMS) { - if (btree_verify_intensity >= 5) { - btree_verify_poison_at(tree, par_hdr, + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, par_hdr, par_hdr->bth_count); } @@ -583,13 +583,13 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, uint64_t keep_count = BTREE_CORE_ELEMS - move_count; ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); tree->bt_num_nodes++; - btree_core_t *new_parent = kmem_alloc(sizeof (btree_core_t) + + zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * size, KM_SLEEP); - btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; + zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; new_par_hdr->bth_parent = par_hdr->bth_parent; new_par_hdr->bth_core = B_TRUE; new_par_hdr->bth_count = move_count; - btree_poison_node(tree, new_par_hdr); + zfs_btree_poison_node(tree, new_par_hdr); par_hdr->bth_count = keep_count; @@ -621,7 +621,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, uint8_t *e_start = parent->btc_elems + offset * size; bcopy(buf, e_start, size); - btree_hdr_t **c_start = parent->btc_children + (offset + 1); + zfs_btree_hdr_t **c_start = parent->btc_children + (offset + 1); *c_start = new_node; ASSERT3P(*(c_start - 1), ==, old_node); @@ -648,7 +648,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, uint8_t *e_out = new_parent->btc_elems + e_count * size; bcopy(buf, e_out, size); - btree_hdr_t **c_out = new_parent->btc_children + e_count + 1; + zfs_btree_hdr_t **c_out = new_parent->btc_children + e_count + 1; *c_out = new_node; ASSERT3P(*(c_out - 1), ==, old_node); @@ -673,7 +673,7 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, new_parent, 0, BSS_PARALLELOGRAM); new_parent->btc_children[0] = new_node; } - btree_poison_node(tree, par_hdr); + zfs_btree_poison_node(tree, par_hdr); for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++) new_parent->btc_children[i]->bth_parent = new_parent; @@ -685,19 +685,19 @@ btree_insert_into_parent(btree_t *tree, btree_hdr_t *old_node, * Now that the node is split, we need to insert the new node into its * parent. This may cause further splitting. */ - btree_insert_into_parent(tree, &parent->btc_hdr, &new_parent->btc_hdr, + zfs_btree_insert_into_parent(tree, &parent->btc_hdr, &new_parent->btc_hdr, buf); } /* Helper function for inserting a new value into leaf at the given index. */ static void -btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, +zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, const void *value, uint64_t idx) { uint64_t size = tree->bt_elem_size; uint8_t *start = leaf->btl_elems + (idx * size); uint64_t count = leaf->btl_hdr.bth_count - idx; - uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (btree_hdr_t)) / + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / size, 2); /* @@ -705,8 +705,8 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, * value. */ if (leaf->btl_hdr.bth_count != capacity) { - if (btree_verify_intensity >= 5) { - btree_verify_poison_at(tree, &leaf->btl_hdr, + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, leaf->btl_hdr.bth_count); } leaf->btl_hdr.bth_count++; @@ -734,12 +734,12 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, uint64_t keep_count = capacity - move_count; ASSERT3U(capacity - move_count, >=, 2); tree->bt_num_nodes++; - btree_leaf_t *new_leaf = kmem_cache_alloc(btree_leaf_cache, KM_SLEEP); - btree_hdr_t *new_hdr = &new_leaf->btl_hdr; + zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP); + zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; new_hdr->bth_parent = leaf->btl_hdr.bth_parent; new_hdr->bth_core = B_FALSE; new_hdr->bth_count = move_count; - btree_poison_node(tree, new_hdr); + zfs_btree_poison_node(tree, new_hdr); leaf->btl_hdr.bth_count = keep_count; @@ -795,28 +795,28 @@ btree_insert_into_leaf(btree_t *tree, btree_leaf_t *leaf, const void *value, 0); } - btree_poison_node(tree, &leaf->btl_hdr); + zfs_btree_poison_node(tree, &leaf->btl_hdr); /* * Now that the node is split, we need to insert the new node into its * parent. This may cause further splitting, bur only of core nodes. */ - btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, buf); + zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, buf); kmem_free(buf, size); } static uint64_t -btree_find_parent_idx(btree_t *tree, btree_hdr_t *hdr) +zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { void *buf; if (hdr->bth_core) { - buf = ((btree_core_t *)hdr)->btc_elems; + buf = ((zfs_btree_core_t *)hdr)->btc_elems; } else { - buf = ((btree_leaf_t *)hdr)->btl_elems; + buf = ((zfs_btree_leaf_t *)hdr)->btl_elems; } - btree_index_t idx; - btree_core_t *parent = hdr->bth_parent; - VERIFY3P(btree_find_in_buf(tree, parent->btc_elems, + zfs_btree_index_t idx; + zfs_btree_core_t *parent = hdr->bth_parent; + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, parent->btc_hdr.bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); @@ -834,15 +834,15 @@ btree_find_parent_idx(btree_t *tree, btree_hdr_t *hdr) * the root. */ static void -btree_bulk_finish(btree_t *tree) +zfs_btree_bulk_finish(zfs_btree_t *tree) { ASSERT3P(tree->bt_bulk, !=, NULL); ASSERT3P(tree->bt_root, !=, NULL); - btree_leaf_t *leaf = tree->bt_bulk; - btree_hdr_t *hdr = &leaf->btl_hdr; - btree_core_t *parent = hdr->bth_parent; + zfs_btree_leaf_t *leaf = tree->bt_bulk; + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + zfs_btree_core_t *parent = hdr->bth_parent; uint64_t size = tree->bt_elem_size; - uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (btree_hdr_t)) / + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / size, 2); /* @@ -858,31 +858,31 @@ btree_bulk_finish(btree_t *tree) if (hdr->bth_count < capacity / 2) { /* * First, find the left neighbor. The simplest way to do this - * is to call btree_prev twice; the first time finds some + * is to call zfs_btree_prev twice; the first time finds some * ancestor of this node, and the second time finds the left * neighbor. The ancestor found is the lowest common ancestor * of leaf and the neighbor. */ - btree_index_t idx = { + zfs_btree_index_t idx = { .bti_node = hdr, .bti_offset = 0 }; - VERIFY3P(btree_prev(tree, &idx, &idx), !=, NULL); + VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); ASSERT(idx.bti_node->bth_core); - btree_core_t *common = (btree_core_t *)idx.bti_node; + zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; uint64_t common_idx = idx.bti_offset; - VERIFY3P(btree_prev(tree, &idx, &idx), !=, NULL); + VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); ASSERT(!idx.bti_node->bth_core); - btree_leaf_t *l_neighbor = (btree_leaf_t *)idx.bti_node; - btree_hdr_t *l_hdr = idx.bti_node; + zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; + zfs_btree_hdr_t *l_hdr = idx.bti_node; uint64_t move_count = (capacity / 2) - hdr->bth_count; ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, capacity / 2); - if (btree_verify_intensity >= 5) { + if (zfs_btree_verify_intensity >= 5) { for (int i = 0; i < move_count; i++) { - btree_verify_poison_at(tree, hdr, + zfs_btree_verify_poison_at(tree, hdr, leaf->btl_hdr.bth_count + i); } } @@ -917,7 +917,7 @@ btree_bulk_finish(btree_t *tree) ASSERT3U(l_hdr->bth_count, >=, capacity / 2); ASSERT3U(hdr->bth_count, >=, capacity / 2); - btree_poison_node(tree, l_hdr); + zfs_btree_poison_node(tree, l_hdr); } /* @@ -926,8 +926,8 @@ btree_bulk_finish(btree_t *tree) */ capacity = BTREE_CORE_ELEMS; while (parent->btc_hdr.bth_parent != NULL) { - btree_core_t *cur = parent; - btree_hdr_t *hdr = &cur->btc_hdr; + zfs_btree_core_t *cur = parent; + zfs_btree_hdr_t *hdr = &cur->btc_hdr; parent = hdr->bth_parent; /* * If the invariant isn't violated, move on to the next @@ -941,17 +941,17 @@ btree_bulk_finish(btree_t *tree) * splitting is 2, we never need to worry about not having a * left sibling (a sibling is a neighbor with the same parent). */ - uint64_t parent_idx = btree_find_parent_idx(tree, hdr); + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); ASSERT3U(parent_idx, >, 0); - btree_core_t *l_neighbor = (btree_core_t *)parent->btc_children[ + zfs_btree_core_t *l_neighbor = (zfs_btree_core_t *)parent->btc_children[ parent_idx - 1]; uint64_t move_count = (capacity / 2) - hdr->bth_count; ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, capacity / 2); - if (btree_verify_intensity >= 5) { + if (zfs_btree_verify_intensity >= 5) { for (int i = 0; i < move_count; i++) { - btree_verify_poison_at(tree, hdr, + zfs_btree_verify_poison_at(tree, hdr, leaf->btl_hdr.bth_count + i); } } @@ -987,7 +987,7 @@ btree_bulk_finish(btree_t *tree) ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); ASSERT3U(hdr->bth_count, >=, capacity / 2); - btree_poison_node(tree, &l_neighbor->btc_hdr); + zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); for (int i = 0; i <= hdr->bth_count; i++) cur->btc_children[i]->bth_parent = cur; @@ -1000,15 +1000,15 @@ btree_bulk_finish(btree_t *tree) * Insert value into tree at the location specified by where. */ void -btree_insert(btree_t *tree, const void *value, const btree_index_t *where) +zfs_btree_insert(zfs_btree_t *tree, const void *value, const zfs_btree_index_t *where) { - btree_index_t idx = {0}; + zfs_btree_index_t idx = {0}; /* If we're not inserting in the last leaf, end bulk insert mode. */ if (tree->bt_bulk != NULL) { if (where->bti_node != &tree->bt_bulk->btl_hdr) { - btree_bulk_finish(tree); - VERIFY3P(btree_find(tree, value, &idx), ==, NULL); + zfs_btree_bulk_finish(tree); + VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); where = &idx; } } @@ -1025,25 +1025,25 @@ btree_insert(btree_t *tree, const void *value, const btree_index_t *where) ASSERT0(where->bti_offset); tree->bt_num_nodes++; - btree_leaf_t *leaf = kmem_cache_alloc(btree_leaf_cache, + zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP); tree->bt_root = &leaf->btl_hdr; tree->bt_height++; - btree_hdr_t *hdr = &leaf->btl_hdr; + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; hdr->bth_parent = NULL; hdr->bth_core = B_FALSE; hdr->bth_count = 0; - btree_poison_node(tree, hdr); + zfs_btree_poison_node(tree, hdr); - btree_insert_into_leaf(tree, leaf, value, 0); + zfs_btree_insert_into_leaf(tree, leaf, value, 0); tree->bt_bulk = leaf; } else if (!where->bti_node->bth_core) { /* * If we're inserting into a leaf, go directly to the helper * function. */ - btree_insert_into_leaf(tree, (btree_leaf_t *)where->bti_node, + zfs_btree_insert_into_leaf(tree, (zfs_btree_leaf_t *)where->bti_node, value, where->bti_offset); } else { /* @@ -1054,14 +1054,14 @@ btree_insert(btree_t *tree, const void *value, const btree_index_t *where) * separator into the first slot in the subtree to the right. */ ASSERT(where->bti_node->bth_core); - btree_core_t *node = (btree_core_t *)where->bti_node; + zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; /* * We can ignore bti_before, because either way the value * should end up in bti_offset. */ uint64_t off = where->bti_offset; - btree_hdr_t *subtree = node->btc_children[off + 1]; + zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; size_t size = tree->bt_elem_size; uint8_t *buf = kmem_alloc(size, KM_SLEEP); bcopy(node->btc_elems + off * size, buf, size); @@ -1071,15 +1071,15 @@ btree_insert(btree_t *tree, const void *value, const btree_index_t *where) * Find the first slot in the subtree to the right, insert * there. */ - btree_index_t new_idx; - VERIFY3P(btree_first_helper(subtree, &new_idx), !=, NULL); + zfs_btree_index_t new_idx; + VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL); ASSERT0(new_idx.bti_offset); ASSERT(!new_idx.bti_node->bth_core); - btree_insert_into_leaf(tree, (btree_leaf_t *)new_idx.bti_node, + zfs_btree_insert_into_leaf(tree, (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); kmem_free(buf, size); } - btree_verify(tree); + zfs_btree_verify(tree); } /* @@ -1087,13 +1087,13 @@ btree_insert(btree_t *tree, const void *value, const btree_index_t *where) * non-null. */ void * -btree_first(btree_t *tree, btree_index_t *where) +zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) { if (tree->bt_height == -1) { ASSERT0(tree->bt_num_elems); return (NULL); } - return (btree_first_helper(tree->bt_root, where)); + return (zfs_btree_first_helper(tree->bt_root, where)); } /* @@ -1101,15 +1101,15 @@ btree_first(btree_t *tree, btree_index_t *where) * put its location in where if non-null. */ static void * -btree_last_helper(btree_t *btree, btree_hdr_t *hdr, btree_index_t *where) +zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) { - btree_hdr_t *node; + zfs_btree_hdr_t *node; for (node = hdr; node->bth_core; node = - ((btree_core_t *)node)->btc_children[node->bth_count]) + ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) ; - btree_leaf_t *leaf = (btree_leaf_t *)node; + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; if (where != NULL) { where->bti_node = node; where->bti_offset = node->bth_count - 1; @@ -1123,25 +1123,25 @@ btree_last_helper(btree_t *btree, btree_hdr_t *hdr, btree_index_t *where) * non-null. */ void * -btree_last(btree_t *tree, btree_index_t *where) +zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) { if (tree->bt_height == -1) { ASSERT0(tree->bt_num_elems); return (NULL); } - return (btree_last_helper(tree, tree->bt_root, where)); + return (zfs_btree_last_helper(tree, tree->bt_root, where)); } /* * This function contains the logic to find the next node in the tree. A * helper function is used because there are multiple internal consumemrs of - * this logic. The done_func is used by btree_destroy_nodes to clean up each + * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each * node after we've finished with it. */ static void * -btree_next_helper(btree_t *tree, const btree_index_t *idx, - btree_index_t *out_idx, - void (*done_func)(btree_t *, btree_hdr_t *)) +zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx, + void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) { if (idx->bti_node == NULL) { ASSERT3S(tree->bt_height, ==, -1); @@ -1159,7 +1159,7 @@ btree_next_helper(btree_t *tree, const btree_index_t *idx, * of its parent. Once we do, the next element is the * separator after our ancestor in its parent. */ - btree_leaf_t *leaf = (btree_leaf_t *)idx->bti_node; + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; uint64_t new_off = offset + (idx->bti_before ? 0 : 1); if (leaf->btl_hdr.bth_count > new_off) { out_idx->bti_node = &leaf->btl_hdr; @@ -1168,12 +1168,12 @@ btree_next_helper(btree_t *tree, const btree_index_t *idx, return (leaf->btl_elems + new_off * tree->bt_elem_size); } - btree_hdr_t *prev = &leaf->btl_hdr; - for (btree_core_t *node = leaf->btl_hdr.bth_parent; + zfs_btree_hdr_t *prev = &leaf->btl_hdr; + for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; node != NULL; node = node->btc_hdr.bth_parent) { - btree_hdr_t *hdr = &node->btc_hdr; + zfs_btree_hdr_t *hdr = &node->btc_hdr; ASSERT(hdr->bth_core); - uint64_t i = btree_find_parent_idx(tree, prev); + uint64_t i = zfs_btree_find_parent_idx(tree, prev); if (done_func != NULL) done_func(tree, prev); if (i == hdr->bth_count) { @@ -1196,7 +1196,7 @@ btree_next_helper(btree_t *tree, const btree_index_t *idx, /* If we were before an element in a core node, return that element. */ ASSERT(idx->bti_node->bth_core); - btree_core_t *node = (btree_core_t *)idx->bti_node; + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; if (idx->bti_before) { out_idx->bti_before = B_FALSE; return (node->btc_elems + offset * tree->bt_elem_size); @@ -1206,8 +1206,8 @@ btree_next_helper(btree_t *tree, const btree_index_t *idx, * The next element from one in a core node is the first element in * the subtree just to the right of the separator. */ - btree_hdr_t *child = node->btc_children[offset + 1]; - return (btree_first_helper(child, out_idx)); + zfs_btree_hdr_t *child = node->btc_children[offset + 1]; + return (zfs_btree_first_helper(child, out_idx)); } /* @@ -1215,9 +1215,9 @@ btree_next_helper(btree_t *tree, const btree_index_t *idx, * passed for idx and out_idx. */ void * -btree_next(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) +zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_t *out_idx) { - return (btree_next_helper(tree, idx, out_idx, NULL)); + return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); } /* @@ -1225,7 +1225,7 @@ btree_next(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) * passed for idx and out_idx. */ void * -btree_prev(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) +zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_t *out_idx) { if (idx->bti_node == NULL) { ASSERT3S(tree->bt_height, ==, -1); @@ -1243,7 +1243,7 @@ btree_prev(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) * first child. Once we do, the previous element is the separator * after our previous ancestor. */ - btree_leaf_t *leaf = (btree_leaf_t *)idx->bti_node; + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; if (offset != 0) { out_idx->bti_node = &leaf->btl_hdr; out_idx->bti_offset = offset - 1; @@ -1251,12 +1251,12 @@ btree_prev(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) return (leaf->btl_elems + (offset - 1) * tree->bt_elem_size); } - btree_hdr_t *prev = &leaf->btl_hdr; - for (btree_core_t *node = leaf->btl_hdr.bth_parent; + zfs_btree_hdr_t *prev = &leaf->btl_hdr; + for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; node != NULL; node = node->btc_hdr.bth_parent) { - btree_hdr_t *hdr = &node->btc_hdr; + zfs_btree_hdr_t *hdr = &node->btc_hdr; ASSERT(hdr->bth_core); - uint64_t i = btree_find_parent_idx(tree, prev); + uint64_t i = zfs_btree_find_parent_idx(tree, prev); if (i == 0) { prev = hdr; continue; @@ -1278,9 +1278,9 @@ btree_prev(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) * the subtree just to the left of the separator. */ ASSERT(idx->bti_node->bth_core); - btree_core_t *node = (btree_core_t *)idx->bti_node; - btree_hdr_t *child = node->btc_children[offset]; - return (btree_last_helper(tree, child, out_idx)); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + zfs_btree_hdr_t *child = node->btc_children[offset]; + return (zfs_btree_last_helper(tree, child, out_idx)); } /* @@ -1291,36 +1291,36 @@ btree_prev(btree_t *tree, const btree_index_t *idx, btree_index_t *out_idx) * elements that could be in the tree. */ void * -btree_get(btree_t *tree, btree_index_t *idx) +zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) { ASSERT(!idx->bti_before); if (!idx->bti_node->bth_core) { - btree_leaf_t *leaf = (btree_leaf_t *)idx->bti_node; + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size); } ASSERT(idx->bti_node->bth_core); - btree_core_t *node = (btree_core_t *)idx->bti_node; + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; return (node->btc_elems + idx->bti_offset * tree->bt_elem_size); } /* Add the given value to the tree. Must not already be in the tree. */ void -btree_add(btree_t *tree, const void *node) +zfs_btree_add(zfs_btree_t *tree, const void *node) { - btree_index_t where = {0}; - VERIFY3P(btree_find(tree, node, &where), ==, NULL); - btree_insert(tree, node, &where); + zfs_btree_index_t where = {0}; + VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); + zfs_btree_insert(tree, node, &where); } /* Helper function to free a tree node. */ static void -btree_node_destroy(btree_t *tree, btree_hdr_t *node) +zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) { tree->bt_num_nodes--; if (!node->bth_core) { - kmem_cache_free(btree_leaf_cache, node); + kmem_cache_free(zfs_btree_leaf_cache, node); } else { - kmem_free(node, sizeof (btree_core_t) + + kmem_free(node, sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * tree->bt_elem_size); } } @@ -1331,11 +1331,11 @@ btree_node_destroy(btree_t *tree, btree_hdr_t *node) * cannot be accessed. */ static void -btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) +zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, zfs_btree_hdr_t *rm_hdr) { size_t size = tree->bt_elem_size; uint64_t min_count = BTREE_CORE_ELEMS / 2; - btree_hdr_t *hdr = &node->btc_hdr; + zfs_btree_hdr_t *hdr = &node->btc_hdr; /* * If the node is the root node and rm_hdr is one of two children, * promote the other child to the root. @@ -1346,7 +1346,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) ASSERT3P(node->btc_children[1], ==, rm_hdr); tree->bt_root = node->btc_children[0]; node->btc_children[0]->bth_parent = NULL; - btree_node_destroy(tree, hdr); + zfs_btree_node_destroy(tree, hdr); tree->bt_height--; return; } @@ -1371,7 +1371,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) */ bt_shift_core_left(tree, node, idx, hdr->bth_count - idx + 1, BSS_PARALLELOGRAM); - btree_poison_node_at(tree, hdr, hdr->bth_count); + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); return; } @@ -1391,15 +1391,15 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * parent). This isn't critical functionality, but may be worth * implementing in the future for completeness' sake. */ - btree_core_t *parent = hdr->bth_parent; - uint64_t parent_idx = btree_find_parent_idx(tree, hdr); + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); - btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : parent->btc_children[parent_idx - 1]); if (l_hdr != NULL && l_hdr->bth_count > min_count) { /* We can take a node from the left neighbor. */ ASSERT(l_hdr->bth_core); - btree_core_t *neighbor = (btree_core_t *)l_hdr; + zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; /* * Start by shifting the elements and children in the current @@ -1416,7 +1416,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) bcopy(separator, node->btc_elems, size); /* Move the last child of neighbor to our first child slot. */ - btree_hdr_t **take_child = neighbor->btc_children + + zfs_btree_hdr_t **take_child = neighbor->btc_children + l_hdr->bth_count; bcopy(take_child, node->btc_children, sizeof (*take_child)); node->btc_children[0]->bth_parent = node; @@ -1427,16 +1427,16 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) bcopy(take_elem, separator, size); l_hdr->bth_count--; hdr->bth_count++; - btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); return; } - btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? NULL : parent->btc_children[parent_idx + 1]); if (r_hdr != NULL && r_hdr->bth_count > min_count) { /* We can take a node from the right neighbor. */ ASSERT(r_hdr->bth_core); - btree_core_t *neighbor = (btree_core_t *)r_hdr; + zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; /* * Shift elements in node left by one spot to overwrite rm_hdr @@ -1456,7 +1456,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * Move the first child of neighbor to the last child spot in * node. */ - btree_hdr_t **take_child = neighbor->btc_children; + zfs_btree_hdr_t **take_child = neighbor->btc_children; bcopy(take_child, node->btc_children + (hdr->bth_count + 1), sizeof (*take_child)); node->btc_children[hdr->bth_count + 1]->bth_parent = node; @@ -1473,7 +1473,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) */ bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, BSS_TRAPEZOID); - btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); return; } @@ -1487,15 +1487,15 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) * right merging node and the separator. This may cause further * merging. */ - btree_hdr_t *new_rm_hdr; + zfs_btree_hdr_t *new_rm_hdr; if (l_hdr != NULL) { ASSERT(l_hdr->bth_core); - btree_core_t *left = (btree_core_t *)l_hdr; + zfs_btree_core_t *left = (zfs_btree_core_t *)l_hdr; - if (btree_verify_intensity >= 5) { + if (zfs_btree_verify_intensity >= 5) { for (int i = 0; i < hdr->bth_count + 1; i++) { - btree_verify_poison_at(tree, l_hdr, + zfs_btree_verify_poison_at(tree, l_hdr, l_hdr->bth_count + i); } } @@ -1512,7 +1512,7 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) left, l_hdr->bth_count + idx, BSS_PARALLELOGRAM); /* Reparent all our children to point to the left node. */ - btree_hdr_t **new_start = left->btc_children + + zfs_btree_hdr_t **new_start = left->btc_children + l_hdr->bth_count + 1; for (int i = 0; i < hdr->bth_count + 1; i++) new_start[i]->bth_parent = left; @@ -1528,11 +1528,11 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) } else { ASSERT3P(r_hdr, !=, NULL); ASSERT(r_hdr->bth_core); - btree_core_t *right = (btree_core_t *)r_hdr; + zfs_btree_core_t *right = (zfs_btree_core_t *)r_hdr; - if (btree_verify_intensity >= 5) { + if (zfs_btree_verify_intensity >= 5) { for (int i = 0; i < r_hdr->bth_count; i++) { - btree_verify_poison_at(tree, hdr, + zfs_btree_verify_poison_at(tree, hdr, hdr->bth_count + i + 1); } } @@ -1574,18 +1574,18 @@ btree_remove_from_node(btree_t *tree, btree_core_t *node, btree_hdr_t *rm_hdr) } new_rm_hdr->bth_count = 0; - btree_node_destroy(tree, new_rm_hdr); - btree_remove_from_node(tree, parent, new_rm_hdr); + zfs_btree_node_destroy(tree, new_rm_hdr); + zfs_btree_remove_from_node(tree, parent, new_rm_hdr); } /* Remove the element at the specific location. */ void -btree_remove_from(btree_t *tree, btree_index_t *where) +zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) { size_t size = tree->bt_elem_size; - btree_hdr_t *hdr = where->bti_node; + zfs_btree_hdr_t *hdr = where->bti_node; uint64_t idx = where->bti_offset; - uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (btree_hdr_t)) / + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / size, 2); ASSERT(!where->bti_before); @@ -1596,11 +1596,11 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * we're planning to remove and find it again after * bulk_finish. */ - uint8_t *value = btree_get(tree, where); + uint8_t *value = zfs_btree_get(tree, where); uint8_t *tmp = kmem_alloc(size, KM_SLEEP); bcopy(value, tmp, size); - btree_bulk_finish(tree); - VERIFY3P(btree_find(tree, tmp, where), !=, NULL); + zfs_btree_bulk_finish(tree); + VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); kmem_free(tmp, size); hdr = where->bti_node; idx = where->bti_offset; @@ -1614,9 +1614,9 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * downwards. */ if (hdr->bth_core) { - btree_core_t *node = (btree_core_t *)hdr; - btree_hdr_t *left_subtree = node->btc_children[idx]; - void *new_value = btree_last_helper(tree, left_subtree, where); + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; + void *new_value = zfs_btree_last_helper(tree, left_subtree, where); ASSERT3P(new_value, !=, NULL); bcopy(new_value, node->btc_elems + idx * size, size); @@ -1632,7 +1632,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * needed. */ ASSERT(!hdr->bth_core); - btree_leaf_t *leaf = (btree_leaf_t *)hdr; + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; ASSERT3U(hdr->bth_count, >, 0); hdr->bth_count--; @@ -1649,12 +1649,12 @@ btree_remove_from(btree_t *tree, btree_index_t *where) if (hdr->bth_count == 0) { tree->bt_root = NULL; tree->bt_height--; - btree_node_destroy(tree, &leaf->btl_hdr); + zfs_btree_node_destroy(tree, &leaf->btl_hdr); } } if (tree->bt_root != NULL) - btree_poison_node_at(tree, hdr, hdr->bth_count); - btree_verify(tree); + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); + zfs_btree_verify(tree); return; } ASSERT3U(hdr->bth_count, ==, min_count - 1); @@ -1672,10 +1672,10 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * that isn't a sibling. This isn't critical functionality, but may be * worth implementing in the future for completeness' sake. */ - btree_core_t *parent = hdr->bth_parent; - uint64_t parent_idx = btree_find_parent_idx(tree, hdr); + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); - btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : parent->btc_children[parent_idx - 1]); if (l_hdr != NULL && l_hdr->bth_count > min_count) { /* We can take a node from the left neighbor. */ @@ -1688,7 +1688,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) bt_shift_leaf_right(tree, leaf, 0, idx); uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; - uint8_t *take_elem = ((btree_leaf_t *)l_hdr)->btl_elems + + uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems + (l_hdr->bth_count - 1) * size; /* Move the separator to our first spot. */ bcopy(separator, leaf->btl_elems, size); @@ -1699,18 +1699,18 @@ btree_remove_from(btree_t *tree, btree_index_t *where) /* Update the bookkeeping. */ l_hdr->bth_count--; hdr->bth_count++; - btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); - btree_verify(tree); + zfs_btree_verify(tree); return; } - btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? NULL : parent->btc_children[parent_idx + 1]); if (r_hdr != NULL && r_hdr->bth_count > min_count) { /* We can take a node from the right neighbor. */ ASSERT(!r_hdr->bth_core); - btree_leaf_t *neighbor = (btree_leaf_t *)r_hdr; + zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; /* * Move our elements after the element being removed forwards @@ -1720,7 +1720,7 @@ btree_remove_from(btree_t *tree, btree_index_t *where) bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); uint8_t *separator = parent->btc_elems + parent_idx * size; - uint8_t *take_elem = ((btree_leaf_t *)r_hdr)->btl_elems; + uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems; /* Move the separator between us to our last spot. */ bcopy(separator, leaf->btl_elems + hdr->bth_count * size, size); @@ -1736,8 +1736,8 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * stolen element. */ bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count); - btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); - btree_verify(tree); + zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + zfs_btree_verify(tree); return; } @@ -1751,15 +1751,15 @@ btree_remove_from(btree_t *tree, btree_index_t *where) * right merging node and the separator. This may cause further * merging. */ - btree_hdr_t *rm_hdr; + zfs_btree_hdr_t *rm_hdr; if (l_hdr != NULL) { ASSERT(!l_hdr->bth_core); - btree_leaf_t *left = (btree_leaf_t *)l_hdr; + zfs_btree_leaf_t *left = (zfs_btree_leaf_t *)l_hdr; - if (btree_verify_intensity >= 5) { + if (zfs_btree_verify_intensity >= 5) { for (int i = 0; i < hdr->bth_count + 1; i++) { - btree_verify_poison_at(tree, l_hdr, + zfs_btree_verify_poison_at(tree, l_hdr, l_hdr->bth_count + i); } } @@ -1785,13 +1785,13 @@ btree_remove_from(btree_t *tree, btree_index_t *where) } else { ASSERT3P(r_hdr, !=, NULL); ASSERT(!r_hdr->bth_core); - if (btree_verify_intensity >= 5) { + if (zfs_btree_verify_intensity >= 5) { for (int i = 0; i < r_hdr->bth_count; i++) { - btree_verify_poison_at(tree, hdr, + zfs_btree_verify_poison_at(tree, hdr, hdr->bth_count + i + 1); } } - btree_leaf_t *right = (btree_leaf_t *)r_hdr; + zfs_btree_leaf_t *right = (zfs_btree_leaf_t *)r_hdr; /* * Move our elements left to overwrite the element being @@ -1814,24 +1814,24 @@ btree_remove_from(btree_t *tree, btree_index_t *where) rm_hdr = r_hdr; } rm_hdr->bth_count = 0; - btree_node_destroy(tree, rm_hdr); + zfs_btree_node_destroy(tree, rm_hdr); /* Remove the emptied node from the parent. */ - btree_remove_from_node(tree, parent, rm_hdr); - btree_verify(tree); + zfs_btree_remove_from_node(tree, parent, rm_hdr); + zfs_btree_verify(tree); } /* Remove the given value from the tree. */ void -btree_remove(btree_t *tree, const void *value) +zfs_btree_remove(zfs_btree_t *tree, const void *value) { - btree_index_t where = {0}; - VERIFY3P(btree_find(tree, value, &where), !=, NULL); - btree_remove_from(tree, &where); + zfs_btree_index_t where = {0}; + VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); + zfs_btree_remove_from(tree, &where); } /* Return the number of elements in the tree. */ ulong_t -btree_numnodes(btree_t *tree) +zfs_btree_numnodes(zfs_btree_t *tree) { return (tree->bt_num_elems); } @@ -1842,30 +1842,30 @@ btree_numnodes(btree_t *tree) * needs to do. This is more efficient than just removing the first element * over and over, because it removes all rebalancing. Once the destroy_nodes() * function has been called, no other btree operations are valid until it - * returns NULL, which point the only valid operation is btree_destroy(). + * returns NULL, which point the only valid operation is zfs_btree_destroy(). * * example: * - * btree_index_t *cookie = NULL; + * zfs_btree_index_t *cookie = NULL; * my_data_t *node; * - * while ((node = btree_destroy_nodes(tree, &cookie)) != NULL) + * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) * free(node->ptr); - * btree_destroy(tree); + * zfs_btree_destroy(tree); * */ void * -btree_destroy_nodes(btree_t *tree, btree_index_t **cookie) +zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) { if (*cookie == NULL) { if (tree->bt_height == -1) return (NULL); *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); - return (btree_first(tree, *cookie)); + return (zfs_btree_first(tree, *cookie)); } - void *rval = btree_next_helper(tree, *cookie, *cookie, - btree_node_destroy); + void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, + zfs_btree_node_destroy); if (rval == NULL) { tree->bt_root = NULL; tree->bt_height = -1; @@ -1877,27 +1877,27 @@ btree_destroy_nodes(btree_t *tree, btree_index_t **cookie) } static void -btree_clear_helper(btree_t *tree, btree_hdr_t *hdr) +zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { if (hdr->bth_core) { - btree_core_t *btc = (btree_core_t *)hdr; + zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; for (int i = 0; i <= hdr->bth_count; i++) { - btree_clear_helper(tree, btc->btc_children[i]); + zfs_btree_clear_helper(tree, btc->btc_children[i]); } } - btree_node_destroy(tree, hdr); + zfs_btree_node_destroy(tree, hdr); } void -btree_clear(btree_t *tree) +zfs_btree_clear(zfs_btree_t *tree) { if (tree->bt_root == NULL) { ASSERT0(tree->bt_num_elems); return; } - btree_clear_helper(tree, tree->bt_root); + zfs_btree_clear_helper(tree, tree->bt_root); tree->bt_num_elems = 0; tree->bt_root = NULL; tree->bt_num_nodes = 0; @@ -1906,7 +1906,7 @@ btree_clear(btree_t *tree) } void -btree_destroy(btree_t *tree) +zfs_btree_destroy(zfs_btree_t *tree) { ASSERT0(tree->bt_num_elems); ASSERT3P(tree->bt_root, ==, NULL); @@ -1914,28 +1914,28 @@ btree_destroy(btree_t *tree) /* Verify that every child of this node has the correct parent pointer. */ static void -btree_verify_pointers_helper(btree_t *tree, btree_hdr_t *hdr) +zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { if (!hdr->bth_core) return; - btree_core_t *node = (btree_core_t *)hdr; + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (int i = 0; i <= hdr->bth_count; i++) { VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); - btree_verify_pointers_helper(tree, node->btc_children[i]); + zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); } } /* Verify that every node has the correct parent pointer. */ static void -btree_verify_pointers(btree_t *tree) +zfs_btree_verify_pointers(zfs_btree_t *tree) { if (tree->bt_height == -1) { VERIFY3P(tree->bt_root, ==, NULL); return; } VERIFY3P(tree->bt_root->bth_parent, ==, NULL); - btree_verify_pointers_helper(tree, tree->bt_root); + zfs_btree_verify_pointers_helper(tree, tree->bt_root); } /* @@ -1943,24 +1943,24 @@ btree_verify_pointers(btree_t *tree) * invariants, and return the total count in the subtree rooted in this node. */ static uint64_t -btree_verify_counts_helper(btree_t *tree, btree_hdr_t *hdr) +zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { if (!hdr->bth_core) { if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) { uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - - sizeof (btree_hdr_t)) / tree->bt_elem_size, 2); + sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2); VERIFY3U(hdr->bth_count, >=, capacity / 2); } return (hdr->bth_count); } else { - btree_core_t *node = (btree_core_t *)hdr; + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; uint64_t ret = hdr->bth_count; if (tree->bt_root != hdr && tree->bt_bulk == NULL) VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2); for (int i = 0; i <= hdr->bth_count; i++) { - ret += btree_verify_counts_helper(tree, + ret += zfs_btree_verify_counts_helper(tree, node->btc_children[i]); } @@ -1973,13 +1973,13 @@ btree_verify_counts_helper(btree_t *tree, btree_hdr_t *hdr) * elements is correct. */ static void -btree_verify_counts(btree_t *tree) +zfs_btree_verify_counts(zfs_btree_t *tree) { EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); if (tree->bt_height == -1) { return; } - VERIFY3P(btree_verify_counts_helper(tree, tree->bt_root), ==, + VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, tree->bt_num_elems); } @@ -1988,7 +1988,7 @@ btree_verify_counts(btree_t *tree) * the number of nodes under this node, to help verify bt_num_nodes. */ static uint64_t -btree_verify_height_helper(btree_t *tree, btree_hdr_t *hdr, int64_t height) +zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, int64_t height) { if (!hdr->bth_core) { VERIFY0(height); @@ -1996,10 +1996,10 @@ btree_verify_height_helper(btree_t *tree, btree_hdr_t *hdr, int64_t height) } VERIFY(hdr->bth_core); - btree_core_t *node = (btree_core_t *)hdr; + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; uint64_t ret = 1; for (int i = 0; i <= hdr->bth_count; i++) { - ret += btree_verify_height_helper(tree, node->btc_children[i], + ret += zfs_btree_verify_height_helper(tree, node->btc_children[i], height - 1); } return (ret); @@ -2010,14 +2010,14 @@ btree_verify_height_helper(btree_t *tree, btree_hdr_t *hdr, int64_t height) * bt_height in the tree is correct. */ static void -btree_verify_height(btree_t *tree) +zfs_btree_verify_height(zfs_btree_t *tree) { EQUIV(tree->bt_height == -1, tree->bt_root == NULL); if (tree->bt_height == -1) { return; } - VERIFY3U(btree_verify_height_helper(tree, tree->bt_root, + VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, tree->bt_height), ==, tree->bt_num_nodes); } @@ -2027,11 +2027,11 @@ btree_verify_height(btree_t *tree) * that the children also satisfy this requirement. */ static void -btree_verify_order_helper(btree_t *tree, btree_hdr_t *hdr) +zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { size_t size = tree->bt_elem_size; if (!hdr->bth_core) { - btree_leaf_t *leaf = (btree_leaf_t *)hdr; + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; for (int i = 1; i < hdr->bth_count; i++) { VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) * size, leaf->btl_elems + i * size), ==, -1); @@ -2039,22 +2039,22 @@ btree_verify_order_helper(btree_t *tree, btree_hdr_t *hdr) return; } - btree_core_t *node = (btree_core_t *)hdr; + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (int i = 1; i < hdr->bth_count; i++) { VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, node->btc_elems + i * size), ==, -1); } for (int i = 0; i < hdr->bth_count; i++) { uint8_t *left_child_last = NULL; - btree_hdr_t *left_child_hdr = node->btc_children[i]; + zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; if (left_child_hdr->bth_core) { - btree_core_t *left_child = - (btree_core_t *)left_child_hdr; + zfs_btree_core_t *left_child = + (zfs_btree_core_t *)left_child_hdr; left_child_last = left_child->btc_elems + (left_child_hdr->bth_count - 1) * size; } else { - btree_leaf_t *left_child = - (btree_leaf_t *)left_child_hdr; + zfs_btree_leaf_t *left_child = + (zfs_btree_leaf_t *)left_child_hdr; left_child_last = left_child->btl_elems + (left_child_hdr->bth_count - 1) * size; } @@ -2068,14 +2068,14 @@ btree_verify_order_helper(btree_t *tree, btree_hdr_t *hdr) } uint8_t *right_child_first = NULL; - btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; + zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; if (right_child_hdr->bth_core) { - btree_core_t *right_child = - (btree_core_t *)right_child_hdr; + zfs_btree_core_t *right_child = + (zfs_btree_core_t *)right_child_hdr; right_child_first = right_child->btc_elems; } else { - btree_leaf_t *right_child = - (btree_leaf_t *)right_child_hdr; + zfs_btree_leaf_t *right_child = + (zfs_btree_leaf_t *)right_child_hdr; right_child_first = right_child->btl_elems; } if (tree->bt_compar(node->btc_elems + i * size, @@ -2088,37 +2088,37 @@ btree_verify_order_helper(btree_t *tree, btree_hdr_t *hdr) } } for (int i = 0; i <= hdr->bth_count; i++) { - btree_verify_order_helper(tree, node->btc_children[i]); + zfs_btree_verify_order_helper(tree, node->btc_children[i]); } } /* Check that all elements in the tree are in sorted order. */ static void -btree_verify_order(btree_t *tree) +zfs_btree_verify_order(zfs_btree_t *tree) { EQUIV(tree->bt_height == -1, tree->bt_root == NULL); if (tree->bt_height == -1) { return; } - btree_verify_order_helper(tree, tree->bt_root); + zfs_btree_verify_order_helper(tree, tree->bt_root); } #ifdef ZFS_DEBUG /* Check that all unused memory is poisoned correctly. */ static void -btree_verify_poison_helper(btree_t *tree, btree_hdr_t *hdr) +zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { size_t size = tree->bt_elem_size; if (!hdr->bth_core) { - btree_leaf_t *leaf = (btree_leaf_t *)hdr; + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; uint8_t val = 0x0f; for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE - - sizeof (btree_hdr_t); i++) { + sizeof (zfs_btree_hdr_t); i++) { VERIFY3U(leaf->btl_elems[i], ==, val); } } else { - btree_core_t *node = (btree_core_t *)hdr; + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; uint8_t val = 0x0f; for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size; i++) { @@ -2127,11 +2127,11 @@ btree_verify_poison_helper(btree_t *tree, btree_hdr_t *hdr) for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { VERIFY3P(node->btc_children[i], ==, - (btree_hdr_t *)BTREE_POISON); + (zfs_btree_hdr_t *)BTREE_POISON); } for (int i = 0; i <= hdr->bth_count; i++) { - btree_verify_poison_helper(tree, node->btc_children[i]); + zfs_btree_verify_poison_helper(tree, node->btc_children[i]); } } } @@ -2139,32 +2139,32 @@ btree_verify_poison_helper(btree_t *tree, btree_hdr_t *hdr) /* Check that unused memory in the tree is still poisoned. */ static void -btree_verify_poison(btree_t *tree) +zfs_btree_verify_poison(zfs_btree_t *tree) { #ifdef ZFS_DEBUG if (tree->bt_height == -1) return; - btree_verify_poison_helper(tree, tree->bt_root); + zfs_btree_verify_poison_helper(tree, tree->bt_root); #endif } void -btree_verify(btree_t *tree) +zfs_btree_verify(zfs_btree_t *tree) { - if (btree_verify_intensity == 0) + if (zfs_btree_verify_intensity == 0) return; - btree_verify_height(tree); - if (btree_verify_intensity == 1) + zfs_btree_verify_height(tree); + if (zfs_btree_verify_intensity == 1) return; - btree_verify_pointers(tree); - if (btree_verify_intensity == 2) + zfs_btree_verify_pointers(tree); + if (zfs_btree_verify_intensity == 2) return; - btree_verify_counts(tree); - if (btree_verify_intensity == 3) + zfs_btree_verify_counts(tree); + if (zfs_btree_verify_intensity == 3) return; - btree_verify_order(tree); + zfs_btree_verify_order(tree); - if (btree_verify_intensity == 4) + if (zfs_btree_verify_intensity == 4) return; - btree_verify_poison(tree); + zfs_btree_verify_poison(tree); } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 997e0ea82863..255d521750c1 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -279,7 +279,7 @@ struct dsl_scan_io_queue { /* trees used for sorting I/Os and extents of I/Os */ range_tree_t *q_exts_by_addr; - btree_t q_exts_by_size; + zfs_btree_t q_exts_by_size; avl_tree_t q_sios_by_addr; uint64_t q_sio_memused; @@ -646,7 +646,7 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) mutex_enter(&vd->vdev_scan_io_queue_lock); ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); - ASSERT3P(btree_first(&q->q_exts_by_size, NULL), ==, + ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==, NULL); ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); mutex_exit(&vd->vdev_scan_io_queue_lock); @@ -1243,7 +1243,7 @@ dsl_scan_should_clear(dsl_scan_t *scn) queue = tvd->vdev_scan_io_queue; if (queue != NULL) { /* # extents in exts_by_size = # in exts_by_addr */ - mused += btree_numnodes(&queue->q_exts_by_size) * + mused += zfs_btree_numnodes(&queue->q_exts_by_size) * sizeof (range_seg_t) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); @@ -2924,7 +2924,7 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) if (zfs_scan_issue_strategy == 1) { return (range_tree_first(queue->q_exts_by_addr)); } else if (zfs_scan_issue_strategy == 2) { - return (btree_first(&queue->q_exts_by_size, NULL)); + return (zfs_btree_first(&queue->q_exts_by_size, NULL)); } } @@ -2940,7 +2940,7 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) if (scn->scn_checkpointing) { return (range_tree_first(queue->q_exts_by_addr)); } else if (scn->scn_clearing) { - return (btree_first(&queue->q_exts_by_size, NULL)); + return (zfs_btree_first(&queue->q_exts_by_size, NULL)); } else { return (NULL); } diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 5767ea86053d..de01782bd076 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1332,7 +1332,7 @@ metaslab_rangesize64_compare(const void *x1, const void *x2) return (TREE_CMP(r1->rs_start, r2->rs_start)); } typedef struct metaslab_rt_arg { - btree_t *mra_bt; + zfs_btree_t *mra_bt; uint32_t mra_floor_shift; } metaslab_rt_arg_t; @@ -1360,7 +1360,7 @@ metaslab_size_tree_full_load(range_tree_t *rt) #ifdef _METASLAB_TRACING METASLABSTAT_BUMP(metaslabstat_reload_tree); #endif - ASSERT0(btree_numnodes(mrap->mra_bt)); + ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; arg.rt = rt; @@ -1377,7 +1377,7 @@ static void metaslab_rt_create(range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; - btree_t *size_tree = mrap->mra_bt; + zfs_btree_t *size_tree = mrap->mra_bt; size_t size; int (*compare) (const void *, const void *); @@ -1393,7 +1393,7 @@ metaslab_rt_create(range_tree_t *rt, void *arg) default: panic("Invalid range seg type %d", rt->rt_type); } - btree_create(size_tree, compare, size); + zfs_btree_create(size_tree, compare, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } @@ -1402,9 +1402,9 @@ static void metaslab_rt_destroy(range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; - btree_t *size_tree = mrap->mra_bt; + zfs_btree_t *size_tree = mrap->mra_bt; - btree_destroy(size_tree); + zfs_btree_destroy(size_tree); kmem_free(mrap, sizeof (*mrap)); } @@ -1413,13 +1413,13 @@ static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; - btree_t *size_tree = mrap->mra_bt; + zfs_btree_t *size_tree = mrap->mra_bt; if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 << mrap->mra_floor_shift)) return; - btree_add(size_tree, rs); + zfs_btree_add(size_tree, rs); } /* ARGSUSED */ @@ -1427,13 +1427,13 @@ static void metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; - btree_t *size_tree = mrap->mra_bt; + zfs_btree_t *size_tree = mrap->mra_bt; if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 << mrap->mra_floor_shift)) return; - btree_remove(size_tree, rs); + zfs_btree_remove(size_tree, rs); } /* ARGSUSED */ @@ -1441,9 +1441,9 @@ static void metaslab_rt_vacate(range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; - btree_t *size_tree = mrap->mra_bt; - btree_clear(size_tree); - btree_destroy(size_tree); + zfs_btree_t *size_tree = mrap->mra_bt; + zfs_btree_clear(size_tree); + zfs_btree_destroy(size_tree); metaslab_rt_create(rt, arg); } @@ -1468,15 +1468,15 @@ static range_tree_ops_t metaslab_rt_ops = { uint64_t metaslab_largest_allocatable(metaslab_t *msp) { - btree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; range_seg_t *rs; if (t == NULL) return (0); - if (btree_numnodes(t) == 0) + if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); - rs = btree_last(t, NULL); + rs = zfs_btree_last(t, NULL); if (rs == NULL) return (0); @@ -1496,9 +1496,9 @@ metaslab_largest_unflushed_free(metaslab_t *msp) if (msp->ms_unflushed_frees == NULL) return (0); - if (btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) + if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) metaslab_size_tree_full_load(msp->ms_unflushed_frees); - range_seg_t *rs = btree_last(&msp->ms_unflushed_frees_by_size, NULL); + range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, NULL); if (rs == NULL) return (0); @@ -1550,8 +1550,8 @@ metaslab_largest_unflushed_free(metaslab_t *msp) } static range_seg_t * -metaslab_block_find(btree_t *t, range_tree_t *rt, uint64_t start, uint64_t size, - btree_index_t *where) +metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, uint64_t size, + zfs_btree_index_t *where) { range_seg_t *rs; range_seg_max_t rsearch; @@ -1559,9 +1559,9 @@ metaslab_block_find(btree_t *t, range_tree_t *rt, uint64_t start, uint64_t size, rs_set_start(&rsearch, rt, start); rs_set_end(&rsearch, rt, start + size); - rs = btree_find(t, &rsearch, where); + rs = zfs_btree_find(t, &rsearch, where); if (rs == NULL) { - rs = btree_next(t, where, where); + rs = zfs_btree_next(t, where, where); } return (rs); @@ -1580,8 +1580,8 @@ metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, { if (*cursor == 0) *cursor = rt->rt_start; - btree_t *bt = &rt->rt_root; - btree_index_t where; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t where; range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); uint64_t first_found; int count_searched = 0; @@ -1596,7 +1596,7 @@ metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, *cursor = offset + size; return (offset); } - rs = btree_next(bt, &where, &where); + rs = zfs_btree_next(bt, &where, &where); count_searched++; } @@ -1658,13 +1658,13 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) if (offset == -1) { range_seg_t *rs; - if (btree_numnodes(&msp->ms_allocatable_by_size) == 0) + if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); if (metaslab_df_use_largest_segment) { /* use largest free segment */ - rs = btree_last(&msp->ms_allocatable_by_size, NULL); + rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); } else { - btree_index_t where; + zfs_btree_index_t where; /* use segment of this size, or next largest */ #ifdef _METASLAB_TRACING metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; @@ -1707,7 +1707,7 @@ static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { range_tree_t *rt = msp->ms_allocatable; - btree_t *t = &msp->ms_allocatable_by_size; + zfs_btree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; @@ -1719,9 +1719,9 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) if ((*cursor + size) > *cursor_end) { range_seg_t *rs; - if (btree_numnodes(t) == 0) + if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); - rs = btree_last(t, NULL); + rs = zfs_btree_last(t, NULL); if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) return (-1ULL); @@ -1762,9 +1762,9 @@ uint64_t metaslab_ndf_clump_shift = 4; static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { - btree_t *t = &msp->ms_allocatable->rt_root; + zfs_btree_t *t = &msp->ms_allocatable->rt_root; range_tree_t *rt = msp->ms_allocatable; - btree_index_t where; + zfs_btree_index_t where; range_seg_t *rs; range_seg_max_t rsearch; uint64_t hbit = highbit64(size); @@ -1779,7 +1779,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) rs_set_start(&rsearch, rt, *cursor); rs_set_end(&rsearch, rt, *cursor + size); - rs = btree_find(t, &rsearch, &where); + rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { t = &msp->ms_allocatable_by_size; @@ -1787,9 +1787,9 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + metaslab_ndf_clump_shift))); - rs = btree_find(t, &rsearch, &where); + rs = zfs_btree_find(t, &rsearch, &where); if (rs == NULL) - rs = btree_next(t, &where, &where); + rs = zfs_btree_next(t, &where, &where); ASSERT(rs != NULL); } @@ -2141,8 +2141,8 @@ metaslab_potentially_evict(metaslab_class_t *mc) { #ifdef _KERNEL uint64_t allmem = arc_all_memory(); - uint64_t inuse = btree_leaf_cache->skc_obj_total; - uint64_t size = btree_leaf_cache->skc_obj_size; + uint64_t inuse = zfs_btree_leaf_cache->skc_obj_total; + uint64_t size = zfs_btree_leaf_cache->skc_obj_size; int tries = 0; for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2; @@ -2179,7 +2179,7 @@ metaslab_potentially_evict(metaslab_class_t *mc) */ if (msp->ms_loading) { msp = next_msp; - inuse = btree_leaf_cache->skc_obj_total; + inuse = zfs_btree_leaf_cache->skc_obj_total; continue; } /* @@ -2201,7 +2201,7 @@ metaslab_potentially_evict(metaslab_class_t *mc) } mutex_exit(&msp->ms_lock); msp = next_msp; - inuse = btree_leaf_cache->skc_obj_total; + inuse = zfs_btree_leaf_cache->skc_obj_total; } } #endif diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index df01e4dd9c31..3470dae1fda5 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -99,12 +99,12 @@ void range_tree_stat_verify(range_tree_t *rt) { range_seg_t *rs; - btree_index_t where; + zfs_btree_index_t where; uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; int i; - for (rs = btree_first(&rt->rt_root, &where); rs != NULL; - rs = btree_next(&rt->rt_root, &where, &where)) { + for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); int idx = highbit64(size) - 1; @@ -188,7 +188,7 @@ range_tree_seg_gap_compare(const void *x1, const void *x2) range_tree_t * range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, - int (*btree_compare) (const void *, const void *), + int (*zfs_btree_compare) (const void *, const void *), uint64_t gap) { range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); @@ -213,7 +213,7 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, default: panic("Invalid range seg type %d", type); } - btree_create(&rt->rt_root, compare, size); + zfs_btree_create(&rt->rt_root, compare, size); rt->rt_ops = ops; rt->rt_gap = gap; @@ -221,7 +221,7 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, rt->rt_type = type; rt->rt_start = start; rt->rt_shift = shift; - rt->rt_btree_compare = btree_compare; + rt->rt_btree_compare = zfs_btree_compare; if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); @@ -244,7 +244,7 @@ range_tree_destroy(range_tree_t *rt) if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); - btree_destroy(&rt->rt_root); + zfs_btree_destroy(&rt->rt_root); kmem_free(rt, sizeof (*rt)); } @@ -266,7 +266,7 @@ static void range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) { range_tree_t *rt = arg; - btree_index_t where; + zfs_btree_index_t where; range_seg_t *rs_before, *rs_after, *rs; range_seg_max_t tmp, rsearch; uint64_t end = start + size, gap = rt->rt_gap; @@ -279,7 +279,7 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) rs_set_start(&rsearch, rt, start); rs_set_end(&rsearch, rt, end); - rs = btree_find(&rt->rt_root, &rsearch, &where); + rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* * If this is a gap-supporting range tree, it is possible that we @@ -299,7 +299,7 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) return; } - btree_remove(&rt->rt_root, rs); + zfs_btree_remove(&rt->rt_root, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); @@ -322,9 +322,9 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * If gap != 0, we might need to merge with our neighbors even if we * aren't directly touching. */ - btree_index_t where_before, where_after; - rs_before = btree_prev(&rt->rt_root, &where, &where_before); - rs_after = btree_next(&rt->rt_root, &where, &where_after); + zfs_btree_index_t where_before, where_after; + rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before); + rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after); merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >= start - gap); @@ -349,13 +349,13 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) uint64_t before_start = rs_get_start_raw(rs_before, rt); uint64_t before_fill = rs_get_fill(rs_before, rt); uint64_t after_fill = rs_get_fill(rs_after, rt); - btree_remove_from(&rt->rt_root, &where_before); + zfs_btree_remove_from(&rt->rt_root, &where_before); /* * We have to re-find the node because our old reference is * invalid as soon as we do any mutating btree operations. */ - rs_after = btree_find(&rt->rt_root, &tmp, &where_after); + rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after); rs_set_start_raw(rs_after, rt, before_start); rs_set_fill(rs_after, rt, after_fill + before_fill + fill); rs = rs_after; @@ -385,7 +385,7 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) rs_set_start(rs, rt, start); rs_set_end(rs, rt, end); rs_set_fill(rs, rt, fill); - btree_insert(&rt->rt_root, rs, &where); + zfs_btree_insert(&rt->rt_root, rs, &where); } if (gap != 0) { @@ -413,7 +413,7 @@ static void range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, boolean_t do_fill) { - btree_index_t where; + zfs_btree_index_t where; range_seg_t *rs; range_seg_max_t rsearch, rs_tmp; uint64_t end = start + size; @@ -426,7 +426,7 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, rs_set_start(&rsearch, rt, start); rs_set_end(&rsearch, rt, end); - rs = btree_find(&rt->rt_root, &rsearch, &where); + rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* Make sure we completely overlap with someone */ if (rs == NULL) { @@ -487,10 +487,10 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, rs_set_end(rs, rt, start); rs_copy(rs, &rs_tmp, rt); - if (btree_next(&rt->rt_root, &where, &where) != NULL) - btree_insert(&rt->rt_root, &newseg, &where); + if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL) + zfs_btree_insert(&rt->rt_root, &newseg, &where); else - btree_add(&rt->rt_root, &newseg); + zfs_btree_add(&rt->rt_root, &newseg); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg); @@ -503,7 +503,7 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, rs_set_start(rs, rt, end); rs_copy(rs, &rs_tmp, rt); } else { - btree_remove_from(&rt->rt_root, &where); + zfs_btree_remove_from(&rt->rt_root, &where); rs = NULL; } @@ -566,7 +566,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) rs_set_start(&rsearch, rt, start); rs_set_end(&rsearch, rt, end); - return (btree_find(&rt->rt_root, &rsearch, NULL)); + return (zfs_btree_find(&rt->rt_root, &rsearch, NULL)); } static range_seg_t * @@ -613,15 +613,15 @@ range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, rs_set_start(&rsearch, rt, start); rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1); - btree_index_t where; - range_seg_t *rs = btree_find(&rt->rt_root, &rsearch, &where); + zfs_btree_index_t where; + range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); if (rs != NULL) { *ostart = start; *osize = MIN(size, rs_get_end(rs, rt) - start); return (B_TRUE); } - rs = btree_next(&rt->rt_root, &where, &where); + rs = zfs_btree_next(&rt->rt_root, &where, &where); if (rs == NULL || rs_get_start(rs, rt) > start + size) return (B_FALSE); @@ -659,7 +659,7 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) range_tree_t *rt; ASSERT0(range_tree_space(*rtdst)); - ASSERT0(btree_numnodes(&(*rtdst)->rt_root)); + ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root)); rt = *rtsrc; *rtsrc = *rtdst; @@ -674,15 +674,15 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) if (func != NULL) { range_seg_t *rs; - btree_index_t *cookie = NULL; + zfs_btree_index_t *cookie = NULL; - while ((rs = btree_destroy_nodes(&rt->rt_root, &cookie)) != + while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - rs_get_start(rs, rt)); } } else { - btree_clear(&rt->rt_root); + zfs_btree_clear(&rt->rt_root); } bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); @@ -692,9 +692,9 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) { - btree_index_t where; - for (range_seg_t *rs = btree_first(&rt->rt_root, &where); rs != NULL; - rs = btree_next(&rt->rt_root, &where, &where)) { + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - rs_get_start(rs, rt)); } @@ -703,7 +703,7 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) range_seg_t * range_tree_first(range_tree_t *rt) { - return (btree_first(&rt->rt_root, NULL)); + return (zfs_btree_first(&rt->rt_root, NULL)); } uint64_t @@ -715,7 +715,7 @@ range_tree_space(range_tree_t *rt) uint64_t range_tree_numsegs(range_tree_t *rt) { - return ((rt == NULL) ? 0 : btree_numnodes(&rt->rt_root)); + return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root)); } boolean_t @@ -729,7 +729,7 @@ range_tree_is_empty(range_tree_t *rt) void rt_btree_create(range_tree_t *rt, void *arg) { - btree_t *size_tree = arg; + zfs_btree_t *size_tree = arg; size_t size; switch (rt->rt_type) { @@ -745,44 +745,44 @@ rt_btree_create(range_tree_t *rt, void *arg) default: panic("Invalid range seg type %d", rt->rt_type); } - btree_create(size_tree, rt->rt_btree_compare, size); + zfs_btree_create(size_tree, rt->rt_btree_compare, size); } /* ARGSUSED */ void rt_btree_destroy(range_tree_t *rt, void *arg) { - btree_t *size_tree = arg; - ASSERT0(btree_numnodes(size_tree)); + zfs_btree_t *size_tree = arg; + ASSERT0(zfs_btree_numnodes(size_tree)); - btree_destroy(size_tree); + zfs_btree_destroy(size_tree); } /* ARGSUSED */ void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg) { - btree_t *size_tree = arg; + zfs_btree_t *size_tree = arg; - btree_add(size_tree, rs); + zfs_btree_add(size_tree, rs); } /* ARGSUSED */ void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { - btree_t *size_tree = arg; + zfs_btree_t *size_tree = arg; - btree_remove(size_tree, rs); + zfs_btree_remove(size_tree, rs); } /* ARGSUSED */ void rt_btree_vacate(range_tree_t *rt, void *arg) { - btree_t *size_tree = arg; - btree_clear(size_tree); - btree_destroy(size_tree); + zfs_btree_t *size_tree = arg; + zfs_btree_clear(size_tree); + zfs_btree_destroy(size_tree); rt_btree_create(rt, arg); } @@ -803,17 +803,17 @@ void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, range_tree_t *removefrom, range_tree_t *addto) { - btree_index_t where; + zfs_btree_index_t where; range_seg_max_t starting_rs; rs_set_start(&starting_rs, removefrom, start); rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs, removefrom) + 1); - range_seg_t *curr = btree_find(&removefrom->rt_root, + range_seg_t *curr = zfs_btree_find(&removefrom->rt_root, &starting_rs, &where); if (curr == NULL) - curr = btree_next(&removefrom->rt_root, &where, &where); + curr = zfs_btree_next(&removefrom->rt_root, &where, &where); range_seg_t *next; for (; curr != NULL; curr = next) { @@ -842,7 +842,7 @@ range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, range_tree_add(addto, start, overlap_start - start); start = overlap_end; - next = btree_find(&removefrom->rt_root, &rs, &where); + next = zfs_btree_find(&removefrom->rt_root, &rs, &where); /* * If we find something here, we only removed part of the * curr segment. Either there's some left at the end @@ -858,7 +858,7 @@ range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, removefrom)); } - next = btree_next(&removefrom->rt_root, &where, &where); + next = zfs_btree_next(&removefrom->rt_root, &where, &where); } VERIFY3P(curr, ==, NULL); @@ -878,9 +878,9 @@ void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, range_tree_t *addto) { - btree_index_t where; - for (range_seg_t *rs = btree_first(&rt->rt_root, &where); rs; - rs = btree_next(&rt->rt_root, &where, &where)) { + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { range_tree_remove_xor_add_segment(rs_get_start(rs, rt), rs_get_end(rs, rt), removefrom, addto); } @@ -889,14 +889,14 @@ range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, uint64_t range_tree_min(range_tree_t *rt) { - range_seg_t *rs = btree_first(&rt->rt_root, NULL); + range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL); return (rs != NULL ? rs_get_start(rs, rt) : 0); } uint64_t range_tree_max(range_tree_t *rt) { - range_seg_t *rs = btree_last(&rt->rt_root, NULL); + range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL); return (rs != NULL ? rs_get_end(rs, rt) : 0); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index e7fe0a8f6fb5..78bf110b40e6 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2319,7 +2319,7 @@ spa_init(int mode) fm_init(); zfs_refcount_init(); unique_init(); - btree_init(); + zfs_btree_init(); metaslab_stat_init(); ddt_init(); zio_init(); @@ -2355,7 +2355,7 @@ spa_fini(void) zio_fini(); ddt_fini(); metaslab_stat_fini(); - btree_fini(); + zfs_btree_fini(); unique_fini(); zfs_refcount_fini(); fm_fini(); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index 99bac9d4bd9c..eb2c36942543 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -674,10 +674,10 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, dmu_buf_will_dirty(db, tx); - btree_t *t = &rt->rt_root; - btree_index_t where; - for (range_seg_t *rs = btree_first(t, &where); rs != NULL; - rs = btree_next(t, &where, &where)) { + zfs_btree_t *t = &rt->rt_root; + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL; + rs = zfs_btree_next(t, &where, &where)) { uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >> sm->sm_shift; uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >> @@ -754,7 +754,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, else sm->sm_phys->smp_alloc -= range_tree_space(rt); - uint64_t nodes = btree_numnodes(&rt->rt_root); + uint64_t nodes = zfs_btree_numnodes(&rt->rt_root); uint64_t rt_space = range_tree_space(rt); space_map_write_impl(sm, rt, maptype, vdev_id, tx); @@ -763,7 +763,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * Ensure that the space_map's accounting wasn't changed * while we were in the middle of writing it out. */ - VERIFY3U(nodes, ==, btree_numnodes(&rt->rt_root)); + VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root)); VERIFY3U(range_tree_space(rt), ==, rt_space); } diff --git a/module/zfs/space_reftree.c b/module/zfs/space_reftree.c index ef9b0473bc15..080fc6646512 100644 --- a/module/zfs/space_reftree.c +++ b/module/zfs/space_reftree.c @@ -109,10 +109,10 @@ space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) { - btree_index_t where; + zfs_btree_index_t where; - for (range_seg_t *rs = btree_first(&rt->rt_root, &where); rs; rs = - btree_next(&rt->rt_root, &where, &where)) { + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs = + zfs_btree_next(&rt->rt_root, &where, &where)) { space_reftree_add_seg(t, rs_get_start(rs, rt), rs_get_end(rs, rt), refcnt); } diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index fcfb4effe6de..a27ab3c25919 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -292,11 +292,11 @@ static int vdev_initialize_ranges(vdev_t *vd, abd_t *data) { range_tree_t *rt = vd->vdev_initialize_tree; - btree_t *bt = &rt->rt_root; - btree_index_t where; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t where; - for (range_seg_t *rs = btree_first(bt, &where); rs != NULL; - rs = btree_next(bt, &where, &where)) { + for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; + rs = zfs_btree_next(bt, &where, &where)) { uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); /* Split range into legally-sized physical chunks */ @@ -367,11 +367,11 @@ vdev_initialize_calculate_progress(vdev_t *vd) */ VERIFY0(metaslab_load(msp)); - btree_index_t where; + zfs_btree_index_t where; range_tree_t *rt = msp->ms_allocatable; for (range_seg_t *rs = - btree_first(&rt->rt_root, &where); rs; - rs = btree_next(&rt->rt_root, &where, + zfs_btree_first(&rt->rt_root, &where); rs; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { logical_rs.rs_start = rs_get_start(rs, rt); logical_rs.rs_end = rs_get_end(rs, rt); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index a30f521cd8d6..e844949b765d 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -969,11 +969,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, * additional split blocks. */ range_seg_max_t search; - btree_index_t where; + zfs_btree_index_t where; rs_set_start(&search, segs, start + maxalloc); rs_set_end(&search, segs, start + maxalloc); - (void) btree_find(&segs->rt_root, &search, &where); - range_seg_t *rs = btree_prev(&segs->rt_root, &where, &where); + (void) zfs_btree_find(&segs->rt_root, &search, &where); + range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, &where); if (rs != NULL) { size = rs_get_end(rs, segs) - start; } else { @@ -1011,11 +1011,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); - btree_index_t where; - range_seg_t *rs = btree_first(&segs->rt_root, &where); + zfs_btree_index_t where; + range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); ASSERT3U(rs_get_start(rs, segs), ==, start); uint64_t prev_seg_end = rs_get_end(rs, segs); - while ((rs = btree_next(&segs->rt_root, &where, &where)) != NULL) { + while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) { if (rs_get_start(rs, segs) >= start + size) { break; } else { @@ -1484,7 +1484,7 @@ spa_vdev_remove_thread(void *arg) vca.vca_msp = msp; zfs_dbgmsg("copying %llu segments for metaslab %llu", - btree_numnodes(&svr->svr_allocd_segs->rt_root), + zfs_btree_numnodes(&svr->svr_allocd_segs->rt_root), msp->ms_id); while (!svr->svr_thread_exit && diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 21ba593c6678..c7c429cbd5e3 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -523,8 +523,8 @@ static int vdev_trim_ranges(trim_args_t *ta) { vdev_t *vd = ta->trim_vdev; - btree_t *t = &ta->trim_tree->rt_root; - btree_index_t idx; + zfs_btree_t *t = &ta->trim_tree->rt_root; + zfs_btree_index_t idx; uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; @@ -532,8 +532,8 @@ vdev_trim_ranges(trim_args_t *ta) ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; - for (range_seg_t *rs = btree_first(t, &idx); rs != NULL; - rs = btree_next(t, &idx, &idx)) { + for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + rs = zfs_btree_next(t, &idx, &idx)) { uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs, ta->trim_tree); @@ -614,10 +614,10 @@ vdev_trim_calculate_progress(vdev_t *vd) VERIFY0(metaslab_load(msp)); range_tree_t *rt = msp->ms_allocatable; - btree_t *bt = &rt->rt_root; - btree_index_t idx; - for (range_seg_t *rs = btree_first(bt, &idx); - rs != NULL; rs = btree_next(bt, &idx, &idx)) { + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t idx; + for (range_seg_t *rs = zfs_btree_first(bt, &idx); + rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { logical_rs.rs_start = rs_get_start(rs, rt); logical_rs.rs_end = rs_get_end(rs, rt); vdev_xlate(vd, &logical_rs, &physical_rs); From 7224043a8dc5f4760f334506dfbbf79ba758db7c Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Fri, 6 Sep 2019 11:19:57 -0700 Subject: [PATCH 14/18] Fix sorted scan bug Signed-off-by: Paul Dagnelie --- include/sys/btree.h | 19 ++++--- include/sys/range_tree.h | 3 +- module/zfs/btree.c | 109 ++++++++++++++++++++++---------------- module/zfs/dsl_scan.c | 19 +++++-- module/zfs/metaslab.c | 7 +-- module/zfs/range_tree.c | 6 +-- module/zfs/vdev_removal.c | 3 +- 7 files changed, 101 insertions(+), 65 deletions(-) diff --git a/include/sys/btree.h b/include/sys/btree.h index 4298496eb597..4e8a282b64c2 100644 --- a/include/sys/btree.h +++ b/include/sys/btree.h @@ -125,7 +125,8 @@ void zfs_btree_fini(void); * -1 for <, 0 for ==, and +1 for > * size - the value of sizeof(struct my_type) */ -void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), size_t); +void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), + size_t); /* * Find a node with a matching value in the tree. Returns the matching node @@ -133,7 +134,8 @@ void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), size_ * "where" for use with zfs_btree_insert() or zfs_btree_nearest(). * * node - node that has the value being looked for - * where - position for use with zfs_btree_nearest() or zfs_btree_insert(), may be NULL + * where - position for use with zfs_btree_nearest() or zfs_btree_insert(), + * may be NULL */ void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *); @@ -155,8 +157,10 @@ void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *); /* * Return the next or previous valued node in the tree. */ -void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *, zfs_btree_index_t *); -void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *, zfs_btree_index_t *); +void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *, + zfs_btree_index_t *); +void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *, + zfs_btree_index_t *); /* * Get a value from a tree and an index. @@ -192,10 +196,11 @@ ulong_t zfs_btree_numnodes(zfs_btree_t *); * removed from the tree and may be free()'d. Returns NULL when the tree is * empty. * - * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it and - * finally zfs_btree_destroy(). No other B-Tree routines will be valid. + * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it + * and finally zfs_btree_destroy(). No other B-Tree routines will be valid. * - * cookie - an index used to save state between calls to zfs_btree_destroy_nodes() + * cookie - an index used to save state between calls to + * zfs_btree_destroy_nodes() * * EXAMPLE: * zfs_btree_t *tree; diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index 069128a3d44f..d80faa933e30 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -53,7 +53,7 @@ typedef enum range_seg_type { * must provide external locking if required. */ typedef struct range_tree { - zfs_btree_t rt_root; /* offset-ordered segment b-tree */ + zfs_btree_t rt_root; /* offset-ordered segment b-tree */ uint64_t rt_space; /* sum of all segments in the map */ range_seg_type_t rt_type; /* type of range_seg_t in use */ /* @@ -285,6 +285,7 @@ range_tree_t *range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, void *arg, uint64_t start, uint64_t shift); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, uint64_t *ostart, uint64_t *osize); void range_tree_verify_not_present(range_tree_t *rt, diff --git a/module/zfs/btree.c b/module/zfs/btree.c index b16055589e85..ddd308c29460 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -73,8 +73,8 @@ zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) if (!hdr->bth_core) { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f, - BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) - hdr->bth_count * - size); + BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) - + hdr->bth_count * size); } else { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { @@ -88,7 +88,8 @@ zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) } static inline void -zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, uint64_t offset) +zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + uint64_t offset) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; @@ -106,7 +107,8 @@ zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, uint64_t offse } static inline void -zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, uint64_t offset) +zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + uint64_t offset) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; @@ -245,8 +247,9 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) * element in the last leaf, it's in the last leaf or * it's not in the tree. */ - void *d = zfs_btree_find_in_buf(tree, last_leaf->btl_elems, - last_leaf->btl_hdr.bth_count, value, &idx); + void *d = zfs_btree_find_in_buf(tree, + last_leaf->btl_elems, last_leaf->btl_hdr.bth_count, + value, &idx); if (where != NULL) { idx.bti_node = (zfs_btree_hdr_t *)last_leaf; @@ -285,8 +288,8 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) * The value is in this leaf, or it would be if it were in the * tree. Find its proper location and return it. */ - zfs_btree_leaf_t *leaf = (depth == 0 ? (zfs_btree_leaf_t *)tree->bt_root : - (zfs_btree_leaf_t *)node); + zfs_btree_leaf_t *leaf = (depth == 0 ? + (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems, leaf->btl_hdr.bth_count, value, &idx); @@ -372,7 +375,8 @@ bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, zfs_btree_hdr_t **c_start = node->btc_children + idx + (shape == BSS_TRAPEZOID ? 0 : 1); - zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : c_start + off); + zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : + c_start + off); uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); bcopy(c_start, c_out, c_count * sizeof (*c_start)); } @@ -510,8 +514,9 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, if (parent == NULL) { ASSERT3P(old_node, ==, tree->bt_root); tree->bt_num_nodes++; - zfs_btree_core_t *new_root = kmem_alloc(sizeof (zfs_btree_core_t) + - BTREE_CORE_ELEMS * size, KM_SLEEP); + zfs_btree_core_t *new_root = + kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * + size, KM_SLEEP); zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; new_root_hdr->bth_parent = NULL; new_root_hdr->bth_core = B_TRUE; @@ -534,8 +539,8 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, */ zfs_btree_index_t idx; ASSERT(par_hdr->bth_core); - VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, - buf, &idx), ==, NULL); + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + par_hdr->bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); uint64_t offset = idx.bti_offset; ASSERT3U(offset, <=, par_hdr->bth_count); @@ -648,7 +653,8 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, uint8_t *e_out = new_parent->btc_elems + e_count * size; bcopy(buf, e_out, size); - zfs_btree_hdr_t **c_out = new_parent->btc_children + e_count + 1; + zfs_btree_hdr_t **c_out = new_parent->btc_children + e_count + + 1; *c_out = new_node; ASSERT3P(*(c_out - 1), ==, old_node); @@ -685,20 +691,20 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, * Now that the node is split, we need to insert the new node into its * parent. This may cause further splitting. */ - zfs_btree_insert_into_parent(tree, &parent->btc_hdr, &new_parent->btc_hdr, - buf); + zfs_btree_insert_into_parent(tree, &parent->btc_hdr, + &new_parent->btc_hdr, buf); } /* Helper function for inserting a new value into leaf at the given index. */ static void -zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, const void *value, - uint64_t idx) +zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, + const void *value, uint64_t idx) { uint64_t size = tree->bt_elem_size; uint8_t *start = leaf->btl_elems + (idx * size); uint64_t count = leaf->btl_hdr.bth_count - idx; - uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / - size, 2); + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); /* * If the leaf isn't full, shift the elements after idx and insert @@ -734,7 +740,8 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, const void uint64_t keep_count = capacity - move_count; ASSERT3U(capacity - move_count, >=, 2); tree->bt_num_nodes++; - zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP); + zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, + KM_SLEEP); zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; new_hdr->bth_parent = leaf->btl_hdr.bth_parent; new_hdr->bth_core = B_FALSE; @@ -801,7 +808,8 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, const void * Now that the node is split, we need to insert the new node into its * parent. This may cause further splitting, bur only of core nodes. */ - zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, buf); + zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, + buf); kmem_free(buf, size); } @@ -842,8 +850,8 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) zfs_btree_hdr_t *hdr = &leaf->btl_hdr; zfs_btree_core_t *parent = hdr->bth_parent; uint64_t size = tree->bt_elem_size; - uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / - size, 2); + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); /* * The invariant doesn't apply to the root node, if that's the only @@ -943,8 +951,8 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) */ uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); ASSERT3U(parent_idx, >, 0); - zfs_btree_core_t *l_neighbor = (zfs_btree_core_t *)parent->btc_children[ - parent_idx - 1]; + zfs_btree_core_t *l_neighbor = + (zfs_btree_core_t *)parent->btc_children[ parent_idx - 1]; uint64_t move_count = (capacity / 2) - hdr->bth_count; ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, capacity / 2); @@ -1000,7 +1008,8 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) * Insert value into tree at the location specified by where. */ void -zfs_btree_insert(zfs_btree_t *tree, const void *value, const zfs_btree_index_t *where) +zfs_btree_insert(zfs_btree_t *tree, const void *value, + const zfs_btree_index_t *where) { zfs_btree_index_t idx = {0}; @@ -1043,8 +1052,9 @@ zfs_btree_insert(zfs_btree_t *tree, const void *value, const zfs_btree_index_t * * If we're inserting into a leaf, go directly to the helper * function. */ - zfs_btree_insert_into_leaf(tree, (zfs_btree_leaf_t *)where->bti_node, - value, where->bti_offset); + zfs_btree_insert_into_leaf(tree, + (zfs_btree_leaf_t *)where->bti_node, value, + where->bti_offset); } else { /* * If we're inserting into a core node, we can't just shift @@ -1075,8 +1085,8 @@ zfs_btree_insert(zfs_btree_t *tree, const void *value, const zfs_btree_index_t * VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL); ASSERT0(new_idx.bti_offset); ASSERT(!new_idx.bti_node->bth_core); - zfs_btree_insert_into_leaf(tree, (zfs_btree_leaf_t *)new_idx.bti_node, - buf, 0); + zfs_btree_insert_into_leaf(tree, + (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); kmem_free(buf, size); } zfs_btree_verify(tree); @@ -1101,7 +1111,8 @@ zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) * put its location in where if non-null. */ static void * -zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) +zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, + zfs_btree_index_t *where) { zfs_btree_hdr_t *node; @@ -1215,7 +1226,8 @@ zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, * passed for idx and out_idx. */ void * -zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_t *out_idx) +zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx) { return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); } @@ -1225,7 +1237,8 @@ zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_ * passed for idx and out_idx. */ void * -zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_t *out_idx) +zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx) { if (idx->bti_node == NULL) { ASSERT3S(tree->bt_height, ==, -1); @@ -1237,11 +1250,11 @@ zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_ /* * When finding the previous element of an element in a leaf, * there are two cases. If the element isn't the first one in - * the leaf, in which case we just return the previous element in - * the leaf. Otherwise, we need to traverse up our parents + * the leaf, in which case we just return the previous element + * in the leaf. Otherwise, we need to traverse up our parents * until we find one where our previous ancestor isn't the - * first child. Once we do, the previous element is the separator - * after our previous ancestor. + * first child. Once we do, the previous element is the + * separator after our previous ancestor. */ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; if (offset != 0) { @@ -1331,7 +1344,8 @@ zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) * cannot be accessed. */ static void -zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, zfs_btree_hdr_t *rm_hdr) +zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, + zfs_btree_hdr_t *rm_hdr) { size_t size = tree->bt_elem_size; uint64_t min_count = BTREE_CORE_ELEMS / 2; @@ -1585,8 +1599,8 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) size_t size = tree->bt_elem_size; zfs_btree_hdr_t *hdr = where->bti_node; uint64_t idx = where->bti_offset; - uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / - size, 2); + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); ASSERT(!where->bti_before); if (tree->bt_bulk != NULL) { @@ -1616,7 +1630,8 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) if (hdr->bth_core) { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; - void *new_value = zfs_btree_last_helper(tree, left_subtree, where); + void *new_value = zfs_btree_last_helper(tree, left_subtree, + where); ASSERT3P(new_value, !=, NULL); bcopy(new_value, node->btc_elems + idx * size, size); @@ -1988,7 +2003,8 @@ zfs_btree_verify_counts(zfs_btree_t *tree) * the number of nodes under this node, to help verify bt_num_nodes. */ static uint64_t -zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, int64_t height) +zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + int64_t height) { if (!hdr->bth_core) { VERIFY0(height); @@ -1999,8 +2015,8 @@ zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, int64_t zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; uint64_t ret = 1; for (int i = 0; i <= hdr->bth_count; i++) { - ret += zfs_btree_verify_height_helper(tree, node->btc_children[i], - height - 1); + ret += zfs_btree_verify_height_helper(tree, + node->btc_children[i], height - 1); } return (ret); } @@ -2131,7 +2147,8 @@ zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) } for (int i = 0; i <= hdr->bth_count; i++) { - zfs_btree_verify_poison_helper(tree, node->btc_children[i]); + zfs_btree_verify_poison_helper(tree, + node->btc_children[i]); } } } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 255d521750c1..75e19b88e509 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2915,6 +2915,7 @@ static range_seg_t * scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; + range_tree_t *rt = queue->q_exts_by_addr; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(scn->scn_is_sorted); @@ -2922,9 +2923,14 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) /* handle tunable overrides */ if (scn->scn_checkpointing || scn->scn_clearing) { if (zfs_scan_issue_strategy == 1) { - return (range_tree_first(queue->q_exts_by_addr)); + return (range_tree_first(rt)); } else if (zfs_scan_issue_strategy == 2) { - return (zfs_btree_first(&queue->q_exts_by_size, NULL)); + range_seg_t *size_rs = + zfs_btree_first(&queue->q_exts_by_size, NULL); + range_seg_t *addr_rs = range_tree_find(rt, + rs_get_start(rt, size_rs), rs_get_end(rt, size_rs)); + ASSERT3P(addr_rs, !=, NULL); + return (addr_rs); } } @@ -2938,9 +2944,14 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) * In this case, we instead switch to issuing extents in LBA order. */ if (scn->scn_checkpointing) { - return (range_tree_first(queue->q_exts_by_addr)); + return (range_tree_first(rt)); } else if (scn->scn_clearing) { - return (zfs_btree_first(&queue->q_exts_by_size, NULL)); + range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size, + NULL); + range_seg_t *addr_rs = range_tree_find(rt, + rs_get_start(rt, size_rs), rs_get_end(rt, size_rs)); + ASSERT3P(addr_rs, !=, NULL); + return (addr_rs); } else { return (NULL); } diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index de01782bd076..7b8ac91b164a 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1498,7 +1498,8 @@ metaslab_largest_unflushed_free(metaslab_t *msp) if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) metaslab_size_tree_full_load(msp->ms_unflushed_frees); - range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, NULL); + range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, + NULL); if (rs == NULL) return (0); @@ -1550,8 +1551,8 @@ metaslab_largest_unflushed_free(metaslab_t *msp) } static range_seg_t * -metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, uint64_t size, - zfs_btree_index_t *where) +metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, + uint64_t size, zfs_btree_index_t *where) { range_seg_t *rs; range_seg_max_t rsearch; diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 3470dae1fda5..0b369a43820f 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -569,7 +569,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) return (zfs_btree_find(&rt->rt_root, &rsearch, NULL)); } -static range_seg_t * +range_seg_t * range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) { if (rt->rt_type == RANGE_SEG64) @@ -693,8 +693,8 @@ void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) { zfs_btree_index_t where; - for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; - rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); + rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - rs_get_start(rs, rt)); } diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index e844949b765d..a4fac1cc590d 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -973,7 +973,8 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, rs_set_start(&search, segs, start + maxalloc); rs_set_end(&search, segs, start + maxalloc); (void) zfs_btree_find(&segs->rt_root, &search, &where); - range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, &where); + range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, + &where); if (rs != NULL) { size = rs_get_end(rs, segs) - start; } else { From f5fdfdabc549cabbe8c590c8334b3d891551dff6 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Mon, 23 Sep 2019 12:50:19 -0700 Subject: [PATCH 15/18] Reduce min size by 1, simplify merge and split logic Signed-off-by: Paul Dagnelie --- include/sys/btree.h | 8 +- module/zfs/btree.c | 509 +++++++++++++++++++------------------------- 2 files changed, 223 insertions(+), 294 deletions(-) diff --git a/include/sys/btree.h b/include/sys/btree.h index 4e8a282b64c2..1673b521750e 100644 --- a/include/sys/btree.h +++ b/include/sys/btree.h @@ -51,10 +51,10 @@ extern "C" { * of node it is in. * * The tree's height is the same throughout, unlike many other forms of search - * tree. Each node (except for the root) must be between 50% and 100% full of - * elements (and children) at all times. Any operation that would put the node - * outside of that range results in a rebalancing operation (stealing, - * merging, or splitting). + * tree. Each node (except for the root) must be between half minus one and + * completely full of elements (and children) at all times. Any operation that + * would put the node outside of that range results in a rebalancing operation + * (taking, merging, or splitting). * * This tree was implemented using descriptions from Wikipedia's articles on * B-Trees and B+ Trees. diff --git a/module/zfs/btree.c b/module/zfs/btree.c index ddd308c29460..cb4d669ac70d 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -494,6 +494,31 @@ zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) return (&leaf->btl_elems[0]); } +/* Insert an element and a child into a core node at the given offset. */ +static void +zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, + uint64_t offset, zfs_btree_hdr_t *new_node, void *buf) +{ + uint64_t size = tree->bt_elem_size; + zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; + ASSERT3P(par_hdr, ==, new_node->bth_parent); + ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); + + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, par_hdr, + par_hdr->bth_count); + } + /* Shift existing elements and children */ + uint64_t count = par_hdr->bth_count - offset; + bt_shift_core_right(tree, parent, offset, count, + BSS_PARALLELOGRAM); + + /* Insert new values */ + parent->btc_children[offset + 1] = new_node; + bcopy(buf, parent->btc_elems + offset * size, size); + par_hdr->bth_count++; +} + /* * Insert new_node into the parent of old_node directly after old_node, with * buf as the dividing element between the two. @@ -551,20 +576,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, * and return. */ if (par_hdr->bth_count != BTREE_CORE_ELEMS) { - if (zfs_btree_verify_intensity >= 5) { - zfs_btree_verify_poison_at(tree, par_hdr, - par_hdr->bth_count); - } - - /* Shift existing elements and children */ - uint64_t count = par_hdr->bth_count - offset; - bt_shift_core_right(tree, parent, offset, count, - BSS_PARALLELOGRAM); - - /* Insert new values */ - parent->btc_children[offset + 1] = new_node; - bcopy(buf, parent->btc_elems + offset * size, size); - par_hdr->bth_count++; + zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); return; } @@ -582,10 +594,13 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, * instead move only about a quarter of the elements (and children) to * the new node. Since the average state after a long time is a 3/4 * full node, shortcutting directly to that state improves efficiency. + * + * We do this in two stages: first we split into two nodes, and then we + * reuse our existing logic to insert the new element and child. */ - uint64_t move_count = MAX(BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? - 2 : 4), 2); - uint64_t keep_count = BTREE_CORE_ELEMS - move_count; + uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? + 2 : 4)) - 1, 2); + uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1; ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); tree->bt_num_nodes++; zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + @@ -598,87 +613,47 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, par_hdr->bth_count = keep_count; - /* - * The three cases to consider are that the element in buf should be - * in the existing node (with lower values), the new node (with higher - * values), or that it should separate the two nodes. - */ - if (offset < keep_count) { - /* - * Copy the back part of the elements and children to the new - * leaf. - */ - bt_transfer_core(tree, parent, keep_count, move_count, - new_parent, 0, BSS_TRAPEZOID); - - /* Store the new separator in a buffer. */ - uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); - bcopy(parent->btc_elems + (keep_count - 1) * size, tmp_buf, - size); + bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, + 0, BSS_TRAPEZOID); - /* - * Shift the remaining elements and children in the front half - * to handle the new value. - */ - bt_shift_core_right(tree, parent, offset, keep_count - 1 - - offset, BSS_PARALLELOGRAM); - - uint8_t *e_start = parent->btc_elems + offset * size; - bcopy(buf, e_start, size); + /* Store the new separator in a buffer. */ + uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); + bcopy(parent->btc_elems + keep_count * size, tmp_buf, + size); + zfs_btree_poison_node(tree, par_hdr); - zfs_btree_hdr_t **c_start = parent->btc_children + (offset + 1); - *c_start = new_node; - ASSERT3P(*(c_start - 1), ==, old_node); + if (offset < keep_count) { + /* Insert the new node into the left half */ + zfs_btree_insert_core_impl(tree, parent, offset, new_node, + buf); /* * Move the new separator to the existing buffer. */ bcopy(tmp_buf, buf, size); - kmem_free(tmp_buf, size); } else if (offset > keep_count) { - /* Store the new separator in a buffer. */ - uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); - uint8_t *e_start = parent->btc_elems + keep_count * size; - bcopy(e_start, tmp_buf, size); - - /* - * Of the elements and children in the back half, move those - * before offset to the new leaf. - */ - uint64_t e_count = offset - keep_count - 1; - bt_transfer_core(tree, parent, keep_count + 1, - e_count, new_parent, 0, BSS_TRAPEZOID); - - /* Add the new value to the new leaf. */ - uint8_t *e_out = new_parent->btc_elems + e_count * size; - bcopy(buf, e_out, size); - - zfs_btree_hdr_t **c_out = new_parent->btc_children + e_count + - 1; - *c_out = new_node; - ASSERT3P(*(c_out - 1), ==, old_node); + /* Insert the new node into the right half */ + new_node->bth_parent = new_parent; + zfs_btree_insert_core_impl(tree, new_parent, + offset - keep_count - 1, new_node, buf); /* * Move the new separator to the existing buffer. */ bcopy(tmp_buf, buf, size); - kmem_free(tmp_buf, size); - - /* Move the rest of the back half to the new leaf. */ - bt_transfer_core(tree, parent, offset, - BTREE_CORE_ELEMS - offset, new_parent, e_count + 1, - BSS_PARALLELOGRAM); } else { /* - * The new value is the new separator, no change. - * - * Copy the back part of the elements and children to the new - * leaf. + * Move the new separator into the right half, and replace it + * with buf. We also need to shift back the elements in the + * right half to accomodate new_node. */ - bt_transfer_core(tree, parent, keep_count, move_count, - new_parent, 0, BSS_PARALLELOGRAM); + bt_shift_core_right(tree, new_parent, 0, move_count, + BSS_TRAPEZOID); new_parent->btc_children[0] = new_node; + bcopy(tmp_buf, new_parent->btc_elems, size); + new_par_hdr->bth_count++; } + kmem_free(tmp_buf, size); zfs_btree_poison_node(tree, par_hdr); for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++) @@ -695,14 +670,35 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, &new_parent->btc_hdr, buf); } +/* Insert an element into a leaf node at the given offset. */ +static void +zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, + uint64_t idx, const void *value) +{ + uint64_t size = tree->bt_elem_size; + uint8_t *start = leaf->btl_elems + (idx * size); + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + ASSERTV(uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2)); + uint64_t count = leaf->btl_hdr.bth_count - idx; + ASSERT3U(leaf->btl_hdr.bth_count, <, capacity); + + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, + leaf->btl_hdr.bth_count); + } + + bt_shift_leaf_right(tree, leaf, idx, count); + bcopy(value, start, size); + hdr->bth_count++; +} + /* Helper function for inserting a new value into leaf at the given index. */ static void zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, const void *value, uint64_t idx) { uint64_t size = tree->bt_elem_size; - uint8_t *start = leaf->btl_elems + (idx * size); - uint64_t count = leaf->btl_hdr.bth_count - idx; uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / size, 2); @@ -711,13 +707,7 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, * value. */ if (leaf->btl_hdr.bth_count != capacity) { - if (zfs_btree_verify_intensity >= 5) { - zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, - leaf->btl_hdr.bth_count); - } - leaf->btl_hdr.bth_count++; - bt_shift_leaf_right(tree, leaf, idx, count); - bcopy(value, start, size); + zfs_btree_insert_leaf_impl(tree, leaf, idx, value); return; } @@ -735,9 +725,9 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, * In either case, we're left with one extra element. The leftover * element will become the new dividing element between the two nodes. */ - uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4), - 2); - uint64_t keep_count = capacity - move_count; + uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) - + 1, 2); + uint64_t keep_count = capacity - move_count - 1; ASSERT3U(capacity - move_count, >=, 2); tree->bt_num_nodes++; zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, @@ -753,57 +743,33 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) tree->bt_bulk = new_leaf; + /* Copy the back part to the new leaf. */ + bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, + 0); + /* We store the new separator in a buffer we control for simplicity. */ uint8_t *buf = kmem_alloc(size, KM_SLEEP); + bcopy(leaf->btl_elems + (keep_count * size), buf, size); + zfs_btree_poison_node(tree, &leaf->btl_hdr); - /* - * The three cases to consider are that value should be in the new - * first node, the new second node, or that it should separate the two - * nodes. - */ if (idx < keep_count) { - /* Copy the back part to the new leaf. */ - bt_transfer_leaf(tree, leaf, keep_count, move_count, new_leaf, - 0); - - /* Store the new separator in a buffer. */ - bcopy(leaf->btl_elems + (keep_count - 1) * size, buf, - size); - - /* - * Shift the remaining elements in the front part to handle - * the new value. - */ - bt_shift_leaf_right(tree, leaf, idx, keep_count - 1 - idx); - bcopy(value, leaf->btl_elems + idx * size, size); + /* Insert into the existing leaf. */ + zfs_btree_insert_leaf_impl(tree, leaf, idx, value); } else if (idx > keep_count) { - /* Store the new separator in a buffer. */ - start = leaf->btl_elems + (keep_count) * size; - bcopy(start, buf, size); - - /* Move the back part before idx to the new leaf. */ - bt_transfer_leaf(tree, leaf, keep_count + 1, - idx - keep_count - 1, new_leaf, 0); - - /* Add the new value to the new leaf. */ - uint8_t *out = new_leaf->btl_elems + (idx - keep_count - 1) * - size; - bcopy(value, out, size); - - /* Move the rest of the back part to the new leaf. */ - bt_transfer_leaf(tree, leaf, idx, capacity - idx, new_leaf, - idx - keep_count); + /* Insert into the new leaf. */ + zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - + 1, value); } else { - /* The new value is the new separator. */ + /* + * Shift the elements in the new leaf to make room for the + * separator, and use the new value as the new separator. + */ + bt_shift_leaf_right(tree, new_leaf, 0, move_count); + bcopy(buf, new_leaf->btl_elems, size); bcopy(value, buf, size); - - /* Copy the back part to the new leaf. */ - bt_transfer_leaf(tree, leaf, keep_count, move_count, new_leaf, - 0); + new_hdr->bth_count++; } - zfs_btree_poison_node(tree, &leaf->btl_hdr); - /* * Now that the node is split, we need to insert the new node into its * parent. This may cause further splitting, bur only of core nodes. @@ -1348,7 +1314,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, zfs_btree_hdr_t *rm_hdr) { size_t size = tree->bt_elem_size; - uint64_t min_count = BTREE_CORE_ELEMS / 2; + uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1; zfs_btree_hdr_t *hdr = &node->btc_hdr; /* * If the node is the root node and rm_hdr is one of two children, @@ -1371,25 +1337,25 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, break; } ASSERT3U(idx, <=, hdr->bth_count); - hdr->bth_count--; /* * If the node is the root or it has more than the minimum number of * children, just remove the child and separator, and return. */ if (hdr->bth_parent == NULL || - hdr->bth_count >= min_count) { + hdr->bth_count > min_count) { /* * Shift the element and children to the right of rm_hdr to * the left by one spot. */ - bt_shift_core_left(tree, node, idx, hdr->bth_count - idx + 1, + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, BSS_PARALLELOGRAM); + hdr->bth_count--; zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); return; } - ASSERT3U(hdr->bth_count, ==, min_count - 1); + ASSERT3U(hdr->bth_count, ==, min_count); /* * Now we try to take a node from a neighbor. We check left, then @@ -1440,7 +1406,6 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, (l_hdr->bth_count - 1) * size; bcopy(take_elem, separator, size); l_hdr->bth_count--; - hdr->bth_count++; zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); return; } @@ -1456,7 +1421,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, * Shift elements in node left by one spot to overwrite rm_hdr * and the separator before it. */ - bt_shift_core_left(tree, node, idx, hdr->bth_count - idx + 1, + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, BSS_PARALLELOGRAM); /* @@ -1464,22 +1429,22 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, * element spot in node. */ uint8_t *separator = parent->btc_elems + parent_idx * size; - bcopy(separator, node->btc_elems + hdr->bth_count * size, size); + bcopy(separator, node->btc_elems + (hdr->bth_count - 1) * size, + size); /* * Move the first child of neighbor to the last child spot in * node. */ zfs_btree_hdr_t **take_child = neighbor->btc_children; - bcopy(take_child, node->btc_children + (hdr->bth_count + 1), + bcopy(take_child, node->btc_children + hdr->bth_count, sizeof (*take_child)); - node->btc_children[hdr->bth_count + 1]->bth_parent = node; + node->btc_children[hdr->bth_count]->bth_parent = node; /* Move the first element of neighbor to the separator spot. */ uint8_t *take_elem = neighbor->btc_elems; bcopy(take_elem, separator, size); r_hdr->bth_count--; - hdr->bth_count++; /* * Shift the elements and children of neighbor to cover the @@ -1496,96 +1461,71 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, * need to merge with one of them. We prefer the left one, * arabitrarily. Move the separator into the leftmost merging node * (which may be us or the left neighbor), and then move the right - * merging node's elements (skipping or overwriting idx, which we're - * deleting). Once that's done, go into the parent and delete the - * right merging node and the separator. This may cause further + * merging node's elements. Once that's done, we go back and delete + * the element we're removing. Finally, go into the parent and delete + * the right merging node and the separator. This may cause further * merging. */ - zfs_btree_hdr_t *new_rm_hdr; - + zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; + uint64_t new_idx = idx; if (l_hdr != NULL) { - ASSERT(l_hdr->bth_core); - zfs_btree_core_t *left = (zfs_btree_core_t *)l_hdr; - - if (zfs_btree_verify_intensity >= 5) { - for (int i = 0; i < hdr->bth_count + 1; i++) { - zfs_btree_verify_poison_at(tree, l_hdr, - l_hdr->bth_count + i); - } - } - /* Move the separator into the left node. */ - uint8_t *e_out = left->btc_elems + l_hdr->bth_count * size; - uint8_t *separator = parent->btc_elems + (parent_idx - 1) * - size; - bcopy(separator, e_out, size); - - /* Move all our elements and children into the left node. */ - bt_transfer_core(tree, node, 0, idx - 1, left, - l_hdr->bth_count + 1, BSS_TRAPEZOID); - bt_transfer_core(tree, node, idx, hdr->bth_count - idx + 1, - left, l_hdr->bth_count + idx, BSS_PARALLELOGRAM); - - /* Reparent all our children to point to the left node. */ - zfs_btree_hdr_t **new_start = left->btc_children + - l_hdr->bth_count + 1; - for (int i = 0; i < hdr->bth_count + 1; i++) - new_start[i]->bth_parent = left; - - /* Update bookkeeping */ - l_hdr->bth_count += hdr->bth_count + 1; - for (int i = 0; i <= l_hdr->bth_count; i++) { - ASSERT3P(left->btc_children[i]->bth_parent, ==, left); - ASSERT3P(left->btc_children[i], !=, rm_hdr); - } - ASSERT3U(l_hdr->bth_count, ==, BTREE_CORE_ELEMS); + keep_hdr = l_hdr; new_rm_hdr = hdr; + new_idx += keep_hdr->bth_count + 1; } else { ASSERT3P(r_hdr, !=, NULL); - ASSERT(r_hdr->bth_core); - zfs_btree_core_t *right = (zfs_btree_core_t *)r_hdr; + keep_hdr = hdr; + new_rm_hdr = r_hdr; + parent_idx++; + } - if (zfs_btree_verify_intensity >= 5) { - for (int i = 0; i < r_hdr->bth_count; i++) { - zfs_btree_verify_poison_at(tree, hdr, - hdr->bth_count + i + 1); - } + ASSERT(keep_hdr->bth_core); + ASSERT(new_rm_hdr->bth_core); + + zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; + zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) { + zfs_btree_verify_poison_at(tree, keep_hdr, + keep_hdr->bth_count + i); } - /* - * Overwrite rm_hdr and its separator by moving node's - * elements and children forward. - */ - bt_shift_core_left(tree, node, idx, hdr->bth_count - idx + 1, - BSS_PARALLELOGRAM); + } - /* - * Move the separator to the first open spot in node's - * elements. - */ - uint8_t *e_out = node->btc_elems + (hdr->bth_count - idx) * - size; - uint8_t *separator = parent->btc_elems + parent_idx * size; - bcopy(separator, e_out, size); + /* Move the separator into the left node. */ + uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bcopy(separator, e_out, size); + keep_hdr->bth_count++; - /* Move the right node's elements and children to node. */ - bt_transfer_core(tree, right, 0, r_hdr->bth_count, node, - hdr->bth_count - idx + 1, BSS_TRAPEZOID); + /* Move all our elements and children into the left node. */ + bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, + keep_hdr->bth_count, BSS_TRAPEZOID); - /* Reparent the right node's children to point to node. */ - for (int i = hdr->bth_count - idx; i < r_hdr->bth_count + 1; - i++) { - node->btc_children[i]->bth_parent = node; - } + /* Reparent all our children to point to the left node. */ + zfs_btree_hdr_t **new_start = keep->btc_children + + keep_hdr->bth_count; + for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) + new_start[i]->bth_parent = keep; - /* Update bookkeeping. */ - hdr->bth_count += r_hdr->bth_count + 1; - for (int i = 0; i <= hdr->bth_count; i++) { - ASSERT3P(node->btc_children[i]->bth_parent, ==, node); - ASSERT3P(node->btc_children[i], !=, rm_hdr); - } + /* Update bookkeeping */ + keep_hdr->bth_count += new_rm_hdr->bth_count; + ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); - ASSERT3U(hdr->bth_count, ==, BTREE_CORE_ELEMS); - new_rm_hdr = r_hdr; + /* + * Shift the element and children to the right of rm_hdr to + * the left by one spot. + */ + ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); + bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, + BSS_PARALLELOGRAM); + keep_hdr->bth_count--; + for (int i = 0; i <= keep_hdr->bth_count; i++) { + ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); + ASSERT3P(keep->btc_children[i], !=, rm_hdr); } + zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); new_rm_hdr->bth_count = 0; zfs_btree_node_destroy(tree, new_rm_hdr); @@ -1649,15 +1589,15 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) ASSERT(!hdr->bth_core); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; ASSERT3U(hdr->bth_count, >, 0); - hdr->bth_count--; - uint64_t min_count = capacity / 2; + uint64_t min_count = (capacity / 2) - 1; /* * If we're over the minimum size or this is the root, just overwrite * the value and return. */ - if (hdr->bth_count >= min_count || hdr->bth_parent == NULL) { + if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { + hdr->bth_count--; bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); if (hdr->bth_parent == NULL) { ASSERT0(tree->bt_height); @@ -1672,7 +1612,7 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) zfs_btree_verify(tree); return; } - ASSERT3U(hdr->bth_count, ==, min_count - 1); + ASSERT3U(hdr->bth_count, ==, min_count); /* * Now we try to take a node from a sibling. We check left, then @@ -1713,7 +1653,6 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) /* Update the bookkeeping. */ l_hdr->bth_count--; - hdr->bth_count++; zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); zfs_btree_verify(tree); @@ -1732,19 +1671,20 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) * by one spot to make room for the stolen element and * overwrite the element being removed. */ - bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx - + 1); uint8_t *separator = parent->btc_elems + parent_idx * size; uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems; /* Move the separator between us to our last spot. */ - bcopy(separator, leaf->btl_elems + hdr->bth_count * size, size); + bcopy(separator, leaf->btl_elems + (hdr->bth_count - 1) * size, + size); /* Move our neighbor's first element to the separator. */ bcopy(take_elem, separator, size); /* Update the bookkeeping. */ r_hdr->bth_count--; - hdr->bth_count++; /* * Move our neighbors elements forwards to overwrite the @@ -1761,73 +1701,62 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) * need to merge with one of them. We prefer the left one, * arabitrarily. Move the separator into the leftmost merging node * (which may be us or the left neighbor), and then move the right - * merging node's elements (skipping or overwriting idx, which we're - * deleting). Once that's done, go into the parent and delete the - * right merging node and the separator. This may cause further + * merging node's elements. Once that's done, we go back and delete + * the element we're removing. Finally, go into the parent and delete + * the right merging node and the separator. This may cause further * merging. */ - zfs_btree_hdr_t *rm_hdr; - + zfs_btree_hdr_t *rm_hdr, *keep_hdr; + uint64_t new_idx = idx; if (l_hdr != NULL) { - ASSERT(!l_hdr->bth_core); - zfs_btree_leaf_t *left = (zfs_btree_leaf_t *)l_hdr; - - if (zfs_btree_verify_intensity >= 5) { - for (int i = 0; i < hdr->bth_count + 1; i++) { - zfs_btree_verify_poison_at(tree, l_hdr, - l_hdr->bth_count + i); - } - } - /* - * Move the separator into the first open spot in the left - * neighbor. - */ - uint8_t *out = left->btl_elems + l_hdr->bth_count * size; - uint8_t *separator = parent->btc_elems + (parent_idx - 1) * - size; - bcopy(separator, out, size); - - /* Move our elements to the left neighbor. */ - bt_transfer_leaf(tree, leaf, 0, idx, left, l_hdr->bth_count + - 1); - bt_transfer_leaf(tree, leaf, idx + 1, hdr->bth_count - idx, - left, l_hdr->bth_count + 1 + idx); - - /* Update the bookkeeping. */ - l_hdr->bth_count += hdr->bth_count + 1; - ASSERT3U(l_hdr->bth_count, ==, min_count * 2); + keep_hdr = l_hdr; rm_hdr = hdr; + new_idx += keep_hdr->bth_count + 1; // 449 } else { ASSERT3P(r_hdr, !=, NULL); - ASSERT(!r_hdr->bth_core); - if (zfs_btree_verify_intensity >= 5) { - for (int i = 0; i < r_hdr->bth_count; i++) { - zfs_btree_verify_poison_at(tree, hdr, - hdr->bth_count + i + 1); - } - } - zfs_btree_leaf_t *right = (zfs_btree_leaf_t *)r_hdr; + keep_hdr = hdr; + rm_hdr = r_hdr; + parent_idx++; + } - /* - * Move our elements left to overwrite the element being - * removed. - */ - bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); + ASSERT(!keep_hdr->bth_core); + ASSERT(!rm_hdr->bth_core); + ASSERT3U(keep_hdr->bth_count, ==, min_count); + ASSERT3U(rm_hdr->bth_count, ==, min_count); - /* Move the separator to node's first open spot. */ - uint8_t *out = leaf->btl_elems + hdr->bth_count * size; - uint8_t *separator = parent->btc_elems + parent_idx * size; - bcopy(separator, out, size); - - /* Move the right neighbor's elements to node. */ - bt_transfer_leaf(tree, right, 0, r_hdr->bth_count, leaf, - hdr->bth_count + 1); + zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr; + zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; - /* Update the bookkeeping. */ - hdr->bth_count += r_hdr->bth_count + 1; - ASSERT3U(hdr->bth_count, ==, min_count * 2); - rm_hdr = r_hdr; + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < rm_hdr->bth_count + 1; i++) { + zfs_btree_verify_poison_at(tree, keep_hdr, + keep_hdr->bth_count + i); + } } + /* + * Move the separator into the first open spot in the left + * neighbor. + */ + uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bcopy(separator, out, size); + keep_hdr->bth_count++; + + /* Move our elements to the left neighbor. */ + bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, + keep_hdr->bth_count); + + /* Update the bookkeeping. */ + keep_hdr->bth_count += rm_hdr->bth_count; + ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1); + + /* Remove the value from the node */ + keep_hdr->bth_count--; + bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count - + new_idx); + zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); + rm_hdr->bth_count = 0; zfs_btree_node_destroy(tree, rm_hdr); /* Remove the emptied node from the parent. */ @@ -1964,7 +1893,7 @@ zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) { uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2); - VERIFY3U(hdr->bth_count, >=, capacity / 2); + VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1); } return (hdr->bth_count); @@ -1973,7 +1902,7 @@ zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; uint64_t ret = hdr->bth_count; if (tree->bt_root != hdr && tree->bt_bulk == NULL) - VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2); + VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); for (int i = 0; i <= hdr->bth_count; i++) { ret += zfs_btree_verify_counts_helper(tree, node->btc_children[i]); @@ -2077,7 +2006,7 @@ zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) if (tree->bt_compar(node->btc_elems + i * size, left_child_last) != 1) { panic("btree: compar returned %d (expected 1) at " - "%p %d: compar(%p, %p)", tree->bt_compar( + "%px %d: compar(%px, %px)", tree->bt_compar( node->btc_elems + i * size, left_child_last), (void *)node, i, (void *)(node->btc_elems + i * size), (void *)left_child_last); @@ -2097,7 +2026,7 @@ zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) if (tree->bt_compar(node->btc_elems + i * size, right_child_first) != -1) { panic("btree: compar returned %d (expected -1) at " - "%p %d: compar(%p, %p)", tree->bt_compar( + "%px %d: compar(%px, %px)", tree->bt_compar( node->btc_elems + i * size, right_child_first), (void *)node, i, (void *)(node->btc_elems + i * size), (void *)right_child_first); From b2f3759454a46d95d202906ed8decb3ac115ddd1 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Mon, 23 Sep 2019 12:50:26 -0700 Subject: [PATCH 16/18] Fix dsl_scan, bulk_finish issues Signed-off-by: Paul Dagnelie --- module/zfs/btree.c | 16 +++++++++------- module/zfs/dsl_scan.c | 11 +++++++---- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/module/zfs/btree.c b/module/zfs/btree.c index cb4d669ac70d..f1ba25ffbd5a 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -918,7 +918,7 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); ASSERT3U(parent_idx, >, 0); zfs_btree_core_t *l_neighbor = - (zfs_btree_core_t *)parent->btc_children[ parent_idx - 1]; + (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; uint64_t move_count = (capacity / 2) - hdr->bth_count; ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, capacity / 2); @@ -926,7 +926,7 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) if (zfs_btree_verify_intensity >= 5) { for (int i = 0; i < move_count; i++) { zfs_btree_verify_poison_at(tree, hdr, - leaf->btl_hdr.bth_count + i); + hdr->bth_count + i); } } /* First, shift things in the right node back. */ @@ -1503,11 +1503,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, keep_hdr->bth_count, BSS_TRAPEZOID); - /* Reparent all our children to point to the left node. */ - zfs_btree_hdr_t **new_start = keep->btc_children + - keep_hdr->bth_count; - for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) - new_start[i]->bth_parent = keep; + uint64_t old_count = keep_hdr->bth_count; /* Update bookkeeping */ keep_hdr->bth_count += new_rm_hdr->bth_count; @@ -1521,6 +1517,12 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, BSS_PARALLELOGRAM); keep_hdr->bth_count--; + + /* Reparent all our children to point to the left node. */ + zfs_btree_hdr_t **new_start = keep->btc_children + + old_count - 1; + for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) + new_start[i]->bth_parent = keep; for (int i = 0; i <= keep_hdr->bth_count; i++) { ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); ASSERT3P(keep->btc_children[i], !=, rm_hdr); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 75e19b88e509..3eacb42f15ab 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2927,8 +2927,10 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) } else if (zfs_scan_issue_strategy == 2) { range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size, NULL); - range_seg_t *addr_rs = range_tree_find(rt, - rs_get_start(rt, size_rs), rs_get_end(rt, size_rs)); + uint64_t start = rs_get_start(size_rs, rt); + uint64_t size = rs_get_end(size_rs, rt) - start; + range_seg_t *addr_rs = range_tree_find(rt, start, + size); ASSERT3P(addr_rs, !=, NULL); return (addr_rs); } @@ -2948,8 +2950,9 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) } else if (scn->scn_clearing) { range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size, NULL); - range_seg_t *addr_rs = range_tree_find(rt, - rs_get_start(rt, size_rs), rs_get_end(rt, size_rs)); + uint64_t start = rs_get_start(size_rs, rt); + uint64_t size = rs_get_end(size_rs, rt) - start; + range_seg_t *addr_rs = range_tree_find(rt, start, size); ASSERT3P(addr_rs, !=, NULL); return (addr_rs); } else { From ade29366cbef5f2afdd75cf00ee1ea2456c34b9e Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Mon, 30 Sep 2019 10:35:20 -0700 Subject: [PATCH 17/18] Reduce verify intensity on debug builds Signed-off-by: Paul Dagnelie --- module/zfs/btree.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/module/zfs/btree.c b/module/zfs/btree.c index f1ba25ffbd5a..8f3c3f3e7268 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -35,7 +35,7 @@ kmem_cache_t *zfs_btree_leaf_cache; * Intensity 3: Verify that the total number of elements in the tree matches the * sum of the number of elements in each node. Also verifies that each node's * count obeys the invariants (less than or equal to maximum value, greater than - * or equal to half the maximum). + * or equal to half the maximum minus one). * Intensity 4: Verify that each element compares less than the element * immediately after it and greater than the one immediately before it using the * comparator function. For core nodes, also checks that each element is greater @@ -53,11 +53,7 @@ kmem_cache_t *zfs_btree_leaf_cache; * (while the asymptotic complexity of the other steps is the same, the * importance of the constant factors cannot be denied). */ -#ifdef ZFS_DEBUG -int zfs_btree_verify_intensity = 5; -#else int zfs_btree_verify_intensity = 0; -#endif #ifdef _ILP32 #define BTREE_POISON 0xabadb10c From 3ea9b78dec4df2f5033e9220785ebafe493ba81f Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 8 Oct 2019 16:15:25 -0700 Subject: [PATCH 18/18] Use memmove wrapper instead of bcopy for platform compatibillity Signed-off-by: Paul Dagnelie --- module/zfs/btree.c | 82 ++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 36 deletions(-) diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 8f3c3f3e7268..8f514caabd9d 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -55,6 +55,16 @@ kmem_cache_t *zfs_btree_leaf_cache; */ int zfs_btree_verify_intensity = 0; +/* + * A convenience function to silence warnings from memmove's return value and + * change argument order to src, dest. + */ +void +bmov(const void *src, void *dest, size_t size) +{ + (void) memmove(dest, src, size); +} + #ifdef _ILP32 #define BTREE_POISON 0xabadb10c #else @@ -367,14 +377,14 @@ bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, int sign = (dir == BSD_LEFT ? -1 : +1); uint8_t *e_out = e_start + sign * off * size; uint64_t e_count = count; - bcopy(e_start, e_out, e_count * size); + bmov(e_start, e_out, e_count * size); zfs_btree_hdr_t **c_start = node->btc_children + idx + (shape == BSS_TRAPEZOID ? 0 : 1); zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : c_start + off); uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); - bcopy(c_start, c_out, c_count * sizeof (*c_start)); + bmov(c_start, c_out, c_count * sizeof (*c_start)); } /* @@ -416,7 +426,7 @@ bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx, uint8_t *start = node->btl_elems + idx * size; int sign = (dir == BSD_LEFT ? -1 : +1); uint8_t *out = start + sign * off * size; - bcopy(start, out, count * size); + bmov(start, out, count * size); } static inline void @@ -446,11 +456,11 @@ bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx, ASSERT(source->btc_hdr.bth_core); ASSERT(dest->btc_hdr.bth_core); - bcopy(source->btc_elems + sidx * size, dest->btc_elems + didx * size, + bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size, count * size); uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); - bcopy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), + bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), c_count * sizeof (*source->btc_children)); } @@ -463,7 +473,7 @@ bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx, ASSERT(!source->btl_hdr.bth_core); ASSERT(!dest->btl_hdr.bth_core); - bcopy(source->btl_elems + sidx * size, dest->btl_elems + didx * size, + bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size, count * size); } @@ -511,7 +521,7 @@ zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, /* Insert new values */ parent->btc_children[offset + 1] = new_node; - bcopy(buf, parent->btc_elems + offset * size, size); + bmov(buf, parent->btc_elems + offset * size, size); par_hdr->bth_count++; } @@ -546,7 +556,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, old_node->bth_parent = new_node->bth_parent = new_root; new_root->btc_children[0] = old_node; new_root->btc_children[1] = new_node; - bcopy(buf, new_root->btc_elems, size); + bmov(buf, new_root->btc_elems, size); tree->bt_height++; tree->bt_root = new_root_hdr; @@ -614,7 +624,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, /* Store the new separator in a buffer. */ uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); - bcopy(parent->btc_elems + keep_count * size, tmp_buf, + bmov(parent->btc_elems + keep_count * size, tmp_buf, size); zfs_btree_poison_node(tree, par_hdr); @@ -626,7 +636,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, /* * Move the new separator to the existing buffer. */ - bcopy(tmp_buf, buf, size); + bmov(tmp_buf, buf, size); } else if (offset > keep_count) { /* Insert the new node into the right half */ new_node->bth_parent = new_parent; @@ -636,7 +646,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, /* * Move the new separator to the existing buffer. */ - bcopy(tmp_buf, buf, size); + bmov(tmp_buf, buf, size); } else { /* * Move the new separator into the right half, and replace it @@ -646,7 +656,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, bt_shift_core_right(tree, new_parent, 0, move_count, BSS_TRAPEZOID); new_parent->btc_children[0] = new_node; - bcopy(tmp_buf, new_parent->btc_elems, size); + bmov(tmp_buf, new_parent->btc_elems, size); new_par_hdr->bth_count++; } kmem_free(tmp_buf, size); @@ -685,7 +695,7 @@ zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, } bt_shift_leaf_right(tree, leaf, idx, count); - bcopy(value, start, size); + bmov(value, start, size); hdr->bth_count++; } @@ -745,7 +755,7 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, /* We store the new separator in a buffer we control for simplicity. */ uint8_t *buf = kmem_alloc(size, KM_SLEEP); - bcopy(leaf->btl_elems + (keep_count * size), buf, size); + bmov(leaf->btl_elems + (keep_count * size), buf, size); zfs_btree_poison_node(tree, &leaf->btl_hdr); if (idx < keep_count) { @@ -761,8 +771,8 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, * separator, and use the new value as the new separator. */ bt_shift_leaf_right(tree, new_leaf, 0, move_count); - bcopy(buf, new_leaf->btl_elems, size); - bcopy(value, buf, size); + bmov(buf, new_leaf->btl_elems, size); + bmov(value, buf, size); new_hdr->bth_count++; } @@ -864,7 +874,7 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) /* Next, move the separator from the common ancestor to leaf. */ uint8_t *separator = common->btc_elems + (common_idx * size); uint8_t *out = leaf->btl_elems + ((move_count - 1) * size); - bcopy(separator, out, size); + bmov(separator, out, size); move_count--; /* @@ -878,7 +888,7 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) * Finally, move the new last element in the left neighbor to * the separator. */ - bcopy(l_neighbor->btl_elems + (l_hdr->bth_count - + bmov(l_neighbor->btl_elems + (l_hdr->bth_count - move_count - 1) * size, separator, size); /* Adjust the node's counts, and we're done. */ @@ -933,7 +943,7 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * size); uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); - bcopy(separator, e_out, size); + bmov(separator, e_out, size); /* * Now, move elements and children from the left node to the @@ -949,7 +959,7 @@ zfs_btree_bulk_finish(zfs_btree_t *tree) * separator's position. */ move_idx--; - bcopy(l_neighbor->btc_elems + move_idx * size, separator, size); + bmov(l_neighbor->btc_elems + move_idx * size, separator, size); l_neighbor->btc_hdr.bth_count -= move_count + 1; hdr->bth_count += move_count + 1; @@ -1036,8 +1046,8 @@ zfs_btree_insert(zfs_btree_t *tree, const void *value, zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; size_t size = tree->bt_elem_size; uint8_t *buf = kmem_alloc(size, KM_SLEEP); - bcopy(node->btc_elems + off * size, buf, size); - bcopy(value, node->btc_elems + off * size, size); + bmov(node->btc_elems + off * size, buf, size); + bmov(value, node->btc_elems + off * size, size); /* * Find the first slot in the subtree to the right, insert @@ -1389,18 +1399,18 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, */ uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; - bcopy(separator, node->btc_elems, size); + bmov(separator, node->btc_elems, size); /* Move the last child of neighbor to our first child slot. */ zfs_btree_hdr_t **take_child = neighbor->btc_children + l_hdr->bth_count; - bcopy(take_child, node->btc_children, sizeof (*take_child)); + bmov(take_child, node->btc_children, sizeof (*take_child)); node->btc_children[0]->bth_parent = node; /* Move the last element of neighbor to the separator spot. */ uint8_t *take_elem = neighbor->btc_elems + (l_hdr->bth_count - 1) * size; - bcopy(take_elem, separator, size); + bmov(take_elem, separator, size); l_hdr->bth_count--; zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); return; @@ -1425,7 +1435,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, * element spot in node. */ uint8_t *separator = parent->btc_elems + parent_idx * size; - bcopy(separator, node->btc_elems + (hdr->bth_count - 1) * size, + bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size, size); /* @@ -1433,13 +1443,13 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, * node. */ zfs_btree_hdr_t **take_child = neighbor->btc_children; - bcopy(take_child, node->btc_children + hdr->bth_count, + bmov(take_child, node->btc_children + hdr->bth_count, sizeof (*take_child)); node->btc_children[hdr->bth_count]->bth_parent = node; /* Move the first element of neighbor to the separator spot. */ uint8_t *take_elem = neighbor->btc_elems; - bcopy(take_elem, separator, size); + bmov(take_elem, separator, size); r_hdr->bth_count--; /* @@ -1492,7 +1502,7 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; - bcopy(separator, e_out, size); + bmov(separator, e_out, size); keep_hdr->bth_count++; /* Move all our elements and children into the left node. */ @@ -1550,7 +1560,7 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) */ uint8_t *value = zfs_btree_get(tree, where); uint8_t *tmp = kmem_alloc(size, KM_SLEEP); - bcopy(value, tmp, size); + bmov(value, tmp, size); zfs_btree_bulk_finish(tree); VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); kmem_free(tmp, size); @@ -1572,7 +1582,7 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) where); ASSERT3P(new_value, !=, NULL); - bcopy(new_value, node->btc_elems + idx * size, size); + bmov(new_value, node->btc_elems + idx * size, size); hdr = where->bti_node; idx = where->bti_offset; @@ -1644,10 +1654,10 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems + (l_hdr->bth_count - 1) * size; /* Move the separator to our first spot. */ - bcopy(separator, leaf->btl_elems, size); + bmov(separator, leaf->btl_elems, size); /* Move our neighbor's last element to the separator. */ - bcopy(take_elem, separator, size); + bmov(take_elem, separator, size); /* Update the bookkeeping. */ l_hdr->bth_count--; @@ -1675,11 +1685,11 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) uint8_t *separator = parent->btc_elems + parent_idx * size; uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems; /* Move the separator between us to our last spot. */ - bcopy(separator, leaf->btl_elems + (hdr->bth_count - 1) * size, + bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size, size); /* Move our neighbor's first element to the separator. */ - bcopy(take_elem, separator, size); + bmov(take_elem, separator, size); /* Update the bookkeeping. */ r_hdr->bth_count--; @@ -1738,7 +1748,7 @@ zfs_btree_remove_from(zfs_btree_t *tree, zfs_btree_index_t *where) uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size; uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; - bcopy(separator, out, size); + bmov(separator, out, size); keep_hdr->bth_count++; /* Move our elements to the left neighbor. */