From 54eec0fa5996ede551b5333033db2408f3d0d06e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 8 Jan 2025 10:43:01 +1100 Subject: [PATCH 01/44] ZTS: remove empty zpool_add--allow-ashift-mismatch test Added in b1e46f869, but empty, so no point keeping it around. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Rob Norris Closes #16931 --- tests/runfiles/common.run | 3 +-- tests/zfs-tests/tests/Makefile.am | 1 - .../cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh | 0 3 files changed, 1 insertion(+), 3 deletions(-) delete mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 1d6f6d85200f..688ee1645656 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -387,8 +387,7 @@ tags = ['functional', 'cli_root', 'zpool'] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', - 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output', - 'zpool_add--allow-ashift-mismatch'] + 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bde33843098f..add549cb6d72 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -998,7 +998,6 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ functional/cli_root/zpool_add/cleanup.ksh \ functional/cli_root/zpool_add/setup.ksh \ - functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh \ functional/cli_root/zpool_add/zpool_add_001_pos.ksh \ functional/cli_root/zpool_add/zpool_add_002_pos.ksh \ functional/cli_root/zpool_add/zpool_add_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh deleted file mode 100755 index e69de29bb2d1..000000000000 From 675b49d2a1105b22e48b8143563c93c6755326d6 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sat, 11 Jan 2025 04:26:42 -0500 Subject: [PATCH 02/44] FreeBSD: Use ashift in vdev_check_boot_reserve() We should not hardcode 512-byte read size when checking for loader in the boot area before RAIDZ expansion. Disk might be unable to handle that I/O as is, and the code zio_vdev_io_start() handling the padding asserts doing it only for top-level vdev. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16942 --- module/os/freebsd/zfs/vdev_label_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/freebsd/zfs/vdev_label_os.c b/module/os/freebsd/zfs/vdev_label_os.c index 79732d9173e8..f1843807fd5d 100644 --- a/module/os/freebsd/zfs/vdev_label_os.c +++ b/module/os/freebsd/zfs/vdev_label_os.c @@ -96,7 +96,7 @@ vdev_check_boot_reserve(spa_t *spa, vdev_t *childvd) { ASSERT(childvd->vdev_ops->vdev_op_leaf); - size_t size = SPA_MINBLOCKSIZE; + size_t size = 1ULL << childvd->vdev_top->vdev_ashift; abd_t *abd = abd_alloc_linear(size, B_FALSE); zio_t *pio = zio_root(spa, NULL, NULL, 0); From fabdd502f4f04e27d057aedc7fb7697e7bd95b74 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 14 Jan 2025 00:33:31 +1100 Subject: [PATCH 03/44] zinject: count matches and injections for each handler When building tests with zinject, it can be quite difficult to work out if you're producing the right kind of IO to match the rules you've set up. So, here we extend injection records to count the number of times a handler matched the operation, and how often an error was actually injected (ie after frequency and other exclusions are applied). Then, display those counts in the `zinject` output. Reviewed-by: Tony Hutter Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #16938 --- cmd/zinject/zinject.c | 66 ++++---- include/sys/zfs_ioctl.h | 4 +- module/zfs/zio_inject.c | 60 ++++++-- tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../cli_root/zinject/zinject_counts.ksh | 142 ++++++++++++++++++ 6 files changed, 236 insertions(+), 39 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zinject/zinject_counts.ksh diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index ed60cce3dd16..6c856763c958 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2023-2024, Klara Inc. + * Copyright (c) 2023-2025, Klara, Inc. */ /* @@ -404,27 +404,30 @@ print_data_handler(int id, const char *pool, zinject_record_t *record, if (*count == 0) { (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s " - "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE", - "LVL", "DVAs", "RANGE"); + "%-15s %-6s %-15s\n", "ID", "POOL", "OBJSET", "OBJECT", + "TYPE", "LVL", "DVAs", "RANGE", "MATCH", "INJECT"); (void) printf("--- --------------- ------ " - "------ -------- --- ---- ---------------\n"); + "------ -------- --- ---- --------------- " + "------ ------\n"); } *count += 1; - (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ", - id, pool, (u_longlong_t)record->zi_objset, - (u_longlong_t)record->zi_object, type_to_name(record->zi_type), - record->zi_level, record->zi_dvas); - - - if (record->zi_start == 0 && - record->zi_end == -1ULL) - (void) printf("all\n"); + char rangebuf[32]; + if (record->zi_start == 0 && record->zi_end == -1ULL) + snprintf(rangebuf, sizeof (rangebuf), "all"); else - (void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start, + snprintf(rangebuf, sizeof (rangebuf), "[%llu, %llu]", + (u_longlong_t)record->zi_start, (u_longlong_t)record->zi_end); + + (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x %-15s " + "%6lu %6lu\n", id, pool, (u_longlong_t)record->zi_objset, + (u_longlong_t)record->zi_object, type_to_name(record->zi_type), + record->zi_level, record->zi_dvas, rangebuf, + record->zi_match_count, record->zi_inject_count); + return (0); } @@ -445,11 +448,14 @@ print_device_handler(int id, const char *pool, zinject_record_t *record, return (0); if (*count == 0) { - (void) printf("%3s %-15s %-16s %-5s %-10s %-9s\n", - "ID", "POOL", "GUID", "TYPE", "ERROR", "FREQ"); + (void) printf("%3s %-15s %-16s %-5s %-10s %-9s " + "%-6s %-6s\n", + "ID", "POOL", "GUID", "TYPE", "ERROR", "FREQ", + "MATCH", "INJECT"); (void) printf( "--- --------------- ---------------- " - "----- ---------- ---------\n"); + "----- ---------- --------- " + "------ ------\n"); } *count += 1; @@ -457,9 +463,10 @@ print_device_handler(int id, const char *pool, zinject_record_t *record, double freq = record->zi_freq == 0 ? 100.0f : (((double)record->zi_freq) / ZI_PERCENTAGE_MAX) * 100.0f; - (void) printf("%3d %-15s %llx %-5s %-10s %8.4f%%\n", id, pool, - (u_longlong_t)record->zi_guid, iotypestr[record->zi_iotype], - err_to_str(record->zi_error), freq); + (void) printf("%3d %-15s %llx %-5s %-10s %8.4f%% " + "%6lu %6lu\n", id, pool, (u_longlong_t)record->zi_guid, + iotypestr[record->zi_iotype], err_to_str(record->zi_error), + freq, record->zi_match_count, record->zi_inject_count); return (0); } @@ -477,18 +484,25 @@ print_delay_handler(int id, const char *pool, zinject_record_t *record, return (0); if (*count == 0) { - (void) printf("%3s %-15s %-15s %-15s %s\n", - "ID", "POOL", "DELAY (ms)", "LANES", "GUID"); - (void) printf("--- --------------- --------------- " - "--------------- ----------------\n"); + (void) printf("%3s %-15s %-16s %-10s %-5s %-9s " + "%-6s %-6s\n", + "ID", "POOL", "GUID", "DELAY (ms)", "LANES", "FREQ", + "MATCH", "INJECT"); + (void) printf("--- --------------- ---------------- " + "---------- ----- --------- " + "------ ------\n"); } *count += 1; - (void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool, + double freq = record->zi_freq == 0 ? 100.0f : + (((double)record->zi_freq) / ZI_PERCENTAGE_MAX) * 100.0f; + + (void) printf("%3d %-15s %llx %10llu %5llu %8.4f%% " + "%6lu %6lu\n", id, pool, (u_longlong_t)record->zi_guid, (u_longlong_t)NSEC2MSEC(record->zi_timer), (u_longlong_t)record->zi_nlanes, - (u_longlong_t)record->zi_guid); + freq, record->zi_match_count, record->zi_inject_count); return (0); } diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index aa20e52a7634..e61d7644764e 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -23,7 +23,7 @@ * Copyright (c) 2012, 2024 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024-2025, Klara, Inc. */ #ifndef _SYS_ZFS_IOCTL_H @@ -421,6 +421,8 @@ typedef struct zinject_record { uint64_t zi_nlanes; uint32_t zi_cmd; uint32_t zi_dvas; + uint64_t zi_match_count; /* count of times matched */ + uint64_t zi_inject_count; /* count of times injected */ } zinject_record_t; #define ZINJECT_NULL 0x1 diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 012a0e3c6c17..f972522b6454 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2024, Klara Inc. + * Copyright (c) 2024-2025, Klara, Inc. */ /* @@ -129,6 +129,9 @@ static boolean_t zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, zinject_record_t *record, int error) { + boolean_t matched = B_FALSE; + boolean_t injected = B_FALSE; + /* * Check for a match against the MOS, which is based on type */ @@ -137,9 +140,8 @@ zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, record->zi_object == DMU_META_DNODE_OBJECT) { if (record->zi_type == DMU_OT_NONE || type == record->zi_type) - return (freq_triggered(record->zi_freq)); - else - return (B_FALSE); + matched = B_TRUE; + goto done; } /* @@ -153,10 +155,20 @@ zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, (record->zi_dvas == 0 || (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) && error == record->zi_error) { - return (freq_triggered(record->zi_freq)); + matched = B_TRUE; + goto done; } - return (B_FALSE); +done: + if (matched) { + record->zi_match_count++; + injected = freq_triggered(record->zi_freq); + } + + if (injected) + record->zi_inject_count++; + + return (injected); } /* @@ -177,8 +189,11 @@ zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type) continue; if (handler->zi_record.zi_type == type && - strcmp(tag, handler->zi_record.zi_func) == 0) + strcmp(tag, handler->zi_record.zi_func) == 0) { + handler->zi_record.zi_match_count++; + handler->zi_record.zi_inject_count++; panic("Panic requested in function %s\n", tag); + } } rw_exit(&inject_lock); @@ -336,6 +351,8 @@ zio_handle_label_injection(zio_t *zio, int error) if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && (offset >= start && offset <= end)) { + handler->zi_record.zi_match_count++; + handler->zi_record.zi_inject_count++; ret = error; break; } @@ -400,12 +417,16 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) if (handler->zi_record.zi_error == err1 || handler->zi_record.zi_error == err2) { + handler->zi_record.zi_match_count++; + /* * limit error injection if requested */ if (!freq_triggered(handler->zi_record.zi_freq)) continue; + handler->zi_record.zi_inject_count++; + /* * For a failed open, pretend like the device * has gone away. @@ -441,6 +462,8 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) break; } if (handler->zi_record.zi_error == ENXIO) { + handler->zi_record.zi_match_count++; + handler->zi_record.zi_inject_count++; ret = SET_ERROR(EIO); break; } @@ -483,6 +506,8 @@ zio_handle_ignored_writes(zio_t *zio) handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; + handler->zi_record.zi_match_count++; + /* * Positive duration implies # of seconds, negative * a number of txgs @@ -495,8 +520,10 @@ zio_handle_ignored_writes(zio_t *zio) } /* Have a "problem" writing 60% of the time */ - if (random_in_range(100) < 60) + if (random_in_range(100) < 60) { + handler->zi_record.zi_inject_count++; zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; + } break; } @@ -520,6 +547,9 @@ spa_handle_ignored_writes(spa_t *spa) handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; + handler->zi_record.zi_match_count++; + handler->zi_record.zi_inject_count++; + if (handler->zi_record.zi_duration > 0) { VERIFY(handler->zi_record.zi_timer == 0 || ddi_time_after64( @@ -601,9 +631,6 @@ zio_handle_io_delay(zio_t *zio) if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) continue; - if (!freq_triggered(handler->zi_record.zi_freq)) - continue; - if (vd->vdev_guid != handler->zi_record.zi_guid) continue; @@ -628,6 +655,12 @@ zio_handle_io_delay(zio_t *zio) ASSERT3U(handler->zi_record.zi_nlanes, >, handler->zi_next_lane); + handler->zi_record.zi_match_count++; + + /* Limit the use of this handler if requested */ + if (!freq_triggered(handler->zi_record.zi_freq)) + continue; + /* * We want to issue this IO to the lane that will become * idle the soonest, so we compare the soonest this @@ -699,6 +732,9 @@ zio_handle_io_delay(zio_t *zio) */ min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % min_handler->zi_record.zi_nlanes; + + min_handler->zi_record.zi_inject_count++; + } mutex_exit(&inject_delay_mtx); @@ -721,9 +757,11 @@ zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command) handler = list_next(&inject_handlers, handler)) { ASSERT3P(handler->zi_spa_name, !=, NULL); if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) { + handler->zi_record.zi_match_count++; uint64_t pause = SEC2NSEC(handler->zi_record.zi_duration); if (pause > elapsed) { + handler->zi_record.zi_inject_count++; delay = pause - elapsed; } id = handler->zi_id; diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 688ee1645656..c3e681727cb3 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -159,7 +159,7 @@ tests = ['json_sanity'] tags = ['functional', 'cli_root', 'json'] [tests/functional/cli_root/zinject] -tests = ['zinject_args'] +tests = ['zinject_args', 'zinject_counts'] pre = post = tags = ['functional', 'cli_root', 'zinject'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index add549cb6d72..520a2396d9a5 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -615,6 +615,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/json/setup.ksh \ functional/cli_root/json/json_sanity.ksh \ functional/cli_root/zinject/zinject_args.ksh \ + functional/cli_root/zinject/zinject_counts.ksh \ functional/cli_root/zdb/zdb_002_pos.ksh \ functional/cli_root/zdb/zdb_003_pos.ksh \ functional/cli_root/zdb/zdb_004_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_counts.ksh b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_counts.ksh new file mode 100755 index 000000000000..19b223aba46c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_counts.ksh @@ -0,0 +1,142 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +# +# This test sets various injections, does some IO to trigger them. and then +# checks the "match" and "inject" counters on the injection records to ensure +# that they're being counted properly. +# +# Note that this is a test of the counters, not injection generally. We're +# usually only looking for the counters moving at all, not caring too much +# about their actual values. + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_assert "Check zinject counts are displayed and advanced as expected." + +DISK1=${DISKS%% *} + +function cleanup +{ + zinject -c all + default_cleanup_noexit +} + +log_onexit cleanup + +default_mirror_setup_noexit $DISKS + +# Call zinject, get the match and inject counts, and make sure they look +# plausible for the requested frequency. +function check_count_freq +{ + typeset -i freq=$1 + + # assuming a single rule, with the match and inject counts in the + # last two columns + typeset rule=$(zinject | grep -m 1 -oE '^ *[0-9].*[0-9]$') + + log_note "check_count_freq: using rule: $rule" + + typeset -a record=($(echo $rule | grep -oE ' [0-9]+ +[0-9]+$')) + typeset -i match=${record[0]} + typeset -i inject=${record[1]} + + log_note "check_count_freq: freq=$freq match=$match inject=$inject" + + # equality check, for 100% frequency, or if we've never matched the rule + if [[ $match -eq 0 || $freq -eq 100 ]] ; then + return [[ $match -eq 0 $inject ]] + fi + + # Compute the expected injection count, and compare. Because we're + # not testing the fine details here, it's considered good-enough for + # the injection account to be within +/- 10% of the expected count. + typeset -i expect=$(($match * $freq / 100)) + typeset -i diff=$((($expect - $inject) / 10)) + return [[ $diff -ge -1 && $diff -le 1 ]] +} + +# Test device IO injections by injecting write errors, doing some writes, +# and making sure the count moved +function test_device_injection +{ + for freq in 100 50 ; do + log_must zinject -d $DISK1 -e io -T write -f $freq $TESTPOOL + + log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=1 + log_must zpool sync + + log_must check_count_freq $freq + + log_must zinject -c all + done +} + +# Test object injections by writing a file, injecting checksum errors and +# trying to read it back +function test_object_injection +{ + log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=1 + zpool sync + + for freq in 100 50 ; do + log_must zinject -t data -e checksum -f $freq /$TESTPOOL/file + + cat /tank/file > /dev/null || true + + log_must check_count_freq $freq + + log_must zinject -c all + done +} + +# Test delay injections, by injecting delays and writing +function test_delay_injection +{ + for freq in 100 50 ; do + log_must zinject -d $DISK1 -D 50:1 -f $freq $TESTPOOL + + log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=1 + zpool sync + + log_must check_count_freq $freq + + log_must zinject -c all + done +} + +# Disable cache, to ensure reads induce IO +log_must zfs set primarycache=none $TESTPOOL + +# Test 'em all. +log_must test_device_injection +log_must test_object_injection +log_must test_delay_injection + +log_pass "zinject counts are displayed and advanced as expected." From 8eba6a5ba189f9c412f40acb9389d2419eb1fc72 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 14 Jan 2025 11:51:37 +1100 Subject: [PATCH 04/44] Makefile.in: pass ARCH for modules_install as well To do a cross-build using only kbuild rather than a full source tree, ARCH= needs to be passed for the kbuild Makefile to find the archspecific Makefile. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16944 --- module/Makefile.in | 1 + 1 file changed, 1 insertion(+) diff --git a/module/Makefile.in b/module/Makefile.in index 529ab81dcec5..f76e94afa410 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -90,6 +90,7 @@ modules_install-Linux: modules_uninstall-Linux-legacy $(MAKE) -C @LINUX_OBJ@ M="$$PWD" modules_install \ INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) \ INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) \ + $(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \ KERNELRELEASE=@LINUX_VERSION@ @# Remove extraneous build products when packaging if [ -n "$(DESTDIR)" ]; then \ From 404254bacb1859cd5975f5d4f8d38492b8dc048e Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Fri, 17 Jan 2025 02:09:45 +0000 Subject: [PATCH 05/44] style: remove unnecessary spaces in sa.h Removed three unnecessary spaces in the definition of the sa_attr_reg_t structure to improve code style consistency and adhere to OpenZFS coding standards. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Rob Norris Signed-off-by: Peng Liu Closes #16955 --- include/sys/sa.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/sys/sa.h b/include/sys/sa.h index c551acecab30..2e1d50c7330f 100644 --- a/include/sys/sa.h +++ b/include/sys/sa.h @@ -49,10 +49,10 @@ typedef uint16_t sa_attr_type_t; * Attribute to register support for. */ typedef struct sa_attr_reg { - const char *sa_name; /* attribute name */ - uint16_t sa_length; + const char *sa_name; /* attribute name */ + uint16_t sa_length; sa_bswap_type_t sa_byteswap; /* bswap function enum */ - sa_attr_type_t sa_attr; /* filled in during registration */ + sa_attr_type_t sa_attr; /* filled in during registration */ } sa_attr_reg_t; @@ -77,7 +77,7 @@ typedef struct sa_bulk_attr { uint16_t sa_length; sa_attr_type_t sa_attr; /* the following are private to the sa framework */ - void *sa_addr; + void *sa_addr; uint16_t sa_buftype; uint16_t sa_size; } sa_bulk_attr_t; From c36faf668b87b6906bd4c64ed110c1f7d0cdccaf Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 17 Jan 2025 08:04:36 -0800 Subject: [PATCH 06/44] Update RELEASES.md LTS release to 2.2 2.3.0 is out now, so make 2.2.x the LTS release. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: George Melikov Signed-off-by: Tony Hutter Closes #16945 Closes #16948 --- RELEASES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASES.md b/RELEASES.md index 55bfdb80ef6e..e673a6b0e977 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -28,7 +28,7 @@ Two release branches are maintained for OpenZFS, they are: Minor changes to support these distribution kernels will be applied as needed. New kernel versions released after the OpenZFS LTS release are not supported. LTS releases will receive patches for at least 2 years. - The current LTS release is OpenZFS 2.1. + The current LTS release is OpenZFS 2.2. * OpenZFS current - Tracks the newest MAJOR.MINOR release. This branch includes support for the latest OpenZFS features and recently releases From 083d322fa08a4ecc67a32323327913ba2a763d86 Mon Sep 17 00:00:00 2001 From: Alexander Ziaee Date: Mon, 20 Jan 2025 14:37:52 -0500 Subject: [PATCH 07/44] zfs-destroy.8: Fix minor formatting typo The warning at the end of the second example in the description section was actually inside the options table. Move the El macro to match what is done in the first section for improved readability. Reviewed-by: Alexander Motin Reviewed-by: Rob Norris Signed-off-by: Alexander Ziaee Closes #16962 --- man/man8/zfs-destroy.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/man8/zfs-destroy.8 b/man/man8/zfs-destroy.8 index fe4f19d18e9f..247c561322bf 100644 --- a/man/man8/zfs-destroy.8 +++ b/man/man8/zfs-destroy.8 @@ -157,6 +157,7 @@ Destroy all snapshots with this name in descendent file systems. .It Fl v Print verbose information about the deleted data. +.El .Pp Extreme care should be taken when applying either the .Fl r @@ -164,7 +165,6 @@ or the .Fl R options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. -.El .It Xo .Nm zfs .Cm destroy From e6c98d11ecea6e6cfc3a7455f510305e18667de7 Mon Sep 17 00:00:00 2001 From: Tim Smith Date: Tue, 21 Jan 2025 07:30:17 -0800 Subject: [PATCH 08/44] Fix several typos in the man pages Reviewed-by: George Amanakis Reviewed-by: Alexander Motin Signed-off-by: Tim Smith Closes #16965 --- man/man4/zfs.4 | 8 ++++---- man/man7/vdevprops.7 | 2 +- man/man7/zpool-features.7 | 4 ++-- man/man8/zfs.8 | 2 +- man/man8/zpool-initialize.8 | 2 +- man/man8/zpool-status.8 | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 7078a5ba8373..dd0b3d848fe9 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -428,7 +428,7 @@ but this may negatively impact pool space efficiency. . .It Sy zfs_vdev_direct_write_verify Ns = Ns Sy Linux 1 | FreeBSD 0 Pq uint If non-zero, then a Direct I/O write's checksum will be verified every -time the write is issued and before it is commited to the block pointer. +time the write is issued and before it is committed to the block pointer. In the event the checksum is not valid then the I/O operation will return EIO. This module parameter can be used to detect if the contents of the users buffer have changed in the process of doing a Direct I/O @@ -438,7 +438,7 @@ writes. Each verify error causes a .Sy dio_verify_wr zevent. -Direct Write I/O checkum verify errors can be seen with +Direct Write I/O checksum verify errors can be seen with .Nm zpool Cm status Fl d . The default value for this is 1 on Linux, but is 0 for .Fx @@ -1612,7 +1612,7 @@ _ . .It Sy zfs_btree_verify_intensity Ns = Ns Sy 0 Pq uint Enables btree verification. -The following settings are culminative: +The following settings are cumulative: .TS box; lbz r l l . @@ -2525,7 +2525,7 @@ generate a system-dependent value close to 6 threads per taskq. Set value only applies to pools imported/created after that. . .It Sy zio_taskq_write_tpq Ns = Ns Sy 16 Pq uint -Determines the minumum number of threads per write issue taskq. +Determines the minimum number of threads per write issue taskq. Higher values improve CPU utilization on high throughput, while lower reduce taskq locks contention on high IOPS. Set value only applies to pools imported/created after that. diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 34d4026b1009..e9fe8898492a 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -147,7 +147,7 @@ A text comment up to 8192 characters long .It Sy bootsize The amount of space to reserve for the EFI system partition .It Sy failfast -If this device should propage BIO errors back to ZFS, used to disable +If this device should propagate BIO errors back to ZFS, used to disable failfast. .It Sy path The path to the device for this vdev diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 7b392a896150..2f87236c762d 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -245,11 +245,11 @@ zpool_checkpoint .No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2-2.06 # Features which are supported by GRUB2 versions prior to v2.12. # -# GRUB is not able to detect ZFS pool if snaphsot of top level boot pool +# GRUB is not able to detect ZFS pool if snapshot of top level boot pool # is created. This issue is observed with GRUB versions before v2.12 if # extensible_dataset feature is enabled on ZFS boot pool. # -# This file lists all read-only comaptible features except +# This file lists all read-only compatible features except # extensible_dataset and any other feature that depends on it. # allocation_classes diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index 2ee15ab21806..ca4c884f8ac0 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -759,7 +759,7 @@ This option is provided for backwards compatibility with older ZFS versions. .It Sy ZFS_SET_PIPE_MAX Tells .Nm zfs -to set the maximum pipe size for sends/recieves. +to set the maximum pipe size for sends/receives. Disabled by default on Linux due to an unfixed deadlock in Linux's pipe size handling code. . diff --git a/man/man8/zpool-initialize.8 b/man/man8/zpool-initialize.8 index a9c8fd35aec9..e0bdcb6f6515 100644 --- a/man/man8/zpool-initialize.8 +++ b/man/man8/zpool-initialize.8 @@ -66,7 +66,7 @@ devices if none are specified. If the devices are being actively initialized the command will fail. After being cleared .Nm zpool Cm initialize -with no flags can be used to re-initialize all unallocoated regions on +with no flags can be used to re-initialize all unallocated regions on the relevant target devices. .It Fl w , -wait Wait until the devices have finished initializing before returning. diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index b9b54185d050..5c62f764fffd 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -83,7 +83,7 @@ Specify to set pool GUID as key for pool objects instead of pool names. .It Fl d Display the number of Direct I/O read/write checksum verify errors that have -occured on a top-level VDEV. +occurred on a top-level VDEV. See .Sx zfs_vdev_direct_write_verify in From 198621f910c440a002084f9b7e7ad83f0d39e831 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 16 Jan 2025 11:19:09 +1100 Subject: [PATCH 09/44] ZTS: reimplement kstat helper function The old kstat helper function was barely used, I suspect in part because it was very limited in the kinds of kstats it could gather. This adds new functions to replace it, for each kind of thing that can have stats: global, pool and dataset. There's options in there to get a single stat value, or all values within a group. Most importantly, the interface is the same for both platforms. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- tests/zfs-tests/Makefile.am | 1 + tests/zfs-tests/include/kstat.shlib | 516 ++++++++++++++++++++++++++ tests/zfs-tests/include/libtest.shlib | 22 +- 3 files changed, 521 insertions(+), 18 deletions(-) create mode 100644 tests/zfs-tests/include/kstat.shlib diff --git a/tests/zfs-tests/Makefile.am b/tests/zfs-tests/Makefile.am index 40a361d582a2..8a4b13d0acbb 100644 --- a/tests/zfs-tests/Makefile.am +++ b/tests/zfs-tests/Makefile.am @@ -42,6 +42,7 @@ scripts_zfs_tests_includedir = $(datadir)/$(PACKAGE)/zfs-tests/include dist_scripts_zfs_tests_include_DATA = \ %D%/include/blkdev.shlib \ %D%/include/commands.cfg \ + %D%/include/kstat.shlib \ %D%/include/libtest.shlib \ %D%/include/math.shlib \ %D%/include/properties.shlib \ diff --git a/tests/zfs-tests/include/kstat.shlib b/tests/zfs-tests/include/kstat.shlib new file mode 100644 index 000000000000..c7615760592f --- /dev/null +++ b/tests/zfs-tests/include/kstat.shlib @@ -0,0 +1,516 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +# +# This file provides the following helpers to read kstats from tests. +# +# kstat [-g] +# kstat_pool [-g] +# kstat_dataset [-N] +# +# `kstat` and `kstat_pool` return the value of of the given , either +# a global or pool-specific state. +# +# $ kstat dbgmsg +# timestamp message +# 1736848201 spa_history.c:304:spa_history_log_sync(): txg 14734896 ... +# 1736848201 spa_history.c:330:spa_history_log_sync(): ioctl ... +# ... +# +# $ kstat_pool garden state +# ONLINE +# +# To get a single stat within a group or collection, separate the name with +# '.' characters. +# +# $ kstat dbufstats.cache_target_bytes +# 3215780693 +# +# $ kstat_pool crayon iostats.arc_read_bytes +# 253671670784 +# +# -g is "group" mode. If the kstat is a group or collection, all stats in that +# group are returned, one stat per line, key and value separated by a space. +# +# $ kstat -g dbufstats +# cache_count 1792 +# cache_size_bytes 87720376 +# cache_size_bytes_max 305187768 +# cache_target_bytes 97668555 +# ... +# +# $ kstat_pool -g crayon iostats +# trim_extents_written 0 +# trim_bytes_written 0 +# trim_extents_skipped 0 +# trim_bytes_skipped 0 +# ... +# +# `kstat_dataset` accesses the per-dataset group kstat. The dataset can be +# specified by name: +# +# $ kstat_dataset crayon/home/robn nunlinks +# 2628514 +# +# or, with the -N switch, as /: +# +# $ kstat_dataset -N crayon/7 writes +# 125135 +# + +#################### +# Public interface + +# +# kstat [-g] +# +function kstat +{ + typeset -i want_group=0 + + OPTIND=1 + while getopts "g" opt ; do + case $opt in + 'g') want_group=1 ;; + *) log_fail "kstat: invalid option '$opt'" ;; + esac + done + shift $(expr $OPTIND - 1) + + typeset stat=$1 + + $_kstat_os 'global' '' "$stat" $want_group +} + +# +# kstat_pool [-g] +# +function kstat_pool +{ + typeset -i want_group=0 + + OPTIND=1 + while getopts "g" opt ; do + case $opt in + 'g') want_group=1 ;; + *) log_fail "kstat_pool: invalid option '$opt'" ;; + esac + done + shift $(expr $OPTIND - 1) + + typeset pool=$1 + typeset stat=$2 + + $_kstat_os 'pool' "$pool" "$stat" $want_group +} + +# +# kstat_dataset [-N] +# +function kstat_dataset +{ + typeset -i opt_objsetid=0 + + OPTIND=1 + while getopts "N" opt ; do + case $opt in + 'N') opt_objsetid=1 ;; + *) log_fail "kstat_dataset: invalid option '$opt'" ;; + esac + done + shift $(expr $OPTIND - 1) + + typeset dsarg=$1 + typeset stat=$2 + + if [[ $opt_objsetid == 0 ]] ; then + typeset pool="${dsarg%%/*}" # clear first / -> end + typeset objsetid=$($_resolve_dsname_os "$pool" "$dsarg") + if [[ -z "$objsetid" ]] ; then + log_fail "kstat_dataset: dataset not found: $dsarg" + fi + dsarg="$pool/$objsetid" + fi + + $_kstat_os 'dataset' "$dsarg" "$stat" 0 +} + +#################### +# Platform-specific interface + +# +# Implementation notes +# +# There's not a lot of uniformity between platforms, so I've written to a rough +# imagined model that seems to fit the majority of OpenZFS kstats. +# +# The main platform entry points look like this: +# +# _kstat_freebsd +# _kstat_linux +# +# - scope: one of 'global', 'pool', 'dataset'. The "kind" of object the kstat +# is attached to. +# - object: name of the scoped object +# global: empty string +# pool: pool name +# dataset: / pair +# - stat: kstat name to get +# - want_group: 0 to get the single value for the kstat, 1 to treat the kstat +# as a group and get all the stat names+values under it. group +# kstats cannot have values, and stat kstats cannot have +# children (by definition) +# +# Stat values can have multiple lines, so be prepared for those. +# +# These functions either succeed and produce the requested output, or call +# log_fail. They should never output empty, or 0, or anything else. +# +# Output: +# +# - want_group=0: the single stat value, followed by newline +# - want_group=1: One stat per line, +# + +# +# To support kstat_dataset(), platforms also need to provide a dataset +# name->object id resolver function. +# +# _resolve_dsname_freebsd +# _resolve_dsname_linux +# +# - pool: pool name. always the first part of the dataset name +# - dsname: dataset name, in the standard // format. +# +# Output is . objsetID is a decimal integer, > 0 +# + +#################### +# FreeBSD + +# +# All kstats are accessed through sysctl. We model "groups" as interior nodes +# in the stat tree, which are normally opaque. Because sysctl has no filtering +# options, and requesting any node produces all nodes below it, we have to +# always get the name and value, and then consider the output to understand +# if we got a group or a single stat, and post-process accordingly. +# +# Scopes are mostly mapped directly to known locations in the tree, but there +# are a handful of stats that are out of position, so we need to adjust. +# + +# +# _kstat_freebsd +# +function _kstat_freebsd +{ + typeset scope=$1 + typeset obj=$2 + typeset stat=$3 + typeset -i want_group=$4 + + typeset oid="" + case "$scope" in + global) + oid="kstat.zfs.misc.$stat" + ;; + pool) + # For reasons unknown, the "multihost", "txgs" and "reads" + # pool-specific kstats are directly under kstat.zfs., + # rather than kstat.zfs..misc like the other pool kstats. + # Adjust for that here. + case "$stat" in + multihost|txgs|reads) + oid="kstat.zfs.$obj.$stat" + ;; + *) + oid="kstat.zfs.$obj.misc.$stat" + ;; + esac + ;; + dataset) + typeset pool="" + typeset -i objsetid=0 + _split_pool_objsetid $obj pool objsetid + oid=$(printf 'kstat.zfs.%s.dataset.objset-0x%x.%s' \ + $pool $objsetid $stat) + ;; + esac + + # Calling sysctl on a "group" node will return everything under that + # node, so we have to inspect the first line to make sure we are + # getting back what we expect. For a single value, the key will have + # the name we requested, while for a group, the key will not have the + # name (group nodes are "opaque", not returned by sysctl by default. + + if [[ $want_group == 0 ]] ; then + sysctl -e "$oid" | awk -v oid="$oid" -v oidre="^$oid=" ' + NR == 1 && $0 !~ oidre { exit 1 } + NR == 1 { print substr($0, length(oid)+2) ; next } + { print } + ' + else + sysctl -e "$oid" | awk -v oid="$oid" -v oidre="^$oid=" ' + NR == 1 && $0 ~ oidre { exit 2 } + { + sub("^" oid "\.", "") + sub("=", " ") + print + } + ' + fi + + typeset -i err=$? + case $err in + 0) return ;; + 1) log_fail "kstat: can't get value for group kstat: $oid" ;; + 2) log_fail "kstat: not a group kstat: $oid" ;; + esac + + log_fail "kstat: unknown error: $oid" +} + +# +# _resolve_dsname_freebsd +# +function _resolve_dsname_freebsd +{ + # we're searching for: + # + # kstat.zfs.shed.dataset.objset-0x8087.dataset_name: shed/poudriere + # + # We split on '.', then get the hex objsetid from field 5. + # + # We convert hex to decimal in the shell because there isn't a _simple_ + # portable way to do it in awk and this code is already too intense to + # do it a complicated way. + typeset pool=$1 + typeset dsname=$2 + sysctl -e kstat.zfs.$pool | \ + awk -F '.' -v dsnamere="=$dsname$" ' + /\.objset-0x[0-9a-f]+\.dataset_name=/ && $6 ~ dsnamere { + print substr($5, 8) + exit + } + ' | xargs printf %d +} + +#################### +# Linux + +# +# kstats all live under /proc/spl/kstat/zfs. They have a flat structure: global +# at top-level, pool in a directory, and dataset in a objset- file inside the +# pool dir. +# +# Groups are challenge. A single stat can be the entire text of a file, or +# a single line that must be extracted from a "group" file. The only way to +# recognise a group from the outside is to look for its header. This naturally +# breaks if a raw file had a matching header, or if a group file chooses to +# hid its header. Fortunately OpenZFS does none of these things at the moment. +# + +# +# _kstat_linux +# +function _kstat_linux +{ + typeset scope=$1 + typeset obj=$2 + typeset stat=$3 + typeset -i want_group=$4 + + typeset singlestat="" + + if [[ $scope == 'dataset' ]] ; then + typeset pool="" + typeset -i objsetid=0 + _split_pool_objsetid $obj pool objsetid + stat=$(printf 'objset-0x%x.%s' $objsetid $stat) + obj=$pool + scope='pool' + fi + + typeset path="" + if [[ $scope == 'global' ]] ; then + path="/proc/spl/kstat/zfs/$stat" + else + path="/proc/spl/kstat/zfs/$obj/$stat" + fi + + if [[ ! -e "$path" && $want_group -eq 0 ]] ; then + # This single stat doesn't have its own file, but the wanted + # stat could be in a group kstat file, which we now need to + # find. To do this, we split a single stat name into two parts: + # the file that would contain the stat, and the key within that + # file to match on. This works by converting all bar the last + # '.' separator to '/', then splitting on the remaining '.' + # separator. If there are no '.' separators, the second arg + # returned will be empty. + # + # foo -> (foo) + # foo.bar -> (foo, bar) + # foo.bar.baz -> (foo/bar, baz) + # foo.bar.baz.quux -> (foo/bar/baz, quux) + # + # This is how we will target single stats within a larger NAMED + # kstat file, eg dbufstats.cache_target_bytes. + typeset -a split=($(echo "$stat" | \ + sed -E 's/^(.+)\.([^\.]+)$/\1 \2/ ; s/\./\//g')) + typeset statfile=${split[0]} + singlestat=${split[1]:-""} + + if [[ $scope == 'global' ]] ; then + path="/proc/spl/kstat/zfs/$statfile" + else + path="/proc/spl/kstat/zfs/$obj/$statfile" + fi + fi + if [[ ! -r "$path" ]] ; then + log_fail "kstat: can't read $path" + fi + + if [[ $want_group == 1 ]] ; then + # "group" (NAMED) kstats on Linux start: + # + # $ cat /proc/spl/kstat/zfs/crayon/iostats + # 70 1 0x01 26 7072 8577844978 661416318663496 + # name type data + # trim_extents_written 4 0 + # trim_bytes_written 4 0 + # + # The second value on the first row is the ks_type. Group + # mode only works for type 1, KSTAT_TYPE_NAMED. So we check + # for that, and eject if it's the wrong type. Otherwise, we + # skip the header row and process the values. + awk ' + NR == 1 && ! /^[0-9]+ 1 / { exit 2 } + NR < 3 { next } + { print $1 " " $NF } + ' "$path" + elif [[ -n $singlestat ]] ; then + # single stat. must be a single line within a group stat, so + # we look for the header again as above. + awk -v singlestat="$singlestat" \ + -v singlestatre="^$singlestat " ' + NR == 1 && /^[0-9]+ [^1] / { exit 2 } + NR < 3 { next } + $0 ~ singlestatre { print $NF ; exit 0 } + ENDFILE { exit 3 } + ' "$path" + else + # raw stat. dump contents, exclude group stats + awk ' + NR == 1 && /^[0-9]+ 1 / { exit 1 } + { print } + ' "$path" + fi + + typeset -i err=$? + case $err in + 0) return ;; + 1) log_fail "kstat: can't get value for group kstat: $path" ;; + 2) log_fail "kstat: not a group kstat: $path" ;; + 3) log_fail "kstat: stat not found in group: $path $singlestat" ;; + esac + + log_fail "kstat: unknown error: $path" +} + +# +# _resolve_dsname_linux +# +function _resolve_dsname_linux +{ + # We look inside all: + # + # /proc/spl/kstat/zfs/crayon/objset-0x113 + # + # and check the dataset_name field inside. If we get a match, we split + # the filename on /, then extract the hex objsetid. + # + # We convert hex to decimal in the shell because there isn't a _simple_ + # portable way to do it in awk and this code is already too intense to + # do it a complicated way. + typeset pool=$1 + typeset dsname=$2 + awk -v dsname="$dsname" ' + $1 == "dataset_name" && $3 == dsname { + split(FILENAME, a, "/") + print substr(a[7], 8) + exit + } + ' /proc/spl/kstat/zfs/$pool/objset-0x* | xargs printf %d +} + +#################### + +# +# _split_pool_objsetid <*pool> <*objsetid> +# +# Splits pool/objsetId string in and fills and . +# +function _split_pool_objsetid +{ + typeset obj=$1 + typeset -n pool=$2 + typeset -n objsetid=$3 + + pool="${obj%%/*}" # clear first / -> end + typeset osidarg="${obj#*/}" # clear start -> first / + + # ensure objsetid arg does not contain a /. we're about to convert it, + # but ksh will treat it as an expression, and a / will give a + # divide-by-zero + if [[ "${osidarg%%/*}" != "$osidarg" ]] ; then + log_fail "kstat: invalid objsetid: $osidarg" + fi + + typeset -i id=$osidarg + if [[ $id -le 0 ]] ; then + log_fail "kstat: invalid objsetid: $osidarg" + fi + objsetid=$id +} + +#################### + +# +# Per-platform function selection. +# +# To avoid needing platform check throughout, we store the names of the +# platform functions and call through them. +# +if is_freebsd ; then + _kstat_os='_kstat_freebsd' + _resolve_dsname_os='_resolve_dsname_freebsd' +elif is_linux ; then + _kstat_os='_kstat_linux' + _resolve_dsname_os='_resolve_dsname_linux' +else + _kstat_os='_kstat_unknown_platform_implement_me' + _resolve_dsname_os='_resolve_dsname_unknown_platform_implement_me' +fi + diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 9cf919c3dd0f..5ba94bc6f5e4 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -28,6 +28,7 @@ # Copyright (c) 2017, Datto Inc. All rights reserved. # Copyright (c) 2017, Open-E Inc. All rights reserved. # Copyright (c) 2021, The FreeBSD Foundation. +# Copyright (c) 2025, Klara, Inc. # Use is subject to license terms. # @@ -37,6 +38,7 @@ . ${STF_SUITE}/include/math.shlib . ${STF_SUITE}/include/blkdev.shlib + # On AlmaLinux 9 we will see $PWD = '.' instead of the full path. This causes # some tests to fail. Fix it up here. if [ "$PWD" = "." ] ; then @@ -3662,24 +3664,6 @@ function ls_xattr # path esac } -function kstat # stat flags? -{ - typeset stat=$1 - typeset flags=${2-"-n"} - - case "$UNAME" in - FreeBSD) - sysctl $flags kstat.zfs.misc.$stat - ;; - Linux) - cat "/proc/spl/kstat/zfs/$stat" 2>/dev/null - ;; - *) - false - ;; - esac -} - function get_arcstat # stat { typeset stat=$1 @@ -3916,3 +3900,5 @@ function pop_coredump_pattern ;; esac } + +. ${STF_SUITE}/include/kstat.shlib From 6edbbe0646ed02b0c3b0165fa9f355b42beaaf29 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 15 Jan 2025 14:40:31 +1100 Subject: [PATCH 10/44] ZTS: update existing kstat users to new helper Removes other custom helpers and direct accesses to /proc. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- .../functional/arc/dbufstats_001_pos.ksh | 13 +--- .../functional/cli_root/zdb/zdb_objset_id.ksh | 17 ++--- .../tests/functional/deadman/deadman_sync.ksh | 6 +- .../tests/functional/direct/dio.kshlib | 73 ++++--------------- .../functional/direct/dio_read_verify.ksh | 8 +- .../direct/dio_unaligned_filesize.ksh | 8 +- .../direct/dio_write_stable_pages.ksh | 4 +- .../functional/direct/dio_write_verify.ksh | 12 +-- .../functional/fadvise/fadvise_sequential.ksh | 8 +- .../fault/suspend_on_probe_errors.ksh | 6 +- .../fault/suspend_resume_single.ksh | 6 +- .../l2arc/persist_l2arc_003_neg.ksh | 3 +- .../zfs-tests/tests/functional/mmp/mmp.kshlib | 8 +- .../functional/mmp/mmp_write_distribution.ksh | 3 +- .../functional/mmp/mmp_write_slow_disk.ksh | 5 +- .../mount/umount_unlinked_drain.ksh | 16 ++-- 16 files changed, 67 insertions(+), 129 deletions(-) diff --git a/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh b/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh index e51cf179d8ef..552a27e98102 100755 --- a/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh @@ -29,8 +29,8 @@ # # DESCRIPTION: -# Ensure stats presented in /proc/spl/kstat/zfs/dbufstats are correct -# based on /proc/spl/kstat/zfs/dbufs. +# Ensure stats presented in the dbufstats kstat are correct based on the +# dbufs kstat. # # STRATEGY: # 1. Generate a file with random data in it @@ -55,12 +55,7 @@ function testdbufstat # stat_name dbufstat_filter [[ -n "$2" ]] && filter="-F $2" - if is_linux; then - read -r _ _ from_dbufstat _ < <(grep -w "$name" "$DBUFSTATS_FILE") - else - from_dbufstat=$(awk "/dbufstats\.$name:/ { print \$2 }" \ - "$DBUFSTATS_FILE") - fi + from_dbufstat=$(grep "^$name " "$DBUFSTATS_FILE" | cut -f2 -d' ') from_dbufs=$(dbufstat -bxn -i "$DBUFS_FILE" "$filter" | wc -l) within_tolerance $from_dbufstat $from_dbufs 15 \ @@ -77,7 +72,7 @@ log_must file_write -o create -f "$TESTDIR/file" -b 1048576 -c 20 -d R sync_all_pools log_must eval "kstat dbufs > $DBUFS_FILE" -log_must eval "kstat dbufstats '' > $DBUFSTATS_FILE" +log_must eval "kstat -g dbufstats > $DBUFSTATS_FILE" for level in {0..11}; do testdbufstat "cache_level_$level" "dbc=1,level=$level" diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_objset_id.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_objset_id.ksh index fdda9ba22638..9d147f382042 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_objset_id.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_objset_id.ksh @@ -31,10 +31,9 @@ # 7. Run zdb -dddddd pool/objsetID objectID (hex) # 8. Confirm names # 9. Repeat with zdb -NNNNNN pool/objsetID objectID -# 10. Obtain objsetID from /proc/spl/kstat/zfs/testpool/obset-0x -# (linux only) +# 10. Obtain dataset name from testpool.objset-0x.dataset_name kstat # 11. Run zdb -dddddd pool/objsetID (hex) -# 12. Match name from zdb against proc entry +# 12. Match name from zdb against kstat # 13. Create dataset with hex numeric name # 14. Create dataset with decimal numeric name # 15. zdb -d for numeric datasets succeeds @@ -68,7 +67,7 @@ log_note "file $init_data has object number $obj" sync_pool $TESTPOOL IFS=", " read -r _ _ _ _ objset_id _ < <(zdb -d $TESTPOOL/$TESTFS) -objset_hex=$(printf "0x%X" $objset_id) +objset_hex=$(printf "0x%x" $objset_id) log_note "objset $TESTPOOL/$TESTFS has objset ID $objset_id ($objset_hex)" for id in "$objset_id" "$objset_hex" @@ -89,13 +88,9 @@ do log_fail "zdb -NNNNNN $TESTPOOL/$id $obj failed (file1 not in zdb output)" done -if is_linux; then - output=$(ls -1 /proc/spl/kstat/zfs/$TESTPOOL | grep objset- | tail -1) - objset_hex=${output#*-} - name_from_proc=$(grep dataset_name /proc/spl/kstat/zfs/$TESTPOOL/$output | cut -d' ' -f3) - log_note "checking zdb output for $name_from_proc" - log_must eval "zdb -dddddd $TESTPOOL/$objset_hex | grep -q \"$name_from_proc\"" -fi +name_from_proc=$(kstat_dataset -N $TESTPOOL/$objset_id dataset_name) +log_note "checking zdb output for $name_from_proc" +log_must eval "zdb -dddddd $TESTPOOL/$objset_hex | grep -q \"$name_from_proc\"" log_must zfs create $hex_ds log_must zfs create $num_ds diff --git a/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh b/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh index f1561b7282e5..5c165523fefd 100755 --- a/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh +++ b/tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh @@ -73,11 +73,7 @@ log_must zinject -c all sync_all_pools # Log txg sync times for reference and the zpool event summary. -if is_freebsd; then - log_must sysctl -n kstat.zfs.$TESTPOOL.txgs -else - log_must cat /proc/spl/kstat/zfs/$TESTPOOL/txgs -fi +log_must kstat_pool $TESTPOOL txgs log_must zpool events # Verify at least 3 deadman events were logged. The first after 5 seconds, diff --git a/tests/zfs-tests/tests/functional/direct/dio.kshlib b/tests/zfs-tests/tests/functional/direct/dio.kshlib index 5b3f893e1ce1..49c43a0aaca3 100644 --- a/tests/zfs-tests/tests/functional/direct/dio.kshlib +++ b/tests/zfs-tests/tests/functional/direct/dio.kshlib @@ -140,29 +140,6 @@ function check_dio_chksum_verify_failures # pool vdev_type op expect_errors } -# -# Get the value of a counter from -# Linux: /proc/spl/kstat/zfs/$pool/iostats file. -# FreeBSD: kstat.zfs.$pool.msic.iostats.$stat -# -function get_iostats_stat # pool stat -{ - typeset pool=$1 - typeset stat=$2 - - if is_linux; then - iostats_file=/proc/spl/kstat/zfs/$pool/iostats - val=$(grep -m1 "$stat" $iostats_file | awk '{ print $3 }') - else - val=$(sysctl -n kstat.zfs.$pool.misc.iostats.$stat) - fi - if [[ -z "$val" ]]; then - log_fail "Unable to read $stat counter" - fi - - echo "$val" -} - # # Evict any buffered blocks by overwritting them using an O_DIRECT request. # @@ -190,17 +167,13 @@ function verify_dio_write_count #pool bs size mnpnt log_note "Checking for $dio_wr_expected Direct I/O writes" - prev_dio_wr=$(get_iostats_stat $pool direct_write_count) + prev_dio_wr=$(kstat_pool $pool iostats.direct_write_count) dio_and_verify write $size $bs $mntpnt "sync" - curr_dio_wr=$(get_iostats_stat $pool direct_write_count) + curr_dio_wr=$(kstat_pool $pool iostats.direct_write_count) dio_wr_actual=$((curr_dio_wr - prev_dio_wr)) if [[ $dio_wr_actual -lt $dio_wr_expected ]]; then - if is_linux; then - cat /proc/spl/kstat/zfs/$pool/iostats - else - sysctl kstat.zfs.$pool.misc.iostats - fi + kstat_pool -g $pool iostats log_fail "Direct writes $dio_wr_actual of $dio_wr_expected" fi } @@ -223,33 +196,25 @@ function check_write # pool file bs count seek flags buf_wr dio_wr log_note "Checking $count * $bs write(s) at offset $seek, $flags" - prev_buf_wr=$(get_iostats_stat $pool arc_write_count) - prev_dio_wr=$(get_iostats_stat $pool direct_write_count) + prev_buf_wr=$(kstat_pool $pool iostats.arc_write_count) + prev_dio_wr=$(kstat_pool $pool iostats.direct_write_count) log_must stride_dd -i /dev/urandom -o $file -b $bs -c $count \ -k $seek $flags - curr_buf_wr=$(get_iostats_stat $pool arc_write_count) + curr_buf_wr=$(kstat_pool $pool iostats.arc_write_count) buf_wr_actual=$((curr_buf_wr - prev_buf_wr)) - curr_dio_wr=$(get_iostats_stat $pool direct_write_count) + curr_dio_wr=$(kstat_pool $pool iostats.direct_write_count) dio_wr_actual=$((curr_dio_wr - prev_dio_wr)) if [[ $buf_wr_actual -lt $buf_wr_expect ]]; then - if is_linux; then - cat /proc/spl/kstat/zfs/$pool/iostats - else - sysctl kstat.zfs.$pool.misc.iostats - fi + kstat_pool -g $pool iostats log_fail "Buffered writes $buf_wr_actual of $buf_wr_expect" fi if [[ $dio_wr_actual -lt $dio_wr_expect ]]; then - if is_linux; then - cat /proc/spl/kstat/zfs/$pool/iostats - else - sysctl kstat.zfs.$pool.misc.iostats - fi + kstat_pool -g $pool iostats log_fail "Direct writes $dio_wr_actual of $dio_wr_expect" fi } @@ -272,33 +237,25 @@ function check_read # pool file bs count skip flags buf_rd dio_rd log_note "Checking $count * $bs read(s) at offset $skip, $flags" - prev_buf_rd=$(get_iostats_stat $pool arc_read_count) - prev_dio_rd=$(get_iostats_stat $pool direct_read_count) + prev_buf_rd=$(kstat_pool $pool iostats.arc_read_count) + prev_dio_rd=$(kstat_pool $pool iostats.direct_read_count) log_must stride_dd -i $file -o /dev/null -b $bs -c $count \ -p $skip $flags - curr_buf_rd=$(get_iostats_stat $pool arc_read_count) + curr_buf_rd=$(kstat_pool $pool iostats.arc_read_count) buf_rd_actual=$((curr_buf_rd - prev_buf_rd)) - curr_dio_rd=$(get_iostats_stat $pool direct_read_count) + curr_dio_rd=$(kstat_pool $pool iostats.direct_read_count) dio_rd_actual=$((curr_dio_rd - prev_dio_rd)) if [[ $buf_rd_actual -lt $buf_rd_expect ]]; then - if is_linux; then - cat /proc/spl/kstat/zfs/$pool/iostats - else - sysctl kstat.zfs.$pool.misc.iostats - fi + kstat_pool -g $pool iostats log_fail "Buffered reads $buf_rd_actual of $buf_rd_expect" fi if [[ $dio_rd_actual -lt $dio_rd_expect ]]; then - if is_linux; then - cat /proc/spl/kstat/zfs/$pool/iostats - else - sysctl kstat.zfs.$pool.misc.iostats - fi + kstat_pool -g $pool iostats log_fail "Direct reads $dio_rd_actual of $dio_rd_expect" fi } diff --git a/tests/zfs-tests/tests/functional/direct/dio_read_verify.ksh b/tests/zfs-tests/tests/functional/direct/dio_read_verify.ksh index 456d429b1d99..67e0b4a7c700 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_read_verify.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_read_verify.ksh @@ -72,8 +72,8 @@ for type in "" "mirror" "raidz" "draid"; do $TESTPOOL1/$TESTFS1" mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1) - prev_dio_rd=$(get_iostats_stat $TESTPOOL1 direct_read_count) - prev_arc_rd=$(get_iostats_stat $TESTPOOL1 arc_read_count) + prev_dio_rd=$(kstat_pool $TESTPOOL1 iostats.direct_read_count) + prev_arc_rd=$(kstat_pool $TESTPOOL1 iostats.arc_read_count) # Create the file before trying to manipulate the contents log_must stride_dd -o "$mntpnt/direct-write.iso" -i /dev/urandom \ @@ -83,8 +83,8 @@ for type in "" "mirror" "raidz" "draid"; do -n $NUMBLOCKS -b $BS -r # Getting new Direct I/O and ARC Write counts. - curr_dio_rd=$(get_iostats_stat $TESTPOOL1 direct_read_count) - curr_arc_rd=$(get_iostats_stat $TESTPOOL1 arc_read_count) + curr_dio_rd=$(kstat_pool $TESTPOOL1 iostats.direct_read_count) + curr_arc_rd=$(kstat_pool $TESTPOOL1 iostats.arc_read_count) total_dio_rd=$((curr_dio_rd - prev_dio_rd)) total_arc_rd=$((curr_arc_rd - prev_arc_rd)) diff --git a/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh b/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh index 8bb363f1a983..6e2982ad7d46 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh @@ -73,11 +73,11 @@ log_must zpool export $TESTPOOL log_must zpool import $TESTPOOL # Reading the file back using Direct I/O -prev_dio_read=$(get_iostats_stat $TESTPOOL direct_read_count) -prev_arc_read=$(get_iostats_stat $TESTPOOL arc_read_count) +prev_dio_read=$(kstat_pool $TESTPOOL iostats.direct_read_count) +prev_arc_read=$(kstat_pool $TESTPOOL iostats.arc_read_count) log_must stride_dd -i $filename -o /dev/null -b $bs -e -d -curr_dio_read=$(get_iostats_stat $TESTPOOL direct_read_count) -curr_arc_read=$(get_iostats_stat $TESTPOOL arc_read_count) +curr_dio_read=$(kstat_pool $TESTPOOL iostats.direct_read_count) +curr_arc_read=$(kstat_pool $TESTPOOL iostats.arc_read_count) total_dio_read=$((curr_dio_read - prev_dio_read)) total_arc_read=$((curr_arc_read - prev_arc_read)) diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh index ccdabc678a68..3d7f7089d7c8 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh @@ -72,7 +72,7 @@ do log_note "Verifying stable pages for Direct I/O writes \ iteration $i of $ITERATIONS" - prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + prev_dio_wr=$(kstat_pool $TESTPOOL iostats.direct_write_count) # Manipulate the user's buffer while running O_DIRECT write # workload with the buffer. @@ -83,7 +83,7 @@ do log_must stride_dd -i $mntpnt/direct-write.iso -o /dev/null \ -b $BS -c $NUMBLOCKS - curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + curr_dio_wr=$(kstat_pool $TESTPOOL iostats.direct_write_count) total_dio_wr=$((curr_dio_wr - prev_dio_wr)) log_note "Making sure we have Direct I/O writes logged" diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh index 4eb9efe95ef1..1c1565cbbefb 100755 --- a/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh +++ b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh @@ -90,10 +90,10 @@ log_must set_tunable32 VDEV_DIRECT_WR_VERIFY 0 # failures log_note "Verifying no panics for Direct I/O writes with compression" log_must zfs set compression=on $TESTPOOL/$TESTFS -prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) +prev_dio_wr=$(kstat_pool $TESTPOOL iostats.direct_write_count) log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" -n $NUMBLOCKS \ -b $BS -w -curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) +curr_dio_wr=$(kstat_pool $TESTPOOL iostats.direct_write_count) total_dio_wr=$((curr_dio_wr - prev_dio_wr)) log_note "Making sure we have Direct I/O writes logged" @@ -115,7 +115,7 @@ for i in $(seq 1 $ITERATIONS); do log_note "Verifying Direct I/O write checksums iteration \ $i of $ITERATIONS with zfs_vdev_direct_write_verify=0" - prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + prev_dio_wr=$(kstat_pool $TESTPOOL iostats.direct_write_count) log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" \ -n $NUMBLOCKS -b $BS -w @@ -126,7 +126,7 @@ for i in $(seq 1 $ITERATIONS); do -c $num_blocks # Getting new Direct I/O and ARC write counts. - curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + curr_dio_wr=$(kstat_pool $TESTPOOL iostats.direct_write_count) total_dio_wr=$((curr_dio_wr - prev_dio_wr)) # Verifying there are checksum errors @@ -165,7 +165,7 @@ for i in $(seq 1 $ITERATIONS); do log_note "Verifying every Direct I/O write checksums iteration $i of \ $ITERATIONS with zfs_vdev_direct_write_verify=1" - prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + prev_dio_wr=$(kstat_pool $TESTPOOL iostats.direct_write_count) log_must manipulate_user_buffer -f "$mntpnt/direct-write.iso" \ -n $NUMBLOCKS -b $BS -e -w @@ -176,7 +176,7 @@ for i in $(seq 1 $ITERATIONS); do -c $num_blocks # Getting new Direct I/O write counts. - curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + curr_dio_wr=$(kstat_pool $TESTPOOL iostats.direct_write_count) total_dio_wr=$((curr_dio_wr - prev_dio_wr)) log_note "Making sure there are no checksum errors with the ZPool" diff --git a/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh b/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh index 7b7d1d379ac6..daeb93273a54 100755 --- a/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh +++ b/tests/zfs-tests/tests/functional/fadvise/fadvise_sequential.ksh @@ -54,10 +54,6 @@ function cleanup [[ -e $TESTDIR ]] && log_must rm -Rf $TESTDIR/* } -getstat() { - awk -v c="$1" '$1 == c {print $3; exit}' /proc/spl/kstat/zfs/arcstats -} - log_assert "Ensure fadvise prefetch data" log_onexit cleanup @@ -67,12 +63,12 @@ log_must zfs set primarycache=metadata $TESTPOOL log_must file_write -o create -f $FILE -b $BLKSZ -c 1000 sync_pool $TESTPOOL -data_size1=$(getstat data_size) +data_size1=$(kstat arcstats.data_size) log_must file_fadvise -f $FILE -a 2 sleep 10 -data_size2=$(getstat data_size) +data_size2=$(kstat arcstats.data_size) log_note "original data_size is $data_size1, final data_size is $data_size2" log_must [ $data_size1 -le $data_size2 ] diff --git a/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh b/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh index d9261bb5d274..3f6edad6da9b 100755 --- a/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh +++ b/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh @@ -119,14 +119,14 @@ log_must dd if=/dev/urandom of=$MNTPOINT/writes bs=1M count=1 # Wait until sync starts, and the pool suspends log_note "waiting for pool to suspend" typeset -i tries=30 -until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do +until [[ $(kstat_pool $TESTPOOL state) == "SUSPENDED" ]] ; do if ((tries-- == 0)); then zpool status -s log_fail "UNEXPECTED -- pool did not suspend" fi sleep 1 done -log_note $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) +log_note $(kstat_pool $TESTPOOL state) # Put the missing disks back into service log_must eval "echo running > /sys/block/$sd/device/state" @@ -137,7 +137,7 @@ log_must zpool clear $TESTPOOL # Wait until the pool resumes log_note "waiting for pool to resume" tries=30 -until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) != "SUSPENDED" ]] ; do +until [[ $(kstat_pool $TESTPOOL state) != "SUSPENDED" ]] ; do if ((tries-- == 0)); then log_fail "pool did not resume" fi diff --git a/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh b/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh index b67059158a57..0dc5584e4fd5 100755 --- a/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh +++ b/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh @@ -26,8 +26,6 @@ . $STF_SUITE/include/libtest.shlib -set -x - DATAFILE="$TMPDIR/datafile" function cleanup @@ -62,7 +60,7 @@ log_must cp $DATAFILE /$TESTPOOL/file # wait until sync starts, and the pool suspends log_note "waiting for pool to suspend" typeset -i tries=10 -until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do +until [[ $(kstat_pool $TESTPOOL state) == "SUSPENDED" ]] ; do if ((tries-- == 0)); then log_fail "pool didn't suspend" fi @@ -82,7 +80,7 @@ log_note "giving pool time to settle and complete txg" sleep 7 # if the pool suspended, then everything is bad -if [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; then +if [[ $(kstat_pool $TESTPOOL state) == "SUSPENDED" ]] ; then log_fail "pool suspended" fi diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_003_neg.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_003_neg.ksh index f8dc2b108f0d..14063658e3c5 100755 --- a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_003_neg.ksh @@ -32,8 +32,7 @@ # 4. Export pool. # 5. Import pool. # 6. Check in zpool iostat if the cache device has space allocated. -# 7. Read the file written in (3) and check if l2_hits in -# /proc/spl/kstat/zfs/arcstats increased. +# 7. Read the file written in (3) and check if arcstats.l2_hits increased. # verify_runnable "global" diff --git a/tests/zfs-tests/tests/functional/mmp/mmp.kshlib b/tests/zfs-tests/tests/functional/mmp/mmp.kshlib index 5071830c489a..01e4f2b735fa 100644 --- a/tests/zfs-tests/tests/functional/mmp/mmp.kshlib +++ b/tests/zfs-tests/tests/functional/mmp/mmp.kshlib @@ -199,20 +199,20 @@ function count_skipped_mmp_writes # pool duration { typeset pool=$1 typeset -i duration=$2 - typeset hist_path="/proc/spl/kstat/zfs/$pool/multihost" sleep $duration - awk 'BEGIN {count=0}; $NF == "-" {count++}; END {print count};' "$hist_path" + kstat_pool $pool multihost | \ + awk 'BEGIN {count=0}; $NF == "-" {count++}; END {print count};' } function count_mmp_writes # pool duration { typeset pool=$1 typeset -i duration=$2 - typeset hist_path="/proc/spl/kstat/zfs/$pool/multihost" sleep $duration - awk 'BEGIN {count=0}; $NF != "-" {count++}; END {print count};' "$hist_path" + kstat_pool $pool multihost | \ + awk 'BEGIN {count=0}; $NF != "-" {count++}; END {print count};' } function summarize_uberblock_mmp # device diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh index 1ac254aa1dab..6f34974770d1 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh @@ -47,7 +47,6 @@ log_assert "mmp writes are evenly distributed across leaf vdevs" log_onexit cleanup MMP_HISTORY_TMP=$MMP_DIR/history -MMP_HISTORY=/proc/spl/kstat/zfs/$MMP_POOL/multihost # Step 1 log_must mkdir -p $MMP_DIR @@ -69,7 +68,7 @@ typeset -i min_writes=999 typeset -i max_writes=0 typeset -i write_count # copy to get as close to a consistent view as possible -cp $MMP_HISTORY $MMP_HISTORY_TMP +kstat_pool $MMP_POOL multihost > $MMP_HISTORY_TMP for x in {0..7}; do write_count=$(grep -c file.${x} $MMP_HISTORY_TMP) if [ $write_count -lt $min_writes ]; then diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh index 8b118684aa7f..e45aedd450d2 100755 --- a/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh +++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh @@ -58,7 +58,7 @@ function cleanup log_assert "A long VDEV probe doesn't cause a MMP check suspend" log_onexit cleanup -MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost +MMP_HISTORY_TMP=$MMP_DIR/history # Create a multiple drive pool log_must zpool events -c @@ -83,8 +83,9 @@ sleep 10 sync_pool $MMP_POOL # Confirm mmp writes to the non-slow disks have taken place +kstat_pool $MMP_POOL multihost > $MMP_HISTORY_TMP for x in {0,1,2,4}; do - write_count=$(grep -c file.${x} $MMP_HISTORY_URL) + write_count=$(grep -c file.${x} $MMP_HISTORY_TMP) [[ $write_count -gt 0 ]] || log_fail "expecting mmp writes" done diff --git a/tests/zfs-tests/tests/functional/mount/umount_unlinked_drain.ksh b/tests/zfs-tests/tests/functional/mount/umount_unlinked_drain.ksh index 40045a7a96b5..9e93c1784dbf 100755 --- a/tests/zfs-tests/tests/functional/mount/umount_unlinked_drain.ksh +++ b/tests/zfs-tests/tests/functional/mount/umount_unlinked_drain.ksh @@ -42,13 +42,15 @@ function cleanup function unlinked_size_is { + typeset -i expect=$1 + typeset dataset=$2 + MAX_ITERS=5 # iteration to do before we consider reported number stable iters=0 last_usize=0 while [[ $iters -le $MAX_ITERS ]]; do - kstat_file=$(grep -nrwl /proc/spl/kstat/zfs/$2/objset-0x* -e $3) - nunlinks=$(awk '/nunlinks/ {print $3}' $kstat_file) - nunlinked=$(awk '/nunlinked/ {print $3}' $kstat_file) + nunlinks=$(kstat_dataset $dataset nunlinks) + nunlinked=$(kstat_dataset $dataset nunlinked) usize=$(($nunlinks - $nunlinked)) if [[ $iters == $MAX_ITERS && $usize == $1 ]]; then return 0 @@ -89,20 +91,20 @@ for fs in 1 2 3; do fi log_must set_tunable32 UNLINK_SUSPEND_PROGRESS 1 - log_must unlinked_size_is 0 $TESTPOOL $TESTPOOL/$TESTFS.$fs + log_must unlinked_size_is 0 $TESTPOOL/$TESTFS.$fs # build up unlinked set for fn in $(seq 1 100); do log_must eval "rm $TESTDIR.$fs/file-$fn &" done - log_must unlinked_size_is 100 $TESTPOOL $TESTPOOL/$TESTFS.$fs + log_must unlinked_size_is 100 $TESTPOOL/$TESTFS.$fs # test that we can mount fs without emptying the unlinked list log_must zfs umount $TESTPOOL/$TESTFS.$fs log_must unmounted $TESTDIR.$fs log_must zfs mount $TESTPOOL/$TESTFS.$fs log_must mounted $TESTDIR.$fs - log_must unlinked_size_is 100 $TESTPOOL $TESTPOOL/$TESTFS.$fs + log_must unlinked_size_is 100 $TESTPOOL/$TESTFS.$fs # confirm we can drain and add to unlinked set at the same time log_must set_tunable32 UNLINK_SUSPEND_PROGRESS 0 @@ -111,7 +113,7 @@ for fs in 1 2 3; do for fn in $(seq 101 175); do log_must eval "rm $TESTDIR.$fs/file-$fn &" done - log_must unlinked_size_is 0 $TESTPOOL $TESTPOOL/$TESTFS.$fs + log_must unlinked_size_is 0 $TESTPOOL/$TESTFS.$fs done done From cf55fdea24a77b0a4a2ad5d2570ca25190f0ca03 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 15 Jan 2025 15:11:33 +1100 Subject: [PATCH 11/44] ZTS: remove get_arcstat It's now a simple wrapper, so lets just call kstat direct. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- tests/zfs-tests/include/libtest.shlib | 21 ++----------------- .../tests/functional/cache/cache_012_pos.ksh | 4 ++-- .../compression/l2arc_compressed_arc.ksh | 4 ++-- .../l2arc_compressed_arc_disabled.ksh | 4 ++-- .../compression/l2arc_encrypted.ksh | 4 ++-- .../l2arc_encrypted_no_compressed_arc.ksh | 4 ++-- .../functional/l2arc/l2arc_arcstats_pos.ksh | 20 +++++++++--------- .../functional/l2arc/l2arc_l2miss_pos.ksh | 6 +++--- .../functional/l2arc/l2arc_mfuonly_pos.ksh | 4 ++-- .../l2arc/persist_l2arc_001_pos.ksh | 2 +- .../l2arc/persist_l2arc_002_pos.ksh | 2 +- .../l2arc/persist_l2arc_003_neg.ksh | 4 ++-- .../l2arc/persist_l2arc_004_pos.ksh | 2 +- .../l2arc/persist_l2arc_005_pos.ksh | 2 +- .../tests/functional/trim/trim_l2arc.ksh | 4 ++-- 15 files changed, 35 insertions(+), 52 deletions(-) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 5ba94bc6f5e4..0b6c675cdd2c 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3664,23 +3664,6 @@ function ls_xattr # path esac } -function get_arcstat # stat -{ - typeset stat=$1 - - case "$UNAME" in - FreeBSD) - kstat arcstats.$stat - ;; - Linux) - kstat arcstats | awk "/$stat/"' { print $3 }' - ;; - *) - false - ;; - esac -} - function punch_hole # offset length file { typeset offset=$1 @@ -3732,9 +3715,9 @@ function arcstat_quiescence # stat echo fi while $do_once || [ $stat1 -ne $stat2 ] || [ $stat2 -eq 0 ]; do - typeset stat1=$(get_arcstat $stat) + typeset stat1=$(kstat arcstats.$stat) sleep 0.5 - typeset stat2=$(get_arcstat $stat) + typeset stat2=$(kstat arcstats.$stat) do_once=false done diff --git a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh index 20498440bea7..b8deafc5b30c 100755 --- a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh @@ -96,9 +96,9 @@ export RUNTIME=1 typeset do_once=true while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do - typeset l2_size1=$(get_arcstat l2_size) + typeset l2_size1=$(kstat arcstats.l2_size) log_must fio $FIO_SCRIPTS/random_reads.fio - typeset l2_size2=$(get_arcstat l2_size) + typeset l2_size2=$(kstat arcstats.l2_size) do_once=false done diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh index 1d3cbfc79ee6..1eded81101c1 100755 --- a/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh +++ b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh @@ -83,12 +83,12 @@ log_must truncate -s ${cache_sz}M $VDEV_CACHE log_must zpool create -O compression=lz4 -f $TESTPOOL-l2arc $VDEV cache $VDEV_CACHE -l2_cksum_bad_start=$(get_arcstat l2_cksum_bad) +l2_cksum_bad_start=$(kstat arcstats.l2_cksum_bad) log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio -l2_cksum_bad_end=$(get_arcstat l2_cksum_bad) +l2_cksum_bad_end=$(kstat arcstats.l2_cksum_bad) log_note "L2ARC Failed Checksums before: $l2_cksum_bad_start After:"\ "$l2_cksum_bad_end" diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc_disabled.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc_disabled.ksh index c8f4111744eb..b08f8dccc845 100755 --- a/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc_disabled.ksh +++ b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc_disabled.ksh @@ -83,12 +83,12 @@ log_must truncate -s ${cache_sz}M $VDEV_CACHE log_must zpool create -O compression=lz4 -f $TESTPOOL-l2arc $VDEV cache $VDEV_CACHE -l2_cksum_bad_start=$(get_arcstat l2_cksum_bad) +l2_cksum_bad_start=$(kstat arcstats.l2_cksum_bad) log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio -l2_cksum_bad_end=$(get_arcstat l2_cksum_bad) +l2_cksum_bad_end=$(kstat arcstats.l2_cksum_bad) log_note "L2ARC Failed Checksums before: $l2_cksum_bad_start After:"\ "$l2_cksum_bad_end" diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh index 460c95bb6051..8da3441330a6 100755 --- a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh @@ -88,12 +88,12 @@ log_must eval "echo $PASSPHRASE | zfs create -o compression=zstd " \ "-o encryption=on -o keyformat=passphrase -o keylocation=prompt " \ "$TESTPOOL-l2arc/encrypted" -l2_cksum_bad_start=$(get_arcstat l2_cksum_bad) +l2_cksum_bad_start=$(kstat arcstats.l2_cksum_bad) log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio -l2_cksum_bad_end=$(get_arcstat l2_cksum_bad) +l2_cksum_bad_end=$(kstat arcstats.l2_cksum_bad) log_note "L2ARC Failed Checksums before: $l2_cksum_bad_start After:"\ "$l2_cksum_bad_end" diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh index 2f352e2af5d4..e571016f6e2a 100755 --- a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh +++ b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh @@ -88,12 +88,12 @@ log_must eval "echo $PASSPHRASE | zfs create -o compression=zstd " \ "-o encryption=on -o keyformat=passphrase -o keylocation=prompt " \ "$TESTPOOL-l2arc/encrypted" -l2_cksum_bad_start=$(get_arcstat l2_cksum_bad) +l2_cksum_bad_start=$(kstat arcstats.l2_cksum_bad) log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio -l2_cksum_bad_end=$(get_arcstat l2_cksum_bad) +l2_cksum_bad_end=$(kstat arcstats.l2_cksum_bad) log_note "L2ARC Failed Checksums before: $l2_cksum_bad_start After:"\ "$l2_cksum_bad_end" diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh index 69d60ab8bb90..dc6bb9f9a163 100755 --- a/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh @@ -73,18 +73,18 @@ arcstat_quiescence_noecho l2_size log_must zpool offline $TESTPOOL $VDEV_CACHE arcstat_quiescence_noecho l2_size -typeset l2_mfu_init=$(get_arcstat l2_mfu_asize) -typeset l2_mru_init=$(get_arcstat l2_mru_asize) -typeset l2_prefetch_init=$(get_arcstat l2_prefetch_asize) -typeset l2_asize_init=$(get_arcstat l2_asize) +typeset l2_mfu_init=$(kstat arcstats.l2_mfu_asize) +typeset l2_mru_init=$(kstat arcstats.l2_mru_asize) +typeset l2_prefetch_init=$(kstat arcstats.l2_prefetch_asize) +typeset l2_asize_init=$(kstat arcstats.l2_asize) log_must zpool online $TESTPOOL $VDEV_CACHE arcstat_quiescence_noecho l2_size log_must zpool export $TESTPOOL arcstat_quiescence_noecho l2_feeds -log_must test $(get_arcstat l2_mfu_asize) -eq 0 -log_must test $(get_arcstat l2_mru_asize) -eq 0 +log_must test $(kstat arcstats.l2_mfu_asize) -eq 0 +log_must test $(kstat arcstats.l2_mru_asize) -eq 0 log_must zpool import -d $VDIR $TESTPOOL arcstat_quiescence_noecho l2_size @@ -93,10 +93,10 @@ arcstat_quiescence_noecho l2_size log_must zpool offline $TESTPOOL $VDEV_CACHE arcstat_quiescence_noecho l2_size -typeset l2_mfu_end=$(get_arcstat l2_mfu_asize) -typeset l2_mru_end=$(get_arcstat l2_mru_asize) -typeset l2_prefetch_end=$(get_arcstat l2_prefetch_asize) -typeset l2_asize_end=$(get_arcstat l2_asize) +typeset l2_mfu_end=$(kstat arcstats.l2_mfu_asize) +typeset l2_mru_end=$(kstat arcstats.l2_mru_asize) +typeset l2_prefetch_end=$(kstat arcstats.l2_prefetch_asize) +typeset l2_asize_end=$(kstat arcstats.l2_asize) log_must test $(( $l2_mru_end + $l2_mfu_end + $l2_prefetch_end - \ $l2_asize_end )) -eq 0 diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_l2miss_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_l2miss_pos.ksh index c9d5d7ffe1f1..8a9e4fa41b7c 100755 --- a/tests/zfs-tests/tests/functional/l2arc/l2arc_l2miss_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_l2miss_pos.ksh @@ -71,10 +71,10 @@ log_must fio $FIO_SCRIPTS/random_reads.fio log_must zpool export $TESTPOOL1 log_must zpool import $TESTPOOL1 -d $VDEV1 -typeset starting_miss_count=$(get_arcstat l2_misses) +typeset starting_miss_count=$(kstat arcstats.l2_misses) log_must fio $FIO_SCRIPTS/random_reads.fio -log_must test $(get_arcstat l2_misses) -eq $starting_miss_count +log_must test $(kstat arcstats.l2_misses) -eq $starting_miss_count # I/O to pool with l2arc - expect that l2_misses rises export DIRECTORY=/$TESTPOOL @@ -88,7 +88,7 @@ log_must zpool export $TESTPOOL log_must zpool import $TESTPOOL -d $VDEV log_must fio $FIO_SCRIPTS/random_reads.fio -log_must test $(get_arcstat l2_misses) -gt $starting_miss_count +log_must test $(kstat arcstats.l2_misses) -gt $starting_miss_count log_must zpool destroy -f $TESTPOOL log_must zpool destroy -f $TESTPOOL1 diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh index 89ab940334ee..2c5fc6753152 100755 --- a/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh @@ -72,7 +72,7 @@ export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M log_must truncate -s ${cache_sz}M $VDEV_CACHE -typeset log_blk_start=$(get_arcstat l2_log_blk_writes) +typeset log_blk_start=$(kstat arcstats.l2_log_blk_writes) log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE @@ -89,7 +89,7 @@ log_must zpool import -N -d $VDIR $TESTPOOL # will not be 0 (mentioned also in zfs.4) # For the purposes of this test we mitigate this by disabling (predictive) # ZFS prefetches with zfs_prefetch_disable=1. -log_must test $(get_arcstat l2_mru_asize) -eq 0 +log_must test $(kstat arcstats.l2_mru_asize) -eq 0 log_must zpool destroy -f $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh index a9968723c3ca..a999f96971fd 100755 --- a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh @@ -88,7 +88,7 @@ arcstat_quiescence_noecho l2_feeds typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | awk '/log_blk_count/ {print $2}') -typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) +typeset l2_rebuild_log_blk_start=$(kstat arcstats.l2_rebuild_log_blks) log_must zpool import -d $VDIR $TESTPOOL arcstat_quiescence_noecho l2_size diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_002_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_002_pos.ksh index 3b893d28da6a..4c6bc2e2e720 100755 --- a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_002_pos.ksh @@ -94,7 +94,7 @@ arcstat_quiescence_noecho l2_feeds typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | awk '/log_blk_count/ {print $2}') -typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) +typeset l2_rebuild_log_blk_start=$(kstat arcstats.l2_rebuild_log_blks) log_must zpool import -d $VDIR $TESTPOOL log_must eval "echo $PASSPHRASE | zfs mount -l $TESTPOOL/$TESTFS1" diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_003_neg.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_003_neg.ksh index 14063658e3c5..104d1d484ff2 100755 --- a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_003_neg.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_003_neg.ksh @@ -73,12 +73,12 @@ log_must fio $FIO_SCRIPTS/random_reads.fio log_must zpool export $TESTPOOL -typeset l2_success_start=$(get_arcstat l2_rebuild_success) +typeset l2_success_start=$(kstat arcstats.l2_rebuild_success) log_must zpool import -d $VDIR $TESTPOOL log_mustnot test "$(zpool iostat -Hpv $TESTPOOL $VDEV_CACHE | awk '{print $2}')" -gt 80000000 -typeset l2_success_end=$(get_arcstat l2_rebuild_success) +typeset l2_success_end=$(kstat arcstats.l2_rebuild_success) log_mustnot test $l2_success_end -gt $l2_success_start diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh index 8a572c26469c..6460b9a0e7a1 100755 --- a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh @@ -79,7 +79,7 @@ arcstat_quiescence_noecho l2_size log_must zpool export $TESTPOOL arcstat_quiescence_noecho l2_feeds -typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) +typeset l2_rebuild_log_blk_start=$(kstat arcstats.l2_rebuild_log_blks) typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | awk '/log_blk_count/ {print $2}') log_must zpool import -d $VDIR $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_005_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_005_pos.ksh index 9663437c6597..ce379a566f18 100755 --- a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_005_pos.ksh @@ -76,7 +76,7 @@ arcstat_quiescence_noecho l2_size log_must zpool offline $TESTPOOL $VDEV_CACHE arcstat_quiescence_noecho l2_size -typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) +typeset l2_rebuild_log_blk_start=$(kstat arcstats.l2_rebuild_log_blks) typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | awk '/log_blk_count/ {print $2}') log_must zpool online $TESTPOOL $VDEV_CACHE diff --git a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh index 62563e0dd4cb..fc7824ec6ce5 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh @@ -89,9 +89,9 @@ log_must fio $FIO_SCRIPTS/random_reads.fio export RUNTIME=1 typeset do_once=true while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do - typeset l2_size1=$(get_arcstat l2_size) + typeset l2_size1=$(kstat arcstats.l2_size) log_must fio $FIO_SCRIPTS/random_reads.fio - typeset l2_size2=$(get_arcstat l2_size) + typeset l2_size2=$(kstat arcstats.l2_size) do_once=false done From 0dfcfe023e06f60e71b71fb37ed620b7caaa5e98 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 8 Jan 2025 17:14:33 +1100 Subject: [PATCH 12/44] zinject: make iotype extendable I'm about to add a new "type", and I need somewhere to put it! Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16947 --- cmd/zinject/zinject.c | 60 +++++++++++++++++++++++++---------------- include/sys/zfs_ioctl.h | 18 +++++++++++++ module/zfs/zio_inject.c | 30 ++++++++++++++++----- 3 files changed, 79 insertions(+), 29 deletions(-) diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index 6c856763c958..d66b0d986f2d 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -242,6 +242,35 @@ err_to_str(int err) return ("[unknown]"); } +static const char *const iotypestrtable[ZINJECT_IOTYPES] = { + [ZINJECT_IOTYPE_NULL] = "null", + [ZINJECT_IOTYPE_READ] = "read", + [ZINJECT_IOTYPE_WRITE] = "write", + [ZINJECT_IOTYPE_FREE] = "free", + [ZINJECT_IOTYPE_CLAIM] = "claim", + [ZINJECT_IOTYPE_FLUSH] = "flush", + [ZINJECT_IOTYPE_TRIM] = "trim", + [ZINJECT_IOTYPE_ALL] = "all", +}; + +static zinject_iotype_t +str_to_iotype(const char *arg) +{ + for (uint_t iotype = 0; iotype < ZINJECT_IOTYPES; iotype++) + if (iotypestrtable[iotype] != NULL && + strcasecmp(iotypestrtable[iotype], arg) == 0) + return (iotype); + return (ZINJECT_IOTYPES); +} + +static const char * +iotype_to_str(zinject_iotype_t iotype) +{ + if (iotype >= ZINJECT_IOTYPES || iotypestrtable[iotype] == NULL) + return ("[unknown]"); + return (iotypestrtable[iotype]); +} + /* * Print usage message. */ @@ -435,10 +464,6 @@ static int print_device_handler(int id, const char *pool, zinject_record_t *record, void *data) { - static const char *iotypestr[] = { - "null", "read", "write", "free", "claim", "flush", "trim", "all", - }; - int *count = data; if (record->zi_guid == 0 || record->zi_func[0] != '\0') @@ -465,7 +490,7 @@ print_device_handler(int id, const char *pool, zinject_record_t *record, (void) printf("%3d %-15s %llx %-5s %-10s %8.4f%% " "%6lu %6lu\n", id, pool, (u_longlong_t)record->zi_guid, - iotypestr[record->zi_iotype], err_to_str(record->zi_error), + iotype_to_str(record->zi_iotype), err_to_str(record->zi_error), freq, record->zi_match_count, record->zi_inject_count); return (0); @@ -866,7 +891,7 @@ main(int argc, char **argv) int quiet = 0; int error = 0; int domount = 0; - int io_type = ZIO_TYPES; + int io_type = ZINJECT_IOTYPE_ALL; int action = VDEV_STATE_UNKNOWN; err_type_t type = TYPE_INVAL; err_type_t label = TYPE_INVAL; @@ -1060,19 +1085,8 @@ main(int argc, char **argv) } break; case 'T': - if (strcasecmp(optarg, "read") == 0) { - io_type = ZIO_TYPE_READ; - } else if (strcasecmp(optarg, "write") == 0) { - io_type = ZIO_TYPE_WRITE; - } else if (strcasecmp(optarg, "free") == 0) { - io_type = ZIO_TYPE_FREE; - } else if (strcasecmp(optarg, "claim") == 0) { - io_type = ZIO_TYPE_CLAIM; - } else if (strcasecmp(optarg, "flush") == 0) { - io_type = ZIO_TYPE_FLUSH; - } else if (strcasecmp(optarg, "all") == 0) { - io_type = ZIO_TYPES; - } else { + io_type = str_to_iotype(optarg); + if (io_type == ZINJECT_IOTYPES) { (void) fprintf(stderr, "invalid I/O type " "'%s': must be 'read', 'write', 'free', " "'claim', 'flush' or 'all'\n", optarg); @@ -1194,7 +1208,7 @@ main(int argc, char **argv) } if (error == EILSEQ && - (record.zi_freq == 0 || io_type != ZIO_TYPE_READ)) { + (record.zi_freq == 0 || io_type != ZINJECT_IOTYPE_READ)) { (void) fprintf(stderr, "device corrupt errors require " "io type read and a frequency value\n"); libzfs_fini(g_zfs); @@ -1209,9 +1223,9 @@ main(int argc, char **argv) if (record.zi_nlanes) { switch (io_type) { - case ZIO_TYPE_READ: - case ZIO_TYPE_WRITE: - case ZIO_TYPES: + case ZINJECT_IOTYPE_READ: + case ZINJECT_IOTYPE_WRITE: + case ZINJECT_IOTYPE_ALL: break; default: (void) fprintf(stderr, "I/O type for a delay " diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index e61d7644764e..9afe984e1749 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -456,6 +456,24 @@ typedef enum zinject_type { ZINJECT_DELAY_EXPORT, } zinject_type_t; +typedef enum zinject_iotype { + /* + * Compatibility: zi_iotype used to be set to ZIO_TYPE_, so make sure + * the corresponding ZINJECT_IOTYPE_ matches. Note that existing here + * does not mean that injections are possible for all these types. + */ + ZINJECT_IOTYPE_NULL = ZIO_TYPE_NULL, + ZINJECT_IOTYPE_READ = ZIO_TYPE_READ, + ZINJECT_IOTYPE_WRITE = ZIO_TYPE_WRITE, + ZINJECT_IOTYPE_FREE = ZIO_TYPE_FREE, + ZINJECT_IOTYPE_CLAIM = ZIO_TYPE_CLAIM, + ZINJECT_IOTYPE_FLUSH = ZIO_TYPE_FLUSH, + ZINJECT_IOTYPE_TRIM = ZIO_TYPE_TRIM, + ZINJECT_IOTYPE_ALL = ZIO_TYPES, + /* Room for future expansion for ZIO_TYPE_* */ + ZINJECT_IOTYPES = 16, +} zinject_iotype_t; + typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index f972522b6454..6848d46f73e7 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -376,6 +376,27 @@ zio_inject_bitflip_cb(void *data, size_t len, void *private) return (1); /* stop after first flip */ } +/* Test if this zio matches the iotype from the injection record. */ +static boolean_t +zio_match_iotype(zio_t *zio, uint32_t iotype) +{ + ASSERT3P(zio, !=, NULL); + + /* Unknown iotype, maybe from a newer version of zinject. Reject it. */ + if (iotype >= ZINJECT_IOTYPES) + return (B_FALSE); + + /* Standard IO types, match against ZIO type. */ + if (iotype < ZINJECT_IOTYPE_ALL) + return (iotype == zio->io_type); + + /* Match any standard IO type. */ + if (iotype == ZINJECT_IOTYPE_ALL) + return (B_TRUE); + + return (B_FALSE); +} + static int zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) { @@ -410,9 +431,8 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) } /* Handle type specific I/O failures */ - if (zio != NULL && - handler->zi_record.zi_iotype != ZIO_TYPES && - handler->zi_record.zi_iotype != zio->io_type) + if (zio != NULL && !zio_match_iotype(zio, + handler->zi_record.zi_iotype)) continue; if (handler->zi_record.zi_error == err1 || @@ -635,10 +655,8 @@ zio_handle_io_delay(zio_t *zio) continue; /* also match on I/O type (e.g., -T read) */ - if (handler->zi_record.zi_iotype != ZIO_TYPES && - handler->zi_record.zi_iotype != zio->io_type) { + if (!zio_match_iotype(zio, handler->zi_record.zi_iotype)) continue; - } /* * Defensive; should never happen as the array allocation From a28f5a94f42c7c3961aa1b1d10d1ab316aa98796 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 8 Jan 2025 17:14:33 +1100 Subject: [PATCH 13/44] zinject: add "probe" device injection type Injecting a device probe failure is not possible by matching IO types, because probe IO goes to the label regions, which is explicitly excluded from injection. Even if it were possible, it would be awkward to do, because a probe is sequence of reads and writes. This commit adds a new IO "type" to match for injection, which looks for the ZIO_FLAG_PROBE flag instead. Any probe IO will be match the injection record and recieve the wanted error. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16947 --- cmd/zinject/zinject.c | 1 + include/sys/zfs_ioctl.h | 3 +- man/man8/zinject.8 | 23 +++--- module/zfs/zio_inject.c | 10 ++- tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../cli_root/zinject/zinject_probe.ksh | 75 +++++++++++++++++++ 7 files changed, 100 insertions(+), 15 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zinject/zinject_probe.ksh diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index d66b0d986f2d..4374e69a7f94 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -251,6 +251,7 @@ static const char *const iotypestrtable[ZINJECT_IOTYPES] = { [ZINJECT_IOTYPE_FLUSH] = "flush", [ZINJECT_IOTYPE_TRIM] = "trim", [ZINJECT_IOTYPE_ALL] = "all", + [ZINJECT_IOTYPE_PROBE] = "probe", }; static zinject_iotype_t diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 9afe984e1749..a8c3ffc76455 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -471,7 +471,8 @@ typedef enum zinject_iotype { ZINJECT_IOTYPE_TRIM = ZIO_TYPE_TRIM, ZINJECT_IOTYPE_ALL = ZIO_TYPES, /* Room for future expansion for ZIO_TYPE_* */ - ZINJECT_IOTYPES = 16, + ZINJECT_IOTYPE_PROBE = 16, + ZINJECT_IOTYPES, } zinject_iotype_t; typedef struct zfs_share { diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index abccc4d086e0..53461681bb1d 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -19,11 +19,11 @@ .\" CDDL HEADER END .\" .\" Copyright 2013 Darik Horn . All rights reserved. -.\" Copyright (c) 2024, Klara Inc. +.\" Copyright (c) 2024, 2025, Klara, Inc. .\" .\" lint-ok: WARNING: sections out of conventional order: Sh SYNOPSIS .\" -.Dd December 2, 2024 +.Dd January 14, 2025 .Dt ZINJECT 8 .Os . @@ -265,15 +265,16 @@ will be translated to the appropriate blkid range according to the object's properties. .It Fl s Ar seconds Run for this many seconds before reporting failure. -.It Fl T Ar failure -Set the failure type to one of -.Sy all , -.Sy flush , -.Sy claim , -.Sy free , -.Sy read , -or -.Sy write . +.It Fl T Ar type +Inject the error into I/O of this type. +.Bl -tag -compact -width "read, write, flush, claim, free" +.It Sy read , Sy write , Sy flush , Sy claim , Sy free +Fundamental I/O types +.It Sy all +All fundamental I/O types +.It Sy probe +Device probe I/O +.El .It Fl t Ar mos_type Set this to .Bl -tag -compact -width "spacemap" diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 6848d46f73e7..f90044299cef 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -386,6 +386,10 @@ zio_match_iotype(zio_t *zio, uint32_t iotype) if (iotype >= ZINJECT_IOTYPES) return (B_FALSE); + /* Probe IOs only match IOTYPE_PROBE, regardless of their type. */ + if (zio->io_flags & ZIO_FLAG_PROBE) + return (iotype == ZINJECT_IOTYPE_PROBE); + /* Standard IO types, match against ZIO type. */ if (iotype < ZINJECT_IOTYPE_ALL) return (iotype == zio->io_type); @@ -405,9 +409,11 @@ zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) /* * We skip over faults in the labels unless it's during device open - * (i.e. zio == NULL) or a device flush (offset is meaningless) + * (i.e. zio == NULL) or a device flush (offset is meaningless). We let + * probe IOs through so we can match them to probe inject records. */ - if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH) { + if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH && + !(zio->io_flags & ZIO_FLAG_PROBE)) { uint64_t offset = zio->io_offset; if (offset < VDEV_LABEL_START_SIZE || diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index c3e681727cb3..2ba8a1ca4ca5 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -159,7 +159,7 @@ tests = ['json_sanity'] tags = ['functional', 'cli_root', 'json'] [tests/functional/cli_root/zinject] -tests = ['zinject_args', 'zinject_counts'] +tests = ['zinject_args', 'zinject_counts', 'zinject_probe'] pre = post = tags = ['functional', 'cli_root', 'zinject'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 520a2396d9a5..4afbb00957a7 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -616,6 +616,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/json/json_sanity.ksh \ functional/cli_root/zinject/zinject_args.ksh \ functional/cli_root/zinject/zinject_counts.ksh \ + functional/cli_root/zinject/zinject_probe.ksh \ functional/cli_root/zdb/zdb_002_pos.ksh \ functional/cli_root/zdb/zdb_003_pos.ksh \ functional/cli_root/zdb/zdb_004_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_probe.ksh b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_probe.ksh new file mode 100755 index 000000000000..22537a54db73 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zinject/zinject_probe.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +log_assert "Check zinject can correctly inject a probe failure." + +DISK1=${DISKS%% *} + +function cleanup +{ + log_pos zinject -c all + log_pos zpool clear $TESTPOOL + log_pos zpool destroy -f $TESTPOOL + log_pos restore_tunable TXG_TIMEOUT +} + +log_onexit cleanup + +log_must zpool create $TESTPOOL $DISK1 + +# set the txg timeout a long way out, to try and avoid the pool syncing +# between error injection and writing +save_tunable TXG_TIMEOUT +log_must set_tunable32 TXG_TIMEOUT 600 + +# force a sync now +log_must zpool sync -f + +# write stuff. this should go into memory, not written yet +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=1 + +# inject faults +log_must zinject -d $DISK1 -e io -T probe $TESTPOOL +log_must zinject -d $DISK1 -e io -T write $TESTPOOL + +# force the sync now. backgrounded, because the pool will suspend and we don't +# want to block. +log_pos zpool sync & + +log_note "waiting for pool to suspend" +typeset -i tries=30 +until [[ $(kstat_pool $TESTPOOL state) == "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + log_fail "pool didn't suspend" + fi + sleep 1 +done + +log_pass "zinject can correctly inject a probe failure." From 42bad934144ce321cc7eb18d594c44ae2cbc1662 Mon Sep 17 00:00:00 2001 From: rmacklem <64620010+rmacklem@users.noreply.github.com> Date: Wed, 22 Jan 2025 16:33:43 -0800 Subject: [PATCH 14/44] FreeBSD: Add setting of the VFCF_FILEREV flag The flag VFCF_FILEREV was recently defined in FreeBSD so that a file system could indicate that it increments va_filerev by one for each change. Since ZFS does do this, set the flag if defined for the kernel being built. This allows the NFSv4.2 server to reply with the correct change_attr_type attribute value. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rick Macklem Closed #16976 --- module/os/freebsd/zfs/zfs_vfsops.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index a3fac1636981..a367ea8e508a 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -144,12 +144,14 @@ struct vfsops zfs_vfsops = { .vfs_quotactl = zfs_quotactl, }; +VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL #ifdef VFCF_CROSS_COPY_FILE_RANGE -VFS_SET(zfs_vfsops, zfs, - VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE); -#else -VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL); + | VFCF_CROSS_COPY_FILE_RANGE +#endif +#ifdef VFCF_FILEREVINC + | VFCF_FILEREVINC #endif +); /* * We need to keep a count of active fs's. From 6e9911212e12642ef092054f757517b1ca9e0d37 Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Wed, 29 Jan 2025 07:18:09 -0700 Subject: [PATCH 15/44] Make the vfs.zfs.vdev.raidz_impl sysctl cross-platform Reviewed-by: Allan Jude Reviewed-by: Alexander Motin Signed-off-by: Alan Somers Sponsored by: ConnectWise Closes #16980 --- include/os/freebsd/spl/sys/mod_os.h | 3 +++ include/sys/vdev_impl.h | 4 +++ include/sys/vdev_raidz.h | 3 +++ module/Kbuild.in | 1 + module/os/freebsd/zfs/sysctl_os.c | 21 +++++++++++++++ module/os/linux/zfs/vdev_raidz.c | 42 +++++++++++++++++++++++++++++ module/zfs/vdev.c | 4 +++ module/zfs/vdev_raidz_math.c | 21 +++++---------- 8 files changed, 84 insertions(+), 15 deletions(-) create mode 100644 module/os/linux/zfs/vdev_raidz.c diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index df7be6fc13f6..1479242de53b 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -94,6 +94,9 @@ #define param_set_max_auto_ashift_args(var) \ CTLTYPE_UINT, NULL, 0, param_set_max_auto_ashift, "IU" +#define param_set_raidz_impl_args(var) \ + CTLTYPE_STRING, NULL, 0, param_set_raidz_impl, "A" + #define spa_taskq_read_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, spa_taskq_read_param, "A" diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index abd66b8abc96..d45a5913dc0f 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -645,6 +645,10 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); +#if defined(__linux__) +int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp); +#endif +int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS); /* * Vdev ashift optimization tunables diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 64f484e9aa13..ed042aedbdbc 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -66,6 +66,8 @@ extern const zio_vsd_ops_t vdev_raidz_vsd_ops; /* * vdev_raidz_math interface */ +/* Required, but not used, by ZFS_MODULE_PARAM_CALL */ +extern uint32_t zfs_vdev_raidz_impl; void vdev_raidz_math_init(void); void vdev_raidz_math_fini(void); const struct raidz_impl_ops *vdev_raidz_math_get_ops(void); @@ -73,6 +75,7 @@ int vdev_raidz_math_generate(struct raidz_map *, struct raidz_row *); int vdev_raidz_math_reconstruct(struct raidz_map *, struct raidz_row *, const int *, const int *, const int); int vdev_raidz_impl_set(const char *); +int vdev_raidz_impl_get(char *buffer, size_t size); typedef struct vdev_raidz_expand { uint64_t vre_vdev_id; diff --git a/module/Kbuild.in b/module/Kbuild.in index dcbdbc912f6d..6fbc441fcaa9 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -448,6 +448,7 @@ ZFS_OBJS_OS := \ trace.o \ vdev_disk.o \ vdev_file.o \ + vdev_raidz.o \ vdev_label_os.o \ zfs_acl.o \ zfs_ctldir.o \ diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 7350b8a6d49f..bddb25a07204 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -679,6 +679,27 @@ param_set_deadman_failmode(SYSCTL_HANDLER_ARGS) return (-param_set_deadman_failmode_common(buf)); } +int +param_set_raidz_impl(SYSCTL_HANDLER_ARGS) +{ + const size_t bufsize = 128; + char *buf; + int rc; + + buf = malloc(bufsize, M_SOLARIS, M_WAITOK | M_ZERO); + if (req->newptr == NULL) + vdev_raidz_impl_get(buf, bufsize); + + rc = sysctl_handle_string(oidp, buf, bufsize, req); + if (rc || req->newptr == NULL) { + free(buf, M_SOLARIS); + return (rc); + } + rc = vdev_raidz_impl_set(buf); + free(buf, M_SOLARIS); + return (rc); +} + int param_set_slop_shift(SYSCTL_HANDLER_ARGS) { diff --git a/module/os/linux/zfs/vdev_raidz.c b/module/os/linux/zfs/vdev_raidz.c new file mode 100644 index 000000000000..0b34ca52fb90 --- /dev/null +++ b/module/os/linux/zfs/vdev_raidz.c @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (C) 2025 ConnectWise */ + +#include +#include +#include +#include +#include + +int +param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp) +{ + return (vdev_raidz_impl_get(buf, PAGE_SIZE)); +} + +int +param_set_raidz_impl(const char *val, zfs_kernel_param_t *kp) +{ + int error; + + error = vdev_raidz_impl_set(val); + return (error); +} diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index d9c5871820ca..310319fdb052 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -6580,3 +6580,7 @@ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, raidz_impl, + param_set_raidz_impl, param_get_raidz_impl, ZMOD_RW, + "RAIDZ implementation"); diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index e12b96170f55..340d32b61bf8 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -81,7 +81,7 @@ static boolean_t raidz_math_initialized = B_FALSE; #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i)) -static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR; +uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR; static uint32_t user_sel_impl = IMPL_FASTEST; /* Hold all supported implementations */ @@ -633,16 +633,10 @@ vdev_raidz_impl_set(const char *val) return (err); } -#if defined(_KERNEL) && defined(__linux__) - -static int -zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp) -{ - return (vdev_raidz_impl_set(val)); -} +#if defined(_KERNEL) -static int -zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) +int +vdev_raidz_impl_get(char *buffer, size_t size) { int i, cnt = 0; char *fmt; @@ -653,21 +647,18 @@ zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) /* list mandatory options */ for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) { fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s "; - cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, + cnt += kmem_scnprintf(buffer + cnt, size - cnt, fmt, math_impl_opts[i].name); } /* list all supported implementations */ for (i = 0; i < raidz_supp_impl_cnt; i++) { fmt = (i == impl) ? "[%s] " : "%s "; - cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, + cnt += kmem_scnprintf(buffer + cnt, size - cnt, fmt, raidz_supp_impl[i]->name); } return (cnt); } -module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set, - zfs_vdev_raidz_impl_get, NULL, 0644); -MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation."); #endif From 0e21e473a7702a1760cd62d6a50d93748132d283 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Thu, 30 Jan 2025 18:53:59 -0500 Subject: [PATCH 16/44] Update pin_user_pages() calls for Direct I/O Originally #16856 updated Linux Direct I/O requests to use the new pin_user_pages API. However, it was an oversight that this PR only handled iov_iter's of type ITER_IOVEC and ITER_UBUF. Other iov_iter types may try and use the pin_user_pages API if it is available. This can lead to panics as the iov_iter is not being iterated over correctly in zfs_uio_pin_user_pages(). Unfortunately, generic iov_iter API's that call pin_user_page_fast() are protected as GPL only. Rather than update zfs_uio_pin_user_pages() to account for all iov_iter types, we can simply just call zfs_uio_get_dio_page_iov_iter() if the iov_iter type is not ITER_IOVEC or ITER_UBUF. zfs_uio_get_dio_page_iov_iter() calls the iov_iter_get_pages() calls that can handle any iov_iter type. In the future it might be worth using the exposed iov_iter iterator functions that are included in the header iov_iter.h since v6.7. These functions allow for any iov_iter type to be iterated over and advanced while applying a step function during iteration. This could possibly be leveraged in zfs_uio_pin_user_pages(). A new ZFS test case was added to test that a ITER_BVEC is handled correctly using this new code path. This test case was provided though issue #16956. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Ameer Hamza Signed-off-by: Brian Atkinson Closes #16956 Closes #17006 --- config/kernel-vfs-iov_iter.m4 | 27 +++++++ include/os/linux/spl/sys/uio.h | 10 +++ module/os/linux/zfs/zfs_uio.c | 64 +++++++++------ tests/runfiles/linux.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/direct/dio_loopback_dev.ksh | 78 +++++++++++++++++++ 6 files changed, 157 insertions(+), 25 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/direct/dio_loopback_dev.ksh diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4 index a223343030db..dc4e11cef2e9 100644 --- a/config/kernel-vfs-iov_iter.m4 +++ b/config/kernel-vfs-iov_iter.m4 @@ -21,6 +21,20 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ __attribute__((unused)) enum iter_type i = iov_iter_type(&iter); ]) + ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [ + #include + ],[ + struct iov_iter iter = { 0 }; + struct page **pages = NULL; + size_t maxsize = 4096; + unsigned maxpages = 1; + size_t start; + size_t ret __attribute__ ((unused)); + + ret = iov_iter_get_pages2(&iter, pages, maxsize, maxpages, + &start); + ]) + ZFS_LINUX_TEST_SRC([iter_is_ubuf], [ #include ],[ @@ -64,6 +78,19 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ AC_MSG_RESULT(no) ]) + + dnl # + dnl # Kernel 6.0 changed iov_iter_get_pages() to iov_iter_get_pages2(). + dnl # + AC_MSG_CHECKING([whether iov_iter_get_pages2() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1, + [iov_iter_get_pages2() is available]) + ],[ + AC_MSG_RESULT(no) + ]) + dnl # dnl # Kernel 6.0 introduced the ITER_UBUF iov_iter type. iter_is_ubuf() dnl # was also added to determine if the iov_iter is an ITER_UBUF. diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index 9e7afea2ab34..fcb4a464c9e4 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -63,6 +63,7 @@ typedef enum zfs_uio_seg { typedef struct { struct page **pages; /* Mapped pages */ long npages; /* Number of mapped pages */ + boolean_t pinned; /* Whether FOLL_PIN was used */ } zfs_uio_dio_t; typedef struct zfs_uio { @@ -199,4 +200,13 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, #define zfs_uio_iov_iter_type(iter) (iter)->type #endif +#if defined(HAVE_ITER_IS_UBUF) +#define zfs_user_backed_iov_iter(iter) \ + (iter_is_ubuf((iter)) || \ + (zfs_uio_iov_iter_type((iter)) == ITER_IOVEC)) +#else +#define zfs_user_backed_iov_iter(iter) \ + (zfs_uio_iov_iter_type((iter)) == ITER_IOVEC) +#endif + #endif /* SPL_UIO_H */ diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index db85b626f12a..1a815c62b19a 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -404,7 +404,6 @@ zfs_uio_page_aligned(zfs_uio_t *uio) return (aligned); } - #if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64) #define ZFS_MARKEED_PAGE 0x0 #define IS_ZFS_MARKED_PAGE(_p) 0 @@ -441,7 +440,6 @@ zfs_unmark_page(struct page *page) } #endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */ -#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED) static void zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) { @@ -473,7 +471,6 @@ zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) } } } -#endif void zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) @@ -482,21 +479,24 @@ zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) ASSERT(uio->uio_extflg & UIO_DIRECT); ASSERT3P(uio->uio_dio.pages, !=, NULL); + if (uio->uio_dio.pinned) { #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) - unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); -#else - for (long i = 0; i < uio->uio_dio.npages; i++) { - struct page *p = uio->uio_dio.pages[i]; + unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); +#endif + } else { + for (long i = 0; i < uio->uio_dio.npages; i++) { + struct page *p = uio->uio_dio.pages[i]; - if (IS_ZFS_MARKED_PAGE(p)) { - zfs_unmark_page(p); - __free_page(p); - continue; - } + if (IS_ZFS_MARKED_PAGE(p)) { + zfs_unmark_page(p); + __free_page(p); + continue; + } - put_page(p); + put_page(p); + } } -#endif + vmem_free(uio->uio_dio.pages, uio->uio_dio.npages * sizeof (struct page *)); } @@ -523,6 +523,7 @@ zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) if (len == 0) return (0); + uio->uio_dio.pinned = B_TRUE; #if defined(HAVE_ITER_IS_UBUF) if (iter_is_ubuf(uio->uio_iter)) { nr_pages = DIV_ROUND_UP(len, PAGE_SIZE); @@ -569,8 +570,8 @@ zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) return (0); } +#endif -#else static int zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) { @@ -581,9 +582,15 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); while (wanted) { +#if defined(HAVE_IOV_ITER_GET_PAGES2) + cnt = iov_iter_get_pages2(uio->uio_iter, + &uio->uio_dio.pages[uio->uio_dio.npages], + wanted, maxpages, &start); +#else cnt = iov_iter_get_pages(uio->uio_iter, &uio->uio_dio.pages[uio->uio_dio.npages], wanted, maxpages, &start); +#endif if (cnt < 0) { iov_iter_revert(uio->uio_iter, rollback); return (SET_ERROR(-cnt)); @@ -595,7 +602,12 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE); rollback += cnt; wanted -= cnt; +#if !defined(HAVE_IOV_ITER_GET_PAGES2) + /* + * iov_iter_get_pages2() advances the iov_iter on success. + */ iov_iter_advance(uio->uio_iter, cnt); +#endif } ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip); @@ -603,7 +615,6 @@ zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) return (0); } -#endif /* HAVE_PIN_USER_PAGES_UNLOCKED */ /* * This function pins user pages. In the event that the user pages were not @@ -621,7 +632,10 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) if (uio->uio_segflg == UIO_ITER) { uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) - error = zfs_uio_pin_user_pages(uio, rw); + if (zfs_user_backed_iov_iter(uio->uio_iter)) + error = zfs_uio_pin_user_pages(uio, rw); + else + error = zfs_uio_get_dio_pages_iov_iter(uio, rw); #else error = zfs_uio_get_dio_pages_iov_iter(uio, rw); #endif @@ -632,22 +646,24 @@ zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) ASSERT3S(uio->uio_dio.npages, >=, 0); if (error) { + if (uio->uio_dio.pinned) { #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) - unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); -#else - for (long i = 0; i < uio->uio_dio.npages; i++) - put_page(uio->uio_dio.pages[i]); + unpin_user_pages(uio->uio_dio.pages, + uio->uio_dio.npages); #endif + } else { + for (long i = 0; i < uio->uio_dio.npages; i++) + put_page(uio->uio_dio.pages[i]); + } + vmem_free(uio->uio_dio.pages, size); return (error); } else { ASSERT3S(uio->uio_dio.npages, ==, npages); } -#if !defined(HAVE_PIN_USER_PAGES_UNLOCKED) - if (rw == UIO_WRITE) + if (rw == UIO_WRITE && !uio->uio_dio.pinned) zfs_uio_dio_check_for_zero_page(uio); -#endif uio->uio_extflg |= UIO_DIRECT; diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index e55ec583d2cc..2c5dcb3650fd 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -103,7 +103,7 @@ tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] tags = ['functional', 'devices'] [tests/functional/direct:Linux] -tests = ['dio_write_verify'] +tests = ['dio_loopback_dev', 'dio_write_verify'] tags = ['functional', 'direct'] [tests/functional/events:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 4afbb00957a7..43519dc18b95 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1475,6 +1475,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/direct/dio_dedup.ksh \ functional/direct/dio_encryption.ksh \ functional/direct/dio_grow_block.ksh \ + functional/direct/dio_loopback_dev.ksh \ functional/direct/dio_max_recordsize.ksh \ functional/direct/dio_mixed.ksh \ functional/direct/dio_mmap.ksh \ diff --git a/tests/zfs-tests/tests/functional/direct/dio_loopback_dev.ksh b/tests/zfs-tests/tests/functional/direct/dio_loopback_dev.ksh new file mode 100755 index 000000000000..7186eba5aafc --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_loopback_dev.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify Direct I/O reads work with loopback devices using direct=always. +# +# STRATEGY: +# 1. Create raidz zpool. +# 2. Create dataset with the direct dataset property set to always. +# 3. Create an empty file in dataset and setup loop device on it. +# 4. Read from loopback device. +# + +verify_runnable "global" + +function cleanup +{ + if [[ -n $lofidev ]]; then + losetup -d $lofidev + fi + dio_cleanup +} + +log_assert "Verify loopback devices with Direct I/O." + +if ! is_linux; then + log_unsupported "This is just a check for Linux Direct I/O" +fi + +log_onexit cleanup + +# Create zpool +log_must truncate -s $MINVDEVSIZE $DIO_VDEVS +log_must create_pool $TESTPOOL1 "raidz" $DIO_VDEVS + +# Creating dataset with direct=always +log_must eval "zfs create -o direct=always $TESTPOOL1/$TESTFS1" +mntpt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1) + +# Getting a loopback device +lofidev=$(losetup -f) + +# Create loopback device +log_must truncate -s 1M "$mntpt/temp_file" +log_must losetup $lofidev "$mntpt/temp_file" + +# Read from looback device to make sure Direct I/O works with loopback device +log_must dd if=$lofidev of=/dev/null count=1 bs=4k + +log_pass "Verified loopback devices for Direct I/O." From 1aa4351c1ff460e5d95b7174046cf62de53bdc93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jerzy=20Ko=C5=82osowski?= Date: Fri, 31 Jan 2025 19:00:59 +0000 Subject: [PATCH 17/44] Add recursive dataset mounting and unmounting support to pam_zfs_key (#16857) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduced functionality to recursively mount datasets with a new config option `mount_recursively`. Adjusted existing functions to handle the recursive behavior and added tests to validate the feature. This enhances support for managing hierarchical ZFS datasets within a PAM context. Signed-off-by: Jerzy Kołosowski Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter --- contrib/pam_zfs_key/pam_zfs_key.c | 304 +++++++++++++----- tests/runfiles/freebsd.run | 4 +- tests/runfiles/linux.run | 4 +- tests/zfs-tests/tests/Makefile.am | 1 + .../tests/functional/pam/cleanup.ksh | 1 + .../functional/pam/pam_mount_recursively.ksh | 90 ++++++ 6 files changed, 326 insertions(+), 78 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/pam/pam_mount_recursively.ksh diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 08a8640669b3..c617a6e6b370 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -63,6 +63,7 @@ pam_syslog(pam_handle_t *pamh, int loglevel, const char *fmt, ...) #include #include #include +#include #include @@ -370,67 +371,6 @@ change_key(pam_handle_t *pamh, const char *ds_name, return (0); } -static int -decrypt_mount(pam_handle_t *pamh, const char *ds_name, - const char *passphrase, boolean_t noop) -{ - zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); - if (ds == NULL) { - pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); - return (-1); - } - pw_password_t *key = prepare_passphrase(pamh, ds, passphrase, NULL); - if (key == NULL) { - zfs_close(ds); - return (-1); - } - int ret = lzc_load_key(ds_name, noop, (uint8_t *)key->value, - WRAPPING_KEY_LEN); - pw_free(key); - if (ret && ret != EEXIST) { - pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret); - zfs_close(ds); - return (-1); - } - if (noop) { - goto out; - } - ret = zfs_mount(ds, NULL, 0); - if (ret) { - pam_syslog(pamh, LOG_ERR, "mount failed: %d", ret); - zfs_close(ds); - return (-1); - } -out: - zfs_close(ds); - return (0); -} - -static int -unmount_unload(pam_handle_t *pamh, const char *ds_name, boolean_t force) -{ - zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); - if (ds == NULL) { - pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); - return (-1); - } - int ret = zfs_unmount(ds, NULL, force ? MS_FORCE : 0); - if (ret) { - pam_syslog(pamh, LOG_ERR, "zfs_unmount failed with: %d", ret); - zfs_close(ds); - return (-1); - } - - ret = lzc_unload_key(ds_name); - if (ret) { - pam_syslog(pamh, LOG_ERR, "unload_key failed with: %d", ret); - zfs_close(ds); - return (-1); - } - zfs_close(ds); - return (0); -} - typedef struct { char *homes_prefix; char *runstatedir; @@ -443,6 +383,7 @@ typedef struct { boolean_t unmount_and_unload; boolean_t force_unmount; boolean_t recursive_homes; + boolean_t mount_recursively; } zfs_key_config_t; static int @@ -481,6 +422,7 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->unmount_and_unload = B_TRUE; config->force_unmount = B_FALSE; config->recursive_homes = B_FALSE; + config->mount_recursively = B_FALSE; config->dsname = NULL; config->homedir = NULL; for (int c = 0; c < argc; c++) { @@ -500,6 +442,8 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->force_unmount = B_TRUE; } else if (strcmp(argv[c], "recursive_homes") == 0) { config->recursive_homes = B_TRUE; + } else if (strcmp(argv[c], "mount_recursively") == 0) { + config->mount_recursively = B_TRUE; } else if (strcmp(argv[c], "prop_mountpoint") == 0) { if (config->homedir == NULL) config->homedir = strdup(entry->pw_dir); @@ -508,6 +452,217 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, return (PAM_SUCCESS); } +typedef struct { + pam_handle_t *pamh; + zfs_key_config_t *target; +} mount_umount_dataset_data_t; + +static int +mount_dataset(zfs_handle_t *zhp, void *data) +{ + mount_umount_dataset_data_t *mount_umount_dataset_data = data; + + zfs_key_config_t *target = mount_umount_dataset_data->target; + pam_handle_t *pamh = mount_umount_dataset_data->pamh; + + /* Refresh properties to get the latest key status */ + zfs_refresh_properties(zhp); + + int ret = 0; + + /* Check if dataset type is filesystem */ + if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) { + pam_syslog(pamh, LOG_DEBUG, + "dataset is not filesystem: %s, skipping.", + zfs_get_name(zhp)); + return (0); + } + + /* Check if encryption key is available */ + if (zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == + ZFS_KEYSTATUS_UNAVAILABLE) { + pam_syslog(pamh, LOG_WARNING, + "key unavailable for: %s, skipping", + zfs_get_name(zhp)); + return (0); + } + + /* Check if prop canmount is on */ + if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) != ZFS_CANMOUNT_ON) { + pam_syslog(pamh, LOG_INFO, + "canmount is not on for: %s, skipping", + zfs_get_name(zhp)); + return (0); + } + + /* Get mountpoint prop for check */ + char mountpoint[ZFS_MAXPROPLEN]; + if ((ret = zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, + sizeof (mountpoint), NULL, NULL, 0, 1)) != 0) { + pam_syslog(pamh, LOG_ERR, + "failed to get mountpoint prop: %d", ret); + return (-1); + } + + /* Check if mountpoint isn't none or legacy */ + if (strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) == 0 || + strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) { + pam_syslog(pamh, LOG_INFO, + "mountpoint is none or legacy for: %s, skipping", + zfs_get_name(zhp)); + return (0); + } + + /* Don't mount the dataset if already mounted */ + if (zfs_is_mounted(zhp, NULL)) { + pam_syslog(pamh, LOG_INFO, "already mounted: %s", + zfs_get_name(zhp)); + return (0); + } + + /* Mount the dataset */ + ret = zfs_mount(zhp, NULL, 0); + if (ret) { + pam_syslog(pamh, LOG_ERR, + "zfs_mount failed for %s with: %d", zfs_get_name(zhp), + ret); + return (ret); + } + + /* Recursively mount children if the recursive flag is set */ + if (target->mount_recursively) { + ret = zfs_iter_filesystems_v2(zhp, 0, mount_dataset, data); + if (ret != 0) { + pam_syslog(pamh, LOG_ERR, + "child iteration failed: %d", ret); + return (-1); + } + } + + return (ret); +} + +static int +umount_dataset(zfs_handle_t *zhp, void *data) +{ + mount_umount_dataset_data_t *mount_umount_dataset_data = data; + + zfs_key_config_t *target = mount_umount_dataset_data->target; + pam_handle_t *pamh = mount_umount_dataset_data->pamh; + + int ret = 0; + /* Recursively umount children if the recursive flag is set */ + if (target->mount_recursively) { + ret = zfs_iter_filesystems_v2(zhp, 0, umount_dataset, data); + if (ret != 0) { + pam_syslog(pamh, LOG_ERR, + "child iteration failed: %d", ret); + return (-1); + } + } + + /* Check if dataset type is filesystem */ + if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) { + pam_syslog(pamh, LOG_DEBUG, + "dataset is not filesystem: %s, skipping", + zfs_get_name(zhp)); + return (0); + } + + /* Don't umount the dataset if already unmounted */ + if (zfs_is_mounted(zhp, NULL) == 0) { + pam_syslog(pamh, LOG_INFO, "already unmounted: %s", + zfs_get_name(zhp)); + return (0); + } + + /* Unmount the dataset */ + ret = zfs_unmount(zhp, NULL, target->force_unmount ? MS_FORCE : 0); + if (ret) { + pam_syslog(pamh, LOG_ERR, + "zfs_unmount failed for %s with: %d", zfs_get_name(zhp), + ret); + return (ret); + } + + return (ret); +} + +static int +decrypt_mount(pam_handle_t *pamh, zfs_key_config_t *config, const char *ds_name, + const char *passphrase, boolean_t noop) +{ + zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); + if (ds == NULL) { + pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); + return (-1); + } + pw_password_t *key = prepare_passphrase(pamh, ds, passphrase, NULL); + if (key == NULL) { + zfs_close(ds); + return (-1); + } + int ret = lzc_load_key(ds_name, noop, (uint8_t *)key->value, + WRAPPING_KEY_LEN); + pw_free(key); + if (ret && ret != EEXIST) { + pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret); + zfs_close(ds); + return (-1); + } + + if (noop) { + zfs_close(ds); + return (0); + } + + mount_umount_dataset_data_t data; + data.pamh = pamh; + data.target = config; + + ret = mount_dataset(ds, &data); + if (ret != 0) { + pam_syslog(pamh, LOG_ERR, "mount failed: %d", ret); + zfs_close(ds); + return (-1); + } + + zfs_close(ds); + return (0); +} + +static int +unmount_unload(pam_handle_t *pamh, const char *ds_name, + zfs_key_config_t *target) +{ + zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); + if (ds == NULL) { + pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); + return (-1); + } + + mount_umount_dataset_data_t data; + data.pamh = pamh; + data.target = target; + + int ret = umount_dataset(ds, &data); + if (ret) { + pam_syslog(pamh, LOG_ERR, + "unmount_dataset failed with: %d", ret); + zfs_close(ds); + return (-1); + } + + ret = lzc_unload_key(ds_name); + if (ret) { + pam_syslog(pamh, LOG_ERR, "unload_key failed with: %d", ret); + zfs_close(ds); + return (-1); + } + zfs_close(ds); + return (0); +} + static void zfs_key_config_free(zfs_key_config_t *config) { @@ -548,7 +703,7 @@ find_dsname_by_prop_value(zfs_handle_t *zhp, void *data) } static char * -zfs_key_config_get_dataset(zfs_key_config_t *config) +zfs_key_config_get_dataset(pam_handle_t *pamh, zfs_key_config_t *config) { if (config->homedir != NULL && config->homes_prefix != NULL) { @@ -559,7 +714,7 @@ zfs_key_config_get_dataset(zfs_key_config_t *config) zfs_handle_t *zhp = zfs_open(g_zfs, config->homes_prefix, ZFS_TYPE_FILESYSTEM); if (zhp == NULL) { - pam_syslog(NULL, LOG_ERR, + pam_syslog(pamh, LOG_ERR, "dataset %s not found", config->homes_prefix); return (NULL); @@ -697,13 +852,13 @@ pam_sm_authenticate(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - char *dataset = zfs_key_config_get_dataset(&config); + char *dataset = zfs_key_config_get_dataset(pamh, &config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - if (decrypt_mount(pamh, dataset, token->value, B_TRUE) == -1) { + if (decrypt_mount(pamh, &config, dataset, token->value, B_TRUE) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); @@ -749,7 +904,7 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - char *dataset = zfs_key_config_get_dataset(&config); + char *dataset = zfs_key_config_get_dataset(pamh, &config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); @@ -763,7 +918,7 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - if (decrypt_mount(pamh, dataset, + if (decrypt_mount(pamh, &config, dataset, old_token->value, B_TRUE) == -1) { pam_syslog(pamh, LOG_ERR, "old token mismatch"); @@ -784,7 +939,7 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, pw_clear(pamh, OLD_PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } - char *dataset = zfs_key_config_get_dataset(&config); + char *dataset = zfs_key_config_get_dataset(pamh, &config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); @@ -793,7 +948,7 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, return (PAM_SERVICE_ERR); } int was_loaded = is_key_loaded(pamh, dataset); - if (!was_loaded && decrypt_mount(pamh, dataset, + if (!was_loaded && decrypt_mount(pamh, &config, dataset, old_token->value, B_FALSE) == -1) { free(dataset); pam_zfs_free(); @@ -804,7 +959,7 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, } int changed = change_key(pamh, dataset, token->value); if (!was_loaded) { - unmount_unload(pamh, dataset, config.force_unmount); + unmount_unload(pamh, dataset, &config); } free(dataset); pam_zfs_free(); @@ -856,13 +1011,14 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - char *dataset = zfs_key_config_get_dataset(&config); + char *dataset = zfs_key_config_get_dataset(pamh, &config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - if (decrypt_mount(pamh, dataset, token->value, B_FALSE) == -1) { + if (decrypt_mount(pamh, &config, dataset, + token->value, B_FALSE) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); @@ -910,13 +1066,13 @@ pam_sm_close_session(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - char *dataset = zfs_key_config_get_dataset(&config); + char *dataset = zfs_key_config_get_dataset(pamh, &config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SESSION_ERR); } - if (unmount_unload(pamh, dataset, config.force_unmount) == -1) { + if (unmount_unload(pamh, dataset, &config) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); diff --git a/tests/runfiles/freebsd.run b/tests/runfiles/freebsd.run index e1ae0c6b7721..943c8eab2715 100644 --- a/tests/runfiles/freebsd.run +++ b/tests/runfiles/freebsd.run @@ -27,8 +27,8 @@ tests = ['zfs_jail_001_pos'] tags = ['functional', 'cli_root', 'zfs_jail'] [tests/functional/pam:FreeBSD] -tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', - 'pam_short_password'] +tests = ['pam_basic', 'pam_change_unmounted', 'pam_mount_recursively', + 'pam_nounmount', 'pam_recursive', 'pam_short_password'] tags = ['functional', 'pam'] [tests/functional/direct:FreeBSD] diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 2c5dcb3650fd..275772f2820e 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -169,8 +169,8 @@ tests = ['umount_unlinked_drain'] tags = ['functional', 'mount'] [tests/functional/pam:Linux] -tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', - 'pam_short_password'] +tests = ['pam_basic', 'pam_change_unmounted', 'pam_mount_recursively', + 'pam_nounmount', 'pam_recursive', 'pam_short_password'] tags = ['functional', 'pam'] [tests/functional/procfs:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 43519dc18b95..fbb6621585c3 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1695,6 +1695,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/pam/cleanup.ksh \ functional/pam/pam_basic.ksh \ functional/pam/pam_change_unmounted.ksh \ + functional/pam/pam_mount_recursively.ksh \ functional/pam/pam_nounmount.ksh \ functional/pam/pam_recursive.ksh \ functional/pam/pam_short_password.ksh \ diff --git a/tests/zfs-tests/tests/functional/pam/cleanup.ksh b/tests/zfs-tests/tests/functional/pam/cleanup.ksh index bfb98cd30707..5bb6e518edb0 100755 --- a/tests/zfs-tests/tests/functional/pam/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/pam/cleanup.ksh @@ -26,5 +26,6 @@ rmconfig destroy_pool $TESTPOOL del_user ${username} del_user ${username}rec +del_user ${username}mrec del_group pamtestgroup log_must rm -rf "$runstatedir" diff --git a/tests/zfs-tests/tests/functional/pam/pam_mount_recursively.ksh b/tests/zfs-tests/tests/functional/pam/pam_mount_recursively.ksh new file mode 100755 index 000000000000..93683da7d7db --- /dev/null +++ b/tests/zfs-tests/tests/functional/pam/pam_mount_recursively.ksh @@ -0,0 +1,90 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/pam/utilities.kshlib + +if [ -n "$ASAN_OPTIONS" ]; then + export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}') +fi + +username="${username}mrec" + +# Set up a deeper hierarchy, a mountpoint that doesn't interfere with other tests, +# and a user which references that mountpoint +log_must zfs create "$TESTPOOL/mrec" +log_must zfs create -o mountpoint="$TESTDIR/mrec" "$TESTPOOL/mrec/pam" +echo "recurpass" | zfs create -o encryption=aes-256-gcm -o keyformat=passphrase \ + -o keylocation=prompt "$TESTPOOL/mrec/pam/${username}" +log_must zfs create "$TESTPOOL/mrec/pam/${username}/deep" +log_must zfs create "$TESTPOOL/mrec/pam/${username}/deep/deeper" +log_must zfs create -o mountpoint=none "$TESTPOOL/mrec/pam/${username}/deep/none" +log_must zfs create -o canmount=noauto "$TESTPOOL/mrec/pam/${username}/deep/noauto" +log_must zfs create -o canmount=off "$TESTPOOL/mrec/pam/${username}/deep/off" +log_must zfs unmount "$TESTPOOL/mrec/pam/${username}" +log_must zfs unload-key "$TESTPOOL/mrec/pam/${username}" +log_must add_user pamtestgroup ${username} "$TESTDIR/mrec" + +function keystatus { + log_must [ "$(get_prop keystatus "$TESTPOOL/mrec/pam/${username}")" = "$1" ] +} + +log_mustnot ismounted "$TESTPOOL/mrec/pam/${username}" +keystatus unavailable + +function test_session { + echo "recurpass" | pamtester ${pamservice} ${username} open_session + references 1 + log_must ismounted "$TESTPOOL/mrec/pam/${username}" + log_must ismounted "$TESTPOOL/mrec/pam/${username}/deep" + log_must ismounted "$TESTPOOL/mrec/pam/${username}/deep/deeper" + log_mustnot ismounted "$TESTPOOL/mrec/pam/${username}/deep/none" + log_mustnot ismounted "$TESTPOOL/mrec/pam/${username}/deep/noauto" + log_mustnot ismounted "$TESTPOOL/mrec/pam/${username}/deep/off" + keystatus available + + log_must pamtester ${pamservice} ${username} close_session + references 0 + log_mustnot ismounted "$TESTPOOL/mrec/pam/${username}" + log_mustnot ismounted "$TESTPOOL/mrec/pam/${username}/deep" + log_mustnot ismounted "$TESTPOOL/mrec/pam/${username}/deep/deeper" + log_mustnot ismounted "$TESTPOOL/mrec/pam/${username}/deep/none" + log_mustnot ismounted "$TESTPOOL/mrec/pam/${username}/deep/noauto" + log_mustnot ismounted "$TESTPOOL/mrec/pam/${username}/deep/off" + keystatus unavailable +} + +genconfig "homes=$TESTPOOL/mrec/pam mount_recursively runstatedir=${runstatedir}" +test_session + +genconfig "homes=$TESTPOOL/mrec/pam prop_mountpoint mount_recursively runstatedir=${runstatedir}" +test_session + +genconfig "homes=$TESTPOOL/mrec recursive_homes prop_mountpoint mount_recursively runstatedir=${runstatedir}" +test_session + +genconfig "homes=$TESTPOOL recursive_homes prop_mountpoint mount_recursively runstatedir=${runstatedir}" +test_session + +genconfig "homes=* recursive_homes prop_mountpoint mount_recursively runstatedir=${runstatedir}" +test_session + +log_pass "done." From 67f0469f7019ad9c08bbd62744a507bac18df37f Mon Sep 17 00:00:00 2001 From: Jaydeep Kshirsagar Date: Sat, 1 Feb 2025 08:15:24 -0800 Subject: [PATCH 18/44] Avoid ARC buffer transfrom operations in prefetch This change will prevent prefetch to perform unnecessary ARC buffer fill when reading from disk. Reviewed-by: Alexander Motin Signed-off-by: Jaydeep Kshirsagar Co-authored-by: Alexander Motin Closes #17013 --- module/zfs/arc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index fa7baac04b7b..c6383d03a4a4 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5916,6 +5916,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; + acb->acb_nobuf = no_buf; acb->acb_zb = *zb; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); From 3b5c3f52d2aa79075a96f18d63547cea2c79ac93 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 5 Feb 2025 00:47:50 +1100 Subject: [PATCH 19/44] zio: lock parent zios when updating wait counts on reexecute As zios are reexecuted after resume from suspension, their ready and wait states need to be propagated to wait counts on all their parents. It's possible for those parents to have active children passing through READY or DONE, which then end up in zio_notify_parent(), take their parent's lock, and decrement the wait count. Without also taking a lock here, it's possible for an increment race to occur, which leads to either there being no references left (tripping the assert in zio_notify_parent()), or a parent waiting forever for a nonexistent child to complete. To protect against this, we simply take the appropriate zio locks in zio_reexecute() before updating the wait counts. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Allan Jude Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #17016 --- module/zfs/zio.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index bd6752f00ac5..ae5340da9f00 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -23,7 +23,7 @@ * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2019, 2023, 2024, Klara Inc. + * Copyright (c) 2019, 2023, 2024, 2025, Klara, Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, Datto, Inc. * Copyright (c) 2021, 2024 by George Melikov. All rights reserved. @@ -2537,13 +2537,29 @@ zio_reexecute(void *arg) pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) || (pio->io_pipeline & ZIO_STAGE_READY) == 0; pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE); + + /* + * It's possible for a failed ZIO to be a descendant of more than one + * ZIO tree. When reexecuting it, we have to be sure to add its wait + * states to all parent wait counts. + * + * Those parents, in turn, may have other children that are currently + * active, usually because they've already been reexecuted after + * resuming. Those children may be executing and may call + * zio_notify_parent() at the same time as we're updating our parent's + * counts. To avoid races while updating the counts, we take + * gio->io_lock before each update. + */ zio_link_t *zl = NULL; while ((gio = zio_walk_parents(pio, &zl)) != NULL) { + mutex_enter(&gio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) { gio->io_children[pio->io_child_type][w] += !pio->io_state[w]; } + mutex_exit(&gio->io_lock); } + for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; From 51bec1606094f317b26195e48ccb2bd79613273c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 5 Feb 2025 17:14:20 +1100 Subject: [PATCH 20/44] Linux 6.14: dops->d_revalidate now takes four args This is a convenience for filesystems that need the inode of their parent or their own name, as its often complicated to get that information. We don't need those things, so this is just detecting which prototype is expected and adjusting our callback to match. Sponsored-by: https://despairlabs.com/sponsor/ Signed-off-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- config/kernel-automount.m4 | 41 ++++++++++++++++++++++++++++++-- module/os/linux/zfs/zpl_ctldir.c | 6 +++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/config/kernel-automount.m4 b/config/kernel-automount.m4 index 52f1931b748e..b5f1392d0fcd 100644 --- a/config/kernel-automount.m4 +++ b/config/kernel-automount.m4 @@ -5,7 +5,7 @@ dnl # solution to handling automounts. Prior to this cifs/nfs clients dnl # which required automount support would abuse the follow_link() dnl # operation on directories for this purpose. dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_AUTOMOUNT], [ +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_AUTOMOUNT], [ ZFS_LINUX_TEST_SRC([dentry_operations_d_automount], [ #include static struct vfsmount *d_automount(struct path *p) { return NULL; } @@ -15,7 +15,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_AUTOMOUNT], [ ]) ]) -AC_DEFUN([ZFS_AC_KERNEL_AUTOMOUNT], [ +AC_DEFUN([ZFS_AC_KERNEL_D_AUTOMOUNT], [ AC_MSG_CHECKING([whether dops->d_automount() exists]) ZFS_LINUX_TEST_RESULT([dentry_operations_d_automount], [ AC_MSG_RESULT(yes) @@ -23,3 +23,40 @@ AC_DEFUN([ZFS_AC_KERNEL_AUTOMOUNT], [ ZFS_LINUX_TEST_ERROR([dops->d_automount()]) ]) ]) + +dnl # +dnl # 6.14 API change +dnl # dops->d_revalidate now has four args. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_REVALIDATE_4ARGS], [ + ZFS_LINUX_TEST_SRC([dentry_operations_d_revalidate_4args], [ + #include + static int d_revalidate(struct inode *dir, + const struct qstr *name, struct dentry *dentry, + unsigned int fl) { return 0; } + struct dentry_operations dops __attribute__ ((unused)) = { + .d_revalidate = d_revalidate, + }; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_D_REVALIDATE_4ARGS], [ + AC_MSG_CHECKING([whether dops->d_revalidate() takes 4 args]) + ZFS_LINUX_TEST_RESULT([dentry_operations_d_revalidate_4args], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_D_REVALIDATE_4ARGS, 1, + [dops->d_revalidate() takes 4 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_AUTOMOUNT], [ + ZFS_AC_KERNEL_SRC_D_AUTOMOUNT + ZFS_AC_KERNEL_SRC_D_REVALIDATE_4ARGS +]) + +AC_DEFUN([ZFS_AC_KERNEL_AUTOMOUNT], [ + ZFS_AC_KERNEL_D_AUTOMOUNT + ZFS_AC_KERNEL_D_REVALIDATE_4ARGS +]) diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c index fe64bc710387..11438ab61475 100644 --- a/module/os/linux/zfs/zpl_ctldir.c +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -189,8 +189,14 @@ zpl_snapdir_automount(struct path *path) * as of the 3.18 kernel revaliding the mountpoint dentry will result in * the snapshot being immediately unmounted. */ +#ifdef HAVE_D_REVALIDATE_4ARGS +static int +zpl_snapdir_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) +#else static int zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) +#endif { return (!!dentry->d_inode); } From 3266d4d65572f2a603503886af8cca8ffbe67800 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 5 Feb 2025 17:52:45 +1100 Subject: [PATCH 21/44] Linux 6.14: BLK_MQ_F_SHOULD_MERGE was removed According to the upstream change, all callers set it, and all block devices either honoured it or ignored it, so removing it entirely allows a bunch of handling for the "unset" case to be removed, and it becomes effectively implied. We follow suit, and keep setting it for older kernels. Sponsored-by: https://despairlabs.com/sponsor/ Signed-off-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- module/os/linux/zfs/zvol_os.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 7c9aae6a66af..7c5d567c3239 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -202,7 +202,16 @@ static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) * We need BLK_MQ_F_BLOCKING here since we do blocking calls in * zvol_request_impl() */ - zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; + zso->tag_set.flags = BLK_MQ_F_BLOCKING; + +#ifdef BLK_MQ_F_SHOULD_MERGE + /* + * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit. + * For older kernels, we set it. + */ + zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE; +#endif + zso->tag_set.driver_data = zv; return (blk_mq_alloc_tag_set(&zso->tag_set)); From d1b0c4ec5aadae919c84b979b784b03c7f58214d Mon Sep 17 00:00:00 2001 From: mnrx <83848843+mnrx@users.noreply.github.com> Date: Wed, 5 Feb 2025 22:47:03 +0000 Subject: [PATCH 22/44] Clarify documentation of `zfs destroy` on snapshots (#17021) The current documentation of `zfs destroy` in application to snapshots is particularly difficult to understand. The following changes are made: - Remove circular reference to `zfs destroy` in the documentation of that command. - Remove use of "for example", which implies there are more, undocumented reasons that ZFS may fail to destroy a snapshot immediately. - Mention properties `defer_destroy` and `userrefs`. - Add `zfsprops(8)` to "SEE ALSO" list. - Clarify meaning of `-d` option. Requires-builders: none Signed-off-by: mnrx <83848843+mnrx@users.noreply.github.com> Co-authored-by: Alexander Motin Reviewed-by: Alexander Motin Reviewed-by: George Amanakis Reviewed-by: Tony Hutter --- man/man8/zfs-destroy.8 | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/man/man8/zfs-destroy.8 b/man/man8/zfs-destroy.8 index 247c561322bf..97596b28444b 100644 --- a/man/man8/zfs-destroy.8 +++ b/man/man8/zfs-destroy.8 @@ -101,18 +101,25 @@ behavior for mounted file systems in use. .Ar filesystem Ns | Ns Ar volume Ns @ Ns Ar snap Ns .Oo % Ns Ar snap Ns Oo , Ns Ar snap Ns Oo % Ns Ar snap Oc Oc Oc Ns … .Xc -The given snapshots are destroyed immediately if and only if the +Attempts to destroy the given snapshot(s). +This will fail if any clones of the snapshot exist or if the snapshot is held. +In this case, by default, .Nm zfs Cm destroy -command without the +will have no effect and exit in error. +If the .Fl d -option would have destroyed it. -Such immediate destruction would occur, for example, if the snapshot had no -clones and the user-initiated reference count were zero. +option is applied, the command will instead mark the given snapshot for +automatic destruction as soon as it becomes eligible. +While marked for destruction, a snapshot remains visible, and the user may +create new clones from it and place new holds on it. .Pp -If a snapshot does not qualify for immediate destruction, it is marked for -deferred deletion. -In this state, it exists as a usable, visible snapshot until both of the -preconditions listed above are met, at which point it is destroyed. +The read-only snapshot properties +.Sy defer_destroy +and +.Sy userrefs +are used by +.Nm zfs Cm destroy +to determine eligibility and marked status. .Pp An inclusive range of snapshots may be specified by separating the first and last snapshots with a percent sign. @@ -137,8 +144,9 @@ If this flag is specified, the .Fl d flag will have no effect. .It Fl d -Destroy immediately. -If a snapshot cannot be destroyed now, mark it for deferred destruction. +Rather than returning error if the given snapshot is ineligible for immediate +destruction, mark it for deferred, automatic destruction once it becomes +eligible. .It Fl n Do a dry-run .Pq Qq No-op @@ -223,4 +231,5 @@ renames the remaining snapshots, and then creates a new snapshot, as follows: . .Sh SEE ALSO .Xr zfs-create 8 , -.Xr zfs-hold 8 +.Xr zfs-hold 8 , +.Xr zfsprops 8 From a77b6398edc577cff8d6d225ed1227dee42439cd Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski <0mp@FreeBSD.org> Date: Thu, 6 Feb 2025 00:53:33 +0100 Subject: [PATCH 23/44] Fix typos in zpool_do_scrub() error messages (#17028) Sponsored-by: Wasabi Technology, Inc. Sponsored-by: Klara, Inc. Signed-off-by: Mateusz Piotrowski <0mp@FreeBSD.org> Reviewed-by: Alexander Motin Reviewed-by: George Amanakis Reviewed-by: George Melikov Reviewed-by: Tony Hutter --- cmd/zpool/zpool_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 506427a10672..5fcf0991de66 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -8481,19 +8481,19 @@ zpool_do_scrub(int argc, char **argv) if (is_pause && is_stop) { (void) fprintf(stderr, gettext("invalid option " - "combination :-s and -p are mutually exclusive\n")); + "combination: -s and -p are mutually exclusive\n")); usage(B_FALSE); } else if (is_pause && is_txg_continue) { (void) fprintf(stderr, gettext("invalid option " - "combination :-p and -C are mutually exclusive\n")); + "combination: -p and -C are mutually exclusive\n")); usage(B_FALSE); } else if (is_stop && is_txg_continue) { (void) fprintf(stderr, gettext("invalid option " - "combination :-s and -C are mutually exclusive\n")); + "combination: -s and -C are mutually exclusive\n")); usage(B_FALSE); } else if (is_error_scrub && is_txg_continue) { (void) fprintf(stderr, gettext("invalid option " - "combination :-e and -C are mutually exclusive\n")); + "combination: -e and -C are mutually exclusive\n")); usage(B_FALSE); } else { if (is_error_scrub) From 2adca179b641c1b415e4f0b031d77ac4880ba3e0 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 6 Feb 2025 12:40:01 -0800 Subject: [PATCH 24/44] Expand fragmentation table to reflect larger possibile allocation sizes When you are using large recordsizes in conjunction with raidz, with incompressible data, you can pretty reliably be making 21 MB allocations. Unfortunately, the fragmentation metric in ZFS considers any metaslabs with 16 MB free chunks completely unfragmented, so you can have a metaslab report 0% fragmented and be unable to satisfy an allocation. When using the segment-based metaslab weight, this is inconvenient; when using the space-based one, it can seriously degrade performance. We expand the fragmentation table to extend up to 512MB, and redefine the table size based on the actual table, rather than having a static define. We also tweak the one variable that depends on fragmentation directly. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Allan Jude Reviewed-by: Alexander Motin Signed-off-by: Paul Dagnelie Closes #16986 --- man/man4/zfs.4 | 2 +- module/zfs/metaslab.c | 57 +++++++++++++++++++++++-------------------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index dd0b3d848fe9..9d83357fcc6d 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1778,7 +1778,7 @@ Normally disabled because these datasets may be missing key data. .It Sy zfs_min_metaslabs_to_flush Ns = Ns Sy 1 Pq u64 Minimum number of metaslabs to flush per dirty TXG. . -.It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 70 Ns % Pq uint +.It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 77 Ns % Pq uint Allow metaslabs to keep their active state as long as their fragmentation percentage is no more than this value. An active metaslab that exceeds this threshold diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 7affbfac9dc7..353a99605913 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -146,7 +146,7 @@ static uint_t zfs_mg_fragmentation_threshold = 95; * active metaslab that exceeds this threshold will no longer keep its active * status allowing better metaslabs to be selected. */ -static uint_t zfs_metaslab_fragmentation_threshold = 70; +static uint_t zfs_metaslab_fragmentation_threshold = 77; /* * When set will load all metaslabs when pool is first opened. @@ -2889,8 +2889,6 @@ metaslab_fini(metaslab_t *msp) kmem_free(msp, sizeof (metaslab_t)); } -#define FRAGMENTATION_TABLE_SIZE 17 - /* * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done @@ -2901,33 +2899,40 @@ metaslab_fini(metaslab_t *msp) * us the fragmentation metric. This means that a high fragmentation metric * equates to most of the free space being comprised of small segments. * Conversely, if the metric is low, then most of the free space is in - * large segments. A 10% change in fragmentation equates to approximately - * double the number of segments. + * large segments. * - * This table defines 0% fragmented space using 16MB segments. Testing has - * shown that segments that are greater than or equal to 16MB do not suffer - * from drastic performance problems. Using this value, we derive the rest - * of the table. Since the fragmentation value is never stored on disk, it - * is possible to change these calculations in the future. + * This table defines 0% fragmented space using 512M segments. Using this value, + * we derive the rest of the table. This table originally went up to 16MB, but + * with larger recordsizes, larger ashifts, and use of raidz3, it is possible + * to have significantly larger allocations than were previously possible. + * Since the fragmentation value is never stored on disk, it is possible to + * change these calculations in the future. */ -static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { +static const int zfs_frag_table[] = { 100, /* 512B */ - 100, /* 1K */ - 98, /* 2K */ - 95, /* 4K */ - 90, /* 8K */ - 80, /* 16K */ - 70, /* 32K */ - 60, /* 64K */ - 50, /* 128K */ - 40, /* 256K */ - 30, /* 512K */ - 20, /* 1M */ - 15, /* 2M */ - 10, /* 4M */ - 5, /* 8M */ - 0 /* 16M */ + 99, /* 1K */ + 97, /* 2K */ + 93, /* 4K */ + 88, /* 8K */ + 83, /* 16K */ + 77, /* 32K */ + 71, /* 64K */ + 64, /* 128K */ + 57, /* 256K */ + 50, /* 512K */ + 43, /* 1M */ + 36, /* 2M */ + 29, /* 4M */ + 23, /* 8M */ + 17, /* 16M */ + 12, /* 32M */ + 7, /* 64M */ + 3, /* 128M */ + 1, /* 256M */ + 0, /* 512M */ }; +#define FRAGMENTATION_TABLE_SIZE \ + (sizeof (zfs_frag_table)/(sizeof (zfs_frag_table[0]))) /* * Calculate the metaslab's fragmentation metric and set ms_fragmentation. From a0f8d3c5845006d1853979209a2ce516f5ab1c23 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 6 Feb 2025 12:42:50 -0800 Subject: [PATCH 25/44] Add kstats tracking gang allocations Gang blocks have a significant impact on the long and short term performance of a zpool, but there is not a lot of observability into whether they're being used. This change adds gang-specific kstats to ZFS, to better allow users to see whether ganging is happening. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Signed-off-by: Paul Dagnelie Closes #17003 --- module/zfs/zio.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index ae5340da9f00..10930e7f381e 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -145,10 +145,53 @@ static const int zio_buf_debug_limit = 16384; static const int zio_buf_debug_limit = 0; #endif +typedef struct zio_stats { + kstat_named_t ziostat_total_allocations; + kstat_named_t ziostat_alloc_class_fallbacks; + kstat_named_t ziostat_gang_writes; + kstat_named_t ziostat_gang_multilevel; +} zio_stats_t; + +static zio_stats_t zio_stats = { + { "total_allocations", KSTAT_DATA_UINT64 }, + { "alloc_class_fallbacks", KSTAT_DATA_UINT64 }, + { "gang_writes", KSTAT_DATA_UINT64 }, + { "gang_multilevel", KSTAT_DATA_UINT64 }, +}; + +struct { + wmsum_t ziostat_total_allocations; + wmsum_t ziostat_alloc_class_fallbacks; + wmsum_t ziostat_gang_writes; + wmsum_t ziostat_gang_multilevel; +} ziostat_sums; + +#define ZIOSTAT_BUMP(stat) wmsum_add(&ziostat_sums.stat, 1); + +static kstat_t *zio_ksp; + static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); +static int +zio_kstats_update(kstat_t *ksp, int rw) +{ + zio_stats_t *zs = ksp->ks_data; + if (rw == KSTAT_WRITE) + return (EACCES); + + zs->ziostat_total_allocations.value.ui64 = + wmsum_value(&ziostat_sums.ziostat_total_allocations); + zs->ziostat_alloc_class_fallbacks.value.ui64 = + wmsum_value(&ziostat_sums.ziostat_alloc_class_fallbacks); + zs->ziostat_gang_writes.value.ui64 = + wmsum_value(&ziostat_sums.ziostat_gang_writes); + zs->ziostat_gang_multilevel.value.ui64 = + wmsum_value(&ziostat_sums.ziostat_gang_multilevel); + return (0); +} + void zio_init(void) { @@ -159,6 +202,19 @@ zio_init(void) zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + wmsum_init(&ziostat_sums.ziostat_total_allocations, 0); + wmsum_init(&ziostat_sums.ziostat_alloc_class_fallbacks, 0); + wmsum_init(&ziostat_sums.ziostat_gang_writes, 0); + wmsum_init(&ziostat_sums.ziostat_gang_multilevel, 0); + zio_ksp = kstat_create("zfs", 0, "zio_stats", + "misc", KSTAT_TYPE_NAMED, sizeof (zio_stats) / + sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (zio_ksp != NULL) { + zio_ksp->ks_data = &zio_stats; + zio_ksp->ks_update = zio_kstats_update; + kstat_install(zio_ksp); + } + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t align, cflags, data_cflags; @@ -286,6 +342,16 @@ zio_fini(void) VERIFY3P(zio_data_buf_cache[i], ==, NULL); } + if (zio_ksp != NULL) { + kstat_delete(zio_ksp); + zio_ksp = NULL; + } + + wmsum_fini(&ziostat_sums.ziostat_total_allocations); + wmsum_fini(&ziostat_sums.ziostat_alloc_class_fallbacks); + wmsum_fini(&ziostat_sums.ziostat_gang_writes); + wmsum_fini(&ziostat_sums.ziostat_gang_multilevel); + kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); @@ -4053,6 +4119,7 @@ zio_dva_allocate(zio_t *zio) mc = spa_preferred_class(spa, zio); zio->io_metaslab_class = mc; } + ZIOSTAT_BUMP(ziostat_total_allocations); /* * Try allocating the block in the usual metaslab class. @@ -4118,6 +4185,7 @@ zio_dva_allocate(zio_t *zio) error); } + ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); @@ -4130,6 +4198,9 @@ zio_dva_allocate(zio_t *zio) spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } + ZIOSTAT_BUMP(ziostat_gang_writes); + if (flags & METASLAB_GANG_CHILD) + ZIOSTAT_BUMP(ziostat_gang_multilevel); return (zio_write_gang_block(zio, mc)); } if (error != 0) { @@ -4221,6 +4292,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, int flags = METASLAB_ZIL; int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; + ZIOSTAT_BUMP(ziostat_total_allocations); error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); *slog = (error == 0); @@ -4230,6 +4302,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, &io_alloc_list, NULL, allocator); } if (error != 0) { + ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); From 7784947923e3e46ea1c1ba765db387ff649cf8a6 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Thu, 6 Feb 2025 21:43:47 +0100 Subject: [PATCH 26/44] optimize recv_fix_encryption_hierarchy() recv_fix_encryption_hierarchy() in its present state goes through all stream filesystems, and for each one traverses the snapshots in order to find one that exists locally. This happens by calling guid_to_name() for each snapshot, which iterates through all children of the filesystem. This results in CPU utilization of 100% for several minutes (for ~1000 filesystems on a Ryzen 4350G) for 1 thread at the end of a raw receive (-w, regardless whether encrypted or not, dryrun or not). Fix this by following a different logic: using the top_fs name, call gather_nvlist() to gather the nvlists for all local filesystems. For each one filesystem, go through the snapshots to find the corresponding stream's filesystem (since we know the snapshots guid and can search with it in stream_avl for the stream's fs). Then go on to fix the encryption roots and locations as in its present state. Avoiding guid_to_name() iteratively makes recv_fix_encryption_hierarchy() significantly faster (from several minutes to seconds for ~1000 filesystems on a Ryzen 4350G). Another problem is the following: in case we have promoted a clone of the filesystem outside the top filesystem specified in zfs send, zfs receive does not fail but returns an error: recv_incremental_replication() fails to find its origin and errors out with needagain=1. This results in recv_fix_hierarchy() not being called which may render some children of the top fs not mountable since their encryption root was not updated. To circumvent this make recv_incremental_replication() silently ignore this error. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #16929 --- lib/libzfs/libzfs_sendrecv.c | 100 ++++++++++-------- .../rsend/send_encrypted_hierarchy.ksh | 31 ++++-- 2 files changed, 80 insertions(+), 51 deletions(-) diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index b9780720e5a3..97920ce6f21c 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -3376,66 +3376,78 @@ created_before(libzfs_handle_t *hdl, avl_tree_t *avl, */ static int recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *top_zfs, - nvlist_t *stream_nv) + nvlist_t *stream_nv, avl_tree_t *stream_avl) { int err; nvpair_t *fselem = NULL; - nvlist_t *stream_fss; + nvlist_t *local_nv; + avl_tree_t *local_avl; + boolean_t recursive; + + recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == + ENOENT); - stream_fss = fnvlist_lookup_nvlist(stream_nv, "fss"); + /* Using top_zfs, gather the nvlists for all local filesystems. */ + if ((err = gather_nvlist(hdl, top_zfs, NULL, NULL, + recursive, B_TRUE, B_FALSE, recursive, B_FALSE, B_FALSE, B_FALSE, + B_FALSE, B_TRUE, &local_nv, &local_avl)) != 0) + return (err); - while ((fselem = nvlist_next_nvpair(stream_fss, fselem)) != NULL) { + /* + * Go through the nvlists of the local filesystems and check for + * encryption roots. + */ + while ((fselem = nvlist_next_nvpair(local_nv, fselem)) != NULL) { zfs_handle_t *zhp = NULL; uint64_t crypt; - nvlist_t *snaps, *props, *stream_nvfs = NULL; - nvpair_t *snapel = NULL; + nvlist_t *stream_props, *snaps, *stream_nvfs = NULL, + *nvfs = NULL; boolean_t is_encroot, is_clone, stream_encroot; - char *cp; - const char *stream_keylocation = NULL; + const char *stream_keylocation = NULL, *fsname; char keylocation[MAXNAMELEN]; - char fsname[ZFS_MAX_DATASET_NAME_LEN]; - - keylocation[0] = '\0'; - stream_nvfs = fnvpair_value_nvlist(fselem); - snaps = fnvlist_lookup_nvlist(stream_nvfs, "snaps"); - props = fnvlist_lookup_nvlist(stream_nvfs, "props"); - stream_encroot = nvlist_exists(stream_nvfs, "is_encroot"); - - /* find a snapshot from the stream that exists locally */ - err = ENOENT; - while ((snapel = nvlist_next_nvpair(snaps, snapel)) != NULL) { - uint64_t guid; - - guid = fnvpair_value_uint64(snapel); - err = guid_to_name(hdl, top_zfs, guid, B_FALSE, - fsname); - if (err == 0) - break; - } - - if (err != 0) - continue; - - cp = strchr(fsname, '@'); - if (cp != NULL) - *cp = '\0'; + nvpair_t *snapelem; + nvfs = fnvpair_value_nvlist(fselem); + snaps = fnvlist_lookup_nvlist(nvfs, "snaps"); + fsname = fnvlist_lookup_string(nvfs, "name"); zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET); if (zhp == NULL) { err = ENOENT; goto error; } - crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); - is_clone = zhp->zfs_dmustats.dds_origin[0] != '\0'; - (void) zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); - /* we don't need to do anything for unencrypted datasets */ + crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); if (crypt == ZIO_CRYPT_OFF) { zfs_close(zhp); continue; } + is_clone = zhp->zfs_dmustats.dds_origin[0] != '\0'; + (void) zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); + keylocation[0] = '\0'; + + /* + * Go through the snapshots of the local filesystem and find + * the stream's filesystem. + */ + for (snapelem = nvlist_next_nvpair(snaps, NULL); + snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) { + uint64_t thisguid; + + thisguid = fnvpair_value_uint64(snapelem); + stream_nvfs = fsavl_find(stream_avl, thisguid, NULL); + + if (stream_nvfs != NULL) + break; + } + + if (stream_nvfs == NULL) + continue; + + stream_props = fnvlist_lookup_nvlist(stream_nvfs, "props"); + stream_encroot = nvlist_exists(stream_nvfs, "is_encroot"); + /* * If the dataset is flagged as an encryption root, was not * received as a clone and is not currently an encryption root, @@ -3451,7 +3463,7 @@ recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *top_zfs, } } - stream_keylocation = fnvlist_lookup_string(props, + stream_keylocation = fnvlist_lookup_string(stream_props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)); /* @@ -3518,14 +3530,14 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, boolean_t needagain, progress, recursive; const char *s1, *s2; + if (flags->dryrun) + return (0); + fromsnap = fnvlist_lookup_string(stream_nv, "fromsnap"); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); - if (flags->dryrun) - return (0); - again: needagain = progress = B_FALSE; @@ -3999,9 +4011,9 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, stream_nv, stream_avl, NULL); } - if (raw && softerr == 0 && *top_zfs != NULL) { + if (raw && *top_zfs != NULL && !flags->dryrun) { softerr = recv_fix_encryption_hierarchy(hdl, *top_zfs, - stream_nv); + stream_nv, stream_avl); } out: diff --git a/tests/zfs-tests/tests/functional/rsend/send_encrypted_hierarchy.ksh b/tests/zfs-tests/tests/functional/rsend/send_encrypted_hierarchy.ksh index 8417afc88d33..6dd4ae46f947 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_encrypted_hierarchy.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_encrypted_hierarchy.ksh @@ -61,16 +61,17 @@ log_must eval "zfs receive -d -F $POOL2 < $BACKDIR/fs-before-R" dstds=$(get_dst_ds $POOL/$FS $POOL2) log_must cmp_ds_subs $POOL/$FS $dstds -log_must verify_encryption_root $POOL/$FS $POOL/$FS -log_must verify_keylocation $POOL/$FS "prompt" -log_must verify_origin $POOL/$FS "-" +log_must verify_encryption_root $POOL2/$FS $POOL2/$FS +log_must verify_keylocation $POOL2/$FS "prompt" +log_must verify_origin $POOL2/$FS "-" -log_must verify_encryption_root $POOL/clone $POOL/$FS -log_must verify_keylocation $POOL/clone "none" -log_must verify_origin $POOL/clone "$POOL/$FS@snap" +log_must verify_encryption_root $POOL2/clone $POOL2/$FS +log_must verify_keylocation $POOL2/clone "none" +log_must verify_origin $POOL2/clone "$POOL2/$FS@snap" log_must verify_encryption_root $POOL/$FS/child $POOL/$FS -log_must verify_keylocation $POOL/$FS/child "none" +log_must verify_encryption_root $POOL2/$FS/child $POOL2/$FS +log_must verify_keylocation $POOL2/$FS/child "none" # Alter the hierarchy and re-send log_must eval "echo $PASSPHRASE1 | zfs change-key -o keyformat=passphrase" \ @@ -93,4 +94,20 @@ log_must verify_origin $POOL/clone "-" log_must verify_encryption_root $POOL/$FS/child $POOL/$FS/child log_must verify_keylocation $POOL/$FS/child "prompt" +log_must verify_encryption_root $POOL2 "-" +log_must verify_encryption_root $POOL2/clone $POOL2/clone +log_must verify_encryption_root $POOL2/$FS $POOL2/clone +log_must verify_encryption_root $POOL2/$FS/child $POOL2/$FS/child + +log_must verify_keylocation $POOL2 "none" +log_must verify_keylocation $POOL2/clone "prompt" +log_must verify_keylocation $POOL2/$FS "none" +log_must verify_keylocation $POOL2/$FS/child "prompt" + +log_must verify_origin $POOL2 "-" +log_must verify_origin $POOL2/clone "-" +log_must verify_origin $POOL2/$FS "$POOL2/clone@snap" +log_must verify_origin $POOL2/$FS/child "-" +log_must zfs list + log_pass "Raw recursive sends preserve filesystem structure." From 001ab5941d730f90957177b82dbd56230aa96e37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dr=2E=20Christian=20Kohlsch=C3=BCtter?= Date: Sat, 8 Feb 2025 01:38:58 +0100 Subject: [PATCH 27/44] Fix "make install" with DESTDIR set (#16995) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "DESTDIR=/path/to/target/root/ make install" may fail when installing to a root that contains an existing lib/modules structure. When run as root we may even affect the wrong kernel (the build system's one, or, if running a different version, some other directory in /lib/modules, but not the desired one installed in DESTDIR). Add a missing reference to the INSTALL_MOD_PATH root when calling "depmod" during "make install" Also add a switch "DONT_DELETE_MODULES_FILES=1" that skips the removal of files named "modules.*" prior to running depmod. Signed-off-by: Christian Kohlschütter Closes #16994 Reviewed-by: Rob Norris Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter --- module/Makefile.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/Makefile.in b/module/Makefile.in index f76e94afa410..a65cbfce1a90 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -93,7 +93,7 @@ modules_install-Linux: modules_uninstall-Linux-legacy $(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \ KERNELRELEASE=@LINUX_VERSION@ @# Remove extraneous build products when packaging - if [ -n "$(DESTDIR)" ]; then \ + if [ -n "$(DESTDIR)" ] && [ "$(DONT_DELETE_MODULES_FILES)" != "1" ]; then \ find $(KMODDIR) -name 'modules.*' -delete; \ fi @# Debian ships tiny fake System.map files that are @@ -106,7 +106,7 @@ modules_install-Linux: modules_uninstall-Linux-legacy { [ -f "$$sysmap" ] && [ $$(wc -l < "$$sysmap") -ge 100 ]; } || \ sysmap=$(INSTALL_MOD_PATH)/usr/lib/debug/boot/System.map-@LINUX_VERSION@; \ if [ -f $$sysmap ]; then \ - depmod -ae -F $$sysmap @LINUX_VERSION@; \ + depmod -ae -F $$sysmap @LINUX_VERSION@ -b $(INSTALL_MOD_PATH)/; \ fi modules_install-FreeBSD: From 0be3b266ed740d7e03317ec58e32c1d0108bf6cd Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 8 Feb 2025 12:42:24 +1100 Subject: [PATCH 28/44] zio: do no-op injections just before handing off to vdevs The purpose of no-op is to simulate a failure between a device cache and its permanent store. We still want it to go through the queue and respond in the same way to everything else. So, inject "success" as the very last thing, and then move on to VDEV_IO_DONE to be dequeued and so any followup work can occur. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Signed-off-by: Rob Norris Closes #17029 --- module/zfs/zio.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 10930e7f381e..b071ac17ed1f 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4495,16 +4495,6 @@ zio_vdev_io_start(zio_t *zio) zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { - if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) { - /* - * "no-op" injections return success, but do no actual - * work. Just skip the remaining vdev stages. - */ - zio_vdev_io_bypass(zio); - zio_interrupt(zio); - return (NULL); - } - if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); @@ -4514,6 +4504,15 @@ zio_vdev_io_start(zio_t *zio) return (NULL); } zio->io_delay = gethrtime(); + + if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) { + /* + * "no-op" injections return success, but do no actual + * work. Just return it. + */ + zio_delay_interrupt(zio); + return (NULL); + } } vd->vdev_ops->vdev_op_io_start(zio); From 8487b6c9b4ace21bb7b238baef48420c3f28f16c Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Tue, 11 Feb 2025 22:07:29 +0500 Subject: [PATCH 29/44] Update the dataset name in handle after zfs_rename (#17040) For zfs_rename, after the dataset name is successfully updated, the dataset handle that was passed to zfs_rename, still contains the old name, due to which, the dataset handle becomes invalid. The following operations performed using this handle result in error since the dataset with old name cannot be found anymore. changelist_rename does update the names in dataset handles, but those are temporary handles that were created during changelist_gather. The original handle that was used to call zfs_rename is not updated. We should update the name in original ZFS handle after the IOCTL for rename returns success for the operation. Signed-off-by: Umer Saleem Reviewed-by: Ameer Hamza Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- lib/libzfs/libzfs_dataset.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 231bbbd92dbf..06fa52b00e05 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -4673,6 +4673,7 @@ zfs_rename(zfs_handle_t *zhp, const char *target, renameflags_t flags) changelist_rename(cl, zfs_get_name(zhp), target); ret = changelist_postfix(cl); } + (void) strlcpy(zhp->zfs_name, target, sizeof (zhp->zfs_name)); } error: From c4fa9c296248316dbabee8274d2531393b7dd110 Mon Sep 17 00:00:00 2001 From: vandanarungta <46906819+vandanarungta@users.noreply.github.com> Date: Tue, 11 Feb 2025 17:37:17 -0800 Subject: [PATCH 30/44] Free memory in an error path in spl-kmem-cache.c skc->skc_name also needs to be freed in an error path. Reviewed-by: Alexander Motin Reviewed-by: Tino Reichardt Signed-off-by: Vandana Rungta Closes #17041 --- module/os/linux/spl/spl-kmem-cache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 33c7d0879741..da5513c50189 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -727,6 +727,7 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align, rc = percpu_counter_init(&skc->skc_linux_alloc, 0, GFP_KERNEL); if (rc != 0) { + kfree(skc->skc_name); kfree(skc); return (NULL); } From 55b21552d34a8f719e890033de2d63c7583d661b Mon Sep 17 00:00:00 2001 From: Ivan Volosyuk Date: Thu, 30 Jan 2025 21:26:49 +1100 Subject: [PATCH 31/44] Linux 6.12 compat: Rename range_tree_* to zfs_range_tree_* Linux 6.12 has conflicting range_tree_{find,destroy,clear} symbols. Signed-off-by: Ivan Volosyuk Reviewed-by: Tony Hutter Reviewed-by: Rob Norris --- cmd/zdb/zdb.c | 102 ++++---- include/sys/dnode.h | 2 +- include/sys/metaslab.h | 2 +- include/sys/metaslab_impl.h | 20 +- include/sys/range_tree.h | 203 +++++++-------- include/sys/space_map.h | 16 +- include/sys/space_reftree.h | 4 +- include/sys/vdev_impl.h | 7 +- include/sys/vdev_rebuild.h | 3 +- include/sys/vdev_removal.h | 4 +- module/zfs/dbuf.c | 4 +- module/zfs/dnode.c | 10 +- module/zfs/dnode_sync.c | 23 +- module/zfs/dsl_pool.c | 4 +- module/zfs/dsl_scan.c | 93 +++---- module/zfs/metaslab.c | 458 ++++++++++++++++++---------------- module/zfs/range_tree.c | 442 ++++++++++++++++---------------- module/zfs/spa.c | 2 +- module/zfs/spa_checkpoint.c | 4 +- module/zfs/spa_log_spacemap.c | 16 +- module/zfs/space_map.c | 53 ++-- module/zfs/space_reftree.c | 17 +- module/zfs/vdev.c | 94 +++---- module/zfs/vdev_indirect.c | 6 +- module/zfs/vdev_initialize.c | 31 +-- module/zfs/vdev_raidz.c | 31 +-- module/zfs/vdev_rebuild.c | 29 +-- module/zfs/vdev_removal.c | 165 ++++++------ module/zfs/vdev_trim.c | 83 +++--- 29 files changed, 994 insertions(+), 934 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index aba99fabbbb9..dd521257ccb2 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -122,7 +122,7 @@ static int flagbits[256]; static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ static int leaked_objects = 0; -static range_tree_t *mos_refd_objs; +static zfs_range_tree_t *mos_refd_objs; static spa_t *spa; static objset_t *os; static boolean_t kernel_init_done; @@ -325,7 +325,7 @@ typedef struct metaslab_verify { /* * What's currently allocated for this metaslab. */ - range_tree_t *mv_allocated; + zfs_range_tree_t *mv_allocated; } metaslab_verify_t; typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); @@ -417,7 +417,7 @@ metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) uint64_t txg = sme->sme_txg; if (sme->sme_type == SM_ALLOC) { - if (range_tree_contains(mv->mv_allocated, + if (zfs_range_tree_contains(mv->mv_allocated, offset, size)) { (void) printf("ERROR: DOUBLE ALLOC: " "%llu [%llx:%llx] " @@ -426,11 +426,11 @@ metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, (u_longlong_t)mv->mv_msid); } else { - range_tree_add(mv->mv_allocated, + zfs_range_tree_add(mv->mv_allocated, offset, size); } } else { - if (!range_tree_contains(mv->mv_allocated, + if (!zfs_range_tree_contains(mv->mv_allocated, offset, size)) { (void) printf("ERROR: DOUBLE FREE: " "%llu [%llx:%llx] " @@ -439,7 +439,7 @@ metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, (u_longlong_t)mv->mv_msid); } else { - range_tree_remove(mv->mv_allocated, + zfs_range_tree_remove(mv->mv_allocated, offset, size); } } @@ -614,11 +614,11 @@ livelist_metaslab_validate(spa_t *spa) (longlong_t)vd->vdev_ms_count); uint64_t shift, start; - range_seg_type_t type = + zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(vd, m, &start, &shift); metaslab_verify_t mv; - mv.mv_allocated = range_tree_create(NULL, + mv.mv_allocated = zfs_range_tree_create(NULL, type, NULL, start, shift); mv.mv_vdid = vd->vdev_id; mv.mv_msid = m->ms_id; @@ -633,8 +633,8 @@ livelist_metaslab_validate(spa_t *spa) spacemap_check_ms_sm(m->ms_sm, &mv); spacemap_check_sm_log(spa, &mv); - range_tree_vacate(mv.mv_allocated, NULL, NULL); - range_tree_destroy(mv.mv_allocated); + zfs_range_tree_vacate(mv.mv_allocated, NULL, NULL); + zfs_range_tree_destroy(mv.mv_allocated); zfs_btree_clear(&mv.mv_livelist_allocs); zfs_btree_destroy(&mv.mv_livelist_allocs); } @@ -1633,9 +1633,9 @@ static void dump_metaslab_stats(metaslab_t *msp) { char maxbuf[32]; - range_tree_t *rt = msp->ms_allocatable; + zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; - int free_pct = range_tree_space(rt) * 100 / msp->ms_size; + int free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size; /* max sure nicenum has enough space */ _Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated"); @@ -1668,7 +1668,7 @@ dump_metaslab(metaslab_t *msp) if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); - range_tree_stat_verify(msp->ms_allocatable); + zfs_range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); metaslab_unload(msp); mutex_exit(&msp->ms_lock); @@ -2292,12 +2292,12 @@ dump_dtl(vdev_t *vd, int indent) required ? "DTL-required" : "DTL-expendable"); for (int t = 0; t < DTL_TYPES; t++) { - range_tree_t *rt = vd->vdev_dtl[t]; - if (range_tree_space(rt) == 0) + zfs_range_tree_t *rt = vd->vdev_dtl[t]; + if (zfs_range_tree_space(rt) == 0) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); - range_tree_walk(rt, dump_dtl_seg, prefix); + zfs_range_tree_walk(rt, dump_dtl_seg, prefix); if (dump_opt['d'] > 5 && vd->vdev_children == 0) dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); @@ -6258,9 +6258,9 @@ load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, return (0); if (sme->sme_type == SM_ALLOC) - range_tree_add(svr->svr_allocd_segs, offset, size); + zfs_range_tree_add(svr->svr_allocd_segs, offset, size); else - range_tree_remove(svr->svr_allocd_segs, offset, size); + zfs_range_tree_remove(svr->svr_allocd_segs, offset, size); return (0); } @@ -6314,18 +6314,20 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - ASSERT0(range_tree_space(svr->svr_allocd_segs)); + ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); - range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + zfs_range_tree_t *allocs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, + NULL, 0, 0); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; - ASSERT0(range_tree_space(allocs)); + ASSERT0(zfs_range_tree_space(allocs)); if (msp->ms_sm != NULL) VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); - range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); + zfs_range_tree_vacate(allocs, zfs_range_tree_add, + svr->svr_allocd_segs); } - range_tree_destroy(allocs); + zfs_range_tree_destroy(allocs); iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); @@ -6334,12 +6336,12 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) * because we have not allocated mappings for * it yet. */ - range_tree_clear(svr->svr_allocd_segs, + zfs_range_tree_clear(svr->svr_allocd_segs, vdev_indirect_mapping_max_offset(vim), vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); - zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); - range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); + zcb->zcb_removing_size += zfs_range_tree_space(svr->svr_allocd_segs); + zfs_range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); spa_config_exit(spa, SCL_CONFIG, FTAG); } @@ -6442,7 +6444,8 @@ checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) * also verify that the entry is there to begin with. */ mutex_enter(&ms->ms_lock); - range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); + zfs_range_tree_remove(ms->ms_allocatable, sme->sme_offset, + sme->sme_run); mutex_exit(&ms->ms_lock); cseea->cseea_checkpoint_size += sme->sme_run; @@ -6573,9 +6576,9 @@ load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) return (0); if (*uic_maptype == sme->sme_type) - range_tree_add(ms->ms_allocatable, offset, size); + zfs_range_tree_add(ms->ms_allocatable, offset, size); else - range_tree_remove(ms->ms_allocatable, offset, size); + zfs_range_tree_remove(ms->ms_allocatable, offset, size); return (0); } @@ -6609,7 +6612,7 @@ load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) (longlong_t)vd->vdev_ms_count); mutex_enter(&msp->ms_lock); - range_tree_vacate(msp->ms_allocatable, NULL, NULL); + zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the @@ -6642,7 +6645,7 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; mutex_enter(&msp->ms_lock); - range_tree_vacate(msp->ms_allocatable, NULL, NULL); + zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); /* * We don't want to spend the CPU manipulating the @@ -6666,7 +6669,7 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, */ ASSERT3U(ent_offset + ent_len, <=, msp->ms_start + msp->ms_size); - range_tree_add(msp->ms_allocatable, ent_offset, ent_len); + zfs_range_tree_add(msp->ms_allocatable, ent_offset, ent_len); } if (!msp->ms_loaded) @@ -6812,7 +6815,7 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) for (uint64_t inner_offset = 0; inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); inner_offset += 1ULL << vd->vdev_ashift) { - if (range_tree_contains(msp->ms_allocatable, + if (zfs_range_tree_contains(msp->ms_allocatable, offset + inner_offset, 1ULL << vd->vdev_ashift)) { obsolete_bytes += 1ULL << vd->vdev_ashift; } @@ -6895,10 +6898,10 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) * not referenced, which is not a bug. */ if (vd->vdev_ops == &vdev_indirect_ops) { - range_tree_vacate(msp->ms_allocatable, + zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); } else { - range_tree_vacate(msp->ms_allocatable, + zfs_range_tree_vacate(msp->ms_allocatable, zdb_leak, vd); } if (msp->ms_loaded) { @@ -7796,7 +7799,7 @@ verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) * their respective ms_allocateable trees should not contain them. */ mutex_enter(&ms->ms_lock); - range_tree_verify_not_present(ms->ms_allocatable, + zfs_range_tree_verify_not_present(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); @@ -7947,8 +7950,9 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) * This way we ensure that none of the blocks that * are part of the checkpoint were freed by mistake. */ - range_tree_walk(ckpoint_msp->ms_allocatable, - (range_tree_func_t *)range_tree_verify_not_present, + zfs_range_tree_walk(ckpoint_msp->ms_allocatable, + (zfs_range_tree_func_t *) + zfs_range_tree_verify_not_present, current_msp->ms_allocatable); } } @@ -8088,7 +8092,7 @@ static void mos_obj_refd(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL) - range_tree_add(mos_refd_objs, obj, 1); + zfs_range_tree_add(mos_refd_objs, obj, 1); } /* @@ -8098,8 +8102,8 @@ static void mos_obj_refd_multiple(uint64_t obj) { if (obj != 0 && mos_refd_objs != NULL && - !range_tree_contains(mos_refd_objs, obj, 1)) - range_tree_add(mos_refd_objs, obj, 1); + !zfs_range_tree_contains(mos_refd_objs, obj, 1)) + zfs_range_tree_add(mos_refd_objs, obj, 1); } static void @@ -8296,8 +8300,8 @@ dump_mos_leaks(spa_t *spa) */ uint64_t object = 0; while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { - if (range_tree_contains(mos_refd_objs, object, 1)) { - range_tree_remove(mos_refd_objs, object, 1); + if (zfs_range_tree_contains(mos_refd_objs, object, 1)) { + zfs_range_tree_remove(mos_refd_objs, object, 1); } else { dmu_object_info_t doi; const char *name; @@ -8315,11 +8319,11 @@ dump_mos_leaks(spa_t *spa) rv = 2; } } - (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); - if (!range_tree_is_empty(mos_refd_objs)) + (void) zfs_range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); + if (!zfs_range_tree_is_empty(mos_refd_objs)) rv = 2; - range_tree_vacate(mos_refd_objs, NULL, NULL); - range_tree_destroy(mos_refd_objs); + zfs_range_tree_vacate(mos_refd_objs, NULL, NULL); + zfs_range_tree_destroy(mos_refd_objs); return (rv); } @@ -8441,8 +8445,8 @@ dump_zpool(spa_t *spa) if (dump_opt['d'] || dump_opt['i']) { spa_feature_t f; - mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, - 0); + mos_refd_objs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, + NULL, 0, 0); dump_objset(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 5d0f0fb26d02..b6d3e2c918c5 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -335,7 +335,7 @@ struct dnode { /* protected by dn_mtx: */ kmutex_t dn_mtx; list_t dn_dirty_records[TXG_SIZE]; - struct range_tree *dn_free_ranges[TXG_SIZE]; + struct zfs_range_tree *dn_free_ranges[TXG_SIZE]; uint64_t dn_allocated_txg; uint64_t dn_free_txg; uint64_t dn_assigned_txg; diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 815b5d0c9cf1..0171cd0fe0f8 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -139,7 +139,7 @@ void metaslab_set_selected_txg(metaslab_t *, uint64_t); extern int metaslab_debug_load; -range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, +zfs_range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, uint64_t *start, uint64_t *shift); #ifdef __cplusplus diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 4f434291ddbf..eae543731224 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -398,8 +398,8 @@ struct metaslab { uint64_t ms_size; uint64_t ms_fragmentation; - range_tree_t *ms_allocating[TXG_SIZE]; - range_tree_t *ms_allocatable; + zfs_range_tree_t *ms_allocating[TXG_SIZE]; + zfs_range_tree_t *ms_allocatable; uint64_t ms_allocated_this_txg; uint64_t ms_allocating_total; @@ -408,10 +408,12 @@ struct metaslab { * ms_free*tree only have entries while syncing, and are empty * between syncs. */ - range_tree_t *ms_freeing; /* to free this syncing txg */ - range_tree_t *ms_freed; /* already freed this syncing txg */ - range_tree_t *ms_defer[TXG_DEFER_SIZE]; - range_tree_t *ms_checkpointing; /* to add to the checkpoint */ + zfs_range_tree_t *ms_freeing; /* to free this syncing txg */ + /* already freed this syncing txg */ + zfs_range_tree_t *ms_freed; + zfs_range_tree_t *ms_defer[TXG_DEFER_SIZE]; + /* to add to the checkpoint */ + zfs_range_tree_t *ms_checkpointing; /* * The ms_trim tree is the set of allocatable segments which are @@ -421,7 +423,7 @@ struct metaslab { * is unloaded. Its purpose is to aggregate freed ranges to * facilitate efficient trimming. */ - range_tree_t *ms_trim; + zfs_range_tree_t *ms_trim; boolean_t ms_condensing; /* condensing? */ boolean_t ms_condense_wanted; @@ -542,8 +544,8 @@ struct metaslab { * Allocs and frees that are committed to the vdev log spacemap but * not yet to this metaslab's spacemap. */ - range_tree_t *ms_unflushed_allocs; - range_tree_t *ms_unflushed_frees; + zfs_range_tree_t *ms_unflushed_allocs; + zfs_range_tree_t *ms_unflushed_frees; /* * We have flushed entries up to but not including this TXG. In diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index d6f60e795288..4b0a3f2bfbb1 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -39,23 +39,23 @@ extern "C" { #define RANGE_TREE_HISTOGRAM_SIZE 64 -typedef struct range_tree_ops range_tree_ops_t; +typedef struct zfs_range_tree_ops zfs_range_tree_ops_t; -typedef enum range_seg_type { - RANGE_SEG32, - RANGE_SEG64, - RANGE_SEG_GAP, - RANGE_SEG_NUM_TYPES, -} range_seg_type_t; +typedef enum zfs_range_seg_type { + ZFS_RANGE_SEG32, + ZFS_RANGE_SEG64, + ZFS_RANGE_SEG_GAP, + ZFS_RANGE_SEG_NUM_TYPES, +} zfs_range_seg_type_t; /* * Note: the range_tree may not be accessed concurrently; consumers * must provide external locking if required. */ -typedef struct range_tree { +typedef struct zfs_range_tree { zfs_btree_t rt_root; /* offset-ordered segment b-tree */ uint64_t rt_space; /* sum of all segments in the map */ - range_seg_type_t rt_type; /* type of range_seg_t in use */ + zfs_range_seg_type_t rt_type; /* type of zfs_range_seg_t in use */ /* * All data that is stored in the range tree must have a start higher * than or equal to rt_start, and all sizes and offsets must be @@ -63,7 +63,7 @@ typedef struct range_tree { */ uint8_t rt_shift; uint64_t rt_start; - const range_tree_ops_t *rt_ops; + const zfs_range_tree_ops_t *rt_ops; void *rt_arg; uint64_t rt_gap; /* allowable inter-segment gap */ @@ -73,7 +73,7 @@ typedef struct range_tree { * 2^i <= size of range in bytes < 2^(i+1) */ uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; -} range_tree_t; +} zfs_range_tree_t; typedef struct range_seg32 { uint32_t rs_start; /* starting offset of this segment */ @@ -106,26 +106,26 @@ typedef range_seg_gap_t range_seg_max_t; * pointer is to a range seg of some type; when we need to do the actual math, * we'll figure out the real type. */ -typedef void range_seg_t; - -struct range_tree_ops { - void (*rtop_create)(range_tree_t *rt, void *arg); - void (*rtop_destroy)(range_tree_t *rt, void *arg); - void (*rtop_add)(range_tree_t *rt, void *rs, void *arg); - void (*rtop_remove)(range_tree_t *rt, void *rs, void *arg); - void (*rtop_vacate)(range_tree_t *rt, void *arg); +typedef void zfs_range_seg_t; + +struct zfs_range_tree_ops { + void (*rtop_create)(zfs_range_tree_t *rt, void *arg); + void (*rtop_destroy)(zfs_range_tree_t *rt, void *arg); + void (*rtop_add)(zfs_range_tree_t *rt, void *rs, void *arg); + void (*rtop_remove)(zfs_range_tree_t *rt, void *rs, void *arg); + void (*rtop_vacate)(zfs_range_tree_t *rt, void *arg); }; static inline uint64_t -rs_get_start_raw(const range_seg_t *rs, const range_tree_t *rt) +zfs_rs_get_start_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { - ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { - case RANGE_SEG32: + case ZFS_RANGE_SEG32: return (((const range_seg32_t *)rs)->rs_start); - case RANGE_SEG64: + case ZFS_RANGE_SEG64: return (((const range_seg64_t *)rs)->rs_start); - case RANGE_SEG_GAP: + case ZFS_RANGE_SEG_GAP: return (((const range_seg_gap_t *)rs)->rs_start); default: VERIFY(0); @@ -134,15 +134,15 @@ rs_get_start_raw(const range_seg_t *rs, const range_tree_t *rt) } static inline uint64_t -rs_get_end_raw(const range_seg_t *rs, const range_tree_t *rt) +zfs_rs_get_end_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { - ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { - case RANGE_SEG32: + case ZFS_RANGE_SEG32: return (((const range_seg32_t *)rs)->rs_end); - case RANGE_SEG64: + case ZFS_RANGE_SEG64: return (((const range_seg64_t *)rs)->rs_end); - case RANGE_SEG_GAP: + case ZFS_RANGE_SEG_GAP: return (((const range_seg_gap_t *)rs)->rs_end); default: VERIFY(0); @@ -151,19 +151,19 @@ rs_get_end_raw(const range_seg_t *rs, const range_tree_t *rt) } static inline uint64_t -rs_get_fill_raw(const range_seg_t *rs, const range_tree_t *rt) +zfs_rs_get_fill_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { - ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { - case RANGE_SEG32: { + case ZFS_RANGE_SEG32: { const range_seg32_t *r32 = (const range_seg32_t *)rs; return (r32->rs_end - r32->rs_start); } - case RANGE_SEG64: { + case ZFS_RANGE_SEG64: { const range_seg64_t *r64 = (const range_seg64_t *)rs; return (r64->rs_end - r64->rs_start); } - case RANGE_SEG_GAP: + case ZFS_RANGE_SEG_GAP: return (((const range_seg_gap_t *)rs)->rs_fill); default: VERIFY(0); @@ -173,36 +173,36 @@ rs_get_fill_raw(const range_seg_t *rs, const range_tree_t *rt) } static inline uint64_t -rs_get_start(const range_seg_t *rs, const range_tree_t *rt) +zfs_rs_get_start(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { - return ((rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start); + return ((zfs_rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start); } static inline uint64_t -rs_get_end(const range_seg_t *rs, const range_tree_t *rt) +zfs_rs_get_end(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { - return ((rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start); + return ((zfs_rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start); } static inline uint64_t -rs_get_fill(const range_seg_t *rs, const range_tree_t *rt) +zfs_rs_get_fill(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) { - return (rs_get_fill_raw(rs, rt) << rt->rt_shift); + return (zfs_rs_get_fill_raw(rs, rt) << rt->rt_shift); } static inline void -rs_set_start_raw(range_seg_t *rs, range_tree_t *rt, uint64_t start) +zfs_rs_set_start_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t start) { - ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { - case RANGE_SEG32: + case ZFS_RANGE_SEG32: ASSERT3U(start, <=, UINT32_MAX); ((range_seg32_t *)rs)->rs_start = (uint32_t)start; break; - case RANGE_SEG64: + case ZFS_RANGE_SEG64: ((range_seg64_t *)rs)->rs_start = start; break; - case RANGE_SEG_GAP: + case ZFS_RANGE_SEG_GAP: ((range_seg_gap_t *)rs)->rs_start = start; break; default: @@ -211,18 +211,18 @@ rs_set_start_raw(range_seg_t *rs, range_tree_t *rt, uint64_t start) } static inline void -rs_set_end_raw(range_seg_t *rs, range_tree_t *rt, uint64_t end) +zfs_rs_set_end_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t end) { - ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { - case RANGE_SEG32: + case ZFS_RANGE_SEG32: ASSERT3U(end, <=, UINT32_MAX); ((range_seg32_t *)rs)->rs_end = (uint32_t)end; break; - case RANGE_SEG64: + case ZFS_RANGE_SEG64: ((range_seg64_t *)rs)->rs_end = end; break; - case RANGE_SEG_GAP: + case ZFS_RANGE_SEG_GAP: ((range_seg_gap_t *)rs)->rs_end = end; break; default: @@ -231,17 +231,18 @@ rs_set_end_raw(range_seg_t *rs, range_tree_t *rt, uint64_t end) } static inline void -rs_set_fill_raw(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +zfs_zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, + uint64_t fill) { - ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { - case RANGE_SEG32: + case ZFS_RANGE_SEG32: /* fall through */ - case RANGE_SEG64: - ASSERT3U(fill, ==, rs_get_end_raw(rs, rt) - rs_get_start_raw(rs, - rt)); + case ZFS_RANGE_SEG64: + ASSERT3U(fill, ==, zfs_rs_get_end_raw(rs, rt) - + zfs_rs_get_start_raw(rs, rt)); break; - case RANGE_SEG_GAP: + case ZFS_RANGE_SEG_GAP: ((range_seg_gap_t *)rs)->rs_fill = fill; break; default: @@ -250,67 +251,73 @@ rs_set_fill_raw(range_seg_t *rs, range_tree_t *rt, uint64_t fill) } static inline void -rs_set_start(range_seg_t *rs, range_tree_t *rt, uint64_t start) +zfs_rs_set_start(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t start) { ASSERT3U(start, >=, rt->rt_start); ASSERT(IS_P2ALIGNED(start, 1ULL << rt->rt_shift)); - rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift); + zfs_rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift); } static inline void -rs_set_end(range_seg_t *rs, range_tree_t *rt, uint64_t end) +zfs_rs_set_end(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t end) { ASSERT3U(end, >=, rt->rt_start); ASSERT(IS_P2ALIGNED(end, 1ULL << rt->rt_shift)); - rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift); + zfs_rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift); } static inline void -rs_set_fill(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +zfs_rs_set_fill(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t fill) { ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift)); - rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); + zfs_zfs_rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); } -typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); +typedef void zfs_range_tree_func_t(void *arg, uint64_t start, uint64_t size); -range_tree_t *range_tree_create_gap(const range_tree_ops_t *ops, - range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, +zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, uint64_t gap); -range_tree_t *range_tree_create(const range_tree_ops_t *ops, - range_seg_type_t type, void *arg, uint64_t start, uint64_t shift); -void range_tree_destroy(range_tree_t *rt); -boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); -range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); -boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, - uint64_t *ostart, uint64_t *osize); -void range_tree_verify_not_present(range_tree_t *rt, +zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift); +void zfs_range_tree_destroy(zfs_range_tree_t *rt); +boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, + uint64_t size); +zfs_range_seg_t *zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, + uint64_t size); +boolean_t zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, + uint64_t size, uint64_t *ostart, uint64_t *osize); +void zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t start, uint64_t size); -void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, +void zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs, uint64_t newstart, uint64_t newsize); -uint64_t range_tree_space(range_tree_t *rt); -uint64_t range_tree_numsegs(range_tree_t *rt); -boolean_t range_tree_is_empty(range_tree_t *rt); -void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); -void range_tree_stat_verify(range_tree_t *rt); -uint64_t range_tree_min(range_tree_t *rt); -uint64_t range_tree_max(range_tree_t *rt); -uint64_t range_tree_span(range_tree_t *rt); - -void range_tree_add(void *arg, uint64_t start, uint64_t size); -void range_tree_remove(void *arg, uint64_t start, uint64_t size); -void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size); -void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta); -void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size); - -void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg); -void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg); -range_seg_t *range_tree_first(range_tree_t *rt); - -void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, - range_tree_t *removefrom, range_tree_t *addto); -void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, - range_tree_t *addto); +uint64_t zfs_range_tree_space(zfs_range_tree_t *rt); +uint64_t zfs_range_tree_numsegs(zfs_range_tree_t *rt); +boolean_t zfs_range_tree_is_empty(zfs_range_tree_t *rt); +void zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst); +void zfs_range_tree_stat_verify(zfs_range_tree_t *rt); +uint64_t zfs_range_tree_min(zfs_range_tree_t *rt); +uint64_t zfs_range_tree_max(zfs_range_tree_t *rt); +uint64_t zfs_range_tree_span(zfs_range_tree_t *rt); + +void zfs_range_tree_add(void *arg, uint64_t start, uint64_t size); +void zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size); +void zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, + uint64_t size); +void zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs, + int64_t delta); +void zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size); + +void zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func, + void *arg); +void zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func, + void *arg); +zfs_range_seg_t *zfs_range_tree_first(zfs_range_tree_t *rt); + +void zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, + zfs_range_tree_t *removefrom, zfs_range_tree_t *addto); +void zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt, + zfs_range_tree_t *removefrom, zfs_range_tree_t *addto); #ifdef __cplusplus } diff --git a/include/sys/space_map.h b/include/sys/space_map.h index 14c5beccee55..2861b25e41ee 100644 --- a/include/sys/space_map.h +++ b/include/sys/space_map.h @@ -207,28 +207,28 @@ boolean_t sm_entry_is_double_word(uint64_t e); typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg); -int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); -int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, - uint64_t length); +int space_map_load(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype); +int space_map_load_length(space_map_t *sm, zfs_range_tree_t *rt, + maptype_t maptype, uint64_t length); int space_map_iterate(space_map_t *sm, uint64_t length, sm_cb_t callback, void *arg); int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, dmu_tx_t *tx); -boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt); +boolean_t space_map_histogram_verify(space_map_t *sm, zfs_range_tree_t *rt); void space_map_histogram_clear(space_map_t *sm); -void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, +void space_map_histogram_add(space_map_t *sm, zfs_range_tree_t *rt, dmu_tx_t *tx); uint64_t space_map_object(space_map_t *sm); int64_t space_map_allocated(space_map_t *sm); uint64_t space_map_length(space_map_t *sm); -uint64_t space_map_entries(space_map_t *sm, range_tree_t *rt); +uint64_t space_map_entries(space_map_t *sm, zfs_range_tree_t *rt); uint64_t space_map_nblocks(space_map_t *sm); -void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, +void space_map_write(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx); -uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, +uint64_t space_map_estimate_optimal_size(space_map_t *sm, zfs_range_tree_t *rt, uint64_t vdev_id); void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx); uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx); diff --git a/include/sys/space_reftree.h b/include/sys/space_reftree.h index b7a846aec624..e9a44ecf46b3 100644 --- a/include/sys/space_reftree.h +++ b/include/sys/space_reftree.h @@ -46,8 +46,8 @@ void space_reftree_create(avl_tree_t *t); void space_reftree_destroy(avl_tree_t *t); void space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, int64_t refcnt); -void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt); -void space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, +void space_reftree_add_map(avl_tree_t *t, zfs_range_tree_t *rt, int64_t refcnt); +void space_reftree_generate_map(avl_tree_t *t, zfs_range_tree_t *rt, int64_t minref); #ifdef __cplusplus diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index d45a5913dc0f..6840ee78915e 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -299,7 +299,8 @@ struct vdev { kcondvar_t vdev_initialize_cv; uint64_t vdev_initialize_offset[TXG_SIZE]; uint64_t vdev_initialize_last_offset; - range_tree_t *vdev_initialize_tree; /* valid while initializing */ + /* valid while initializing */ + zfs_range_tree_t *vdev_initialize_tree; uint64_t vdev_initialize_bytes_est; uint64_t vdev_initialize_bytes_done; uint64_t vdev_initialize_action_time; /* start and end time */ @@ -375,7 +376,7 @@ struct vdev { * from multiple zio threads. */ kmutex_t vdev_obsolete_lock; - range_tree_t *vdev_obsolete_segments; + zfs_range_tree_t *vdev_obsolete_segments; space_map_t *vdev_obsolete_sm; /* @@ -388,7 +389,7 @@ struct vdev { /* * Leaf vdev state. */ - range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ + zfs_range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ space_map_t *vdev_dtl_sm; /* dirty time log space map */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_dtl_object; /* DTL object */ diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h index 55ec6c570316..b7664a822bb3 100644 --- a/include/sys/vdev_rebuild.h +++ b/include/sys/vdev_rebuild.h @@ -65,7 +65,8 @@ typedef struct vdev_rebuild_phys { typedef struct vdev_rebuild { vdev_t *vr_top_vdev; /* top-level vdev to rebuild */ metaslab_t *vr_scan_msp; /* scanning disabled metaslab */ - range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */ + /* scan ranges (in metaslab) */ + zfs_range_tree_t *vr_scan_tree; kmutex_t vr_io_lock; /* inflight IO lock */ kcondvar_t vr_io_cv; /* inflight IO cv */ diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h index 70b743f4ec6b..8e6005a94260 100644 --- a/include/sys/vdev_removal.h +++ b/include/sys/vdev_removal.h @@ -35,7 +35,7 @@ typedef struct spa_vdev_removal { /* Thread performing a vdev removal. */ kthread_t *svr_thread; /* Segments left to copy from the current metaslab. */ - range_tree_t *svr_allocd_segs; + zfs_range_tree_t *svr_allocd_segs; kmutex_t svr_lock; kcondvar_t svr_cv; boolean_t svr_thread_exit; @@ -49,7 +49,7 @@ typedef struct spa_vdev_removal { * Ranges that were freed while a mapping was in flight. This is * a subset of the ranges covered by vdev_im_new_segments. */ - range_tree_t *svr_frees[TXG_SIZE]; + zfs_range_tree_t *svr_frees[TXG_SIZE]; /* * Number of bytes which we have finished our work for diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 90395cad6e45..5212751f9a63 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2193,7 +2193,7 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) mutex_enter(&dn->dn_mtx); int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] != NULL) { - range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1); + zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1); } if (dn->dn_nlevels == 1) { @@ -2400,7 +2400,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_free_ranges[txgoff] != NULL) { - range_tree_clear(dn->dn_free_ranges[txgoff], + zfs_range_tree_clear(dn->dn_free_ranges[txgoff], db->db_blkid, 1); } mutex_exit(&dn->dn_mtx); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index ecc6761f8fa4..ce2c79dbfaa3 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2435,11 +2435,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { - dn->dn_free_ranges[txgoff] = range_tree_create(NULL, - RANGE_SEG64, NULL, 0, 0); + dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL, + ZFS_RANGE_SEG64, NULL, 0, 0); } - range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); - range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); + zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); + zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); } dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", (u_longlong_t)blkid, (u_longlong_t)nblks, @@ -2482,7 +2482,7 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid) mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { if (dn->dn_free_ranges[i] != NULL && - range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) + zfs_range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) break; } mutex_exit(&dn->dn_mtx); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 122d7d0d17d8..c82f45145d4b 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -720,7 +720,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dn->dn_maxblkid == 0 || list_head(list) != NULL || dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == dnp->dn_datablkszsec || - !range_tree_is_empty(dn->dn_free_ranges[txgoff])); + !zfs_range_tree_is_empty(dn->dn_free_ranges[txgoff])); dnp->dn_datablkszsec = dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; dn->dn_next_blksz[txgoff] = 0; @@ -786,21 +786,22 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dsfra.dsfra_free_indirects = freeing_dnode; mutex_enter(&dn->dn_mtx); if (freeing_dnode) { - ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], - 0, dn->dn_maxblkid + 1)); + ASSERT(zfs_range_tree_contains( + dn->dn_free_ranges[txgoff], 0, + dn->dn_maxblkid + 1)); } /* * Because dnode_sync_free_range() must drop dn_mtx during its - * processing, using it as a callback to range_tree_vacate() is - * not safe. No other operations (besides destroy) are allowed - * once range_tree_vacate() has begun, and dropping dn_mtx - * would leave a window open for another thread to observe that - * invalid (and unsafe) state. + * processing, using it as a callback to zfs_range_tree_vacate() + * is not safe. No other operations (besides destroy) are + * allowed once zfs_range_tree_vacate() has begun, and dropping + * dn_mtx would leave a window open for another thread to + * observe that invalid (and unsafe) state. */ - range_tree_walk(dn->dn_free_ranges[txgoff], + zfs_range_tree_walk(dn->dn_free_ranges[txgoff], dnode_sync_free_range, &dsfra); - range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL); - range_tree_destroy(dn->dn_free_ranges[txgoff]); + zfs_range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL); + zfs_range_tree_destroy(dn->dn_free_ranges[txgoff]); dn->dn_free_ranges[txgoff] = NULL; mutex_exit(&dn->dn_mtx); } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 5ae96882935c..aabf41bec92e 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -652,8 +652,8 @@ dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms; ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) { - VERIFY(range_tree_is_empty(ms->ms_freeing)); - VERIFY(range_tree_is_empty(ms->ms_checkpointing)); + VERIFY(zfs_range_tree_is_empty(ms->ms_freeing)); + VERIFY(zfs_range_tree_is_empty(ms->ms_checkpointing)); } } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 3eba4cb35cc6..bc5c3cb9a670 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -321,7 +321,7 @@ struct dsl_scan_io_queue { zio_t *q_zio; /* scn_zio_root child for waiting on IO */ /* trees used for sorting I/Os and extents of I/Os */ - range_tree_t *q_exts_by_addr; + zfs_range_tree_t *q_exts_by_addr; zfs_btree_t q_exts_by_size; avl_tree_t q_sios_by_addr; uint64_t q_sio_memused; @@ -814,7 +814,8 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==, NULL); - ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); + ASSERT3P(zfs_range_tree_first(q->q_exts_by_addr), ==, + NULL); mutex_exit(&vd->vdev_scan_io_queue_lock); } @@ -3277,13 +3278,14 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) /* * This function removes sios from an IO queue which reside within a given - * range_seg_t and inserts them (in offset order) into a list. Note that + * zfs_range_seg_t and inserts them (in offset order) into a list. Note that * we only ever return a maximum of 32 sios at once. If there are more sios * to process within this segment that did not make it onto the list we * return B_TRUE and otherwise B_FALSE. */ static boolean_t -scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) +scan_io_queue_gather(dsl_scan_io_queue_t *queue, zfs_range_seg_t *rs, + list_t *list) { scan_io_t *srch_sio, *sio, *next_sio; avl_index_t idx; @@ -3295,7 +3297,7 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) srch_sio = sio_alloc(1); srch_sio->sio_nr_dvas = 1; - SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr)); + SIO_SET_OFFSET(srch_sio, zfs_rs_get_start(rs, queue->q_exts_by_addr)); /* * The exact start of the extent might not contain any matching zios, @@ -3307,11 +3309,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); - while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + while (sio != NULL && SIO_GET_OFFSET(sio) < zfs_rs_get_end(rs, queue->q_exts_by_addr) && num_sios <= 32) { - ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs, + ASSERT3U(SIO_GET_OFFSET(sio), >=, zfs_rs_get_start(rs, queue->q_exts_by_addr)); - ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs, + ASSERT3U(SIO_GET_END_OFFSET(sio), <=, zfs_rs_get_end(rs, queue->q_exts_by_addr)); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); @@ -3332,19 +3334,20 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ - if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + if (sio != NULL && SIO_GET_OFFSET(sio) < zfs_rs_get_end(rs, queue->q_exts_by_addr)) { - range_tree_adjust_fill(queue->q_exts_by_addr, rs, + zfs_range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); - range_tree_resize_segment(queue->q_exts_by_addr, rs, - SIO_GET_OFFSET(sio), rs_get_end(rs, + zfs_range_tree_resize_segment(queue->q_exts_by_addr, rs, + SIO_GET_OFFSET(sio), zfs_rs_get_end(rs, queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); queue->q_last_ext_addr = SIO_GET_OFFSET(sio); return (B_TRUE); } else { - uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr); - uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr); - range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); + uint64_t rstart = zfs_rs_get_start(rs, queue->q_exts_by_addr); + uint64_t rend = zfs_rs_get_end(rs, queue->q_exts_by_addr); + zfs_range_tree_remove(queue->q_exts_by_addr, rstart, rend - + rstart); queue->q_last_ext_addr = -1; return (B_FALSE); } @@ -3361,11 +3364,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) * memory limit. * 3) Otherwise we don't select any extents. */ -static range_seg_t * +static zfs_range_seg_t * scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; - range_tree_t *rt = queue->q_exts_by_addr; + zfs_range_tree_t *rt = queue->q_exts_by_addr; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(scn->scn_is_sorted); @@ -3384,7 +3387,7 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) */ if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) || zfs_scan_issue_strategy == 1) - return (range_tree_first(rt)); + return (zfs_range_tree_first(rt)); /* * Try to continue previous extent if it is not completed yet. After @@ -3393,10 +3396,10 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) */ uint64_t start; uint64_t size = 1ULL << rt->rt_shift; - range_seg_t *addr_rs; + zfs_range_seg_t *addr_rs; if (queue->q_last_ext_addr != -1) { start = queue->q_last_ext_addr; - addr_rs = range_tree_find(rt, start, size); + addr_rs = zfs_range_tree_find(rt, start, size); if (addr_rs != NULL) return (addr_rs); } @@ -3413,10 +3416,10 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) * We need to get the original entry in the by_addr tree so we can * modify it. */ - addr_rs = range_tree_find(rt, start, size); + addr_rs = zfs_range_tree_find(rt, start, size); ASSERT3P(addr_rs, !=, NULL); - ASSERT3U(rs_get_start(addr_rs, rt), ==, start); - ASSERT3U(rs_get_end(addr_rs, rt), >, start); + ASSERT3U(zfs_rs_get_start(addr_rs, rt), ==, start); + ASSERT3U(zfs_rs_get_end(addr_rs, rt), >, start); return (addr_rs); } @@ -3426,7 +3429,7 @@ scan_io_queues_run_one(void *arg) dsl_scan_io_queue_t *queue = arg; kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; boolean_t suspended = B_FALSE; - range_seg_t *rs; + zfs_range_seg_t *rs; scan_io_t *sio; zio_t *zio; list_t sio_list; @@ -4723,7 +4726,7 @@ scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) } avl_insert(&queue->q_sios_by_addr, sio, idx); queue->q_sio_memused += SIO_GET_MUSED(sio); - range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), + zfs_range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)); } @@ -4983,7 +4986,7 @@ ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t, ext_size_compare) static void -ext_size_create(range_tree_t *rt, void *arg) +ext_size_create(zfs_range_tree_t *rt, void *arg) { (void) rt; zfs_btree_t *size_tree = arg; @@ -4993,7 +4996,7 @@ ext_size_create(range_tree_t *rt, void *arg) } static void -ext_size_destroy(range_tree_t *rt, void *arg) +ext_size_destroy(zfs_range_tree_t *rt, void *arg) { (void) rt; zfs_btree_t *size_tree = arg; @@ -5003,7 +5006,7 @@ ext_size_destroy(range_tree_t *rt, void *arg) } static uint64_t -ext_size_value(range_tree_t *rt, range_seg_gap_t *rsg) +ext_size_value(zfs_range_tree_t *rt, range_seg_gap_t *rsg) { (void) rt; uint64_t size = rsg->rs_end - rsg->rs_start; @@ -5014,25 +5017,25 @@ ext_size_value(range_tree_t *rt, range_seg_gap_t *rsg) } static void -ext_size_add(range_tree_t *rt, range_seg_t *rs, void *arg) +ext_size_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { zfs_btree_t *size_tree = arg; - ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP); + ASSERT3U(rt->rt_type, ==, ZFS_RANGE_SEG_GAP); uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); zfs_btree_add(size_tree, &v); } static void -ext_size_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +ext_size_remove(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { zfs_btree_t *size_tree = arg; - ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP); + ASSERT3U(rt->rt_type, ==, ZFS_RANGE_SEG_GAP); uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); zfs_btree_remove(size_tree, &v); } static void -ext_size_vacate(range_tree_t *rt, void *arg) +ext_size_vacate(zfs_range_tree_t *rt, void *arg) { zfs_btree_t *size_tree = arg; zfs_btree_clear(size_tree); @@ -5041,7 +5044,7 @@ ext_size_vacate(range_tree_t *rt, void *arg) ext_size_create(rt, arg); } -static const range_tree_ops_t ext_size_ops = { +static const zfs_range_tree_ops_t ext_size_ops = { .rtop_create = ext_size_create, .rtop_destroy = ext_size_destroy, .rtop_add = ext_size_add, @@ -5073,8 +5076,9 @@ scan_io_queue_create(vdev_t *vd) q->q_sio_memused = 0; q->q_last_ext_addr = -1; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); - q->q_exts_by_addr = range_tree_create_gap(&ext_size_ops, RANGE_SEG_GAP, - &q->q_exts_by_size, 0, vd->vdev_ashift, zfs_scan_max_ext_gap); + q->q_exts_by_addr = zfs_range_tree_create_gap(&ext_size_ops, + ZFS_RANGE_SEG_GAP, &q->q_exts_by_size, 0, vd->vdev_ashift, + zfs_scan_max_ext_gap); avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); @@ -5099,15 +5103,15 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) atomic_add_64(&scn->scn_queues_pending, -1); while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != NULL) { - ASSERT(range_tree_contains(queue->q_exts_by_addr, + ASSERT(zfs_range_tree_contains(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); queue->q_sio_memused -= SIO_GET_MUSED(sio); sio_free(sio); } ASSERT0(queue->q_sio_memused); - range_tree_vacate(queue->q_exts_by_addr, NULL, queue); - range_tree_destroy(queue->q_exts_by_addr); + zfs_range_tree_vacate(queue->q_exts_by_addr, NULL, queue); + zfs_range_tree_destroy(queue->q_exts_by_addr); avl_destroy(&queue->q_sios_by_addr); cv_destroy(&queue->q_zio_cv); @@ -5184,10 +5188,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) * 1) Cold, just sitting in the queue of zio's to be issued at * some point in the future. In this case, all we do is * remove the zio from the q_sios_by_addr tree, decrement - * its data volume from the containing range_seg_t and + * its data volume from the containing zfs_range_seg_t and * resort the q_exts_by_size tree to reflect that the - * range_seg_t has lost some of its 'fill'. We don't shorten - * the range_seg_t - this is usually rare enough not to be + * zfs_range_seg_t has lost some of its 'fill'. We don't shorten + * the zfs_range_seg_t - this is usually rare enough not to be * worth the extra hassle of trying keep track of precise * extent boundaries. * 2) Hot, where the zio is currently in-flight in @@ -5211,8 +5215,9 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) atomic_add_64(&scn->scn_queues_pending, -1); queue->q_sio_memused -= SIO_GET_MUSED(sio); - ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); - range_tree_remove_fill(queue->q_exts_by_addr, start, size); + ASSERT(zfs_range_tree_contains(queue->q_exts_by_addr, start, + size)); + zfs_range_tree_remove_fill(queue->q_exts_by_addr, start, size); /* count the block as though we skipped it */ sio2bp(sio, &tmpbp); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 353a99605913..10546798824a 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -347,7 +347,8 @@ static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); -static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); +static void metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, + void *arg); kmem_cache_t *metaslab_alloc_trace_cache; typedef struct metaslab_stats { @@ -1379,7 +1380,7 @@ typedef struct metaslab_rt_arg { } metaslab_rt_arg_t; struct mssa_arg { - range_tree_t *rt; + zfs_range_tree_t *rt; metaslab_rt_arg_t *mra; }; @@ -1387,16 +1388,16 @@ static void metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) { struct mssa_arg *mssap = arg; - range_tree_t *rt = mssap->rt; + zfs_range_tree_t *rt = mssap->rt; metaslab_rt_arg_t *mrap = mssap->mra; range_seg_max_t seg = {0}; - rs_set_start(&seg, rt, start); - rs_set_end(&seg, rt, start + size); + zfs_rs_set_start(&seg, rt, start); + zfs_rs_set_end(&seg, rt, start + size); metaslab_rt_add(rt, &seg, mrap); } static void -metaslab_size_tree_full_load(range_tree_t *rt) +metaslab_size_tree_full_load(zfs_range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; METASLABSTAT_BUMP(metaslabstat_reload_tree); @@ -1405,7 +1406,7 @@ metaslab_size_tree_full_load(range_tree_t *rt) struct mssa_arg arg = {0}; arg.rt = rt; arg.mra = mrap; - range_tree_walk(rt, metaslab_size_sorted_add, &arg); + zfs_range_tree_walk(rt, metaslab_size_sorted_add, &arg); } @@ -1417,10 +1418,11 @@ ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, /* * Create any block allocator specific components. The current allocators - * rely on using both a size-ordered range_tree_t and an array of uint64_t's. + * rely on using both a size-ordered zfs_range_tree_t and an array of + * uint64_t's. */ static void -metaslab_rt_create(range_tree_t *rt, void *arg) +metaslab_rt_create(zfs_range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; @@ -1429,12 +1431,12 @@ metaslab_rt_create(range_tree_t *rt, void *arg) int (*compare) (const void *, const void *); bt_find_in_buf_f bt_find; switch (rt->rt_type) { - case RANGE_SEG32: + case ZFS_RANGE_SEG32: size = sizeof (range_seg32_t); compare = metaslab_rangesize32_compare; bt_find = metaslab_rt_find_rangesize32_in_buf; break; - case RANGE_SEG64: + case ZFS_RANGE_SEG64: size = sizeof (range_seg64_t); compare = metaslab_rangesize64_compare; bt_find = metaslab_rt_find_rangesize64_in_buf; @@ -1447,7 +1449,7 @@ metaslab_rt_create(range_tree_t *rt, void *arg) } static void -metaslab_rt_destroy(range_tree_t *rt, void *arg) +metaslab_rt_destroy(zfs_range_tree_t *rt, void *arg) { (void) rt; metaslab_rt_arg_t *mrap = arg; @@ -1458,12 +1460,12 @@ metaslab_rt_destroy(range_tree_t *rt, void *arg) } static void -metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) +metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; - if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < + if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; @@ -1471,12 +1473,12 @@ metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) } static void -metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +metaslab_rt_remove(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; - if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL << + if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL << mrap->mra_floor_shift)) return; @@ -1484,7 +1486,7 @@ metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) } static void -metaslab_rt_vacate(range_tree_t *rt, void *arg) +metaslab_rt_vacate(zfs_range_tree_t *rt, void *arg) { metaslab_rt_arg_t *mrap = arg; zfs_btree_t *size_tree = mrap->mra_bt; @@ -1494,7 +1496,7 @@ metaslab_rt_vacate(range_tree_t *rt, void *arg) metaslab_rt_create(rt, arg); } -static const range_tree_ops_t metaslab_rt_ops = { +static const zfs_range_tree_ops_t metaslab_rt_ops = { .rtop_create = metaslab_rt_create, .rtop_destroy = metaslab_rt_destroy, .rtop_add = metaslab_rt_add, @@ -1515,7 +1517,7 @@ uint64_t metaslab_largest_allocatable(metaslab_t *msp) { zfs_btree_t *t = &msp->ms_allocatable_by_size; - range_seg_t *rs; + zfs_range_seg_t *rs; if (t == NULL) return (0); @@ -1526,7 +1528,7 @@ metaslab_largest_allocatable(metaslab_t *msp) if (rs == NULL) return (0); - return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, + return (zfs_rs_get_end(rs, msp->ms_allocatable) - zfs_rs_get_start(rs, msp->ms_allocatable)); } @@ -1544,7 +1546,7 @@ metaslab_largest_unflushed_free(metaslab_t *msp) if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) metaslab_size_tree_full_load(msp->ms_unflushed_frees); - range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, + zfs_range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, NULL); if (rs == NULL) return (0); @@ -1572,13 +1574,13 @@ metaslab_largest_unflushed_free(metaslab_t *msp) * the largest segment; there may be other usable chunks in the * largest segment, but we ignore them. */ - uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); - uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; + uint64_t rstart = zfs_rs_get_start(rs, msp->ms_unflushed_frees); + uint64_t rsize = zfs_rs_get_end(rs, msp->ms_unflushed_frees) - rstart; for (int t = 0; t < TXG_DEFER_SIZE; t++) { uint64_t start = 0; uint64_t size = 0; - boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, - rsize, &start, &size); + boolean_t found = zfs_range_tree_find_in(msp->ms_defer[t], + rstart, rsize, &start, &size); if (found) { if (rstart == start) return (0); @@ -1588,7 +1590,7 @@ metaslab_largest_unflushed_free(metaslab_t *msp) uint64_t start = 0; uint64_t size = 0; - boolean_t found = range_tree_find_in(msp->ms_freed, rstart, + boolean_t found = zfs_range_tree_find_in(msp->ms_freed, rstart, rsize, &start, &size); if (found) rsize = start - rstart; @@ -1596,15 +1598,15 @@ metaslab_largest_unflushed_free(metaslab_t *msp) return (rsize); } -static range_seg_t * -metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, +static zfs_range_seg_t * +metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start, uint64_t size, zfs_btree_index_t *where) { - range_seg_t *rs; + zfs_range_seg_t *rs; range_seg_max_t rsearch; - rs_set_start(&rsearch, rt, start); - rs_set_end(&rsearch, rt, start + size); + zfs_rs_set_start(&rsearch, rt, start); + zfs_rs_set_end(&rsearch, rt, start + size); rs = zfs_btree_find(t, &rsearch, where); if (rs == NULL) { @@ -1620,24 +1622,25 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, * for a block that matches the specified criteria. */ static uint64_t -metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, +metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size, uint64_t max_search) { if (*cursor == 0) *cursor = rt->rt_start; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t where; - range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); + zfs_range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, + &where); uint64_t first_found; int count_searched = 0; if (rs != NULL) - first_found = rs_get_start(rs, rt); + first_found = zfs_rs_get_start(rs, rt); - while (rs != NULL && (rs_get_start(rs, rt) - first_found <= + while (rs != NULL && (zfs_rs_get_start(rs, rt) - first_found <= max_search || count_searched < metaslab_min_search_count)) { - uint64_t offset = rs_get_start(rs, rt); - if (offset + size <= rs_get_end(rs, rt)) { + uint64_t offset = zfs_rs_get_start(rs, rt); + if (offset + size <= zfs_rs_get_end(rs, rt)) { *cursor = offset + size; return (offset); } @@ -1748,8 +1751,8 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) */ uint64_t align = size & -size; uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; - range_tree_t *rt = msp->ms_allocatable; - uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size; + zfs_range_tree_t *rt = msp->ms_allocatable; + uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size; uint64_t offset; ASSERT(MUTEX_HELD(&msp->ms_lock)); @@ -1767,7 +1770,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) } if (offset == -1) { - range_seg_t *rs; + zfs_range_seg_t *rs; if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); @@ -1780,9 +1783,9 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) rs = metaslab_block_find(&msp->ms_allocatable_by_size, rt, msp->ms_start, size, &where); } - if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, - rt)) { - offset = rs_get_start(rs, rt); + if (rs != NULL && zfs_rs_get_start(rs, rt) + size <= + zfs_rs_get_end(rs, rt)) { + offset = zfs_rs_get_start(rs, rt); *cursor = offset + size; } } @@ -1802,7 +1805,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { - range_tree_t *rt = msp->ms_allocatable; + zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *t = &msp->ms_allocatable_by_size; uint64_t *cursor = &msp->ms_lbas[0]; uint64_t *cursor_end = &msp->ms_lbas[1]; @@ -1813,17 +1816,17 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) ASSERT3U(*cursor_end, >=, *cursor); if ((*cursor + size) > *cursor_end) { - range_seg_t *rs; + zfs_range_seg_t *rs; if (zfs_btree_numnodes(t) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); rs = zfs_btree_last(t, NULL); - if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < - size) + if (rs == NULL || (zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt)) < size) return (-1ULL); - *cursor = rs_get_start(rs, rt); - *cursor_end = rs_get_end(rs, rt); + *cursor = zfs_rs_get_start(rs, rt); + *cursor_end = zfs_rs_get_end(rs, rt); } offset = *cursor; @@ -1851,9 +1854,9 @@ static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { zfs_btree_t *t = &msp->ms_allocatable->rt_root; - range_tree_t *rt = msp->ms_allocatable; + zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_index_t where; - range_seg_t *rs; + zfs_range_seg_t *rs; range_seg_max_t rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; @@ -1864,15 +1867,16 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) if (max_size < size) return (-1ULL); - rs_set_start(&rsearch, rt, *cursor); - rs_set_end(&rsearch, rt, *cursor + size); + zfs_rs_set_start(&rsearch, rt, *cursor); + zfs_rs_set_end(&rsearch, rt, *cursor + size); rs = zfs_btree_find(t, &rsearch, &where); - if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { + if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < + size) { t = &msp->ms_allocatable_by_size; - rs_set_start(&rsearch, rt, 0); - rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + + zfs_rs_set_start(&rsearch, rt, 0); + zfs_rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + metaslab_ndf_clump_shift))); rs = zfs_btree_find(t, &rsearch, &where); @@ -1881,9 +1885,9 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) ASSERT(rs != NULL); } - if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { - *cursor = rs_get_start(rs, rt) + size; - return (rs_get_start(rs, rt)); + if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) { + *cursor = zfs_rs_get_start(rs, rt) + size; + return (zfs_rs_get_start(rs, rt)); } return (-1ULL); } @@ -1973,12 +1977,12 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg) ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); ASSERT3U(space_map_allocated(msp->ms_sm), >=, - range_tree_space(msp->ms_unflushed_frees)); + zfs_range_tree_space(msp->ms_unflushed_frees)); ASSERT3U(metaslab_allocated_space(msp), ==, space_map_allocated(msp->ms_sm) + - range_tree_space(msp->ms_unflushed_allocs) - - range_tree_space(msp->ms_unflushed_frees)); + zfs_range_tree_space(msp->ms_unflushed_allocs) - + zfs_range_tree_space(msp->ms_unflushed_frees)); sm_free_space = msp->ms_size - metaslab_allocated_space(msp); @@ -1988,17 +1992,19 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg) */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { allocating += - range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); + zfs_range_tree_space(msp->ms_allocating[(txg + t) & + TXG_MASK]); } ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, msp->ms_allocating_total); ASSERT3U(msp->ms_deferspace, ==, - range_tree_space(msp->ms_defer[0]) + - range_tree_space(msp->ms_defer[1])); + zfs_range_tree_space(msp->ms_defer[0]) + + zfs_range_tree_space(msp->ms_defer[1])); - msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + - msp->ms_deferspace + range_tree_space(msp->ms_freed); + msp_free_space = zfs_range_tree_space(msp->ms_allocatable) + + allocating + msp->ms_deferspace + + zfs_range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); } @@ -2019,7 +2025,7 @@ metaslab_aux_histograms_clear(metaslab_t *msp) static void metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, - range_tree_t *rt) + zfs_range_tree_t *rt) { /* * This is modeled after space_map_histogram_add(), so refer to that @@ -2167,7 +2173,7 @@ metaslab_verify_weight_and_frag(metaslab_t *msp) /* some extra verification for in-core tree if you can */ if (msp->ms_loaded) { - range_tree_stat_verify(msp->ms_allocatable); + zfs_range_tree_stat_verify(msp->ms_allocatable); VERIFY(space_map_histogram_verify(msp->ms_sm, msp->ms_allocatable)); } @@ -2355,8 +2361,8 @@ metaslab_load_impl(metaslab_t *msp) struct mssa_arg arg = {0}; arg.rt = msp->ms_allocatable; arg.mra = mrap; - range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, - &arg); + zfs_range_tree_walk(msp->ms_allocatable, + metaslab_size_sorted_add, &arg); } else { /* * Add the size-sorted tree first, since we don't need to load @@ -2370,7 +2376,7 @@ metaslab_load_impl(metaslab_t *msp) * all the space in the metaslab as free and add it to the * ms_allocatable tree. */ - range_tree_add(msp->ms_allocatable, + zfs_range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); if (msp->ms_new) { @@ -2381,8 +2387,10 @@ metaslab_load_impl(metaslab_t *msp) * expect any unflushed allocs or frees from previous * TXGs. */ - ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); - ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + ASSERT(zfs_range_tree_is_empty( + msp->ms_unflushed_allocs)); + ASSERT(zfs_range_tree_is_empty( + msp->ms_unflushed_frees)); } } @@ -2412,10 +2420,10 @@ metaslab_load_impl(metaslab_t *msp) * away so any manipulations we do below have a clear view * of what is allocated and what is free. */ - range_tree_walk(msp->ms_unflushed_allocs, - range_tree_remove, msp->ms_allocatable); - range_tree_walk(msp->ms_unflushed_frees, - range_tree_add, msp->ms_allocatable); + zfs_range_tree_walk(msp->ms_unflushed_allocs, + zfs_range_tree_remove, msp->ms_allocatable); + zfs_range_tree_walk(msp->ms_unflushed_frees, + zfs_range_tree_add, msp->ms_allocatable); ASSERT3P(msp->ms_group, !=, NULL); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; @@ -2443,8 +2451,8 @@ metaslab_load_impl(metaslab_t *msp) * correctly doesn't contain any segments that exist * in ms_freed [see ms_synced_length]. */ - range_tree_walk(msp->ms_freed, - range_tree_remove, msp->ms_allocatable); + zfs_range_tree_walk(msp->ms_freed, + zfs_range_tree_remove, msp->ms_allocatable); } /* @@ -2462,8 +2470,8 @@ metaslab_load_impl(metaslab_t *msp) * code path. */ for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defer[t], - range_tree_remove, msp->ms_allocatable); + zfs_range_tree_walk(msp->ms_defer[t], + zfs_range_tree_remove, msp->ms_allocatable); } /* @@ -2498,11 +2506,11 @@ metaslab_load_impl(metaslab_t *msp) (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, (u_longlong_t)space_map_length(msp->ms_sm), - (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), - (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), - (u_longlong_t)range_tree_space(msp->ms_freed), - (u_longlong_t)range_tree_space(msp->ms_defer[0]), - (u_longlong_t)range_tree_space(msp->ms_defer[1]), + (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_allocs), + (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_frees), + (u_longlong_t)zfs_range_tree_space(msp->ms_freed), + (u_longlong_t)zfs_range_tree_space(msp->ms_defer[0]), + (u_longlong_t)zfs_range_tree_space(msp->ms_defer[1]), (longlong_t)((load_start - msp->ms_unload_time) / 1000000), (longlong_t)((load_end - load_start) / 1000000), (u_longlong_t)msp->ms_max_size, @@ -2584,7 +2592,7 @@ metaslab_unload(metaslab_t *msp) if (!msp->ms_loaded) return; - range_tree_vacate(msp->ms_allocatable, NULL, NULL); + zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; msp->ms_unload_time = gethrtime(); @@ -2640,7 +2648,7 @@ metaslab_unload(metaslab_t *msp) * the vdev_ms_shift - the vdev_ashift is less than 32, we can store * the ranges using two uint32_ts, rather than two uint64_ts. */ -range_seg_type_t +zfs_range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, uint64_t *start, uint64_t *shift) { @@ -2648,11 +2656,11 @@ metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, !zfs_metaslab_force_large_segs) { *shift = vdev->vdev_ashift; *start = msp->ms_start; - return (RANGE_SEG32); + return (ZFS_RANGE_SEG32); } else { *shift = 0; *start = 0; - return (RANGE_SEG64); + return (ZFS_RANGE_SEG64); } } @@ -2738,32 +2746,33 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, } uint64_t shift, start; - range_seg_type_t type = + zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); - ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); + ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start, + shift); for (int t = 0; t < TXG_SIZE; t++) { - ms->ms_allocating[t] = range_tree_create(NULL, type, + ms->ms_allocating[t] = zfs_range_tree_create(NULL, type, NULL, start, shift); } - ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift); - ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift); + ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift); + ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ms->ms_defer[t] = range_tree_create(NULL, type, NULL, + ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL, start, shift); } ms->ms_checkpointing = - range_tree_create(NULL, type, NULL, start, shift); + zfs_range_tree_create(NULL, type, NULL, start, shift); ms->ms_unflushed_allocs = - range_tree_create(NULL, type, NULL, start, shift); + zfs_range_tree_create(NULL, type, NULL, start, shift); metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); mrap->mra_bt = &ms->ms_unflushed_frees_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; - ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, + ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops, type, mrap, start, shift); - ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); + ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms, B_FALSE); @@ -2817,8 +2826,8 @@ metaslab_fini_flush_data(metaslab_t *msp) uint64_t metaslab_unflushed_changes_memused(metaslab_t *ms) { - return ((range_tree_numsegs(ms->ms_unflushed_allocs) + - range_tree_numsegs(ms->ms_unflushed_frees)) * + return ((zfs_range_tree_numsegs(ms->ms_unflushed_allocs) + + zfs_range_tree_numsegs(ms->ms_unflushed_frees)) * ms->ms_unflushed_allocs->rt_root.bt_elem_size); } @@ -2851,33 +2860,33 @@ metaslab_fini(metaslab_t *msp) metaslab_unload(msp); - range_tree_destroy(msp->ms_allocatable); - range_tree_destroy(msp->ms_freeing); - range_tree_destroy(msp->ms_freed); + zfs_range_tree_destroy(msp->ms_allocatable); + zfs_range_tree_destroy(msp->ms_freeing); + zfs_range_tree_destroy(msp->ms_freed); ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); - range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); - range_tree_destroy(msp->ms_unflushed_allocs); - range_tree_destroy(msp->ms_checkpointing); - range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); - range_tree_destroy(msp->ms_unflushed_frees); + zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + zfs_range_tree_destroy(msp->ms_unflushed_allocs); + zfs_range_tree_destroy(msp->ms_checkpointing); + zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + zfs_range_tree_destroy(msp->ms_unflushed_frees); for (int t = 0; t < TXG_SIZE; t++) { - range_tree_destroy(msp->ms_allocating[t]); + zfs_range_tree_destroy(msp->ms_allocating[t]); } for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_destroy(msp->ms_defer[t]); + zfs_range_tree_destroy(msp->ms_defer[t]); } ASSERT0(msp->ms_deferspace); for (int t = 0; t < TXG_SIZE; t++) ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); - range_tree_vacate(msp->ms_trim, NULL, NULL); - range_tree_destroy(msp->ms_trim); + zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); + zfs_range_tree_destroy(msp->ms_trim); mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); @@ -3445,7 +3454,7 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) * lock. */ if (msp->ms_weight == 0) { - ASSERT0(range_tree_space(msp->ms_allocatable)); + ASSERT0(zfs_range_tree_space(msp->ms_allocatable)); return (SET_ERROR(ENOSPC)); } @@ -3504,7 +3513,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight) */ ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || size >= SPA_MINBLOCKSIZE || - range_tree_space(msp->ms_allocatable) == 0); + zfs_range_tree_space(msp->ms_allocatable) == 0); ASSERT0(weight & METASLAB_ACTIVE_MASK); ASSERT(msp->ms_activation_weight != 0); @@ -3635,7 +3644,7 @@ metaslab_should_condense(metaslab_t *msp) * We always condense metaslabs that are empty and metaslabs for * which a condense request has been made. */ - if (range_tree_numsegs(msp->ms_allocatable) == 0 || + if (zfs_range_tree_numsegs(msp->ms_allocatable) == 0 || msp->ms_condense_wanted) return (B_TRUE); @@ -3659,7 +3668,7 @@ metaslab_should_condense(metaslab_t *msp) static void metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) { - range_tree_t *condense_tree; + zfs_range_tree_t *condense_tree; space_map_t *sm = msp->ms_sm; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = msp->ms_group->mg_vd->vdev_spa; @@ -3711,41 +3720,41 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) * metaslab_flush_update(). */ ASSERT3U(spa_sync_pass(spa), ==, 1); - ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ + ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " "spa %s, smp size %llu, segments %llu, forcing condense=%s", (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp, (u_longlong_t)msp->ms_group->mg_vd->vdev_id, spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm), - (u_longlong_t)range_tree_numsegs(msp->ms_allocatable), + (u_longlong_t)zfs_range_tree_numsegs(msp->ms_allocatable), msp->ms_condense_wanted ? "TRUE" : "FALSE"); msp->ms_condense_wanted = B_FALSE; - range_seg_type_t type; + zfs_range_seg_type_t type; uint64_t shift, start; type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, &start, &shift); - condense_tree = range_tree_create(NULL, type, NULL, start, shift); + condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift); for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defer[t], - range_tree_add, condense_tree); + zfs_range_tree_walk(msp->ms_defer[t], + zfs_range_tree_add, condense_tree); } for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { - range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], - range_tree_add, condense_tree); + zfs_range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], + zfs_range_tree_add, condense_tree); } ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); - range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); - range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); /* * We're about to drop the metaslab's lock thus allowing other @@ -3785,17 +3794,17 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ - range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start, - shift); - range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); + zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL, + start, shift); + zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); - range_tree_vacate(condense_tree, NULL, NULL); - range_tree_destroy(condense_tree); - range_tree_vacate(tmp_tree, NULL, NULL); - range_tree_destroy(tmp_tree); + zfs_range_tree_vacate(condense_tree, NULL, NULL); + zfs_range_tree_destroy(condense_tree); + zfs_range_tree_vacate(tmp_tree, NULL, NULL); + zfs_range_tree_destroy(tmp_tree); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; @@ -3808,8 +3817,8 @@ metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx) spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); - ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); - ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); mutex_enter(&spa->spa_flushed_ms_lock); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); @@ -3829,8 +3838,8 @@ metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty) ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); - ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); - ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); @@ -3950,7 +3959,7 @@ metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) space_map_histogram_clear(msp->ms_sm); space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); - ASSERT(range_tree_is_empty(msp->ms_freed)); + ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); for (int t = 0; t < TXG_DEFER_SIZE; t++) { space_map_histogram_add(msp->ms_sm, msp->ms_defer[t], tx); @@ -3992,8 +4001,10 @@ metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) spa_name(spa), (u_longlong_t)msp->ms_group->mg_vd->vdev_id, (u_longlong_t)msp->ms_id, - (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), - (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), + (u_longlong_t)zfs_range_tree_space( + msp->ms_unflushed_allocs), + (u_longlong_t)zfs_range_tree_space( + msp->ms_unflushed_frees), (u_longlong_t)(sm_len_after - sm_len_before)); } @@ -4001,8 +4012,8 @@ metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); - range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); - range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); metaslab_verify_space(msp, dmu_tx_get_txg(tx)); metaslab_verify_weight_and_frag(msp); @@ -4027,7 +4038,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); - range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; + zfs_range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); @@ -4036,11 +4047,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * This metaslab has just been added so there's no work to do now. */ if (msp->ms_new) { - ASSERT0(range_tree_space(alloctree)); - ASSERT0(range_tree_space(msp->ms_freeing)); - ASSERT0(range_tree_space(msp->ms_freed)); - ASSERT0(range_tree_space(msp->ms_checkpointing)); - ASSERT0(range_tree_space(msp->ms_trim)); + ASSERT0(zfs_range_tree_space(alloctree)); + ASSERT0(zfs_range_tree_space(msp->ms_freeing)); + ASSERT0(zfs_range_tree_space(msp->ms_freed)); + ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); + ASSERT0(zfs_range_tree_space(msp->ms_trim)); return; } @@ -4055,9 +4066,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * we preserve the utility of the VERIFY statements in all other * cases. */ - if (range_tree_is_empty(alloctree) && - range_tree_is_empty(msp->ms_freeing) && - range_tree_is_empty(msp->ms_checkpointing) && + if (zfs_range_tree_is_empty(alloctree) && + zfs_range_tree_is_empty(msp->ms_freeing) && + zfs_range_tree_is_empty(msp->ms_checkpointing) && !(msp->ms_loaded && msp->ms_condense_wanted && txg <= spa_final_dirty_txg(spa))) return; @@ -4099,12 +4110,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) msp->ms_start, msp->ms_size, vd->vdev_ashift)); ASSERT(msp->ms_sm != NULL); - ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); - ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); ASSERT0(metaslab_allocated_space(msp)); } - if (!range_tree_is_empty(msp->ms_checkpointing) && + if (!zfs_range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); @@ -4166,9 +4177,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_unflushed_changes_memused(msp)); spa->spa_unflushed_stats.sus_memused -= metaslab_unflushed_changes_memused(msp); - range_tree_remove_xor_add(alloctree, + zfs_range_tree_remove_xor_add(alloctree, msp->ms_unflushed_frees, msp->ms_unflushed_allocs); - range_tree_remove_xor_add(msp->ms_freeing, + zfs_range_tree_remove_xor_add(msp->ms_freeing, msp->ms_unflushed_allocs, msp->ms_unflushed_frees); spa->spa_unflushed_stats.sus_memused += metaslab_unflushed_changes_memused(msp); @@ -4182,12 +4193,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) mutex_enter(&msp->ms_lock); } - msp->ms_allocated_space += range_tree_space(alloctree); + msp->ms_allocated_space += zfs_range_tree_space(alloctree); ASSERT3U(msp->ms_allocated_space, >=, - range_tree_space(msp->ms_freeing)); - msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); + zfs_range_tree_space(msp->ms_freeing)); + msp->ms_allocated_space -= zfs_range_tree_space(msp->ms_freeing); - if (!range_tree_is_empty(msp->ms_checkpointing)) { + if (!zfs_range_tree_is_empty(msp->ms_checkpointing)) { ASSERT(spa_has_checkpoint(spa)); ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); @@ -4203,13 +4214,13 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) mutex_enter(&msp->ms_lock); spa->spa_checkpoint_info.sci_dspace += - range_tree_space(msp->ms_checkpointing); + zfs_range_tree_space(msp->ms_checkpointing); vd->vdev_stat.vs_checkpoint_space += - range_tree_space(msp->ms_checkpointing); + zfs_range_tree_space(msp->ms_checkpointing); ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, -space_map_allocated(vd->vdev_checkpoint_sm)); - range_tree_vacate(msp->ms_checkpointing, NULL, NULL); + zfs_range_tree_vacate(msp->ms_checkpointing, NULL, NULL); } if (msp->ms_loaded) { @@ -4269,20 +4280,20 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * get appended to the ms_sm) so their ranges can be reused as usual. */ if (spa_sync_pass(spa) == 1) { - range_tree_swap(&msp->ms_freeing, &msp->ms_freed); + zfs_range_tree_swap(&msp->ms_freeing, &msp->ms_freed); ASSERT0(msp->ms_allocated_this_txg); } else { - range_tree_vacate(msp->ms_freeing, - range_tree_add, msp->ms_freed); + zfs_range_tree_vacate(msp->ms_freeing, + zfs_range_tree_add, msp->ms_freed); } - msp->ms_allocated_this_txg += range_tree_space(alloctree); - range_tree_vacate(alloctree, NULL, NULL); + msp->ms_allocated_this_txg += zfs_range_tree_space(alloctree); + zfs_range_tree_vacate(alloctree, NULL, NULL); - ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) + ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); + ASSERT0(zfs_range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_freeing)); - ASSERT0(range_tree_space(msp->ms_checkpointing)); + ASSERT0(zfs_range_tree_space(msp->ms_freeing)); + ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); mutex_exit(&msp->ms_lock); @@ -4306,7 +4317,7 @@ metaslab_evict(metaslab_t *msp, uint64_t txg) return; for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { - VERIFY0(range_tree_space( + VERIFY0(zfs_range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } if (msp->ms_allocator != -1) @@ -4326,7 +4337,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; - range_tree_t **defer_tree; + zfs_range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; boolean_t defer_allowed = B_TRUE; @@ -4340,11 +4351,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* there should be no allocations nor frees at this point */ VERIFY0(msp->ms_allocated_this_txg); - VERIFY0(range_tree_space(msp->ms_freed)); + VERIFY0(zfs_range_tree_space(msp->ms_freed)); } - ASSERT0(range_tree_space(msp->ms_freeing)); - ASSERT0(range_tree_space(msp->ms_checkpointing)); + ASSERT0(zfs_range_tree_space(msp->ms_freeing)); + ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; @@ -4357,13 +4368,13 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) defer_delta = 0; alloc_delta = msp->ms_allocated_this_txg - - range_tree_space(msp->ms_freed); + zfs_range_tree_space(msp->ms_freed); if (defer_allowed) { - defer_delta = range_tree_space(msp->ms_freed) - - range_tree_space(*defer_tree); + defer_delta = zfs_range_tree_space(msp->ms_freed) - + zfs_range_tree_space(*defer_tree); } else { - defer_delta -= range_tree_space(*defer_tree); + defer_delta -= zfs_range_tree_space(*defer_tree); } metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, defer_delta, 0); @@ -4390,13 +4401,14 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * frees not being trimmed. */ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { - range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); + zfs_range_tree_walk(*defer_tree, zfs_range_tree_add, + msp->ms_trim); if (!defer_allowed) { - range_tree_walk(msp->ms_freed, range_tree_add, + zfs_range_tree_walk(msp->ms_freed, zfs_range_tree_add, msp->ms_trim); } } else { - range_tree_vacate(msp->ms_trim, NULL, NULL); + zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); } /* @@ -4405,13 +4417,13 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * the defer_tree -- this is safe to do because we've * just emptied out the defer_tree. */ - range_tree_vacate(*defer_tree, - msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); + zfs_range_tree_vacate(*defer_tree, + msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable); if (defer_allowed) { - range_tree_swap(&msp->ms_freed, defer_tree); + zfs_range_tree_swap(&msp->ms_freed, defer_tree); } else { - range_tree_vacate(msp->ms_freed, - msp->ms_loaded ? range_tree_add : NULL, + zfs_range_tree_vacate(msp->ms_freed, + msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable); } @@ -4442,10 +4454,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) */ metaslab_recalculate_weight_and_sort(msp); - ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); - ASSERT0(range_tree_space(msp->ms_freeing)); - ASSERT0(range_tree_space(msp->ms_freed)); - ASSERT0(range_tree_space(msp->ms_checkpointing)); + ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); + ASSERT0(zfs_range_tree_space(msp->ms_freeing)); + ASSERT0(zfs_range_tree_space(msp->ms_freed)); + ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); msp->ms_allocating_total -= msp->ms_allocated_this_txg; msp->ms_allocated_this_txg = 0; mutex_exit(&msp->ms_lock); @@ -4653,7 +4665,7 @@ static uint64_t metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) { uint64_t start; - range_tree_t *rt = msp->ms_allocatable; + zfs_range_tree_t *rt = msp->ms_allocatable; metaslab_class_t *mc = msp->ms_group->mg_class; ASSERT(MUTEX_HELD(&msp->ms_lock)); @@ -4668,14 +4680,15 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); - range_tree_remove(rt, start, size); - range_tree_clear(msp->ms_trim, start, size); + VERIFY3U(zfs_range_tree_space(rt) - size, <=, msp->ms_size); + zfs_range_tree_remove(rt, start, size); + zfs_range_tree_clear(msp->ms_trim, start, size); - if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) + if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); + zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, + size); msp->ms_allocating_total += size; /* Track the last successful allocation */ @@ -5395,16 +5408,16 @@ metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, metaslab_check_free_impl(vd, offset, asize); mutex_enter(&msp->ms_lock); - if (range_tree_is_empty(msp->ms_freeing) && - range_tree_is_empty(msp->ms_checkpointing)) { + if (zfs_range_tree_is_empty(msp->ms_freeing) && + zfs_range_tree_is_empty(msp->ms_checkpointing)) { vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); } if (checkpoint) { ASSERT(spa_has_checkpoint(spa)); - range_tree_add(msp->ms_checkpointing, offset, asize); + zfs_range_tree_add(msp->ms_checkpointing, offset, asize); } else { - range_tree_add(msp->ms_freeing, offset, asize); + zfs_range_tree_add(msp->ms_freeing, offset, asize); } mutex_exit(&msp->ms_lock); } @@ -5628,18 +5641,18 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; mutex_enter(&msp->ms_lock); - range_tree_remove(msp->ms_allocating[txg & TXG_MASK], + zfs_range_tree_remove(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total -= size; VERIFY(!msp->ms_condensing); VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); - VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, + VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) + size, <=, msp->ms_size); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - range_tree_add(msp->ms_allocatable, offset, size); + zfs_range_tree_add(msp->ms_allocatable, offset, size); mutex_exit(&msp->ms_lock); } @@ -5735,7 +5748,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, } if (error == 0 && - !range_tree_contains(msp->ms_allocatable, offset, size)) + !zfs_range_tree_contains(msp->ms_allocatable, offset, size)) error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ @@ -5746,10 +5759,10 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, VERIFY(!msp->ms_condensing); VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); - VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, + VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) - size, <=, msp->ms_size); - range_tree_remove(msp->ms_allocatable, offset, size); - range_tree_clear(msp->ms_trim, offset, size); + zfs_range_tree_remove(msp->ms_allocatable, offset, size); + zfs_range_tree_clear(msp->ms_trim, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ metaslab_class_t *mc = msp->ms_group->mg_class; @@ -5761,9 +5774,9 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, } multilist_sublist_unlock(mls); - if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) + if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) vdev_dirty(vd, VDD_METASLAB, msp, txg); - range_tree_add(msp->ms_allocating[txg & TXG_MASK], + zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], offset, size); msp->ms_allocating_total += size; } @@ -6020,7 +6033,7 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) mutex_enter(&msp->ms_lock); if (msp->ms_loaded) { - range_tree_verify_not_present(msp->ms_allocatable, + zfs_range_tree_verify_not_present(msp->ms_allocatable, offset, size); } @@ -6032,15 +6045,16 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) * allocated and freed in the same sync pass within the same txg. * Unfortunately there are places (e.g. the ZIL) where we allocate a * segment but then we free part of it within the same txg - * [see zil_sync()]. Thus, we don't call range_tree_verify() in the + * [see zil_sync()]. Thus, we don't call zfs_range_tree_verify() in the * current allocating tree. */ - range_tree_verify_not_present(msp->ms_freeing, offset, size); - range_tree_verify_not_present(msp->ms_checkpointing, offset, size); - range_tree_verify_not_present(msp->ms_freed, offset, size); + zfs_range_tree_verify_not_present(msp->ms_freeing, offset, size); + zfs_range_tree_verify_not_present(msp->ms_checkpointing, offset, size); + zfs_range_tree_verify_not_present(msp->ms_freed, offset, size); for (int j = 0; j < TXG_DEFER_SIZE; j++) - range_tree_verify_not_present(msp->ms_defer[j], offset, size); - range_tree_verify_not_present(msp->ms_trim, offset, size); + zfs_range_tree_verify_not_present(msp->ms_defer[j], offset, + size); + zfs_range_tree_verify_not_present(msp->ms_trim, offset, size); mutex_exit(&msp->ms_lock); } diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 5174e2c46633..3cbd5712e1d3 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -42,11 +42,11 @@ * splitting in response to range add/remove requests. * * A range tree starts out completely empty, with no segments in it. - * Adding an allocation via range_tree_add to the range tree can either: + * Adding an allocation via zfs_range_tree_add to the range tree can either: * 1) create a new extent * 2) extend an adjacent extent * 3) merge two adjacent extents - * Conversely, removing an allocation via range_tree_remove can: + * Conversely, removing an allocation via zfs_range_tree_remove can: * 1) completely remove an extent * 2) shorten an extent (if the allocation was near one of its ends) * 3) split an extent into two extents, in effect punching a hole @@ -54,16 +54,16 @@ * A range tree is also capable of 'bridging' gaps when adding * allocations. This is useful for cases when close proximity of * allocations is an important detail that needs to be represented - * in the range tree. See range_tree_set_gap(). The default behavior + * in the range tree. See zfs_range_tree_set_gap(). The default behavior * is not to bridge gaps (i.e. the maximum allowed gap size is 0). * - * In order to traverse a range tree, use either the range_tree_walk() - * or range_tree_vacate() functions. + * In order to traverse a range tree, use either the zfs_range_tree_walk() + * or zfs_range_tree_vacate() functions. * * To obtain more accurate information on individual segment * operations that the range tree performs "under the hood", you can - * specify a set of callbacks by passing a range_tree_ops_t structure - * to the range_tree_create function. Any callbacks that are non-NULL + * specify a set of callbacks by passing a zfs_range_tree_ops_t structure + * to the zfs_range_tree_create function. Any callbacks that are non-NULL * are then called at the appropriate times. * * The range tree code also supports a special variant of range trees @@ -76,18 +76,18 @@ */ static inline void -rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt) +zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt) { - ASSERT3U(rt->rt_type, <, RANGE_SEG_NUM_TYPES); + ASSERT3U(rt->rt_type, <, ZFS_RANGE_SEG_NUM_TYPES); size_t size = 0; switch (rt->rt_type) { - case RANGE_SEG32: + case ZFS_RANGE_SEG32: size = sizeof (range_seg32_t); break; - case RANGE_SEG64: + case ZFS_RANGE_SEG64: size = sizeof (range_seg64_t); break; - case RANGE_SEG_GAP: + case ZFS_RANGE_SEG_GAP: size = sizeof (range_seg_gap_t); break; default: @@ -97,16 +97,17 @@ rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt) } void -range_tree_stat_verify(range_tree_t *rt) +zfs_range_tree_stat_verify(zfs_range_tree_t *rt) { - range_seg_t *rs; + zfs_range_seg_t *rs; zfs_btree_index_t where; uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; int i; for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { - uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); + uint64_t size = zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt); int idx = highbit64(size) - 1; hist[idx]++; @@ -124,9 +125,9 @@ range_tree_stat_verify(range_tree_t *rt) } static void -range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) +zfs_range_tree_stat_incr(zfs_range_tree_t *rt, zfs_range_seg_t *rs) { - uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); + uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); @@ -138,9 +139,9 @@ range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) } static void -range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) +zfs_range_tree_stat_decr(zfs_range_tree_t *rt, zfs_range_seg_t *rs) { - uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); + uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt); int idx = highbit64(size) - 1; ASSERT(size != 0); @@ -153,7 +154,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) __attribute__((always_inline)) inline static int -range_tree_seg32_compare(const void *x1, const void *x2) +zfs_range_tree_seg32_compare(const void *x1, const void *x2) { const range_seg32_t *r1 = x1; const range_seg32_t *r2 = x2; @@ -166,7 +167,7 @@ range_tree_seg32_compare(const void *x1, const void *x2) __attribute__((always_inline)) inline static int -range_tree_seg64_compare(const void *x1, const void *x2) +zfs_range_tree_seg64_compare(const void *x1, const void *x2) { const range_seg64_t *r1 = x1; const range_seg64_t *r2 = x2; @@ -179,7 +180,7 @@ range_tree_seg64_compare(const void *x1, const void *x2) __attribute__((always_inline)) inline static int -range_tree_seg_gap_compare(const void *x1, const void *x2) +zfs_range_tree_seg_gap_compare(const void *x1, const void *x2) { const range_seg_gap_t *r1 = x1; const range_seg_gap_t *r2 = x2; @@ -190,41 +191,42 @@ range_tree_seg_gap_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } -ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t, - range_tree_seg32_compare) +ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, range_seg32_t, + zfs_range_tree_seg32_compare) -ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t, - range_tree_seg64_compare) +ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, range_seg64_t, + zfs_range_tree_seg64_compare) -ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t, - range_tree_seg_gap_compare) +ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf, range_seg_gap_t, + zfs_range_tree_seg_gap_compare) -range_tree_t * -range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, - void *arg, uint64_t start, uint64_t shift, uint64_t gap) +zfs_range_tree_t * +zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + uint64_t gap) { - range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); + zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP); ASSERT3U(shift, <, 64); - ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); + ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES); size_t size; int (*compare) (const void *, const void *); bt_find_in_buf_f bt_find; switch (type) { - case RANGE_SEG32: + case ZFS_RANGE_SEG32: size = sizeof (range_seg32_t); - compare = range_tree_seg32_compare; - bt_find = range_tree_seg32_find_in_buf; + compare = zfs_range_tree_seg32_compare; + bt_find = zfs_range_tree_seg32_find_in_buf; break; - case RANGE_SEG64: + case ZFS_RANGE_SEG64: size = sizeof (range_seg64_t); - compare = range_tree_seg64_compare; - bt_find = range_tree_seg64_find_in_buf; + compare = zfs_range_tree_seg64_compare; + bt_find = zfs_range_tree_seg64_find_in_buf; break; - case RANGE_SEG_GAP: + case ZFS_RANGE_SEG_GAP: size = sizeof (range_seg_gap_t); - compare = range_tree_seg_gap_compare; - bt_find = range_tree_seg_gap_find_in_buf; + compare = zfs_range_tree_seg_gap_compare; + bt_find = zfs_range_tree_seg_gap_find_in_buf; break; default: panic("Invalid range seg type %d", type); @@ -244,15 +246,15 @@ range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, return (rt); } -range_tree_t * -range_tree_create(const range_tree_ops_t *ops, range_seg_type_t type, - void *arg, uint64_t start, uint64_t shift) +zfs_range_tree_t * +zfs_range_tree_create(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift) { - return (range_tree_create_gap(ops, type, arg, start, shift, 0)); + return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0)); } void -range_tree_destroy(range_tree_t *rt) +zfs_range_tree_destroy(zfs_range_tree_t *rt) { VERIFY0(rt->rt_space); @@ -264,35 +266,36 @@ range_tree_destroy(range_tree_t *rt) } void -range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) +zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs, + int64_t delta) { - if (delta < 0 && delta * -1 >= rs_get_fill(rs, rt)) { + if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) { zfs_panic_recover("zfs: attempting to decrease fill to or " "below 0; probable double remove in segment [%llx:%llx]", - (longlong_t)rs_get_start(rs, rt), - (longlong_t)rs_get_end(rs, rt)); + (longlong_t)zfs_rs_get_start(rs, rt), + (longlong_t)zfs_rs_get_end(rs, rt)); } - if (rs_get_fill(rs, rt) + delta > rs_get_end(rs, rt) - - rs_get_start(rs, rt)) { + if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt)) { zfs_panic_recover("zfs: attempting to increase fill beyond " "max; probable double add in segment [%llx:%llx]", - (longlong_t)rs_get_start(rs, rt), - (longlong_t)rs_get_end(rs, rt)); + (longlong_t)zfs_rs_get_start(rs, rt), + (longlong_t)zfs_rs_get_end(rs, rt)); } if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta); + zfs_rs_set_fill(rs, rt, zfs_rs_get_fill(rs, rt) + delta); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } static void -range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) +zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) { - range_tree_t *rt = arg; + zfs_range_tree_t *rt = arg; zfs_btree_index_t where; - range_seg_t *rs_before, *rs_after, *rs; + zfs_range_seg_t *rs_before, *rs_after, *rs; range_seg_max_t tmp, rsearch; uint64_t end = start + size, gap = rt->rt_gap; uint64_t bridge_size = 0; @@ -302,8 +305,8 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) ASSERT3U(fill, <=, size); ASSERT3U(start + size, >, start); - rs_set_start(&rsearch, rt, start); - rs_set_end(&rsearch, rt, end); + zfs_rs_set_start(&rsearch, rt, start); + zfs_rs_set_end(&rsearch, rt, end); rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* @@ -321,26 +324,26 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) (longlong_t)start, (longlong_t)size); return; } - uint64_t rstart = rs_get_start(rs, rt); - uint64_t rend = rs_get_end(rs, rt); + uint64_t rstart = zfs_rs_get_start(rs, rt); + uint64_t rend = zfs_rs_get_end(rs, rt); if (rstart <= start && rend >= end) { - range_tree_adjust_fill(rt, rs, fill); + zfs_range_tree_adjust_fill(rt, rs, fill); return; } if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - range_tree_stat_decr(rt, rs); + zfs_range_tree_stat_decr(rt, rs); rt->rt_space -= rend - rstart; - fill += rs_get_fill(rs, rt); + fill += zfs_rs_get_fill(rs, rt); start = MIN(start, rstart); end = MAX(end, rend); size = end - start; zfs_btree_remove(&rt->rt_root, rs); - range_tree_add_impl(rt, start, size, fill); + zfs_range_tree_add_impl(rt, start, size, fill); return; } @@ -355,15 +358,15 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before); rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after); - merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >= + merge_before = (rs_before != NULL && zfs_rs_get_end(rs_before, rt) >= start - gap); - merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end + - gap); + merge_after = (rs_after != NULL && zfs_rs_get_start(rs_after, rt) <= + end + gap); if (merge_before && gap != 0) - bridge_size += start - rs_get_end(rs_before, rt); + bridge_size += start - zfs_rs_get_end(rs_before, rt); if (merge_after && gap != 0) - bridge_size += rs_get_start(rs_after, rt) - end; + bridge_size += zfs_rs_get_start(rs_after, rt) - end; if (merge_before && merge_after) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { @@ -371,13 +374,13 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); } - range_tree_stat_decr(rt, rs_before); - range_tree_stat_decr(rt, rs_after); + zfs_range_tree_stat_decr(rt, rs_before); + zfs_range_tree_stat_decr(rt, rs_after); - rs_copy(rs_after, &tmp, rt); - uint64_t before_start = rs_get_start_raw(rs_before, rt); - uint64_t before_fill = rs_get_fill(rs_before, rt); - uint64_t after_fill = rs_get_fill(rs_after, rt); + zfs_rs_copy(rs_after, &tmp, rt); + uint64_t before_start = zfs_rs_get_start_raw(rs_before, rt); + uint64_t before_fill = zfs_rs_get_fill(rs_before, rt); + uint64_t after_fill = zfs_rs_get_fill(rs_after, rt); zfs_btree_remove_idx(&rt->rt_root, &where_before); /* @@ -386,76 +389,76 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) */ rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after); ASSERT3P(rs_after, !=, NULL); - rs_set_start_raw(rs_after, rt, before_start); - rs_set_fill(rs_after, rt, after_fill + before_fill + fill); + zfs_rs_set_start_raw(rs_after, rt, before_start); + zfs_rs_set_fill(rs_after, rt, after_fill + before_fill + fill); rs = rs_after; } else if (merge_before) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); - range_tree_stat_decr(rt, rs_before); + zfs_range_tree_stat_decr(rt, rs_before); - uint64_t before_fill = rs_get_fill(rs_before, rt); - rs_set_end(rs_before, rt, end); - rs_set_fill(rs_before, rt, before_fill + fill); + uint64_t before_fill = zfs_rs_get_fill(rs_before, rt); + zfs_rs_set_end(rs_before, rt, end); + zfs_rs_set_fill(rs_before, rt, before_fill + fill); rs = rs_before; } else if (merge_after) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); - range_tree_stat_decr(rt, rs_after); + zfs_range_tree_stat_decr(rt, rs_after); - uint64_t after_fill = rs_get_fill(rs_after, rt); - rs_set_start(rs_after, rt, start); - rs_set_fill(rs_after, rt, after_fill + fill); + uint64_t after_fill = zfs_rs_get_fill(rs_after, rt); + zfs_rs_set_start(rs_after, rt, start); + zfs_rs_set_fill(rs_after, rt, after_fill + fill); rs = rs_after; } else { rs = &tmp; - rs_set_start(rs, rt, start); - rs_set_end(rs, rt, end); - rs_set_fill(rs, rt, fill); + zfs_rs_set_start(rs, rt, start); + zfs_rs_set_end(rs, rt, end); + zfs_rs_set_fill(rs, rt, fill); zfs_btree_add_idx(&rt->rt_root, rs, &where); } if (gap != 0) { - ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) - - rs_get_start(rs, rt)); + ASSERT3U(zfs_rs_get_fill(rs, rt), <=, zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt)); } else { - ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) - - rs_get_start(rs, rt)); + ASSERT3U(zfs_rs_get_fill(rs, rt), ==, zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt)); } if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); - range_tree_stat_incr(rt, rs); + zfs_range_tree_stat_incr(rt, rs); rt->rt_space += size + bridge_size; } void -range_tree_add(void *arg, uint64_t start, uint64_t size) +zfs_range_tree_add(void *arg, uint64_t start, uint64_t size) { - range_tree_add_impl(arg, start, size, size); + zfs_range_tree_add_impl(arg, start, size, size); } static void -range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, +zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, boolean_t do_fill) { zfs_btree_index_t where; - range_seg_t *rs; + zfs_range_seg_t *rs; range_seg_max_t rsearch, rs_tmp; uint64_t end = start + size; boolean_t left_over, right_over; VERIFY3U(size, !=, 0); VERIFY3U(size, <=, rt->rt_space); - if (rt->rt_type == RANGE_SEG64) + if (rt->rt_type == ZFS_RANGE_SEG64) ASSERT3U(start + size, >, start); - rs_set_start(&rsearch, rt, start); - rs_set_end(&rsearch, rt, end); + zfs_rs_set_start(&rsearch, rt, start); + zfs_rs_set_end(&rsearch, rt, end); rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); /* Make sure we completely overlap with someone */ @@ -474,49 +477,49 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, */ if (rt->rt_gap != 0) { if (do_fill) { - if (rs_get_fill(rs, rt) == size) { - start = rs_get_start(rs, rt); - end = rs_get_end(rs, rt); + if (zfs_rs_get_fill(rs, rt) == size) { + start = zfs_rs_get_start(rs, rt); + end = zfs_rs_get_end(rs, rt); size = end - start; } else { - range_tree_adjust_fill(rt, rs, -size); + zfs_range_tree_adjust_fill(rt, rs, -size); return; } - } else if (rs_get_start(rs, rt) != start || - rs_get_end(rs, rt) != end) { + } else if (zfs_rs_get_start(rs, rt) != start || + zfs_rs_get_end(rs, rt) != end) { zfs_panic_recover("zfs: freeing partial segment of " "gap tree (offset=%llx size=%llx) of " "(offset=%llx size=%llx)", (longlong_t)start, (longlong_t)size, - (longlong_t)rs_get_start(rs, rt), - (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs, - rt)); + (longlong_t)zfs_rs_get_start(rs, rt), + (longlong_t)zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt)); return; } } - VERIFY3U(rs_get_start(rs, rt), <=, start); - VERIFY3U(rs_get_end(rs, rt), >=, end); + VERIFY3U(zfs_rs_get_start(rs, rt), <=, start); + VERIFY3U(zfs_rs_get_end(rs, rt), >=, end); - left_over = (rs_get_start(rs, rt) != start); - right_over = (rs_get_end(rs, rt) != end); + left_over = (zfs_rs_get_start(rs, rt) != start); + right_over = (zfs_rs_get_end(rs, rt) != end); - range_tree_stat_decr(rt, rs); + zfs_range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { range_seg_max_t newseg; - rs_set_start(&newseg, rt, end); - rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt)); - rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end); - range_tree_stat_incr(rt, &newseg); + zfs_rs_set_start(&newseg, rt, end); + zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt)); + zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end); + zfs_range_tree_stat_incr(rt, &newseg); // This modifies the buffer already inside the range tree - rs_set_end(rs, rt, start); + zfs_rs_set_end(rs, rt, start); - rs_copy(rs, &rs_tmp, rt); + zfs_rs_copy(rs, &rs_tmp, rt); if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL) zfs_btree_add_idx(&rt->rt_root, &newseg, &where); else @@ -526,12 +529,12 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg); } else if (left_over) { // This modifies the buffer already inside the range tree - rs_set_end(rs, rt, start); - rs_copy(rs, &rs_tmp, rt); + zfs_rs_set_end(rs, rt, start); + zfs_rs_copy(rs, &rs_tmp, rt); } else if (right_over) { // This modifies the buffer already inside the range tree - rs_set_start(rs, rt, end); - rs_copy(rs, &rs_tmp, rt); + zfs_rs_set_start(rs, rt, end); + zfs_rs_copy(rs, &rs_tmp, rt); } else { zfs_btree_remove_idx(&rt->rt_root, &where); rs = NULL; @@ -543,9 +546,9 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, * the size, since we do not support removing partial segments * of range trees with gaps. */ - rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) - - rs_get_start_raw(rs, rt)); - range_tree_stat_incr(rt, &rs_tmp); + zfs_zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) - + zfs_rs_get_start_raw(rs, rt)); + zfs_range_tree_stat_incr(rt, &rs_tmp); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg); @@ -555,76 +558,78 @@ range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, } void -range_tree_remove(void *arg, uint64_t start, uint64_t size) +zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size) { - range_tree_remove_impl(arg, start, size, B_FALSE); + zfs_range_tree_remove_impl(arg, start, size, B_FALSE); } void -range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size) +zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size) { - range_tree_remove_impl(rt, start, size, B_TRUE); + zfs_range_tree_remove_impl(rt, start, size, B_TRUE); } void -range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, +zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs, uint64_t newstart, uint64_t newsize) { - int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt)); + int64_t delta = newsize - (zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt)); - range_tree_stat_decr(rt, rs); + zfs_range_tree_stat_decr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); - rs_set_start(rs, rt, newstart); - rs_set_end(rs, rt, newstart + newsize); + zfs_rs_set_start(rs, rt, newstart); + zfs_rs_set_end(rs, rt, newstart + newsize); - range_tree_stat_incr(rt, rs); + zfs_range_tree_stat_incr(rt, rs); if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); rt->rt_space += delta; } -static range_seg_t * -range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) +static zfs_range_seg_t * +zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_max_t rsearch; uint64_t end = start + size; VERIFY(size != 0); - rs_set_start(&rsearch, rt, start); - rs_set_end(&rsearch, rt, end); + zfs_rs_set_start(&rsearch, rt, start); + zfs_rs_set_end(&rsearch, rt, end); return (zfs_btree_find(&rt->rt_root, &rsearch, NULL)); } -range_seg_t * -range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) +zfs_range_seg_t * +zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size) { - if (rt->rt_type == RANGE_SEG64) + if (rt->rt_type == ZFS_RANGE_SEG64) ASSERT3U(start + size, >, start); - range_seg_t *rs = range_tree_find_impl(rt, start, size); - if (rs != NULL && rs_get_start(rs, rt) <= start && - rs_get_end(rs, rt) >= start + size) { + zfs_range_seg_t *rs = zfs_range_tree_find_impl(rt, start, size); + if (rs != NULL && zfs_rs_get_start(rs, rt) <= start && + zfs_rs_get_end(rs, rt) >= start + size) { return (rs); } return (NULL); } void -range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size) +zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t off, + uint64_t size) { - range_seg_t *rs = range_tree_find(rt, off, size); + zfs_range_seg_t *rs = zfs_range_tree_find(rt, off, size); if (rs != NULL) panic("segment already in tree; rs=%p", (void *)rs); } boolean_t -range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) +zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size) { - return (range_tree_find(rt, start, size) != NULL); + return (zfs_range_tree_find(rt, start, size) != NULL); } /* @@ -633,31 +638,32 @@ range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) * isn't. */ boolean_t -range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, +zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size, uint64_t *ostart, uint64_t *osize) { - if (rt->rt_type == RANGE_SEG64) + if (rt->rt_type == ZFS_RANGE_SEG64) ASSERT3U(start + size, >, start); range_seg_max_t rsearch; - rs_set_start(&rsearch, rt, start); - rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1); + zfs_rs_set_start(&rsearch, rt, start); + zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) + + 1); zfs_btree_index_t where; - range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); + zfs_range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); if (rs != NULL) { *ostart = start; - *osize = MIN(size, rs_get_end(rs, rt) - start); + *osize = MIN(size, zfs_rs_get_end(rs, rt) - start); return (B_TRUE); } rs = zfs_btree_next(&rt->rt_root, &where, &where); - if (rs == NULL || rs_get_start(rs, rt) > start + size) + if (rs == NULL || zfs_rs_get_start(rs, rt) > start + size) return (B_FALSE); - *ostart = rs_get_start(rs, rt); - *osize = MIN(start + size, rs_get_end(rs, rt)) - - rs_get_start(rs, rt); + *ostart = zfs_rs_get_start(rs, rt); + *osize = MIN(start + size, zfs_rs_get_end(rs, rt)) - + zfs_rs_get_start(rs, rt); return (B_TRUE); } @@ -666,29 +672,29 @@ range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, * it is currently in the tree. */ void -range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) +zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size) { - range_seg_t *rs; + zfs_range_seg_t *rs; if (size == 0) return; - if (rt->rt_type == RANGE_SEG64) + if (rt->rt_type == ZFS_RANGE_SEG64) ASSERT3U(start + size, >, start); - while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { - uint64_t free_start = MAX(rs_get_start(rs, rt), start); - uint64_t free_end = MIN(rs_get_end(rs, rt), start + size); - range_tree_remove(rt, free_start, free_end - free_start); + while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) { + uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start); + uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size); + zfs_range_tree_remove(rt, free_start, free_end - free_start); } } void -range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) +zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst) { - range_tree_t *rt; + zfs_range_tree_t *rt; - ASSERT0(range_tree_space(*rtdst)); + ASSERT0(zfs_range_tree_space(*rtdst)); ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root)); rt = *rtsrc; @@ -697,19 +703,20 @@ range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) } void -range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) +zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func, + void *arg) { if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); if (func != NULL) { - range_seg_t *rs; + zfs_range_seg_t *rs; zfs_btree_index_t *cookie = NULL; while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { - func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - - rs_get_start(rs, rt)); + func(arg, zfs_rs_get_start(rs, rt), + zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)); } } else { zfs_btree_clear(&rt->rt_root); @@ -720,39 +727,40 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) } void -range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) +zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func, + void *arg) { zfs_btree_index_t where; - for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); + for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { - func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - - rs_get_start(rs, rt)); + func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt)); } } -range_seg_t * -range_tree_first(range_tree_t *rt) +zfs_range_seg_t * +zfs_range_tree_first(zfs_range_tree_t *rt) { return (zfs_btree_first(&rt->rt_root, NULL)); } uint64_t -range_tree_space(range_tree_t *rt) +zfs_range_tree_space(zfs_range_tree_t *rt) { return (rt->rt_space); } uint64_t -range_tree_numsegs(range_tree_t *rt) +zfs_range_tree_numsegs(zfs_range_tree_t *rt) { return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root)); } boolean_t -range_tree_is_empty(range_tree_t *rt) +zfs_range_tree_is_empty(zfs_range_tree_t *rt) { ASSERT(rt != NULL); - return (range_tree_space(rt) == 0); + return (zfs_range_tree_space(rt) == 0); } /* @@ -760,46 +768,46 @@ range_tree_is_empty(range_tree_t *rt) * from removefrom. Add non-overlapping leftovers to addto. */ void -range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, - range_tree_t *removefrom, range_tree_t *addto) +zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, + zfs_range_tree_t *removefrom, zfs_range_tree_t *addto) { zfs_btree_index_t where; range_seg_max_t starting_rs; - rs_set_start(&starting_rs, removefrom, start); - rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs, - removefrom) + 1); + zfs_rs_set_start(&starting_rs, removefrom, start); + zfs_rs_set_end_raw(&starting_rs, removefrom, + zfs_rs_get_start_raw(&starting_rs, removefrom) + 1); - range_seg_t *curr = zfs_btree_find(&removefrom->rt_root, + zfs_range_seg_t *curr = zfs_btree_find(&removefrom->rt_root, &starting_rs, &where); if (curr == NULL) curr = zfs_btree_next(&removefrom->rt_root, &where, &where); - range_seg_t *next; + zfs_range_seg_t *next; for (; curr != NULL; curr = next) { if (start == end) return; VERIFY3U(start, <, end); /* there is no overlap */ - if (end <= rs_get_start(curr, removefrom)) { - range_tree_add(addto, start, end - start); + if (end <= zfs_rs_get_start(curr, removefrom)) { + zfs_range_tree_add(addto, start, end - start); return; } - uint64_t overlap_start = MAX(rs_get_start(curr, removefrom), + uint64_t overlap_start = MAX(zfs_rs_get_start(curr, removefrom), start); - uint64_t overlap_end = MIN(rs_get_end(curr, removefrom), + uint64_t overlap_end = MIN(zfs_rs_get_end(curr, removefrom), end); uint64_t overlap_size = overlap_end - overlap_start; ASSERT3S(overlap_size, >, 0); range_seg_max_t rs; - rs_copy(curr, &rs, removefrom); + zfs_rs_copy(curr, &rs, removefrom); - range_tree_remove(removefrom, overlap_start, overlap_size); + zfs_range_tree_remove(removefrom, overlap_start, overlap_size); if (start < overlap_start) - range_tree_add(addto, start, overlap_start - start); + zfs_range_tree_add(addto, start, overlap_start - start); start = overlap_end; next = zfs_btree_find(&removefrom->rt_root, &rs, &where); @@ -814,7 +822,7 @@ range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, * area to process. */ if (next != NULL) { - ASSERT(start == end || start == rs_get_end(&rs, + ASSERT(start == end || start == zfs_rs_get_end(&rs, removefrom)); } @@ -824,7 +832,7 @@ range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, if (start != end) { VERIFY3U(start, <, end); - range_tree_add(addto, start, end - start); + zfs_range_tree_add(addto, start, end - start); } else { VERIFY3U(start, ==, end); } @@ -835,33 +843,33 @@ range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, * from removefrom. Otherwise, add it to addto. */ void -range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, - range_tree_t *addto) +zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt, + zfs_range_tree_t *removefrom, zfs_range_tree_t *addto) { zfs_btree_index_t where; - for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; + for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { - range_tree_remove_xor_add_segment(rs_get_start(rs, rt), - rs_get_end(rs, rt), removefrom, addto); + zfs_range_tree_remove_xor_add_segment(zfs_rs_get_start(rs, rt), + zfs_rs_get_end(rs, rt), removefrom, addto); } } uint64_t -range_tree_min(range_tree_t *rt) +zfs_range_tree_min(zfs_range_tree_t *rt) { - range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL); - return (rs != NULL ? rs_get_start(rs, rt) : 0); + zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL); + return (rs != NULL ? zfs_rs_get_start(rs, rt) : 0); } uint64_t -range_tree_max(range_tree_t *rt) +zfs_range_tree_max(zfs_range_tree_t *rt) { - range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL); - return (rs != NULL ? rs_get_end(rs, rt) : 0); + zfs_range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL); + return (rs != NULL ? zfs_rs_get_end(rs, rt) : 0); } uint64_t -range_tree_span(range_tree_t *rt) +zfs_range_tree_span(zfs_range_tree_t *rt) { - return (range_tree_max(rt) - range_tree_min(rt)); + return (zfs_range_tree_max(rt) - zfs_range_tree_min(rt)); } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index c9dfd7ac2e4d..54830b9536d9 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -9861,7 +9861,7 @@ vdev_indirect_state_sync_verify(vdev_t *vd) * happen in syncing context, the obsolete segments * tree must be empty when we start syncing. */ - ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); + ASSERT0(zfs_range_tree_space(vd->vdev_obsolete_segments)); } /* diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index 4c3721c159be..5fbf474b0ece 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -235,9 +235,9 @@ spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) * potentially save ourselves from future headaches. */ mutex_enter(&ms->ms_lock); - if (range_tree_is_empty(ms->ms_freeing)) + if (zfs_range_tree_is_empty(ms->ms_freeing)) vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); - range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); + zfs_range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index a95152608578..5eb4d043be41 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -1108,11 +1108,11 @@ spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg) switch (sme->sme_type) { case SM_ALLOC: - range_tree_remove_xor_add_segment(offset, offset + size, + zfs_range_tree_remove_xor_add_segment(offset, offset + size, ms->ms_unflushed_frees, ms->ms_unflushed_allocs); break; case SM_FREE: - range_tree_remove_xor_add_segment(offset, offset + size, + zfs_range_tree_remove_xor_add_segment(offset, offset + size, ms->ms_unflushed_allocs, ms->ms_unflushed_frees); break; default: @@ -1251,14 +1251,14 @@ spa_ld_log_sm_data(spa_t *spa) m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) { mutex_enter(&m->ms_lock); m->ms_allocated_space = space_map_allocated(m->ms_sm) + - range_tree_space(m->ms_unflushed_allocs) - - range_tree_space(m->ms_unflushed_frees); + zfs_range_tree_space(m->ms_unflushed_allocs) - + zfs_range_tree_space(m->ms_unflushed_frees); vdev_t *vd = m->ms_group->mg_vd; metaslab_space_update(vd, m->ms_group->mg_class, - range_tree_space(m->ms_unflushed_allocs), 0, 0); + zfs_range_tree_space(m->ms_unflushed_allocs), 0, 0); metaslab_space_update(vd, m->ms_group->mg_class, - -range_tree_space(m->ms_unflushed_frees), 0, 0); + -zfs_range_tree_space(m->ms_unflushed_frees), 0, 0); ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK); metaslab_recalculate_weight_and_sort(m); @@ -1317,8 +1317,8 @@ spa_ld_unflushed_txgs(vdev_t *vd) ms->ms_unflushed_txg = entry.msp_unflushed_txg; ms->ms_unflushed_dirty = B_FALSE; - ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs)); - ASSERT(range_tree_is_empty(ms->ms_unflushed_frees)); + ASSERT(zfs_range_tree_is_empty(ms->ms_unflushed_allocs)); + ASSERT(zfs_range_tree_is_empty(ms->ms_unflushed_frees)); if (ms->ms_unflushed_txg != 0) { mutex_enter(&spa->spa_flushed_ms_lock); avl_add(&spa->spa_metaslabs_by_flushed, ms); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index a336ff41eadb..e9e03e05c86a 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -393,7 +393,7 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, typedef struct space_map_load_arg { space_map_t *smla_sm; - range_tree_t *smla_rt; + zfs_range_tree_t *smla_rt; maptype_t smla_type; } space_map_load_arg_t; @@ -402,11 +402,13 @@ space_map_load_callback(space_map_entry_t *sme, void *arg) { space_map_load_arg_t *smla = arg; if (sme->sme_type == smla->smla_type) { - VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=, + VERIFY3U(zfs_range_tree_space(smla->smla_rt) + sme->sme_run, <=, smla->smla_sm->sm_size); - range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run); + zfs_range_tree_add(smla->smla_rt, sme->sme_offset, + sme->sme_run); } else { - range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run); + zfs_range_tree_remove(smla->smla_rt, sme->sme_offset, + sme->sme_run); } return (0); @@ -417,15 +419,15 @@ space_map_load_callback(space_map_entry_t *sme, void *arg) * read the first 'length' bytes of the spacemap. */ int -space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, +space_map_load_length(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype, uint64_t length) { space_map_load_arg_t smla; - VERIFY0(range_tree_space(rt)); + VERIFY0(zfs_range_tree_space(rt)); if (maptype == SM_FREE) - range_tree_add(rt, sm->sm_start, sm->sm_size); + zfs_range_tree_add(rt, sm->sm_start, sm->sm_size); smla.smla_rt = rt; smla.smla_sm = sm; @@ -434,7 +436,7 @@ space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, space_map_load_callback, &smla); if (err != 0) - range_tree_vacate(rt, NULL, NULL); + zfs_range_tree_vacate(rt, NULL, NULL); return (err); } @@ -444,7 +446,7 @@ space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * are added to the range tree, other segment types are removed. */ int -space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) +space_map_load(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype) { return (space_map_load_length(sm, rt, maptype, space_map_length(sm))); } @@ -460,7 +462,7 @@ space_map_histogram_clear(space_map_t *sm) } boolean_t -space_map_histogram_verify(space_map_t *sm, range_tree_t *rt) +space_map_histogram_verify(space_map_t *sm, zfs_range_tree_t *rt) { /* * Verify that the in-core range tree does not have any @@ -474,7 +476,7 @@ space_map_histogram_verify(space_map_t *sm, range_tree_t *rt) } void -space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) +space_map_histogram_add(space_map_t *sm, zfs_range_tree_t *rt, dmu_tx_t *tx) { int idx = 0; @@ -667,7 +669,7 @@ space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend, * take effect. */ static void -space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, +space_map_write_impl(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; @@ -700,12 +702,12 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, zfs_btree_t *t = &rt->rt_root; zfs_btree_index_t where; - for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL; + for (zfs_range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL; rs = zfs_btree_next(t, &where, &where)) { - uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >> - sm->sm_shift; - uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >> + uint64_t offset = (zfs_rs_get_start(rs, rt) - sm->sm_start) >> sm->sm_shift; + uint64_t length = (zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt)) >> sm->sm_shift; uint8_t words = 1; /* @@ -730,8 +732,9 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, random_in_range(100) == 0))) words = 2; - space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs, - rt), maptype, vdev_id, words, &db, FTAG, tx); + space_map_write_seg(sm, zfs_rs_get_start(rs, rt), + zfs_rs_get_end(rs, rt), maptype, vdev_id, words, &db, + FTAG, tx); } dmu_buf_rele(db, FTAG); @@ -753,7 +756,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * for synchronizing writes to the space map. */ void -space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, +space_map_write(space_map_t *sm, zfs_range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx) { ASSERT(dsl_pool_sync_context(dmu_objset_pool(sm->sm_os))); @@ -768,18 +771,18 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, */ sm->sm_phys->smp_object = sm->sm_object; - if (range_tree_is_empty(rt)) { + if (zfs_range_tree_is_empty(rt)) { VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object); return; } if (maptype == SM_ALLOC) - sm->sm_phys->smp_alloc += range_tree_space(rt); + sm->sm_phys->smp_alloc += zfs_range_tree_space(rt); else - sm->sm_phys->smp_alloc -= range_tree_space(rt); + sm->sm_phys->smp_alloc -= zfs_range_tree_space(rt); uint64_t nodes = zfs_btree_numnodes(&rt->rt_root); - uint64_t rt_space = range_tree_space(rt); + uint64_t rt_space = zfs_range_tree_space(rt); space_map_write_impl(sm, rt, maptype, vdev_id, tx); @@ -788,7 +791,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * while we were in the middle of writing it out. */ VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root)); - VERIFY3U(range_tree_space(rt), ==, rt_space); + VERIFY3U(zfs_range_tree_space(rt), ==, rt_space); } static int @@ -960,7 +963,7 @@ space_map_free(space_map_t *sm, dmu_tx_t *tx) * the given space map. */ uint64_t -space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, +space_map_estimate_optimal_size(space_map_t *sm, zfs_range_tree_t *rt, uint64_t vdev_id) { spa_t *spa = dmu_objset_spa(sm->sm_os); diff --git a/module/zfs/space_reftree.c b/module/zfs/space_reftree.c index ee11e162dd5b..baa741395e0c 100644 --- a/module/zfs/space_reftree.c +++ b/module/zfs/space_reftree.c @@ -107,14 +107,14 @@ space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, * Convert (or add) a range tree into a reference tree. */ void -space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) +space_reftree_add_map(avl_tree_t *t, zfs_range_tree_t *rt, int64_t refcnt) { zfs_btree_index_t where; - for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs = - zfs_btree_next(&rt->rt_root, &where, &where)) { - space_reftree_add_seg(t, rs_get_start(rs, rt), rs_get_end(rs, - rt), refcnt); + for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + space_reftree_add_seg(t, zfs_rs_get_start(rs, rt), + zfs_rs_get_end(rs, rt), refcnt); } } @@ -123,13 +123,13 @@ space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) * all members of the reference tree for which refcnt >= minref. */ void -space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref) +space_reftree_generate_map(avl_tree_t *t, zfs_range_tree_t *rt, int64_t minref) { uint64_t start = -1ULL; int64_t refcnt = 0; space_ref_t *sr; - range_tree_vacate(rt, NULL, NULL); + zfs_range_tree_vacate(rt, NULL, NULL); for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { refcnt += sr->sr_refcnt; @@ -142,7 +142,8 @@ space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref) uint64_t end = sr->sr_offset; ASSERT(start <= end); if (end > start) - range_tree_add(rt, start, end - start); + zfs_range_tree_add(rt, start, end - + start); start = -1ULL; } } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 310319fdb052..92b001aabf7d 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -677,8 +677,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); - vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, - 0, 0); + vd->vdev_obsolete_segments = zfs_range_tree_create(NULL, + ZFS_RANGE_SEG64, NULL, 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay @@ -732,8 +732,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { - vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, - 0); + vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, + NULL, 0, 0); } txg_list_create(&vd->vdev_ms_list, spa, @@ -1155,8 +1155,8 @@ vdev_free(vdev_t *vd) mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { - range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); - range_tree_destroy(vd->vdev_dtl[t]); + zfs_range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); + zfs_range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); @@ -1173,7 +1173,7 @@ vdev_free(vdev_t *vd) space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } - range_tree_destroy(vd->vdev_obsolete_segments); + zfs_range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); @@ -1283,7 +1283,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; - range_tree_swap(&svd->vdev_obsolete_segments, + zfs_range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; @@ -2969,22 +2969,22 @@ vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { - range_tree_t *rt = vd->vdev_dtl[t]; + zfs_range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); - if (!range_tree_contains(rt, txg, size)) - range_tree_add(rt, txg, size); + if (!zfs_range_tree_contains(rt, txg, size)) + zfs_range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { - range_tree_t *rt = vd->vdev_dtl[t]; + zfs_range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); @@ -2999,8 +2999,8 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) * always checksummed. */ mutex_enter(&vd->vdev_dtl_lock); - if (!range_tree_is_empty(rt)) - dirty = range_tree_contains(rt, txg, size); + if (!zfs_range_tree_is_empty(rt)) + dirty = zfs_range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); @@ -3009,11 +3009,11 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { - range_tree_t *rt = vd->vdev_dtl[t]; + zfs_range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); - empty = range_tree_is_empty(rt); + empty = zfs_range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); @@ -3060,10 +3060,10 @@ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); - ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); + ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); - return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); + return (zfs_range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* @@ -3073,10 +3073,10 @@ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); - ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); + ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); - return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); + return (zfs_range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* @@ -3098,7 +3098,7 @@ vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) if (vd->vdev_resilver_deferred) return (B_FALSE); - if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) + if (zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { @@ -3187,7 +3187,7 @@ vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, } if (scrub_txg != 0 && - !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + !zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", @@ -3243,7 +3243,8 @@ vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); - if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + if (!zfs_range_tree_is_empty( + vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); @@ -3251,12 +3252,13 @@ vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, zfs_dbgmsg("DTL_MISSING is now empty"); } } - range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); - range_tree_walk(vd->vdev_dtl[DTL_MISSING], - range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); + zfs_range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); + zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING], + zfs_range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) - range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); - range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); + zfs_range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, + NULL); + zfs_range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); /* * For the faulting case, treat members of a replacing vdev @@ -3267,10 +3269,10 @@ vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, if (!vdev_readable(vd) || (faulting && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) { - range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); + zfs_range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); } else { - range_tree_walk(vd->vdev_dtl[DTL_MISSING], - range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); + zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING], + zfs_range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); } /* @@ -3279,8 +3281,8 @@ vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, * the top level so that we persist the change. */ if (txg != 0 && - range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && - range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { + zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && + zfs_range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); @@ -3374,7 +3376,7 @@ vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; - range_tree_t *rt; + zfs_range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { @@ -3392,17 +3394,17 @@ vdev_dtl_load(vdev_t *vd) return (error); ASSERT(vd->vdev_dtl_sm != NULL); - rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); - range_tree_walk(rt, range_tree_add, + zfs_range_tree_walk(rt, zfs_range_tree_add, vd->vdev_dtl[DTL_MISSING]); mutex_exit(&vd->vdev_dtl_lock); } - range_tree_vacate(rt, NULL, NULL); - range_tree_destroy(rt); + zfs_range_tree_vacate(rt, NULL, NULL); + zfs_range_tree_destroy(rt); return (error); } @@ -3496,9 +3498,9 @@ static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; - range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; + zfs_range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; - range_tree_t *rtsync; + zfs_range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); @@ -3540,17 +3542,17 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) ASSERT(vd->vdev_dtl_sm != NULL); } - rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); - range_tree_walk(rt, range_tree_add, rtsync); + zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); - range_tree_vacate(rtsync, NULL, NULL); + zfs_range_tree_vacate(rtsync, NULL, NULL); - range_tree_destroy(rtsync); + zfs_range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty @@ -3620,7 +3622,7 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); - if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && + if (!zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); @@ -4064,7 +4066,7 @@ vdev_sync(vdev_t *vd, uint64_t txg) ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - if (range_tree_space(vd->vdev_obsolete_segments) > 0) { + if (zfs_range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index cd24f97ae7cd..46c1fed6d2c6 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -333,7 +333,7 @@ vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { mutex_enter(&vd->vdev_obsolete_lock); - range_tree_add(vd->vdev_obsolete_segments, offset, size); + zfs_range_tree_add(vd->vdev_obsolete_segments, offset, size); mutex_exit(&vd->vdev_obsolete_lock); vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); } @@ -816,7 +816,7 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; ASSERT3U(vic->vic_mapping_object, !=, 0); - ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); + ASSERT(zfs_range_tree_space(vd->vdev_obsolete_segments) > 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); @@ -845,7 +845,7 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) space_map_write(vd->vdev_obsolete_sm, vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); - range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); + zfs_range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); } int diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 0a7323f58df2..008e014ecfdc 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -330,13 +330,14 @@ vdev_initialize_block_free(abd_t *data) static int vdev_initialize_ranges(vdev_t *vd, abd_t *data) { - range_tree_t *rt = vd->vdev_initialize_tree; + zfs_range_tree_t *rt = vd->vdev_initialize_tree; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t where; - for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; + for (zfs_range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; rs = zfs_btree_next(bt, &where, &where)) { - uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); + uint64_t size = zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt); /* Split range into legally-sized physical chunks */ uint64_t writes_required = @@ -346,7 +347,7 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) int error; error = vdev_initialize_write(vd, - VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) + + VDEV_LABEL_START_SIZE + zfs_rs_get_start(rs, rt) + (w * zfs_initialize_chunk_size), MIN(size - (w * zfs_initialize_chunk_size), zfs_initialize_chunk_size), data); @@ -440,13 +441,13 @@ vdev_initialize_calculate_progress(vdev_t *vd) VERIFY0(metaslab_load(msp)); zfs_btree_index_t where; - range_tree_t *rt = msp->ms_allocatable; - for (range_seg_t *rs = + zfs_range_tree_t *rt = msp->ms_allocatable; + for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { - logical_rs.rs_start = rs_get_start(rs, rt); - logical_rs.rs_end = rs_get_end(rs, rt); + logical_rs.rs_start = zfs_rs_get_start(rs, rt); + logical_rs.rs_end = zfs_rs_get_end(rs, rt); vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_progress, vd); @@ -503,7 +504,7 @@ vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs) ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); - range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start, + zfs_range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start, physical_rs->rs_end - physical_rs->rs_start); } @@ -539,8 +540,8 @@ vdev_initialize_thread(void *arg) abd_t *deadbeef = vdev_initialize_block_alloc(); - vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL, - 0, 0); + vd->vdev_initialize_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, + NULL, 0, 0); for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { @@ -563,15 +564,15 @@ vdev_initialize_thread(void *arg) unload_when_done = B_TRUE; VERIFY0(metaslab_load(msp)); - range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, - vd); + zfs_range_tree_walk(msp->ms_allocatable, + vdev_initialize_range_add, vd); mutex_exit(&msp->ms_lock); error = vdev_initialize_ranges(vd, deadbeef); metaslab_enable(msp, B_TRUE, unload_when_done); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); + zfs_range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); if (error != 0) break; } @@ -584,7 +585,7 @@ vdev_initialize_thread(void *arg) } mutex_exit(&vd->vdev_initialize_io_lock); - range_tree_destroy(vd->vdev_initialize_tree); + zfs_range_tree_destroy(vd->vdev_initialize_tree); vdev_initialize_block_free(deadbeef); vd->vdev_initialize_tree = NULL; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 6103f780e6bc..6bac2241c6d8 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -3953,18 +3953,18 @@ vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) } static boolean_t -raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, +raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint_t ashift = vd->vdev_top->vdev_ashift; - range_seg_t *rs = range_tree_first(rt); + zfs_range_seg_t *rs = zfs_range_tree_first(rt); if (rt == NULL) return (B_FALSE); - uint64_t offset = rs_get_start(rs, rt); + uint64_t offset = zfs_rs_get_start(rs, rt); ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); - uint64_t size = rs_get_end(rs, rt) - offset; + uint64_t size = zfs_rs_get_end(rs, rt) - offset; ASSERT3U(size, >=, 1 << ashift); ASSERT(IS_P2ALIGNED(size, 1 << ashift)); @@ -4001,7 +4001,7 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid); size = (uint64_t)blocks << ashift; - range_tree_remove(rt, offset, size); + zfs_range_tree_remove(rt, offset, size); uint_t reads = MIN(blocks, old_children); uint_t writes = MIN(blocks, vd->vdev_children); @@ -4553,12 +4553,13 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr) * space (e.g. in ms_defer), and it's fine to copy that too. */ uint64_t shift, start; - range_seg_type_t type = metaslab_calculate_range_tree_type( + zfs_range_seg_type_t type = metaslab_calculate_range_tree_type( raidvd, msp, &start, &shift); - range_tree_t *rt = range_tree_create(NULL, type, NULL, + zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL, start, shift); - range_tree_add(rt, msp->ms_start, msp->ms_size); - range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); + zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); + zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, + rt); mutex_exit(&msp->ms_lock); /* @@ -4572,8 +4573,8 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr) int sectorsz = 1 << raidvd->vdev_ashift; uint64_t ms_last_offset = msp->ms_start + msp->ms_size - sectorsz; - if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { - range_tree_add(rt, ms_last_offset, sectorsz); + if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) { + zfs_range_tree_add(rt, ms_last_offset, sectorsz); } /* @@ -4582,12 +4583,12 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr) * discard any state that we have already processed. */ if (vre->vre_offset > msp->ms_start) { - range_tree_clear(rt, msp->ms_start, + zfs_range_tree_clear(rt, msp->ms_start, vre->vre_offset - msp->ms_start); } while (!zthr_iscancelled(zthr) && - !range_tree_is_empty(rt) && + !zfs_range_tree_is_empty(rt) && vre->vre_failed_offset == UINT64_MAX) { /* @@ -4649,8 +4650,8 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr) spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_enable(msp, B_FALSE, B_FALSE); - range_tree_vacate(rt, NULL, NULL); - range_tree_destroy(rt); + zfs_range_tree_vacate(rt, NULL, NULL); + zfs_range_tree_destroy(rt); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index f80ed1b401f9..7ca1b1f846b6 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -641,10 +641,10 @@ vdev_rebuild_ranges(vdev_rebuild_t *vr) zfs_btree_index_t idx; int error; - for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + for (zfs_range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; rs = zfs_btree_next(t, &idx, &idx)) { - uint64_t start = rs_get_start(rs, vr->vr_scan_tree); - uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start; + uint64_t start = zfs_rs_get_start(rs, vr->vr_scan_tree); + uint64_t size = zfs_rs_get_end(rs, vr->vr_scan_tree) - start; /* * zfs_scan_suspend_progress can be set to disable rebuild @@ -786,7 +786,8 @@ vdev_rebuild_thread(void *arg) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vr->vr_top_vdev = vd; vr->vr_scan_msp = NULL; - vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + vr->vr_scan_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, + 0, 0); mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); @@ -833,7 +834,7 @@ vdev_rebuild_thread(void *arg) break; } - ASSERT0(range_tree_space(vr->vr_scan_tree)); + ASSERT0(zfs_range_tree_space(vr->vr_scan_tree)); /* Disable any new allocations to this metaslab */ spa_config_exit(spa, SCL_CONFIG, FTAG); @@ -848,7 +849,7 @@ vdev_rebuild_thread(void *arg) * on disk and therefore will be rebuilt. */ for (int j = 0; j < TXG_SIZE; j++) { - if (range_tree_space(msp->ms_allocating[j])) { + if (zfs_range_tree_space(msp->ms_allocating[j])) { mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_sync_lock); txg_wait_synced(dsl, 0); @@ -869,21 +870,21 @@ vdev_rebuild_thread(void *arg) vr->vr_scan_tree, SM_ALLOC)); for (int i = 0; i < TXG_SIZE; i++) { - ASSERT0(range_tree_space( + ASSERT0(zfs_range_tree_space( msp->ms_allocating[i])); } - range_tree_walk(msp->ms_unflushed_allocs, - range_tree_add, vr->vr_scan_tree); - range_tree_walk(msp->ms_unflushed_frees, - range_tree_remove, vr->vr_scan_tree); + zfs_range_tree_walk(msp->ms_unflushed_allocs, + zfs_range_tree_add, vr->vr_scan_tree); + zfs_range_tree_walk(msp->ms_unflushed_frees, + zfs_range_tree_remove, vr->vr_scan_tree); /* * Remove ranges which have already been rebuilt based * on the last offset. This can happen when restarting * a scan after exporting and re-importing the pool. */ - range_tree_clear(vr->vr_scan_tree, 0, + zfs_range_tree_clear(vr->vr_scan_tree, 0, vrp->vrp_last_offset); } @@ -904,7 +905,7 @@ vdev_rebuild_thread(void *arg) * Walk the allocated space map and issue the rebuild I/O. */ error = vdev_rebuild_ranges(vr); - range_tree_vacate(vr->vr_scan_tree, NULL, NULL); + zfs_range_tree_vacate(vr->vr_scan_tree, NULL, NULL); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); metaslab_enable(msp, B_FALSE, B_FALSE); @@ -913,7 +914,7 @@ vdev_rebuild_thread(void *arg) break; } - range_tree_destroy(vr->vr_scan_tree); + zfs_range_tree_destroy(vr->vr_scan_tree); spa_config_exit(spa, SCL_CONFIG, FTAG); /* Wait for any remaining rebuild I/O to complete */ diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 08c85a874803..e1819448a98a 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -369,12 +369,13 @@ spa_vdev_removal_create(vdev_t *vd) spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); - svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + svr->svr_allocd_segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, + NULL, 0, 0); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { - svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL, - 0, 0); + svr->svr_frees[i] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, + NULL, 0, 0); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); @@ -389,11 +390,11 @@ spa_vdev_removal_destroy(spa_vdev_removal_t *svr) for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(svr->svr_bytes_done[i]); ASSERT0(svr->svr_max_offset_to_sync[i]); - range_tree_destroy(svr->svr_frees[i]); + zfs_range_tree_destroy(svr->svr_frees[i]); list_destroy(&svr->svr_new_segments[i]); } - range_tree_destroy(svr->svr_allocd_segs); + zfs_range_tree_destroy(svr->svr_allocd_segs); mutex_destroy(&svr->svr_lock); cv_destroy(&svr->svr_cv); kmem_free(svr, sizeof (*svr)); @@ -475,11 +476,11 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) * be copied. */ spa->spa_removing_phys.sr_to_copy -= - range_tree_space(ms->ms_freeing); + zfs_range_tree_space(ms->ms_freeing); - ASSERT0(range_tree_space(ms->ms_freed)); + ASSERT0(zfs_range_tree_space(ms->ms_freed)); for (int t = 0; t < TXG_SIZE; t++) - ASSERT0(range_tree_space(ms->ms_allocating[t])); + ASSERT0(zfs_range_tree_space(ms->ms_allocating[t])); } /* @@ -770,7 +771,7 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) * completed the copy and synced the mapping (see * vdev_mapping_sync). */ - range_tree_add(svr->svr_frees[txgoff], + zfs_range_tree_add(svr->svr_frees[txgoff], offset, inflight_size); size -= inflight_size; offset += inflight_size; @@ -806,7 +807,8 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) uint64_t, size); if (svr->svr_allocd_segs != NULL) - range_tree_clear(svr->svr_allocd_segs, offset, size); + zfs_range_tree_clear(svr->svr_allocd_segs, offset, + size); /* * Since we now do not need to copy this data, for @@ -915,7 +917,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx) * mapping entries were in flight. */ mutex_enter(&svr->svr_lock); - range_tree_vacate(svr->svr_frees[txg & TXG_MASK], + zfs_range_tree_vacate(svr->svr_frees[txg & TXG_MASK], free_mapped_segment_cb, vd); ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, vdev_indirect_mapping_max_offset(vim)); @@ -929,7 +931,7 @@ typedef struct vdev_copy_segment_arg { spa_t *vcsa_spa; dva_t *vcsa_dest_dva; uint64_t vcsa_txg; - range_tree_t *vcsa_obsolete_segs; + zfs_range_tree_t *vcsa_obsolete_segs; } vdev_copy_segment_arg_t; static void @@ -966,9 +968,9 @@ spa_vdev_copy_segment_done(zio_t *zio) { vdev_copy_segment_arg_t *vcsa = zio->io_private; - range_tree_vacate(vcsa->vcsa_obsolete_segs, + zfs_range_tree_vacate(vcsa->vcsa_obsolete_segs, unalloc_seg, vcsa); - range_tree_destroy(vcsa->vcsa_obsolete_segs); + zfs_range_tree_destroy(vcsa->vcsa_obsolete_segs); kmem_free(vcsa, sizeof (*vcsa)); spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); @@ -1119,7 +1121,7 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, * read from the old location and write to the new location. */ static int -spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, +spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs, uint64_t maxalloc, uint64_t txg, vdev_copy_arg_t *vca, zio_alloc_list_t *zal) { @@ -1128,14 +1130,14 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_entry_t *entry; dva_t dst = {{ 0 }}; - uint64_t start = range_tree_min(segs); + uint64_t start = zfs_range_tree_min(segs); ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift)); ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift)); - uint64_t size = range_tree_span(segs); - if (range_tree_span(segs) > maxalloc) { + uint64_t size = zfs_range_tree_span(segs); + if (zfs_range_tree_span(segs) > maxalloc) { /* * We can't allocate all the segments. Prefer to end * the allocation at the end of a segment, thus avoiding @@ -1143,13 +1145,13 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, */ range_seg_max_t search; zfs_btree_index_t where; - rs_set_start(&search, segs, start + maxalloc); - rs_set_end(&search, segs, start + maxalloc); + zfs_rs_set_start(&search, segs, start + maxalloc); + zfs_rs_set_end(&search, segs, start + maxalloc); (void) zfs_btree_find(&segs->rt_root, &search, &where); - range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, + zfs_range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, &where); if (rs != NULL) { - size = rs_get_end(rs, segs) - start; + size = zfs_rs_get_end(rs, segs) - start; } else { /* * There are no segments that end before maxalloc. @@ -1182,27 +1184,27 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ - range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL, - 0, 0); + zfs_range_tree_t *obsolete_segs = zfs_range_tree_create(NULL, + ZFS_RANGE_SEG64, NULL, 0, 0); zfs_btree_index_t where; - range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); - ASSERT3U(rs_get_start(rs, segs), ==, start); - uint64_t prev_seg_end = rs_get_end(rs, segs); + zfs_range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); + ASSERT3U(zfs_rs_get_start(rs, segs), ==, start); + uint64_t prev_seg_end = zfs_rs_get_end(rs, segs); while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) { - if (rs_get_start(rs, segs) >= start + size) { + if (zfs_rs_get_start(rs, segs) >= start + size) { break; } else { - range_tree_add(obsolete_segs, + zfs_range_tree_add(obsolete_segs, prev_seg_end - start, - rs_get_start(rs, segs) - prev_seg_end); + zfs_rs_get_start(rs, segs) - prev_seg_end); } - prev_seg_end = rs_get_end(rs, segs); + prev_seg_end = zfs_rs_get_end(rs, segs); } /* We don't end in the middle of an obsolete range */ ASSERT3U(start + size, <=, prev_seg_end); - range_tree_clear(segs, start, size); + zfs_range_tree_clear(segs, start, size); /* * We can't have any padding of the allocated size, otherwise we will @@ -1216,7 +1218,8 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); entry->vime_mapping.vimep_dst = dst; if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { - entry->vime_obsolete_count = range_tree_space(obsolete_segs); + entry->vime_obsolete_count = + zfs_range_tree_space(obsolete_segs); } vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP); @@ -1455,30 +1458,31 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ - range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, + NULL, 0, 0); for (;;) { - range_tree_t *rt = svr->svr_allocd_segs; - range_seg_t *rs = range_tree_first(rt); + zfs_range_tree_t *rt = svr->svr_allocd_segs; + zfs_range_seg_t *rs = zfs_range_tree_first(rt); if (rs == NULL) break; uint64_t seg_length; - if (range_tree_is_empty(segs)) { + if (zfs_range_tree_is_empty(segs)) { /* need to truncate the first seg based on max_alloc */ - seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs, - rt), *max_alloc); + seg_length = MIN(zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt), *max_alloc); } else { - if (rs_get_start(rs, rt) - range_tree_max(segs) > - vdev_removal_max_span) { + if (zfs_rs_get_start(rs, rt) - zfs_range_tree_max(segs) + > vdev_removal_max_span) { /* * Including this segment would cause us to * copy a larger unneeded chunk than is allowed. */ break; - } else if (rs_get_end(rs, rt) - range_tree_min(segs) > - *max_alloc) { + } else if (zfs_rs_get_end(rs, rt) - + zfs_range_tree_min(segs) > *max_alloc) { /* * This additional segment would extend past * max_alloc. Rather than splitting this @@ -1486,19 +1490,19 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, */ break; } else { - seg_length = rs_get_end(rs, rt) - - rs_get_start(rs, rt); + seg_length = zfs_rs_get_end(rs, rt) - + zfs_rs_get_start(rs, rt); } } - range_tree_add(segs, rs_get_start(rs, rt), seg_length); - range_tree_remove(svr->svr_allocd_segs, - rs_get_start(rs, rt), seg_length); + zfs_range_tree_add(segs, zfs_rs_get_start(rs, rt), seg_length); + zfs_range_tree_remove(svr->svr_allocd_segs, + zfs_rs_get_start(rs, rt), seg_length); } - if (range_tree_is_empty(segs)) { + if (zfs_range_tree_is_empty(segs)) { mutex_exit(&svr->svr_lock); - range_tree_destroy(segs); + zfs_range_tree_destroy(segs); return; } @@ -1507,20 +1511,20 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, svr, tx); } - svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs); + svr->svr_max_offset_to_sync[txg & TXG_MASK] = zfs_range_tree_max(segs); /* * Note: this is the amount of *allocated* space * that we are taking care of each txg. */ - svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs); + svr->svr_bytes_done[txg & TXG_MASK] += zfs_range_tree_space(segs); mutex_exit(&svr->svr_lock); zio_alloc_list_t zal; metaslab_trace_init(&zal); uint64_t thismax = SPA_MAXBLOCKSIZE; - while (!range_tree_is_empty(segs)) { + while (!zfs_range_tree_is_empty(segs)) { int error = spa_vdev_copy_segment(vd, segs, thismax, txg, vca, &zal); @@ -1537,7 +1541,7 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); uint64_t attempted = - MIN(range_tree_span(segs), thismax); + MIN(zfs_range_tree_span(segs), thismax); thismax = P2ROUNDUP(attempted / 2, 1 << spa->spa_max_ashift); /* @@ -1557,7 +1561,7 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, } } metaslab_trace_fini(&zal); - range_tree_destroy(segs); + zfs_range_tree_destroy(segs); } /* @@ -1628,7 +1632,7 @@ spa_vdev_remove_thread(void *arg) metaslab_t *msp = vd->vdev_ms[msi]; ASSERT3U(msi, <=, vd->vdev_ms_count); - ASSERT0(range_tree_space(svr->svr_allocd_segs)); + ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); @@ -1637,7 +1641,7 @@ spa_vdev_remove_thread(void *arg) * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) { - ASSERT0(range_tree_space(msp->ms_allocating[i])); + ASSERT0(zfs_range_tree_space(msp->ms_allocating[i])); } /* @@ -1653,19 +1657,20 @@ spa_vdev_remove_thread(void *arg) VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); - range_tree_walk(msp->ms_unflushed_allocs, - range_tree_add, svr->svr_allocd_segs); - range_tree_walk(msp->ms_unflushed_frees, - range_tree_remove, svr->svr_allocd_segs); - range_tree_walk(msp->ms_freeing, - range_tree_remove, svr->svr_allocd_segs); + zfs_range_tree_walk(msp->ms_unflushed_allocs, + zfs_range_tree_add, svr->svr_allocd_segs); + zfs_range_tree_walk(msp->ms_unflushed_frees, + zfs_range_tree_remove, svr->svr_allocd_segs); + zfs_range_tree_walk(msp->ms_freeing, + zfs_range_tree_remove, svr->svr_allocd_segs); /* * When we are resuming from a paused removal (i.e. * when importing a pool with a removal in progress), * discard any state that we have already processed. */ - range_tree_clear(svr->svr_allocd_segs, 0, start_offset); + zfs_range_tree_clear(svr->svr_allocd_segs, 0, + start_offset); } mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_sync_lock); @@ -1677,7 +1682,7 @@ spa_vdev_remove_thread(void *arg) (u_longlong_t)msp->ms_id); while (!svr->svr_thread_exit && - !range_tree_is_empty(svr->svr_allocd_segs)) { + !zfs_range_tree_is_empty(svr->svr_allocd_segs)) { mutex_exit(&svr->svr_lock); @@ -1756,7 +1761,7 @@ spa_vdev_remove_thread(void *arg) if (svr->svr_thread_exit) { mutex_enter(&svr->svr_lock); - range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); + zfs_range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); svr->svr_thread = NULL; cv_broadcast(&svr->svr_cv); mutex_exit(&svr->svr_lock); @@ -1776,7 +1781,7 @@ spa_vdev_remove_thread(void *arg) spa_vdev_remove_cancel_impl(spa); } } else { - ASSERT0(range_tree_space(svr->svr_allocd_segs)); + ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); vdev_remove_complete(spa); } @@ -1885,7 +1890,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) break; - ASSERT0(range_tree_space(svr->svr_allocd_segs)); + ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); mutex_enter(&msp->ms_lock); @@ -1893,22 +1898,22 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) - ASSERT0(range_tree_space(msp->ms_allocating[i])); + ASSERT0(zfs_range_tree_space(msp->ms_allocating[i])); for (int i = 0; i < TXG_DEFER_SIZE; i++) - ASSERT0(range_tree_space(msp->ms_defer[i])); - ASSERT0(range_tree_space(msp->ms_freed)); + ASSERT0(zfs_range_tree_space(msp->ms_defer[i])); + ASSERT0(zfs_range_tree_space(msp->ms_freed)); if (msp->ms_sm != NULL) { mutex_enter(&svr->svr_lock); VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); - range_tree_walk(msp->ms_unflushed_allocs, - range_tree_add, svr->svr_allocd_segs); - range_tree_walk(msp->ms_unflushed_frees, - range_tree_remove, svr->svr_allocd_segs); - range_tree_walk(msp->ms_freeing, - range_tree_remove, svr->svr_allocd_segs); + zfs_range_tree_walk(msp->ms_unflushed_allocs, + zfs_range_tree_add, svr->svr_allocd_segs); + zfs_range_tree_walk(msp->ms_unflushed_frees, + zfs_range_tree_remove, svr->svr_allocd_segs); + zfs_range_tree_walk(msp->ms_freeing, + zfs_range_tree_remove, svr->svr_allocd_segs); /* * Clear everything past what has been synced, @@ -1918,7 +1923,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) uint64_t sm_end = msp->ms_sm->sm_start + msp->ms_sm->sm_size; if (sm_end > syncd) - range_tree_clear(svr->svr_allocd_segs, + zfs_range_tree_clear(svr->svr_allocd_segs, syncd, sm_end - syncd); mutex_exit(&svr->svr_lock); @@ -1926,7 +1931,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) mutex_exit(&msp->ms_lock); mutex_enter(&svr->svr_lock); - range_tree_vacate(svr->svr_allocd_segs, + zfs_range_tree_vacate(svr->svr_allocd_segs, free_mapped_segment_cb, vd); mutex_exit(&svr->svr_lock); } @@ -1935,7 +1940,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) * Note: this must happen after we invoke free_mapped_segment_cb, * because it adds to the obsolete_segments. */ - range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); + zfs_range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); ASSERT3U(vic->vic_mapping_object, ==, vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 9cf10332e8bf..d13753f81a69 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -149,7 +149,7 @@ typedef struct trim_args { */ vdev_t *trim_vdev; /* Leaf vdev to TRIM */ metaslab_t *trim_msp; /* Disabled metaslab */ - range_tree_t *trim_tree; /* TRIM ranges (in metaslab) */ + zfs_range_tree_t *trim_tree; /* TRIM ranges (in metaslab) */ trim_type_t trim_type; /* Manual or auto TRIM */ uint64_t trim_extent_bytes_max; /* Maximum TRIM I/O size */ uint64_t trim_extent_bytes_min; /* Minimum TRIM I/O size */ @@ -601,10 +601,10 @@ vdev_trim_ranges(trim_args_t *ta) ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; - for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + for (zfs_range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; rs = zfs_btree_next(t, &idx, &idx)) { - uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs, - ta->trim_tree); + uint64_t size = zfs_rs_get_end(rs, ta->trim_tree) - + zfs_rs_get_start(rs, ta->trim_tree); if (extent_bytes_min && size < extent_bytes_min) { spa_iostats_trim_add(spa, ta->trim_type, @@ -617,7 +617,7 @@ vdev_trim_ranges(trim_args_t *ta) for (uint64_t w = 0; w < writes_required; w++) { error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + - rs_get_start(rs, ta->trim_tree) + + zfs_rs_get_start(rs, ta->trim_tree) + (w *extent_bytes_max), MIN(size - (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { @@ -729,13 +729,13 @@ vdev_trim_calculate_progress(vdev_t *vd) */ VERIFY0(metaslab_load(msp)); - range_tree_t *rt = msp->ms_allocatable; + zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t idx; - for (range_seg_t *rs = zfs_btree_first(bt, &idx); + for (zfs_range_seg_t *rs = zfs_btree_first(bt, &idx); rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { - logical_rs.rs_start = rs_get_start(rs, rt); - logical_rs.rs_end = rs_get_end(rs, rt); + logical_rs.rs_start = zfs_rs_get_start(rs, rt); + logical_rs.rs_end = zfs_rs_get_end(rs, rt); vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_progress, vd); @@ -832,7 +832,7 @@ vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs) ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); - range_tree_add(ta->trim_tree, physical_rs->rs_start, + zfs_range_tree_add(ta->trim_tree, physical_rs->rs_start, physical_rs->rs_end - physical_rs->rs_start); } @@ -858,7 +858,8 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) metaslab_t *msp = ta->trim_msp; VERIFY0(metaslab_load(msp)); VERIFY3B(msp->ms_loaded, ==, B_TRUE); - VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); + VERIFY(zfs_range_tree_contains(msp->ms_allocatable, start, + size)); } ASSERT(vd->vdev_ops->vdev_op_leaf); @@ -900,7 +901,7 @@ vdev_trim_thread(void *arg) ta.trim_vdev = vd; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; - ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_flags = 0; @@ -946,22 +947,23 @@ vdev_trim_thread(void *arg) } ta.trim_msp = msp; - range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta); - range_tree_vacate(msp->ms_trim, NULL, NULL); + zfs_range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, + &ta); + zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); mutex_exit(&msp->ms_lock); error = vdev_trim_ranges(&ta); metaslab_enable(msp, B_TRUE, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - range_tree_vacate(ta.trim_tree, NULL, NULL); + zfs_range_tree_vacate(ta.trim_tree, NULL, NULL); if (error != 0) break; } spa_config_exit(spa, SCL_CONFIG, FTAG); - range_tree_destroy(ta.trim_tree); + zfs_range_tree_destroy(ta.trim_tree); mutex_enter(&vd->vdev_trim_lock); if (!vd->vdev_trim_exit_wanted) { @@ -1204,7 +1206,7 @@ vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size) VERIFY3B(msp->ms_loaded, ==, B_TRUE); VERIFY3U(msp->ms_disabled, >, 0); - VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); + VERIFY(zfs_range_tree_contains(msp->ms_allocatable, start, size)); } /* @@ -1261,7 +1263,7 @@ vdev_autotrim_thread(void *arg) for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count; i += txgs_per_trim) { metaslab_t *msp = vd->vdev_ms[i]; - range_tree_t *trim_tree; + zfs_range_tree_t *trim_tree; boolean_t issued_trim = B_FALSE; boolean_t wait_aborted = B_FALSE; @@ -1276,7 +1278,7 @@ vdev_autotrim_thread(void *arg) * or when there are no recent frees to trim. */ if (msp->ms_sm == NULL || - range_tree_is_empty(msp->ms_trim)) { + zfs_range_tree_is_empty(msp->ms_trim)) { mutex_exit(&msp->ms_lock); metaslab_enable(msp, B_FALSE, B_FALSE); continue; @@ -1302,10 +1304,10 @@ vdev_autotrim_thread(void *arg) * Allocate an empty range tree which is swapped in * for the existing ms_trim tree while it is processed. */ - trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, - 0, 0); - range_tree_swap(&msp->ms_trim, &trim_tree); - ASSERT(range_tree_is_empty(msp->ms_trim)); + trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, + NULL, 0, 0); + zfs_range_tree_swap(&msp->ms_trim, &trim_tree); + ASSERT(zfs_range_tree_is_empty(msp->ms_trim)); /* * There are two cases when constructing the per-vdev @@ -1357,9 +1359,9 @@ vdev_autotrim_thread(void *arg) if (!cvd->vdev_ops->vdev_op_leaf) continue; - ta->trim_tree = range_tree_create(NULL, - RANGE_SEG64, NULL, 0, 0); - range_tree_walk(trim_tree, + ta->trim_tree = zfs_range_tree_create(NULL, + ZFS_RANGE_SEG64, NULL, 0, 0); + zfs_range_tree_walk(trim_tree, vdev_trim_range_add, ta); } @@ -1406,13 +1408,13 @@ vdev_autotrim_thread(void *arg) mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); VERIFY3P(tap[0].trim_msp, ==, msp); - range_tree_walk(trim_tree, + zfs_range_tree_walk(trim_tree, vdev_trim_range_verify, &tap[0]); mutex_exit(&msp->ms_lock); } - range_tree_vacate(trim_tree, NULL, NULL); - range_tree_destroy(trim_tree); + zfs_range_tree_vacate(trim_tree, NULL, NULL); + zfs_range_tree_destroy(trim_tree); /* * Wait for couples of kicks, to ensure the trim io is @@ -1434,8 +1436,9 @@ vdev_autotrim_thread(void *arg) if (ta->trim_tree == NULL) continue; - range_tree_vacate(ta->trim_tree, NULL, NULL); - range_tree_destroy(ta->trim_tree); + zfs_range_tree_vacate(ta->trim_tree, NULL, + NULL); + zfs_range_tree_destroy(ta->trim_tree); } kmem_free(tap, sizeof (trim_args_t) * children); @@ -1474,7 +1477,7 @@ vdev_autotrim_thread(void *arg) metaslab_t *msp = vd->vdev_ms[i]; mutex_enter(&msp->ms_lock); - range_tree_vacate(msp->ms_trim, NULL, NULL); + zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); mutex_exit(&msp->ms_lock); } } @@ -1596,7 +1599,7 @@ vdev_trim_l2arc_thread(void *arg) vd->vdev_trim_secure = 0; ta.trim_vdev = vd; - ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; @@ -1606,7 +1609,7 @@ vdev_trim_l2arc_thread(void *arg) physical_rs.rs_end = vd->vdev_trim_bytes_est = vdev_get_min_asize(vd); - range_tree_add(ta.trim_tree, physical_rs.rs_start, + zfs_range_tree_add(ta.trim_tree, physical_rs.rs_start, physical_rs.rs_end - physical_rs.rs_start); mutex_enter(&vd->vdev_trim_lock); @@ -1622,8 +1625,8 @@ vdev_trim_l2arc_thread(void *arg) } mutex_exit(&vd->vdev_trim_io_lock); - range_tree_vacate(ta.trim_tree, NULL, NULL); - range_tree_destroy(ta.trim_tree); + zfs_range_tree_vacate(ta.trim_tree, NULL, NULL); + zfs_range_tree_destroy(ta.trim_tree); mutex_enter(&vd->vdev_trim_lock); if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { @@ -1731,7 +1734,7 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) ASSERT(!vd->vdev_top->vdev_rz_expanding); ta.trim_vdev = vd; - ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_SIMPLE; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; @@ -1740,7 +1743,7 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); if (physical_rs.rs_end > physical_rs.rs_start) { - range_tree_add(ta.trim_tree, physical_rs.rs_start, + zfs_range_tree_add(ta.trim_tree, physical_rs.rs_start, physical_rs.rs_end - physical_rs.rs_start); } else { ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); @@ -1754,8 +1757,8 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) } mutex_exit(&vd->vdev_trim_io_lock); - range_tree_vacate(ta.trim_tree, NULL, NULL); - range_tree_destroy(ta.trim_tree); + zfs_range_tree_vacate(ta.trim_tree, NULL, NULL); + zfs_range_tree_destroy(ta.trim_tree); return (error); } From 1bdce0410c0b1afef846fe61b3a12ca2a1abdd77 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 11 Feb 2025 08:50:10 +1100 Subject: [PATCH 32/44] range_tree: convert remaining range_* defs to zfs_range_* Signed-off-by: Rob Norris Reviewed-by: Tony Hutter Reviewed-by: Rob Norris --- cmd/zdb/zdb.c | 7 ++--- include/sys/metaslab_impl.h | 4 +-- include/sys/range_tree.h | 50 ++++++++++++++++++------------------ include/sys/vdev.h | 10 ++++---- include/sys/vdev_impl.h | 8 +++--- module/zfs/dsl_scan.c | 10 ++++---- module/zfs/metaslab.c | 42 +++++++++++++++--------------- module/zfs/range_tree.c | 50 ++++++++++++++++++------------------ module/zfs/space_map.c | 6 ++--- module/zfs/vdev.c | 22 ++++++++-------- module/zfs/vdev_draid.c | 6 ++--- module/zfs/vdev_initialize.c | 10 ++++---- module/zfs/vdev_label.c | 3 ++- module/zfs/vdev_raidz.c | 6 ++--- module/zfs/vdev_removal.c | 2 +- module/zfs/vdev_trim.c | 14 +++++----- 16 files changed, 126 insertions(+), 124 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index dd521257ccb2..5e8f282e96c3 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1646,7 +1646,7 @@ dump_metaslab_stats(metaslab_t *msp) "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); (void) printf("\tIn-memory histogram:\n"); - dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); + dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); } static void @@ -1769,7 +1769,8 @@ dump_metaslab_groups(spa_t *spa, boolean_t show_special) (void) printf("%3llu%%\n", (u_longlong_t)mg->mg_fragmentation); } - dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); + dump_histogram(mg->mg_histogram, + ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); } (void) printf("\tpool %s\tfragmentation", spa_name(spa)); @@ -1778,7 +1779,7 @@ dump_metaslab_groups(spa_t *spa, boolean_t show_special) (void) printf("\t%3s\n", "-"); else (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); - dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); + dump_histogram(mc->mc_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); } static void diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index eae543731224..9c35f27ff0b4 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -200,7 +200,7 @@ struct metaslab_class { uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ - uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + uint64_t mc_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE]; /* * List of all loaded metaslabs in the class, sorted in order of most @@ -290,7 +290,7 @@ struct metaslab_group { uint64_t mg_allocations; uint64_t mg_failed_allocations; uint64_t mg_fragmentation; - uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + uint64_t mg_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE]; int mg_ms_disabled; boolean_t mg_disabled_updating; diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index 4b0a3f2bfbb1..23eea3210c98 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -37,7 +37,7 @@ extern "C" { #endif -#define RANGE_TREE_HISTOGRAM_SIZE 64 +#define ZFS_RANGE_TREE_HISTOGRAM_SIZE 64 typedef struct zfs_range_tree_ops zfs_range_tree_ops_t; @@ -72,34 +72,34 @@ typedef struct zfs_range_tree { * rt_histogram[i], contains the number of ranges whose size is: * 2^i <= size of range in bytes < 2^(i+1) */ - uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + uint64_t rt_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE]; } zfs_range_tree_t; -typedef struct range_seg32 { +typedef struct zfs_range_seg32 { uint32_t rs_start; /* starting offset of this segment */ uint32_t rs_end; /* ending offset (non-inclusive) */ -} range_seg32_t; +} zfs_range_seg32_t; /* * Extremely large metaslabs, vdev-wide trees, and dnode-wide trees may * require 64-bit integers for ranges. */ -typedef struct range_seg64 { +typedef struct zfs_range_seg64 { uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ -} range_seg64_t; +} zfs_range_seg64_t; -typedef struct range_seg_gap { +typedef struct zfs_range_seg_gap { uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ uint64_t rs_fill; /* actual fill if gap mode is on */ -} range_seg_gap_t; +} zfs_range_seg_gap_t; /* * This type needs to be the largest of the range segs, since it will be stack * allocated and then cast the actual type to do tree operations. */ -typedef range_seg_gap_t range_seg_max_t; +typedef zfs_range_seg_gap_t zfs_range_seg_max_t; /* * This is just for clarity of code purposes, so we can make it clear that a @@ -122,11 +122,11 @@ zfs_rs_get_start_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { case ZFS_RANGE_SEG32: - return (((const range_seg32_t *)rs)->rs_start); + return (((const zfs_range_seg32_t *)rs)->rs_start); case ZFS_RANGE_SEG64: - return (((const range_seg64_t *)rs)->rs_start); + return (((const zfs_range_seg64_t *)rs)->rs_start); case ZFS_RANGE_SEG_GAP: - return (((const range_seg_gap_t *)rs)->rs_start); + return (((const zfs_range_seg_gap_t *)rs)->rs_start); default: VERIFY(0); return (0); @@ -139,11 +139,11 @@ zfs_rs_get_end_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { case ZFS_RANGE_SEG32: - return (((const range_seg32_t *)rs)->rs_end); + return (((const zfs_range_seg32_t *)rs)->rs_end); case ZFS_RANGE_SEG64: - return (((const range_seg64_t *)rs)->rs_end); + return (((const zfs_range_seg64_t *)rs)->rs_end); case ZFS_RANGE_SEG_GAP: - return (((const range_seg_gap_t *)rs)->rs_end); + return (((const zfs_range_seg_gap_t *)rs)->rs_end); default: VERIFY(0); return (0); @@ -156,15 +156,15 @@ zfs_rs_get_fill_raw(const zfs_range_seg_t *rs, const zfs_range_tree_t *rt) ASSERT3U(rt->rt_type, <=, ZFS_RANGE_SEG_NUM_TYPES); switch (rt->rt_type) { case ZFS_RANGE_SEG32: { - const range_seg32_t *r32 = (const range_seg32_t *)rs; + const zfs_range_seg32_t *r32 = (const zfs_range_seg32_t *)rs; return (r32->rs_end - r32->rs_start); } case ZFS_RANGE_SEG64: { - const range_seg64_t *r64 = (const range_seg64_t *)rs; + const zfs_range_seg64_t *r64 = (const zfs_range_seg64_t *)rs; return (r64->rs_end - r64->rs_start); } case ZFS_RANGE_SEG_GAP: - return (((const range_seg_gap_t *)rs)->rs_fill); + return (((const zfs_range_seg_gap_t *)rs)->rs_fill); default: VERIFY(0); return (0); @@ -197,13 +197,13 @@ zfs_rs_set_start_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t start) switch (rt->rt_type) { case ZFS_RANGE_SEG32: ASSERT3U(start, <=, UINT32_MAX); - ((range_seg32_t *)rs)->rs_start = (uint32_t)start; + ((zfs_range_seg32_t *)rs)->rs_start = (uint32_t)start; break; case ZFS_RANGE_SEG64: - ((range_seg64_t *)rs)->rs_start = start; + ((zfs_range_seg64_t *)rs)->rs_start = start; break; case ZFS_RANGE_SEG_GAP: - ((range_seg_gap_t *)rs)->rs_start = start; + ((zfs_range_seg_gap_t *)rs)->rs_start = start; break; default: VERIFY(0); @@ -217,13 +217,13 @@ zfs_rs_set_end_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, uint64_t end) switch (rt->rt_type) { case ZFS_RANGE_SEG32: ASSERT3U(end, <=, UINT32_MAX); - ((range_seg32_t *)rs)->rs_end = (uint32_t)end; + ((zfs_range_seg32_t *)rs)->rs_end = (uint32_t)end; break; case ZFS_RANGE_SEG64: - ((range_seg64_t *)rs)->rs_end = end; + ((zfs_range_seg64_t *)rs)->rs_end = end; break; case ZFS_RANGE_SEG_GAP: - ((range_seg_gap_t *)rs)->rs_end = end; + ((zfs_range_seg_gap_t *)rs)->rs_end = end; break; default: VERIFY(0); @@ -243,7 +243,7 @@ zfs_zfs_rs_set_fill_raw(zfs_range_seg_t *rs, zfs_range_tree_t *rt, zfs_rs_get_start_raw(rs, rt)); break; case ZFS_RANGE_SEG_GAP: - ((range_seg_gap_t *)rs)->rs_fill = fill; + ((zfs_range_seg_gap_t *)rs)->rs_fill = fill; break; default: VERIFY(0); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 38f62b07dc59..6ab7ac40bb07 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -106,12 +106,12 @@ extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd, const char *tag); -typedef void vdev_xlate_func_t(void *arg, range_seg64_t *physical_rs); +typedef void vdev_xlate_func_t(void *arg, zfs_range_seg64_t *physical_rs); -extern boolean_t vdev_xlate_is_empty(range_seg64_t *rs); -extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs); -extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, +extern boolean_t vdev_xlate_is_empty(zfs_range_seg64_t *rs); +extern void vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs); +extern void vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg); extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 6840ee78915e..315e2fc88410 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -91,8 +91,8 @@ typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, * Given a target vdev, translates the logical range "in" to the physical * range "res" */ -typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *logical, - range_seg64_t *physical, range_seg64_t *remain); +typedef void vdev_xlation_func_t(vdev_t *cvd, const zfs_range_seg64_t *logical, + zfs_range_seg64_t *physical, zfs_range_seg64_t *remain); typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start, uint64_t size, uint64_t max_segment); typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp, @@ -616,8 +616,8 @@ extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ -extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs); +extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index bc5c3cb9a670..5977f8c82b45 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1602,8 +1602,8 @@ dsl_scan_should_clear(dsl_scan_t *scn) * # of extents in exts_by_addr = # in exts_by_size. * B-tree efficiency is ~75%, but can be as low as 50%. */ - mused += zfs_btree_numnodes(&queue->q_exts_by_size) * - ((sizeof (range_seg_gap_t) + sizeof (uint64_t)) * + mused += zfs_btree_numnodes(&queue->q_exts_by_size) * (( + sizeof (zfs_range_seg_gap_t) + sizeof (uint64_t)) * 3 / 2) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); @@ -5006,7 +5006,7 @@ ext_size_destroy(zfs_range_tree_t *rt, void *arg) } static uint64_t -ext_size_value(zfs_range_tree_t *rt, range_seg_gap_t *rsg) +ext_size_value(zfs_range_tree_t *rt, zfs_range_seg_gap_t *rsg) { (void) rt; uint64_t size = rsg->rs_end - rsg->rs_start; @@ -5021,7 +5021,7 @@ ext_size_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { zfs_btree_t *size_tree = arg; ASSERT3U(rt->rt_type, ==, ZFS_RANGE_SEG_GAP); - uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); + uint64_t v = ext_size_value(rt, (zfs_range_seg_gap_t *)rs); zfs_btree_add(size_tree, &v); } @@ -5030,7 +5030,7 @@ ext_size_remove(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg) { zfs_btree_t *size_tree = arg; ASSERT3U(rt->rt_type, ==, ZFS_RANGE_SEG_GAP); - uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); + uint64_t v = ext_size_value(rt, (zfs_range_seg_gap_t *)rs); zfs_btree_remove(size_tree, &v); } diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 10546798824a..e3c9afbd6e41 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -518,7 +518,7 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; - mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + mc_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); mutex_enter(&mc->mc_lock); @@ -538,16 +538,16 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) IMPLY(mg == mg->mg_vd->vdev_log_mg, mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); } mutex_exit(&mc->mc_lock); - kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); + kmem_free(mc_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE); } /* @@ -1029,10 +1029,10 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; - mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + mg_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); - ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, + ASSERT3U(ZFS_RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); mutex_enter(&mg->mg_lock); @@ -1049,12 +1049,12 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) } } - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) + for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); mutex_exit(&mg->mg_lock); - kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); + kmem_free(mg_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE); } static void @@ -1344,8 +1344,8 @@ __attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { - const range_seg32_t *r1 = x1; - const range_seg32_t *r2 = x2; + const zfs_range_seg32_t *r1 = x1; + const zfs_range_seg32_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; @@ -1363,8 +1363,8 @@ __attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { - const range_seg64_t *r1 = x1; - const range_seg64_t *r2 = x2; + const zfs_range_seg64_t *r1 = x1; + const zfs_range_seg64_t *r2 = x2; uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; @@ -1390,7 +1390,7 @@ metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) struct mssa_arg *mssap = arg; zfs_range_tree_t *rt = mssap->rt; metaslab_rt_arg_t *mrap = mssap->mra; - range_seg_max_t seg = {0}; + zfs_range_seg_max_t seg = {0}; zfs_rs_set_start(&seg, rt, start); zfs_rs_set_end(&seg, rt, start + size); metaslab_rt_add(rt, &seg, mrap); @@ -1411,10 +1411,10 @@ metaslab_size_tree_full_load(zfs_range_tree_t *rt) ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, - range_seg32_t, metaslab_rangesize32_compare) + zfs_range_seg32_t, metaslab_rangesize32_compare) ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, - range_seg64_t, metaslab_rangesize64_compare) + zfs_range_seg64_t, metaslab_rangesize64_compare) /* * Create any block allocator specific components. The current allocators @@ -1432,12 +1432,12 @@ metaslab_rt_create(zfs_range_tree_t *rt, void *arg) bt_find_in_buf_f bt_find; switch (rt->rt_type) { case ZFS_RANGE_SEG32: - size = sizeof (range_seg32_t); + size = sizeof (zfs_range_seg32_t); compare = metaslab_rangesize32_compare; bt_find = metaslab_rt_find_rangesize32_in_buf; break; case ZFS_RANGE_SEG64: - size = sizeof (range_seg64_t); + size = sizeof (zfs_range_seg64_t); compare = metaslab_rangesize64_compare; bt_find = metaslab_rt_find_rangesize64_in_buf; break; @@ -1603,7 +1603,7 @@ metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start, uint64_t size, zfs_btree_index_t *where) { zfs_range_seg_t *rs; - range_seg_max_t rsearch; + zfs_range_seg_max_t rsearch; zfs_rs_set_start(&rsearch, rt, start); zfs_rs_set_end(&rsearch, rt, start + size); @@ -1857,7 +1857,7 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) zfs_range_tree_t *rt = msp->ms_allocatable; zfs_btree_index_t where; zfs_range_seg_t *rs; - range_seg_max_t rsearch; + zfs_range_seg_max_t rsearch; uint64_t hbit = highbit64(size); uint64_t *cursor = &msp->ms_lbas[hbit - 1]; uint64_t max_size = metaslab_largest_allocatable(msp); @@ -2035,7 +2035,7 @@ metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, * from the space map histogram. */ int idx = 0; - for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (int i = shift; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT3U(i, >=, idx + shift); histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); @@ -3110,7 +3110,7 @@ metaslab_weight_from_range_tree(metaslab_t *msp) ASSERT(msp->ms_loaded); - for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; + for (int i = ZFS_RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; i--) { uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 3cbd5712e1d3..8bb9a0724e61 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -82,13 +82,13 @@ zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt) size_t size = 0; switch (rt->rt_type) { case ZFS_RANGE_SEG32: - size = sizeof (range_seg32_t); + size = sizeof (zfs_range_seg32_t); break; case ZFS_RANGE_SEG64: - size = sizeof (range_seg64_t); + size = sizeof (zfs_range_seg64_t); break; case ZFS_RANGE_SEG_GAP: - size = sizeof (range_seg_gap_t); + size = sizeof (zfs_range_seg_gap_t); break; default: __builtin_unreachable(); @@ -101,7 +101,7 @@ zfs_range_tree_stat_verify(zfs_range_tree_t *rt) { zfs_range_seg_t *rs; zfs_btree_index_t where; - uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; + uint64_t hist[ZFS_RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; int i; for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; @@ -114,7 +114,7 @@ zfs_range_tree_stat_verify(zfs_range_tree_t *rt) ASSERT3U(hist[idx], !=, 0); } - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { if (hist[i] != rt->rt_histogram[i]) { zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu", i, hist, (u_longlong_t)hist[i], @@ -156,8 +156,8 @@ __attribute__((always_inline)) inline static int zfs_range_tree_seg32_compare(const void *x1, const void *x2) { - const range_seg32_t *r1 = x1; - const range_seg32_t *r2 = x2; + const zfs_range_seg32_t *r1 = x1; + const zfs_range_seg32_t *r2 = x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); @@ -169,8 +169,8 @@ __attribute__((always_inline)) inline static int zfs_range_tree_seg64_compare(const void *x1, const void *x2) { - const range_seg64_t *r1 = x1; - const range_seg64_t *r2 = x2; + const zfs_range_seg64_t *r1 = x1; + const zfs_range_seg64_t *r2 = x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); @@ -182,8 +182,8 @@ __attribute__((always_inline)) inline static int zfs_range_tree_seg_gap_compare(const void *x1, const void *x2) { - const range_seg_gap_t *r1 = x1; - const range_seg_gap_t *r2 = x2; + const zfs_range_seg_gap_t *r1 = x1; + const zfs_range_seg_gap_t *r2 = x2; ASSERT3U(r1->rs_start, <=, r1->rs_end); ASSERT3U(r2->rs_start, <=, r2->rs_end); @@ -191,14 +191,14 @@ zfs_range_tree_seg_gap_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } -ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, range_seg32_t, +ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, zfs_range_seg32_t, zfs_range_tree_seg32_compare) -ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, range_seg64_t, +ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t, zfs_range_tree_seg64_compare) -ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf, range_seg_gap_t, - zfs_range_tree_seg_gap_compare) +ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf, + zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare) zfs_range_tree_t * zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, @@ -214,17 +214,17 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, bt_find_in_buf_f bt_find; switch (type) { case ZFS_RANGE_SEG32: - size = sizeof (range_seg32_t); + size = sizeof (zfs_range_seg32_t); compare = zfs_range_tree_seg32_compare; bt_find = zfs_range_tree_seg32_find_in_buf; break; case ZFS_RANGE_SEG64: - size = sizeof (range_seg64_t); + size = sizeof (zfs_range_seg64_t); compare = zfs_range_tree_seg64_compare; bt_find = zfs_range_tree_seg64_find_in_buf; break; case ZFS_RANGE_SEG_GAP: - size = sizeof (range_seg_gap_t); + size = sizeof (zfs_range_seg_gap_t); compare = zfs_range_tree_seg_gap_compare; bt_find = zfs_range_tree_seg_gap_find_in_buf; break; @@ -296,7 +296,7 @@ zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) zfs_range_tree_t *rt = arg; zfs_btree_index_t where; zfs_range_seg_t *rs_before, *rs_after, *rs; - range_seg_max_t tmp, rsearch; + zfs_range_seg_max_t tmp, rsearch; uint64_t end = start + size, gap = rt->rt_gap; uint64_t bridge_size = 0; boolean_t merge_before, merge_after; @@ -448,7 +448,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, { zfs_btree_index_t where; zfs_range_seg_t *rs; - range_seg_max_t rsearch, rs_tmp; + zfs_range_seg_max_t rsearch, rs_tmp; uint64_t end = start + size; boolean_t left_over, right_over; @@ -510,7 +510,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { - range_seg_max_t newseg; + zfs_range_seg_max_t newseg; zfs_rs_set_start(&newseg, rt, end); zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt)); zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end); @@ -593,7 +593,7 @@ zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs, static zfs_range_seg_t * zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size) { - range_seg_max_t rsearch; + zfs_range_seg_max_t rsearch; uint64_t end = start + size; VERIFY(size != 0); @@ -644,7 +644,7 @@ zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size, if (rt->rt_type == ZFS_RANGE_SEG64) ASSERT3U(start + size, >, start); - range_seg_max_t rsearch; + zfs_range_seg_max_t rsearch; zfs_rs_set_start(&rsearch, rt, start); zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) + 1); @@ -772,7 +772,7 @@ zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, zfs_range_tree_t *removefrom, zfs_range_tree_t *addto) { zfs_btree_index_t where; - range_seg_max_t starting_rs; + zfs_range_seg_max_t starting_rs; zfs_rs_set_start(&starting_rs, removefrom, start); zfs_rs_set_end_raw(&starting_rs, removefrom, zfs_rs_get_start_raw(&starting_rs, removefrom) + 1); @@ -801,7 +801,7 @@ zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, end); uint64_t overlap_size = overlap_end - overlap_start; ASSERT3S(overlap_size, >, 0); - range_seg_max_t rs; + zfs_range_seg_max_t rs; zfs_rs_copy(curr, &rs, removefrom); zfs_range_tree_remove(removefrom, overlap_start, overlap_size); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index e9e03e05c86a..36e15b8d73af 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -497,7 +497,7 @@ space_map_histogram_add(space_map_t *sm, zfs_range_tree_t *rt, dmu_tx_t *tx) * map only cares about allocatable blocks (minimum of sm_shift) we * can safely ignore all ranges in the range tree smaller than sm_shift. */ - for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (int i = sm->sm_shift; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { /* * Since the largest histogram bucket in the space map is @@ -1050,7 +1050,7 @@ space_map_estimate_optimal_size(space_map_t *sm, zfs_range_tree_t *rt, size += histogram[idx] * entry_size; if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) { - for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + for (; idx < ZFS_RANGE_TREE_HISTOGRAM_SIZE; idx++) { ASSERT3U(idx, >=, single_entry_max_bucket); entries_for_seg = 1ULL << (idx - single_entry_max_bucket); @@ -1067,7 +1067,7 @@ space_map_estimate_optimal_size(space_map_t *sm, zfs_range_tree_t *rt, for (; idx <= double_entry_max_bucket; idx++) size += histogram[idx] * 2 * sizeof (uint64_t); - for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + for (; idx < ZFS_RANGE_TREE_HISTOGRAM_SIZE; idx++) { ASSERT3U(idx, >=, double_entry_max_bucket); entries_for_seg = 1ULL << (idx - double_entry_max_bucket); size += histogram[idx] * diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 92b001aabf7d..b7def0b7eb14 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -294,8 +294,8 @@ vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) } void -vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs) +vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { (void) vd, (void) remain_rs; @@ -1677,7 +1677,7 @@ vdev_metaslab_fini(vdev_t *vd) vd->vdev_ms = NULL; vd->vdev_ms_count = 0; - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); if (vd->vdev_log_mg != NULL) ASSERT0(vd->vdev_log_mg->mg_histogram[i]); @@ -5689,7 +5689,7 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) } boolean_t -vdev_xlate_is_empty(range_seg64_t *rs) +vdev_xlate_is_empty(zfs_range_seg64_t *rs) { return (rs->rs_start == rs->rs_end); } @@ -5703,8 +5703,8 @@ vdev_xlate_is_empty(range_seg64_t *rs) * specific translation function to do the real conversion. */ void -vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs) +vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { /* * Walk up the vdev tree @@ -5736,7 +5736,7 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, * range into its physical and any remaining components by calling * the vdev specific translate function. */ - range_seg64_t intermediate = { 0 }; + zfs_range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; @@ -5744,12 +5744,12 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, } void -vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, +vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg) { - range_seg64_t iter_rs = *logical_rs; - range_seg64_t physical_rs; - range_seg64_t remain_rs; + zfs_range_seg64_t iter_rs = *logical_rs; + zfs_range_seg64_t physical_rs; + zfs_range_seg64_t remain_rs; while (!vdev_xlate_is_empty(&iter_rs)) { diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 419c8ac5bb28..45f8bcfbd4ed 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1823,7 +1823,7 @@ static void vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) { #ifdef ZFS_DEBUG - range_seg64_t logical_rs, physical_rs, remain_rs; + zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_draid_asize(vd, rr->rr_size, 0); @@ -2080,8 +2080,8 @@ vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) } static void -vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs) +vdev_draid_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_draid_ops); diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 008e014ecfdc..f6e2662bd40f 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -359,7 +359,7 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) } static void -vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +vdev_initialize_xlate_last_rs_end(void *arg, zfs_range_seg64_t *physical_rs) { uint64_t *last_rs_end = (uint64_t *)arg; @@ -368,7 +368,7 @@ vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) } static void -vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs) +vdev_initialize_xlate_progress(void *arg, zfs_range_seg64_t *physical_rs) { vdev_t *vd = (vdev_t *)arg; @@ -407,7 +407,7 @@ vdev_initialize_calculate_progress(vdev_t *vd) * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg64_t logical_rs, physical_rs, remain_rs; + zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; @@ -481,7 +481,7 @@ vdev_initialize_load(vdev_t *vd) } static void -vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs) +vdev_initialize_xlate_range_add(void *arg, zfs_range_seg64_t *physical_rs) { vdev_t *vd = arg; @@ -516,7 +516,7 @@ static void vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; - range_seg64_t logical_rs; + zfs_range_seg64_t logical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 9d12bc2eb0a2..2c4e0c1c4848 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -643,7 +643,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, * will be combined with adjacent allocated segments * as a single mapping. */ - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; + i++) { if (i + 1 < highbit64(vdev_removal_max_span) - 1) { to_alloc += diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 6bac2241c6d8..59225e766ba1 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2305,7 +2305,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) { (void) rm; #ifdef ZFS_DEBUG - range_seg64_t logical_rs, physical_rs, remain_rs; + zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_raidz_asize(zio->io_vd, rr->rr_size, @@ -3650,8 +3650,8 @@ vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, } static void -vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs, range_seg64_t *remain_rs) +vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs, + zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) { (void) remain_rs; diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index e1819448a98a..1970c5425854 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1143,7 +1143,7 @@ spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs, * the allocation at the end of a segment, thus avoiding * additional split blocks. */ - range_seg_max_t search; + zfs_range_seg_max_t search; zfs_btree_index_t where; zfs_rs_set_start(&search, segs, start + maxalloc); zfs_rs_set_end(&search, segs, start + maxalloc); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index d13753f81a69..1ca0b23c0ee4 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -645,7 +645,7 @@ vdev_trim_ranges(trim_args_t *ta) } static void -vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +vdev_trim_xlate_last_rs_end(void *arg, zfs_range_seg64_t *physical_rs) { uint64_t *last_rs_end = (uint64_t *)arg; @@ -654,7 +654,7 @@ vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) } static void -vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs) +vdev_trim_xlate_progress(void *arg, zfs_range_seg64_t *physical_rs) { vdev_t *vd = (vdev_t *)arg; @@ -696,7 +696,7 @@ vdev_trim_calculate_progress(vdev_t *vd) * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg64_t logical_rs, physical_rs, remain_rs; + zfs_range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; @@ -807,7 +807,7 @@ vdev_trim_load(vdev_t *vd) } static void -vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs) +vdev_trim_xlate_range_add(void *arg, zfs_range_seg64_t *physical_rs) { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; @@ -845,7 +845,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; - range_seg64_t logical_rs; + zfs_range_seg64_t logical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; @@ -1588,7 +1588,7 @@ vdev_trim_l2arc_thread(void *arg) spa_t *spa = vd->vdev_spa; l2arc_dev_t *dev = l2arc_vdev_get(vd); trim_args_t ta = {0}; - range_seg64_t physical_rs; + zfs_range_seg64_t physical_rs; ASSERT(vdev_is_concrete(vd)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1722,7 +1722,7 @@ int vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) { trim_args_t ta = {0}; - range_seg64_t physical_rs; + zfs_range_seg64_t physical_rs; int error; physical_rs.rs_start = start; physical_rs.rs_end = start + size; From f2ab5b82da20d1c49d1e8884cb79c4623916de1a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 18 Feb 2025 13:45:42 -0500 Subject: [PATCH 33/44] Fix metaslab group fragmentation math (#17037) Since we are calculating a free space fragmentation, we should weight metaslabs by the amount of their free space, not a full size. Fragmentation of full metaslabs may not matter in presence empty ones. The old algorithm did not differentiate metaslabs having only one free 4KB block from metaslabs having 50% of space free in 4KB blocks, reporting higher fragmentation. While there, move metaslab_group_alloc_update() call after setting mg_fragmentation, otherwise the effect may be delayed by one TXG. Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Reviewed-by: Paul Dagnelie Reviewed-by: Tony Nguyen Reviewed-by: Tony Hutter --- module/zfs/metaslab.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index e3c9afbd6e41..35bd968f68ce 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1176,9 +1176,8 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) } /* - * Calculate the fragmentation for a given metaslab group. We can use - * a simple average here since all metaslabs within the group must have - * the same size. The return value will be a value between 0 and 100 + * Calculate the fragmentation for a given metaslab group. Weight metaslabs + * on the amount of free space. The return value will be between 0 and 100 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this * group have a fragmentation metric. */ @@ -1187,24 +1186,29 @@ metaslab_group_fragmentation(metaslab_group_t *mg) { vdev_t *vd = mg->mg_vd; uint64_t fragmentation = 0; - uint64_t valid_ms = 0; + uint64_t valid_ms = 0, total_ms = 0; + uint64_t free, total_free = 0; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - if (msp->ms_fragmentation == ZFS_FRAG_INVALID) - continue; if (msp->ms_group != mg) continue; + total_ms++; + if (msp->ms_fragmentation == ZFS_FRAG_INVALID) + continue; valid_ms++; - fragmentation += msp->ms_fragmentation; + free = (msp->ms_size - metaslab_allocated_space(msp)) / + SPA_MINBLOCKSIZE; /* To prevent overflows. */ + total_free += free; + fragmentation += msp->ms_fragmentation * free; } - if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) + if (valid_ms < (total_ms + 1) / 2 || total_free == 0) return (ZFS_FRAG_INVALID); - fragmentation /= valid_ms; + fragmentation /= total_free; ASSERT3U(fragmentation, <=, 100); return (fragmentation); } @@ -4469,8 +4473,8 @@ metaslab_sync_reassess(metaslab_group_t *mg) spa_t *spa = mg->mg_class->mc_spa; spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - metaslab_group_alloc_update(mg); mg->mg_fragmentation = metaslab_group_fragmentation(mg); + metaslab_group_alloc_update(mg); /* * Preload the next potential metaslabs but only on active From af062c480cd973fd84f2f046b016ac91257b88c1 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 21 Feb 2025 05:42:42 +1100 Subject: [PATCH 34/44] vdev_file: unify FreeBSD and Linux implementations (#17046) Kernel & userspace specifics are in zfs_file_os.c, so there's no particular reason these have to be separate. The one platform-specific part is in the Linux kernel part, to offload flushes to a taskq if we're already inside a filesystem transaction. This would be normally be an unsatisfying wart, but I'm intending to remove this shortly, so I'm content to leave it gated for the moment. Reviewed-by: Allan Jude Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- lib/libzpool/Makefile.am | 2 +- module/Kbuild.in | 2 +- module/Makefile.bsd | 2 +- module/os/freebsd/zfs/vdev_file.c | 342 -------------------------- module/{os/linux => }/zfs/vdev_file.c | 60 +++-- 5 files changed, 31 insertions(+), 377 deletions(-) delete mode 100644 module/os/freebsd/zfs/vdev_file.c rename module/{os/linux => }/zfs/vdev_file.c (95%) diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 404b737c204d..8875393dcb22 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -46,7 +46,6 @@ nodist_libzpool_la_SOURCES = \ module/lua/lvm.c \ module/lua/lzio.c \ \ - module/os/linux/zfs/vdev_file.c \ module/os/linux/zfs/zio_crypt.c \ \ module/zcommon/cityhash.c \ @@ -143,6 +142,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/vdev.c \ module/zfs/vdev_draid.c \ module/zfs/vdev_draid_rand.c \ + module/zfs/vdev_file.c \ module/zfs/vdev_indirect.c \ module/zfs/vdev_indirect_births.c \ module/zfs/vdev_indirect_mapping.c \ diff --git a/module/Kbuild.in b/module/Kbuild.in index 6fbc441fcaa9..7af3d98ceb81 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -386,6 +386,7 @@ ZFS_OBJS := \ vdev.o \ vdev_draid.o \ vdev_draid_rand.o \ + vdev_file.o \ vdev_indirect.o \ vdev_indirect_births.o \ vdev_indirect_mapping.o \ @@ -447,7 +448,6 @@ ZFS_OBJS_OS := \ spa_misc_os.o \ trace.o \ vdev_disk.o \ - vdev_file.o \ vdev_raidz.o \ vdev_label_os.o \ zfs_acl.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 61a664c5bf66..d71df1e5c00b 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -199,7 +199,6 @@ SRCS+= abd_os.c \ kmod_core.c \ spa_os.c \ sysctl_os.c \ - vdev_file.c \ vdev_geom.c \ vdev_label_os.c \ zfs_acl.c \ @@ -314,6 +313,7 @@ SRCS+= abd.c \ vdev.c \ vdev_draid.c \ vdev_draid_rand.c \ + vdev_file.c \ vdev_indirect_births.c \ vdev_indirect.c \ vdev_indirect_mapping.c \ diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c deleted file mode 100644 index 6719c87f82e5..000000000000 --- a/module/os/freebsd/zfs/vdev_file.c +++ /dev/null @@ -1,342 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Virtual device vector for files. - */ - -static taskq_t *vdev_file_taskq; - -static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; -static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; - -void -vdev_file_init(void) -{ - vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), - minclsyspri, max_ncpus, INT_MAX, 0); -} - -void -vdev_file_fini(void) -{ - taskq_destroy(vdev_file_taskq); -} - -static void -vdev_file_hold(vdev_t *vd) -{ - ASSERT3P(vd->vdev_path, !=, NULL); -} - -static void -vdev_file_rele(vdev_t *vd) -{ - ASSERT3P(vd->vdev_path, !=, NULL); -} - -static mode_t -vdev_file_open_mode(spa_mode_t spa_mode) -{ - mode_t mode = 0; - - if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { - mode = O_RDWR; - } else if (spa_mode & SPA_MODE_READ) { - mode = O_RDONLY; - } else if (spa_mode & SPA_MODE_WRITE) { - mode = O_WRONLY; - } - - return (mode | O_LARGEFILE); -} - -static int -vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *logical_ashift, uint64_t *physical_ashift) -{ - vdev_file_t *vf; - zfs_file_t *fp; - zfs_file_attr_t zfa; - int error; - - /* - * Rotational optimizations only make sense on block devices. - */ - vd->vdev_nonrot = B_TRUE; - - /* - * Allow TRIM on file based vdevs. This may not always be supported, - * since it depends on your kernel version and underlying filesystem - * type but it is always safe to attempt. - */ - vd->vdev_has_trim = B_TRUE; - - /* - * Disable secure TRIM on file based vdevs. There is no way to - * request this behavior from the underlying filesystem. - */ - vd->vdev_has_securetrim = B_FALSE; - - /* - * We must have a pathname, and it must be absolute. - */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (SET_ERROR(EINVAL)); - } - - /* - * Reopen the device if it's not currently open. Otherwise, - * just update the physical size of the device. - */ - if (vd->vdev_tsd != NULL) { - ASSERT(vd->vdev_reopening); - vf = vd->vdev_tsd; - goto skip_open; - } - - vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); - - /* - * We always open the files from the root of the global zone, even if - * we're in a local zone. If the user has gotten to this point, the - * administrator has already decided that the pool should be available - * to local zone users, so the underlying devices should be as well. - */ - ASSERT3P(vd->vdev_path, !=, NULL); - ASSERT(vd->vdev_path[0] == '/'); - - error = zfs_file_open(vd->vdev_path, - vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); - if (error) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (error); - } - - vf->vf_file = fp; - -#ifdef _KERNEL - /* - * Make sure it's a regular file. - */ - if (zfs_file_getattr(fp, &zfa)) { - return (SET_ERROR(ENODEV)); - } - if (!S_ISREG(zfa.zfa_mode)) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (SET_ERROR(ENODEV)); - } -#endif - -skip_open: - - error = zfs_file_getattr(vf->vf_file, &zfa); - if (error) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (error); - } - - *max_psize = *psize = zfa.zfa_size; - *logical_ashift = vdev_file_logical_ashift; - *physical_ashift = vdev_file_physical_ashift; - - return (0); -} - -static void -vdev_file_close(vdev_t *vd) -{ - vdev_file_t *vf = vd->vdev_tsd; - - if (vd->vdev_reopening || vf == NULL) - return; - - if (vf->vf_file != NULL) { - zfs_file_close(vf->vf_file); - } - - vd->vdev_delayed_close = B_FALSE; - kmem_free(vf, sizeof (vdev_file_t)); - vd->vdev_tsd = NULL; -} - -/* - * Implements the interrupt side for file vdev types. This routine will be - * called when the I/O completes allowing us to transfer the I/O to the - * interrupt taskqs. For consistency, the code structure mimics disk vdev - * types. - */ -static void -vdev_file_io_intr(zio_t *zio) -{ - zio_delay_interrupt(zio); -} - -static void -vdev_file_io_strategy(void *arg) -{ - zio_t *zio = arg; - vdev_t *vd = zio->io_vd; - vdev_file_t *vf; - void *buf; - ssize_t resid; - loff_t off; - ssize_t size; - int err; - - off = zio->io_offset; - size = zio->io_size; - resid = 0; - - vf = vd->vdev_tsd; - - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); - if (zio->io_type == ZIO_TYPE_READ) { - buf = abd_borrow_buf(zio->io_abd, zio->io_size); - err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); - abd_return_buf_copy(zio->io_abd, buf, size); - } else { - buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); - err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); - abd_return_buf(zio->io_abd, buf, size); - } - zio->io_error = err; - if (resid != 0 && zio->io_error == 0) - zio->io_error = ENOSPC; - - vdev_file_io_intr(zio); -} - -static void -vdev_file_io_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_file_t *vf = vd->vdev_tsd; - - if (zio->io_type == ZIO_TYPE_FLUSH) { - /* XXPOLICY */ - if (!vdev_readable(vd)) { - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - - zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC|O_DSYNC); - - zio_execute(zio); - return; - } else if (zio->io_type == ZIO_TYPE_TRIM) { - ASSERT3U(zio->io_size, !=, 0); - zio->io_error = zfs_file_deallocate(vf->vf_file, - zio->io_offset, zio->io_size); - zio_execute(zio); - return; - } - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); - zio->io_target_timestamp = zio_handle_io_delay(zio); - - VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, - TQ_SLEEP), !=, 0); -} - -static void -vdev_file_io_done(zio_t *zio) -{ - (void) zio; -} - -vdev_ops_t vdev_file_ops = { - .vdev_op_init = NULL, - .vdev_op_fini = NULL, - .vdev_op_open = vdev_file_open, - .vdev_op_close = vdev_file_close, - .vdev_op_asize = vdev_default_asize, - .vdev_op_min_asize = vdev_default_min_asize, - .vdev_op_min_alloc = NULL, - .vdev_op_io_start = vdev_file_io_start, - .vdev_op_io_done = vdev_file_io_done, - .vdev_op_state_change = NULL, - .vdev_op_need_resilver = NULL, - .vdev_op_hold = vdev_file_hold, - .vdev_op_rele = vdev_file_rele, - .vdev_op_remap = NULL, - .vdev_op_xlate = vdev_default_xlate, - .vdev_op_rebuild_asize = NULL, - .vdev_op_metaslab_init = NULL, - .vdev_op_config_generate = NULL, - .vdev_op_nparity = NULL, - .vdev_op_ndisks = NULL, - .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ - .vdev_op_leaf = B_TRUE /* leaf vdev */ -}; - -/* - * From userland we access disks just like files. - */ -#ifndef _KERNEL - -vdev_ops_t vdev_disk_ops = { - .vdev_op_init = NULL, - .vdev_op_fini = NULL, - .vdev_op_open = vdev_file_open, - .vdev_op_close = vdev_file_close, - .vdev_op_asize = vdev_default_asize, - .vdev_op_min_asize = vdev_default_min_asize, - .vdev_op_min_alloc = NULL, - .vdev_op_io_start = vdev_file_io_start, - .vdev_op_io_done = vdev_file_io_done, - .vdev_op_state_change = NULL, - .vdev_op_need_resilver = NULL, - .vdev_op_hold = vdev_file_hold, - .vdev_op_rele = vdev_file_rele, - .vdev_op_remap = NULL, - .vdev_op_xlate = vdev_default_xlate, - .vdev_op_rebuild_asize = NULL, - .vdev_op_metaslab_init = NULL, - .vdev_op_config_generate = NULL, - .vdev_op_nparity = NULL, - .vdev_op_ndisks = NULL, - .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ - .vdev_op_leaf = B_TRUE /* leaf vdev */ -}; - -#endif - -ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW, - "Logical ashift for file-based devices"); -ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW, - "Physical ashift for file-based devices"); diff --git a/module/os/linux/zfs/vdev_file.c b/module/zfs/vdev_file.c similarity index 95% rename from module/os/linux/zfs/vdev_file.c rename to module/zfs/vdev_file.c index 2cab6532487a..224340405d70 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -21,26 +21,19 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #include #include -#include #include #include -#include #include #include #include #include -#include -#include -#ifdef _KERNEL -#include -#include -#else -#include -#endif +#include + /* * Virtual device vector for files. */ @@ -58,16 +51,31 @@ static taskq_t *vdev_file_taskq; static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16), + minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC); + + VERIFY(vdev_file_taskq); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + static void vdev_file_hold(vdev_t *vd) { - ASSERT(vd->vdev_path != NULL); + ASSERT3P(vd->vdev_path, !=, NULL); } static void vdev_file_rele(vdev_t *vd) { - ASSERT(vd->vdev_path != NULL); + ASSERT3P(vd->vdev_path, !=, NULL); } static mode_t @@ -139,7 +147,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, * administrator has already decided that the pool should be available * to local zone users, so the underlying devices should be as well. */ - ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + ASSERT3P(vd->vdev_path, !=, NULL); + ASSERT3S(vd->vdev_path[0], ==, '/'); error = zfs_file_open(vd->vdev_path, vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); @@ -201,8 +210,8 @@ vdev_file_io_strategy(void *arg) zio_t *zio = (zio_t *)arg; vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; - ssize_t resid; void *buf; + ssize_t resid; loff_t off; ssize_t size; int err; @@ -211,6 +220,7 @@ vdev_file_io_strategy(void *arg) size = zio->io_size; resid = 0; + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); if (zio->io_type == ZIO_TYPE_READ) { buf = abd_borrow_buf(zio->io_abd, zio->io_size); err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); @@ -257,6 +267,7 @@ vdev_file_io_start(zio_t *zio) return; } +#ifdef __linux__ /* * We cannot safely call vfs_fsync() when PF_FSTRANS * is set in the current context. Filesystems like @@ -270,10 +281,9 @@ vdev_file_io_start(zio_t *zio) TASKQID_INVALID); return; } +#endif - zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC); - - zio_execute(zio); + vdev_file_io_fsync(zio); return; } else if (zio->io_type == ZIO_TYPE_TRIM) { ASSERT3U(zio->io_size, !=, 0); @@ -283,6 +293,7 @@ vdev_file_io_start(zio_t *zio) return; } + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); zio->io_target_timestamp = zio_handle_io_delay(zio); VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, @@ -320,21 +331,6 @@ vdev_ops_t vdev_file_ops = { .vdev_op_leaf = B_TRUE /* leaf vdev */ }; -void -vdev_file_init(void) -{ - vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16), - minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC); - - VERIFY(vdev_file_taskq); -} - -void -vdev_file_fini(void) -{ - taskq_destroy(vdev_file_taskq); -} - /* * From userland we access disks just like files. */ From e085d66f7ab4ec5f5cee5a16e721db852c8963f2 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Sat, 22 Feb 2025 10:50:33 -0800 Subject: [PATCH 35/44] Fix wrong free function in arc_hdr_decrypt Need to use arc_free_data_abd to free abd type buffer. Reviewed-by: Alexander Motin Signed-off-by: Chunwei Chen Closes #17079 --- module/zfs/arc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index c6383d03a4a4..119576563e86 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1882,7 +1882,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) error: arc_hdr_free_abd(hdr, B_FALSE); if (cabd != NULL) - arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); + arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); return (ret); } From 7ea899be04a9f5d1ed4b977c92a1234166d26d3b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 23 Feb 2025 06:16:54 +1100 Subject: [PATCH 36/44] vdev_file: make FLUSH and TRIM asynchronous zfs_file_fsync() and zfs_file_deallocate() are both blocking ops, so the zio_taskq thread is active and blocked both while waiting for the IO call and then while calling zio_execute() for the next stage. This is a particular issue for FLUSH, as the z_flush_iss queue typically only has one thread; multiple flushes arriving at once can cause long delays if the underlying fsync() response is particularly slow. To fix this, we dispatch both FLUSH and TRIM to the z_vdev_file taskq, just as we do for reads and writes. Further, we return all results through zio_interrupt(), so neither the issue nor the file taskqs are blocked. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Tony Hutter Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #17064 --- module/zfs/vdev_file.c | 45 +++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index 224340405d70..66997f0e7e8e 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -248,11 +248,22 @@ vdev_file_io_fsync(void *arg) zio_interrupt(zio); } +static void +vdev_file_io_deallocate(void *arg) +{ + zio_t *zio = (zio_t *)arg; + vdev_file_t *vf = zio->io_vd->vdev_tsd; + + zio->io_error = zfs_file_deallocate(vf->vf_file, + zio->io_offset, zio->io_size); + + zio_interrupt(zio); +} + static void vdev_file_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; - vdev_file_t *vf = vd->vdev_tsd; if (zio->io_type == ZIO_TYPE_FLUSH) { /* XXPOLICY */ @@ -263,33 +274,23 @@ vdev_file_io_start(zio_t *zio) } if (zfs_nocacheflush) { - zio_execute(zio); + zio_interrupt(zio); return; } -#ifdef __linux__ - /* - * We cannot safely call vfs_fsync() when PF_FSTRANS - * is set in the current context. Filesystems like - * XFS include sanity checks to verify it is not - * already set, see xfs_vm_writepage(). Therefore - * the sync must be dispatched to a different context. - */ - if (__spl_pf_fstrans_check()) { - VERIFY3U(taskq_dispatch(vdev_file_taskq, - vdev_file_io_fsync, zio, TQ_SLEEP), !=, - TASKQID_INVALID); - return; - } -#endif + VERIFY3U(taskq_dispatch(vdev_file_taskq, + vdev_file_io_fsync, zio, TQ_SLEEP), !=, TASKQID_INVALID); - vdev_file_io_fsync(zio); return; - } else if (zio->io_type == ZIO_TYPE_TRIM) { + } + + if (zio->io_type == ZIO_TYPE_TRIM) { ASSERT3U(zio->io_size, !=, 0); - zio->io_error = zfs_file_deallocate(vf->vf_file, - zio->io_offset, zio->io_size); - zio_execute(zio); + + VERIFY3U(taskq_dispatch(vdev_file_taskq, + vdev_file_io_deallocate, zio, TQ_SLEEP), !=, + TASKQID_INVALID); + return; } From 92d1686a2ad9e5ada480029d698522e1b5bdcda6 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 23 Feb 2025 10:46:26 +1100 Subject: [PATCH 37/44] include: move zio_priority_t into zfs.h It's included so it's effectively already part of it, but it's not always installed as a userspace header, making zfs.h effectively useless. Might as well just combine it. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Signed-off-by: Rob Norris Close #17066 --- include/Makefile.am | 1 - include/sys/dmu.h | 1 - include/sys/fs/zfs.h | 23 +++++++++++++++++-- include/sys/zio.h | 1 - include/sys/zio_priority.h | 47 -------------------------------------- 5 files changed, 21 insertions(+), 52 deletions(-) delete mode 100644 include/sys/zio_priority.h diff --git a/include/Makefile.am b/include/Makefile.am index f173064efc99..a9258deabfd7 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -150,7 +150,6 @@ COMMON_H = \ sys/zio_compress.h \ sys/zio_crypt.h \ sys/zio_impl.h \ - sys/zio_priority.h \ sys/zrlock.h \ sys/zthr.h \ \ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 29f715039d29..2e49b290b263 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -48,7 +48,6 @@ #include #include #include -#include #include #include diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index dc474e3739f3..dc84e66c1e85 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2024 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014, 2016, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -37,7 +37,6 @@ #define _SYS_FS_ZFS_H extern __attribute__((visibility("default"))) #include -#include #ifdef __cplusplus extern "C" { @@ -1126,6 +1125,26 @@ typedef enum zio_type { */ #define ZIO_TYPE_IOCTL ZIO_TYPE_FLUSH +/* + * ZIO priority types. Needed to interpret vdev statistics below. + * + * NOTE: PLEASE UPDATE THE ENUM STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER + * VALUE. + */ +typedef enum zio_priority { + ZIO_PRIORITY_SYNC_READ, + ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ + ZIO_PRIORITY_ASYNC_READ, /* prefetch */ + ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ + ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ + ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ + ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ + ZIO_PRIORITY_TRIM, /* trim I/O (discard) */ + ZIO_PRIORITY_REBUILD, /* reads/writes for vdev rebuild */ + ZIO_PRIORITY_NUM_QUEUEABLE, + ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ +} zio_priority_t; + /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. diff --git a/include/sys/zio.h b/include/sys/zio.h index 46f5d68aed4a..741f34b2d871 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -35,7 +35,6 @@ #ifndef _ZIO_H #define _ZIO_H -#include #include #include #include diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h deleted file mode 100644 index bdf5f9b8ff35..000000000000 --- a/include/sys/zio_priority.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * CDDL HEADER START - * - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2014, 2016 by Delphix. All rights reserved. - */ -#ifndef _ZIO_PRIORITY_H -#define _ZIO_PRIORITY_H - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * NOTE: PLEASE UPDATE THE ENUM STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER - * VALUE. - */ -typedef enum zio_priority { - ZIO_PRIORITY_SYNC_READ, - ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ - ZIO_PRIORITY_ASYNC_READ, /* prefetch */ - ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ - ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ - ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ - ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ - ZIO_PRIORITY_TRIM, /* trim I/O (discard) */ - ZIO_PRIORITY_REBUILD, /* reads/writes for vdev rebuild */ - ZIO_PRIORITY_NUM_QUEUEABLE, - ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ -} zio_priority_t; - -#ifdef __cplusplus -} -#endif - -#endif /* _ZIO_PRIORITY_H */ From b4ce059a7638bcc329298c8102473ebb37633dad Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 24 Feb 2025 07:22:00 +1100 Subject: [PATCH 38/44] suspend_resume_single: clear pool errors on fail If the timing is unfortunate, the pool can suspend just as we're failing because it didn't suspend. If we don't resume the pool, we hang trying to destroy it. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #17054 --- tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh b/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh index 0dc5584e4fd5..22831d28adaf 100755 --- a/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh +++ b/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh @@ -30,6 +30,7 @@ DATAFILE="$TMPDIR/datafile" function cleanup { + zpool clear $TESTPOOL destroy_pool $TESTPOOL unload_scsi_debug rm -f $DATA_FILE From c2668b2d1096091291b1c3d0caeda985c0bd80d5 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 25 Feb 2025 14:26:34 -0500 Subject: [PATCH 39/44] Better fill empty metaslabs Before this change zfs_metaslab_switch_threshold tunable switched metaslabs each time ones index reduced by two (which means biggest contiguous chunk reduced to 1/4). It is a good idea to balance metaslabs fragmentation. But for empty metaslabs (having power- of-2 sizes) this means switching when they get just below the half of their capacity. Inspection with zdb after filling new pool to half capacity shown most of its metaslabs filled to half capacity. I consider this sub-optimal for pool fragmentation in a long run. This change blocks the metaslabs switching if most of the metaslab free space (15/16) is represented by a single contiguous range. Such metaslab should not be considered fragmented until it actually fail some big allocation. More contiguous filling should improve data locality and increase time before previously filled and partially freed metaslab is touched again, giving it more time to free more contiguous chunks for lower fragmentation. It should also slightly reduce spacemap traffic. Reviewed-by: Brian Behlendorf Reviewed-by: Paul Dagnelie Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #17081 --- module/zfs/metaslab.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 35bd968f68ce..c1424a81bf7b 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -3545,6 +3545,15 @@ metaslab_segment_may_passivate(metaslab_t *msp) if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) return; + /* + * As long as a single largest free segment covers majorioty of free + * space, don't consider the metaslab fragmented. It should allow + * us to fill new unfragmented metaslabs full before switching. + */ + if (metaslab_largest_allocatable(msp) > + zfs_range_tree_space(msp->ms_allocatable) * 15 / 16) + return; + /* * Since we are in the middle of a sync pass, the most accurate * information that is accessible to us is the in-core range tree From 383256c3293a5d6c91958885f632c02b4b2d28f7 Mon Sep 17 00:00:00 2001 From: aokblast Date: Wed, 26 Feb 2025 03:28:57 +0800 Subject: [PATCH 40/44] spa: fix signature mismatch for spa_boot_init as eventhandler required Reviewed-by: Ameer Hamza Reviewed-by: Alexander Motin Signed-off-by: SHENGYI HONG Closes #17088 --- include/sys/spa.h | 2 +- module/zfs/spa_misc.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index 510d1119bffd..ecf3cd8a2f9d 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1213,7 +1213,7 @@ extern void vdev_mirror_stat_fini(void); /* Initialization and termination */ extern void spa_init(spa_mode_t mode); extern void spa_fini(void); -extern void spa_boot_init(void); +extern void spa_boot_init(void *); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 0550dfd4766d..2e67a27c5483 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2521,8 +2521,9 @@ spa_name_compare(const void *a1, const void *a2) } void -spa_boot_init(void) +spa_boot_init(void *unused) { + (void) unused; spa_config_load(); } From 7e72312eff201d49bc793cbb9da21903a1e39dae Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 25 Feb 2025 11:30:51 -0800 Subject: [PATCH 41/44] Don't try to get mg of hole vdev in removal Don't try to get mg of hole vdev in removal Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #17080 --- module/zfs/vdev_removal.c | 4 +-- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/removal/removal_with_hole.ksh | 34 +++++++++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_hole.ksh diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 1970c5425854..d3351555ced5 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -209,8 +209,8 @@ vdev_passivate(vdev_t *vd, uint64_t *txg) for (uint64_t id = 0; id < rvd->vdev_children; id++) { vdev_t *cvd = rvd->vdev_child[id]; - if (cvd == vd || - cvd->vdev_ops == &vdev_indirect_ops) + if (cvd == vd || !vdev_is_concrete(cvd) || + vdev_is_dead(cvd)) continue; metaslab_class_t *mc = cvd->vdev_mg->mg_class; diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 2ba8a1ca4ca5..8e1ffab5b4eb 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -887,7 +887,8 @@ tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space', 'removal_with_send_recv', 'removal_with_snapshot', 'removal_with_write', 'removal_with_zdb', 'remove_expanded', 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz', - 'remove_indirect', 'remove_attach_mirror', 'removal_reservation'] + 'remove_indirect', 'remove_attach_mirror', 'removal_reservation', + 'removal_with_hole'] tags = ['functional', 'removal'] [tests/functional/rename_dirs] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index fbb6621585c3..24eeac11299f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1852,6 +1852,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/removal/removal_with_export.ksh \ functional/removal/removal_with_faulted.ksh \ functional/removal/removal_with_ganging.ksh \ + functional/removal/removal_with_hole.ksh \ functional/removal/removal_with_indirect.ksh \ functional/removal/removal_with_remove.ksh \ functional/removal/removal_with_scrub.ksh \ diff --git a/tests/zfs-tests/tests/functional/removal/removal_with_hole.ksh b/tests/zfs-tests/tests/functional/removal/removal_with_hole.ksh new file mode 100755 index 000000000000..34175fc64394 --- /dev/null +++ b/tests/zfs-tests/tests/functional/removal/removal_with_hole.ksh @@ -0,0 +1,34 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +log_onexit default_cleanup_noexit +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" +DISK3="$(echo $DISKS | cut -d' ' -f3)" + +log_must zpool create $TESTPOOL $DISK1 log $DISK2 +log_must zpool add $TESTPOOL $DISK3 +log_must zpool remove $TESTPOOL $DISK2 +log_must zpool remove $TESTPOOL $DISK1 + +log_pass "Removal with a hole as the first other device doesn't panic." From 637f918211415c561246bf211f33d6102e50b114 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Wed, 26 Feb 2025 00:32:12 +0500 Subject: [PATCH 42/44] arc: avoid possible deadlock in arc_read In l2arc_evict(), the config lock may be acquired in reverse order (e.g., first the config lock (writer), then a hash lock) unlike in arc_read() during scenarios like L2ARC device removal. To avoid deadlocks, if the attempt to acquire the config lock (reader) fails in arc_read(), release the hash lock, wait for the config lock, and retry from the beginning. Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #17071 --- cmd/zdb/zdb.c | 4 ++-- include/sys/zio.h | 3 ++- module/zfs/arc.c | 26 ++++++++++++++++++++++---- module/zfs/dsl_scan.c | 2 +- module/zfs/spa.c | 2 +- module/zfs/zio.c | 19 ++++++++++++------- 6 files changed, 40 insertions(+), 16 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 5e8f282e96c3..fbc92833ebce 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -9043,7 +9043,7 @@ zdb_read_block(char *thing, spa_t *spa) const blkptr_t *b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (zfs_blkptr_verify(spa, b, - BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) { + BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY)) { abd_return_buf_copy(pabd, buf, lsize); borrowed = B_FALSE; buf = lbuf; @@ -9052,7 +9052,7 @@ zdb_read_block(char *thing, spa_t *spa) b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); if (lsize == -1 || zfs_blkptr_verify(spa, b, - BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) { + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { printf("invalid block pointer at this DVA\n"); goto out; } diff --git a/include/sys/zio.h b/include/sys/zio.h index 741f34b2d871..225f326e5244 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -545,6 +545,7 @@ enum blk_verify_flag { enum blk_config_flag { BLK_CONFIG_HELD, // SCL_VDEV held for writer BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader + BLK_CONFIG_NEEDED_TRY, // Try with SCL_VDEV for reader BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV }; @@ -662,7 +663,7 @@ extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); -extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, +extern int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify); /* diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 119576563e86..229870702585 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5568,6 +5568,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; arc_buf_t *buf = NULL; int rc = 0; + boolean_t bp_validation = B_FALSE; ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); @@ -5610,7 +5611,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * should always be the case since the blkptr is protected by * a checksum. */ - if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP, + if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP, BLK_VERIFY_LOG)) { mutex_exit(hash_lock); rc = SET_ERROR(ECKSUM); @@ -5762,6 +5763,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *hdr_abd; int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); + int config_lock; + int error; if (*arc_flags & ARC_FLAG_CACHED_ONLY) { if (hash_lock != NULL) @@ -5770,16 +5773,31 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, goto done; } + if (zio_flags & ZIO_FLAG_CONFIG_WRITER) { + config_lock = BLK_CONFIG_HELD; + } else if (hash_lock != NULL) { + /* + * Prevent lock order reversal + */ + config_lock = BLK_CONFIG_NEEDED_TRY; + } else { + config_lock = BLK_CONFIG_NEEDED; + } + /* * Verify the block pointer contents are reasonable. This * should always be the case since the blkptr is protected by * a checksum. */ - if (!zfs_blkptr_verify(spa, bp, - (zio_flags & ZIO_FLAG_CONFIG_WRITER) ? - BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { + if (!bp_validation && (error = zfs_blkptr_verify(spa, bp, + config_lock, BLK_VERIFY_LOG))) { if (hash_lock != NULL) mutex_exit(hash_lock); + if (error == EBUSY && !zfs_blkptr_verify(spa, bp, + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { + bp_validation = B_TRUE; + goto top; + } rc = SET_ERROR(ECKSUM); goto done; } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 5977f8c82b45..35b56420511a 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2305,7 +2305,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); - } else if (!zfs_blkptr_verify(spa, bp, + } else if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { /* * Sanity check the block pointer contents, this is handled diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 54830b9536d9..820a82b10323 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -2778,7 +2778,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, * When damaged consider it to be a metadata error since we cannot * trust the BP_GET_TYPE and BP_GET_LEVEL values. */ - if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { + if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { atomic_inc_64(&sle->sle_meta_count); return (0); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index b071ac17ed1f..36e2f5e4bba8 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1164,7 +1164,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, * it only contains known object types, checksum/compression identifiers, * block sizes within the maximum allowed limits, valid DVAs, etc. * - * If everything checks out B_TRUE is returned. The zfs_blkptr_verify + * If everything checks out 0 is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * * Values for blk_verify_flag: @@ -1179,7 +1179,7 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better * performance */ -boolean_t +int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { @@ -1211,7 +1211,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BPE_GET_PSIZE(bp)); } - return (errors == 0); + return (errors ? ECKSUM : 0); } if (unlikely(BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS)) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, @@ -1229,7 +1229,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (unlikely(!spa->spa_trust_config)) - return (errors == 0); + return (errors ? ECKSUM : 0); switch (blk_config) { case BLK_CONFIG_HELD: @@ -1238,8 +1238,12 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, case BLK_CONFIG_NEEDED: spa_config_enter(spa, SCL_VDEV, bp, RW_READER); break; + case BLK_CONFIG_NEEDED_TRY: + if (!spa_config_tryenter(spa, SCL_VDEV, bp, RW_READER)) + return (EBUSY); + break; case BLK_CONFIG_SKIP: - return (errors == 0); + return (errors ? ECKSUM : 0); default: panic("invalid blk_config %u", blk_config); } @@ -1294,10 +1298,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, bp, i, (longlong_t)offset); } } - if (blk_config == BLK_CONFIG_NEEDED) + if (blk_config == BLK_CONFIG_NEEDED || blk_config == + BLK_CONFIG_NEEDED_TRY) spa_config_exit(spa, SCL_VDEV, bp); - return (errors == 0); + return (errors ? ECKSUM : 0); } boolean_t From f741c841dd69c8822a71f73180912f7fe2f6e2bb Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 25 Feb 2025 11:40:20 -0800 Subject: [PATCH 43/44] zpool: allow relative vdev paths `zpool create` won't let you use relative paths to disks. This is annoying when you want to do: zpool create tank ./diskfile But have to do.. zpool create tank `pwd`/diskfile This fixes it. Reviewed-by: Tino Reichardt Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #17042 --- lib/libzutil/zutil_device_path.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lib/libzutil/zutil_device_path.c b/lib/libzutil/zutil_device_path.c index 0425018e1022..0586c0c7c80d 100644 --- a/lib/libzutil/zutil_device_path.c +++ b/lib/libzutil/zutil_device_path.c @@ -57,6 +57,7 @@ int zfs_resolve_shortname(const char *name, char *path, size_t len) { const char *env = getenv("ZPOOL_IMPORT_PATH"); + char resolved_path[PATH_MAX]; if (env) { for (;;) { @@ -85,6 +86,20 @@ zfs_resolve_shortname(const char *name, char *path, size_t len) } } + /* + * The user can pass a relative path like ./file1 for the vdev. The path + * must contain a directory prefix like './file1' or '../file1'. Simply + * passing 'file1' is not allowed, as it may match a block device name. + */ + if ((strncmp(name, "./", 2) == 0 || strncmp(name, "../", 3) == 0) && + realpath(name, resolved_path) != NULL) { + if (access(resolved_path, F_OK) == 0) { + if (strlen(resolved_path) + 1 <= len) { + if (strlcpy(path, resolved_path, len) < len) + return (0); /* success */ + } + } + } return (errno = ENOENT); } From acfd6511cf77ae98494c2bc40491cde11417b16b Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 27 Feb 2025 11:24:36 -0800 Subject: [PATCH 44/44] Linux 6.13 compat: META (#17098) Update the META file to reflect compatibility with the 6.13 kernel. Signed-off-by: Tony Hutter Reviewed-by: Rob Norris Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 4b5605bdf226..6ece54a13f0d 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.12 +Linux-Maximum: 6.13 Linux-Minimum: 4.18