Skip to content

Commit c5a762f

Browse files
committed
BRT: Fix FICLONE/FICLONERANGE shortened copy
On Linux the ioctl_ficlonerange() and ioctl_ficlone() system calls are expected to either fully clone the specified range or return an error. The range may be for an entire file. While internally ZFS supports cloning partial ranges there's no way to return the length cloned to the caller so we need to make this all or nothing. As part of this change support for the REMAP_FILE_CAN_SHORTEN flag has been added. When REMAP_FILE_CAN_SHORTEN is set zfs_clone_range() will return a shortened range when encountering pending dirty records. When it's clear zfs_clone_range() will block and wait for the records to be written out allowing the blocks to be cloned. Furthermore, the file rangelock is held over the region being cloned to prevent it from being modified while cloning. This doesn't quite provide an atomic semantics since if an error is encountered only a portion of the range may be cloned. This will be converted to an error if REMAP_FILE_CAN_SHORTEN was not provided and returned to the caller. However, the destination file range is left in an undefined state. A test case has been added which exercises this functionality by verifying that `cp --reflink=never|auto|always` works correctly. Signed-off-by: Brian D Behlendorf <[email protected]> Issue #15728
1 parent 2e6b3c4 commit c5a762f

File tree

11 files changed

+222
-24
lines changed

11 files changed

+222
-24
lines changed

include/os/linux/zfs/sys/zfs_vfsops_os.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ typedef struct zfsvfs zfsvfs_t;
4646
struct znode;
4747

4848
extern int zfs_bclone_enabled;
49+
extern int zfs_bclone_wait_dirty;
4950

5051
/*
5152
* This structure emulates the vfs_t from other platforms. It's purpose

man/man4/zfs.4

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1159,6 +1159,15 @@ Enable the experimental block cloning feature.
11591159
If this setting is 0, then even if feature@block_cloning is enabled,
11601160
attempts to clone blocks will act as though the feature is disabled.
11611161
.
1162+
.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
1163+
When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
1164+
written to disk.
1165+
This allows the clone operation to reliably succeed when a file is
1166+
modified and then immediately cloned.
1167+
For small files this may be slower than making a copy of the file.
1168+
Therefore, this setting defaults to 0 which causes a clone operation to
1169+
immediately fail when encountering a dirty block.
1170+
.
11621171
.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
11631172
Select a BLAKE3 implementation.
11641173
.Pp

module/os/freebsd/zfs/zfs_vfsops.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ int zfs_bclone_enabled = 1;
9393
SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
9494
&zfs_bclone_enabled, 0, "Enable block cloning");
9595

96+
int zfs_bclone_wait_dirty = 0;
97+
SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_wait_dirty, CTLFLAG_RWTUN,
98+
&zfs_bclone_wait_dirty, 0, "Wait for dirty blocks when cloning");
99+
96100
struct zfs_jailparam {
97101
int mount_snapshot;
98102
};

module/os/linux/zfs/zfs_vnops_os.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4257,7 +4257,11 @@ module_param(zfs_delete_blocks, ulong, 0644);
42574257
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
42584258

42594259
/* CSTYLED */
4260-
module_param(zfs_bclone_enabled, uint, 0644);
4260+
module_param(zfs_bclone_enabled, int, 0644);
42614261
MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");
42624262

4263+
/* CSTYLED */
4264+
module_param(zfs_bclone_wait_dirty, int, 0644);
4265+
MODULE_PARM_DESC(zfs_bclone_wait_dirty, "Wait for dirty blocks when cloning");
4266+
42634267
#endif

module/os/linux/zfs/zpl_file_range.c

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <sys/zfeature.h>
3333

3434
int zfs_bclone_enabled = 1;
35+
int zfs_bclone_wait_dirty = 0;
3536

3637
/*
3738
* Clone part of a file via block cloning.
@@ -40,7 +41,7 @@ int zfs_bclone_enabled = 1;
4041
* care of that depending on how it was called.
4142
*/
4243
static ssize_t
43-
__zpl_clone_file_range(struct file *src_file, loff_t src_off,
44+
zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
4445
struct file *dst_file, loff_t dst_off, size_t len)
4546
{
4647
struct inode *src_i = file_inode(src_file);
@@ -96,11 +97,12 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
9697
{
9798
ssize_t ret;
9899

100+
/* Flags is reserved for future extensions and must be zero. */
99101
if (flags != 0)
100102
return (-EINVAL);
101103

102-
/* Try to do it via zfs_clone_range() */
103-
ret = __zpl_clone_file_range(src_file, src_off,
104+
/* Try to do it via zfs_clone_range() and allow shortening. */
105+
ret = zpl_clone_file_range_impl(src_file, src_off,
104106
dst_file, dst_off, len);
105107

106108
#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
@@ -137,6 +139,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
137139
* FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
138140
* range in both files and if they're the same, arrange for them to be backed
139141
* by the same storage.
142+
*
143+
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
144+
* if we want. It's designed for filesystems that may need to shorten the
145+
* length for alignment, EOF, or any other requirement. ZFS may shorten the
146+
* request when there is outstanding dirty data which hasn't been written.
140147
*/
141148
loff_t
142149
zpl_remap_file_range(struct file *src_file, loff_t src_off,
@@ -145,24 +152,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off,
145152
if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
146153
return (-EINVAL);
147154

148-
/*
149-
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
150-
* range if we want. Its designed for filesystems that make data past
151-
* EOF available, and don't want it to be visible in both files. ZFS
152-
* doesn't do that, so we just turn the flag off.
153-
*/
154-
flags &= ~REMAP_FILE_CAN_SHORTEN;
155-
155+
/* No support for dedup yet */
156156
if (flags & REMAP_FILE_DEDUP)
157-
/* No support for dedup yet */
158157
return (-EOPNOTSUPP);
159158

160159
/* Zero length means to clone everything to the end of the file */
161160
if (len == 0)
162161
len = i_size_read(file_inode(src_file)) - src_off;
163162

164-
return (__zpl_clone_file_range(src_file, src_off,
165-
dst_file, dst_off, len));
163+
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
164+
dst_file, dst_off, len);
165+
166+
if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len)
167+
ret = -EINVAL;
168+
169+
return (ret);
166170
}
167171
#endif /* HAVE_VFS_REMAP_FILE_RANGE */
168172

@@ -179,8 +183,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off,
179183
if (len == 0)
180184
len = i_size_read(file_inode(src_file)) - src_off;
181185

182-
return (__zpl_clone_file_range(src_file, src_off,
183-
dst_file, dst_off, len));
186+
/* The entire length must be cloned or this is an error. */
187+
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
188+
dst_file, dst_off, len);
189+
190+
if (ret >= 0 && ret != len)
191+
ret = -EINVAL;
192+
193+
return (ret);
184194
}
185195
#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
186196

@@ -214,8 +224,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg)
214224

215225
size_t len = i_size_read(file_inode(src_file));
216226

217-
ssize_t ret =
218-
__zpl_clone_file_range(src_file, 0, dst_file, 0, len);
227+
ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len);
219228

220229
fput(src_file);
221230

@@ -253,7 +262,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
253262
if (len == 0)
254263
len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
255264

256-
ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
265+
ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
257266
dst_file, fcr.fcr_dest_offset, len);
258267

259268
fput(src_file);

module/zfs/zfs_vnops.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
10491049
size_t maxblocks, nbps;
10501050
uint_t inblksz;
10511051
uint64_t clear_setid_bits_txg = 0;
1052+
uint64_t last_synced_txg = 0;
10521053

10531054
inoff = *inoffp;
10541055
outoff = *outoffp;
@@ -1287,15 +1288,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
12871288
}
12881289

12891290
nbps = maxblocks;
1291+
last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
12901292
error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
12911293
&nbps);
12921294
if (error != 0) {
12931295
/*
12941296
* If we are trying to clone a block that was created
1295-
* in the current transaction group, error will be
1296-
* EAGAIN here, which we can just return to the caller
1297-
* so it can fallback if it likes.
1297+
* in the current transaction group, the error will be
1298+
* EAGAIN here. Based on zfs_bclone_wait_dirty either
1299+
* return a shortened range to the caller so it can
1300+
* fallback, or wait for the next TXG and check again.
12981301
*/
1302+
if (error == EAGAIN && zfs_bclone_wait_dirty) {
1303+
txg_wait_synced(dmu_objset_pool(inos),
1304+
last_synced_txg + 1);
1305+
continue;
1306+
}
1307+
12991308
break;
13001309
}
13011310

tests/runfiles/common.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -631,7 +631,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
631631
tags = ['functional', 'compression']
632632

633633
[tests/functional/cp_files]
634-
tests = ['cp_files_001_pos', 'cp_stress']
634+
tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
635635
tags = ['functional', 'cp_files']
636636

637637
[tests/functional/crtime]

tests/test-runner/bin/zts-report.py.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ if sys.platform.startswith('freebsd'):
176176
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
177177
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
178178
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
179+
'cp_files/cp_files_002_pos': ['SKIP', na_reason],
179180
'link_count/link_count_001': ['SKIP', na_reason],
180181
'casenorm/mixed_create_failure': ['FAIL', 13215],
181182
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],

tests/zfs-tests/include/tunables.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ VOL_MODE vol.mode zvol_volmode
9494
VOL_RECURSIVE vol.recursive UNSUPPORTED
9595
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
9696
BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled
97+
BCLONE_WAIT_DIRTY zfs_bclone_wait_dirty zfs_bclone_wait_dirty
9798
XATTR_COMPAT xattr_compat zfs_xattr_compat
9899
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
99100
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,6 +1394,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
13941394
functional/compression/setup.ksh \
13951395
functional/cp_files/cleanup.ksh \
13961396
functional/cp_files/cp_files_001_pos.ksh \
1397+
functional/cp_files/cp_files_002_pos.ksh \
13971398
functional/cp_files/cp_stress.ksh \
13981399
functional/cp_files/setup.ksh \
13991400
functional/crtime/cleanup.ksh \
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
#! /bin/ksh -p
2+
#
3+
# CDDL HEADER START
4+
#
5+
# The contents of this file are subject to the terms of the
6+
# Common Development and Distribution License (the "License").
7+
# You may not use this file except in compliance with the License.
8+
#
9+
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10+
# or https://opensource.org/licenses/CDDL-1.0.
11+
# See the License for the specific language governing permissions
12+
# and limitations under the License.
13+
#
14+
# When distributing Covered Code, include this CDDL HEADER in each
15+
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16+
# If applicable, add the following below this CDDL HEADER, with the
17+
# fields enclosed by brackets "[]" replaced with your own identifying
18+
# information: Portions Copyright [yyyy] [name of copyright owner]
19+
#
20+
# CDDL HEADER END
21+
#
22+
23+
#
24+
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
25+
#
26+
27+
. $STF_SUITE/include/libtest.shlib
28+
29+
#
30+
# DESCRIPTION:
31+
# Verify all cp --reflink modes work with modified file.
32+
#
33+
# STRATEGY:
34+
# 1. Verify "cp --reflink=never|auto|always" behaves as expected.
35+
# Two different modes of operation are tested.
36+
#
37+
# a. zfs_bclone_wait_dirty=0: FICLONE and FICLONERANGE fail with EINVAL
38+
# when there are dirty blocks which cannot be immediately cloned.
39+
# This is the default behavior.
40+
#
41+
# b. zfs_bclone_wait_dirty=1: FICLONE and FICLONERANGE wait for
42+
# dirty blocks to be written to disk allowing the clone to succeed.
43+
# The downside to this is it may be slow which depending on the
44+
# situtation may defeat the point of making a clone.
45+
#
46+
47+
verify_runnable "global"
48+
49+
if ! is_linux; then
50+
log_unsupported "cp --reflink is a GNU coreutils option"
51+
fi
52+
53+
function cleanup
54+
{
55+
datasetexists $TESTPOOL/cp-reflink && \
56+
destroy_dataset $$TESTPOOL/cp-reflink -f
57+
log_must set_tunable32 BCLONE_WAIT_DIRTY 0
58+
}
59+
60+
function verify_copy
61+
{
62+
src_cksum=$(sha256digest $1)
63+
dst_cksum=$(sha256digest $2)
64+
65+
if [[ "$src_cksum" != "$dst_cksum" ]]; then
66+
log_must ls -l $CP_TESTDIR
67+
log_fail "checksum mismatch ($src_cksum != $dst_cksum)"
68+
fi
69+
}
70+
71+
log_assert "Verify all cp --reflink modes work with modified file"
72+
73+
log_onexit cleanup
74+
75+
SRC_FILE=src.data
76+
DST_FILE=dst.data
77+
SRC_SIZE=$(($RANDOM % 2048))
78+
79+
# A smaller recordsize is used merely to speed up the test.
80+
RECORDSIZE=4096
81+
82+
log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink
83+
CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink)
84+
85+
log_must cd $CP_TESTDIR
86+
87+
# Never wait on dirty blocks (zfs_bclone_wait_dirty=0)
88+
log_must set_tunable32 BCLONE_WAIT_DIRTY 0
89+
90+
for mode in "never" "auto" "always"; do
91+
log_note "Checking 'cp --reflink=$mode'"
92+
93+
# Create a new file and immediately copy it.
94+
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
95+
96+
if [[ "$mode" == "always" ]]; then
97+
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
98+
log_must ls -l $CP_TESTDIR
99+
else
100+
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
101+
verify_copy $SRC_FILE $DST_FILE
102+
fi
103+
log_must rm -f $DST_FILE
104+
105+
# Append to an existing file and immediately copy it.
106+
sync_pool $TESTPOOL
107+
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
108+
count=1 conv=notrunc
109+
if [[ "$mode" == "always" ]]; then
110+
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
111+
log_must ls -l $CP_TESTDIR
112+
else
113+
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
114+
verify_copy $SRC_FILE $DST_FILE
115+
fi
116+
log_must rm -f $DST_FILE
117+
118+
# Overwrite a random range of an existing file and immediately copy it.
119+
sync_pool $TESTPOOL
120+
log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
121+
seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
122+
if [[ "$mode" == "always" ]]; then
123+
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
124+
log_must ls -l $CP_TESTDIR
125+
else
126+
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
127+
verify_copy $SRC_FILE $DST_FILE
128+
fi
129+
log_must rm -f $SRC_FILE $DST_FILE
130+
done
131+
132+
# Wait on dirty blocks (zfs_bclone_wait_dirty=1)
133+
log_must set_tunable32 BCLONE_WAIT_DIRTY 1
134+
135+
for mode in "never" "auto" "always"; do
136+
log_note "Checking 'cp --reflink=$mode'"
137+
138+
# Create a new file and immediately copy it.
139+
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
140+
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
141+
verify_copy $SRC_FILE $DST_FILE
142+
log_must rm -f $DST_FILE
143+
144+
# Append to an existing file and immediately copy it.
145+
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
146+
count=1 conv=notrunc
147+
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
148+
verify_copy $SRC_FILE $DST_FILE
149+
log_must rm -f $DST_FILE
150+
151+
# Overwrite a random range of an existing file and immediately copy it.
152+
log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
153+
seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
154+
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
155+
verify_copy $SRC_FILE $DST_FILE
156+
log_must rm -f $SRC_FILE $DST_FILE
157+
done
158+
159+
log_pass

0 commit comments

Comments
 (0)