Skip to content

Commit 46e82de

Browse files
committed
BRT: Fix FICLONE/FICLONERANGE shortened copy
On Linux the ioctl_ficlonerange() and ioctl_ficlone() system calls are expected to either fully clone the specified range or return an error. The range may be for an entire file. While internally ZFS supports cloning partial ranges there's no way to return the length cloned to the caller so we need to make this all or nothing. As part of this change support for the REMAP_FILE_CAN_SHORTEN flag has been added. When REMAP_FILE_CAN_SHORTEN is set zfs_clone_range() will return a shortened range when encountering pending dirty records. When it's clear zfs_clone_range() will block and wait for the records to be written out allowing the blocks to be cloned. Furthermore, the file rangelock is held over the region being cloned to prevent it from being modified while cloning. This doesn't quite provide an atomic semantics since if an error is encountered only a portion of the range may be cloned. This will be converted to an error if REMAP_FILE_CAN_SHORTEN was not provided and returned to the caller. However, the destination file range is left in an undefined state. A test case has been added which exercises this functionality by verifying that `cp --reflink=never|auto|always` works correctly. Signed-off-by: Brian D Behlendorf <[email protected]> Issue #15728
1 parent 2e6b3c4 commit 46e82de

File tree

13 files changed

+230
-31
lines changed

13 files changed

+230
-31
lines changed

include/os/linux/zfs/sys/zfs_vfsops_os.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ typedef struct zfsvfs zfsvfs_t;
4646
struct znode;
4747

4848
extern int zfs_bclone_enabled;
49+
extern int zfs_bclone_wait_dirty;
4950

5051
/*
5152
* This structure emulates the vfs_t from other platforms. It's purpose

include/sys/zfs_vnops.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
3232
extern int zfs_holey(znode_t *, ulong_t, loff_t *);
3333
extern int zfs_access(znode_t *, int, int, cred_t *);
3434
extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *,
35-
uint64_t *, cred_t *);
35+
uint64_t *, cred_t *, boolean_t);
3636
extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t,
3737
const blkptr_t *, size_t);
3838

man/man4/zfs.4

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1159,6 +1159,14 @@ Enable the experimental block cloning feature.
11591159
If this setting is 0, then even if feature@block_cloning is enabled,
11601160
attempts to clone blocks will act as though the feature is disabled.
11611161
.
1162+
.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
1163+
When set to 1 the FICLONE and FICLONERANGE ioctls will wait for
1164+
dirty blocks in the file to be written to disk.
1165+
This allows the clone operation to reliably succeed when it is
1166+
posible but may take a significant amount of time.
1167+
By default this setting is 0 which results in the clone operation
1168+
immediately failing when a dirty block is encountered.
1169+
.
11621170
.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
11631171
Select a BLAKE3 implementation.
11641172
.Pp

module/os/freebsd/zfs/zfs_vfsops.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ int zfs_bclone_enabled = 1;
9393
SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
9494
&zfs_bclone_enabled, 0, "Enable block cloning");
9595

96+
int zfs_bclone_wait_dirty = 0;
97+
SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_wait_dirty, CTLFLAG_RWTUN,
98+
&zfs_bclone_wait_dirty, 0, "Wait for dirty blocks when cloning");
99+
96100
struct zfs_jailparam {
97101
int mount_snapshot;
98102
};

module/os/freebsd/zfs/zfs_vnops_os.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6304,7 +6304,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
63046304
goto out_locked;
63056305

63066306
error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
6307-
ap->a_outoffp, &len, ap->a_outcred);
6307+
ap->a_outoffp, &len, ap->a_outcred, zfs_bclone_wait_dirty);
63086308
if (error == EXDEV || error == EAGAIN || error == EINVAL ||
63096309
error == EOPNOTSUPP)
63106310
goto bad_locked_fallback;

module/os/linux/zfs/zfs_vnops_os.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4257,7 +4257,11 @@ module_param(zfs_delete_blocks, ulong, 0644);
42574257
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
42584258

42594259
/* CSTYLED */
4260-
module_param(zfs_bclone_enabled, uint, 0644);
4260+
module_param(zfs_bclone_enabled, int, 0644);
42614261
MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");
42624262

4263+
/* CSTYLED */
4264+
module_param(zfs_bclone_wait_dirty, int, 0644);
4265+
MODULE_PARM_DESC(zfs_bclone_wait_dirty, "Wait for dirty blocks when cloning");
4266+
42634267
#endif

module/os/linux/zfs/zpl_file_range.c

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,21 @@
3232
#include <sys/zfeature.h>
3333

3434
int zfs_bclone_enabled = 1;
35+
int zfs_bclone_wait_dirty = 0;
3536

3637
/*
3738
* Clone part of a file via block cloning.
3839
*
3940
* Note that we are not required to update file offsets; the kernel will take
4041
* care of that depending on how it was called.
42+
*
43+
* When the caller cannot handle a shortened range wait_dirty may be set
44+
* to request that zfs_clone_range() wait on transaction groups as needed
45+
* to allow the clone to succeed if possible.
4146
*/
4247
static ssize_t
43-
__zpl_clone_file_range(struct file *src_file, loff_t src_off,
44-
struct file *dst_file, loff_t dst_off, size_t len)
48+
zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
49+
struct file *dst_file, loff_t dst_off, size_t len, boolean_t wait_dirty)
4550
{
4651
struct inode *src_i = file_inode(src_file);
4752
struct inode *dst_i = file_inode(dst_file);
@@ -67,7 +72,7 @@ __zpl_clone_file_range(struct file *src_file, loff_t src_off,
6772
cookie = spl_fstrans_mark();
6873

6974
err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i),
70-
&dst_off_o, &len_o, cr);
75+
&dst_off_o, &len_o, cr, wait_dirty);
7176

7277
spl_fstrans_unmark(cookie);
7378
crfree(cr);
@@ -96,12 +101,13 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
96101
{
97102
ssize_t ret;
98103

104+
/* Flags is reserved for future extensions and must be zero. */
99105
if (flags != 0)
100106
return (-EINVAL);
101107

102-
/* Try to do it via zfs_clone_range() */
103-
ret = __zpl_clone_file_range(src_file, src_off,
104-
dst_file, dst_off, len);
108+
/* Try to do it via zfs_clone_range() and allow shortening. */
109+
ret = zpl_clone_file_range_impl(src_file, src_off,
110+
dst_file, dst_off, len, zfs_bclone_wait_dirty);
105111

106112
#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
107113
/*
@@ -137,6 +143,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
137143
* FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
138144
* range in both files and if they're the same, arrange for them to be backed
139145
* by the same storage.
146+
*
147+
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
148+
* if we want. It's designed for filesystems that may need to shorten the
149+
* length for alignment, EOF, or any other requirement. ZFS may shorten the
150+
* request when there is outstanding dirty data which hasn't been written.
140151
*/
141152
loff_t
142153
zpl_remap_file_range(struct file *src_file, loff_t src_off,
@@ -145,24 +156,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off,
145156
if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
146157
return (-EINVAL);
147158

148-
/*
149-
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
150-
* range if we want. Its designed for filesystems that make data past
151-
* EOF available, and don't want it to be visible in both files. ZFS
152-
* doesn't do that, so we just turn the flag off.
153-
*/
154-
flags &= ~REMAP_FILE_CAN_SHORTEN;
155-
159+
/* No support for dedup yet */
156160
if (flags & REMAP_FILE_DEDUP)
157-
/* No support for dedup yet */
158161
return (-EOPNOTSUPP);
159162

160163
/* Zero length means to clone everything to the end of the file */
161164
if (len == 0)
162165
len = i_size_read(file_inode(src_file)) - src_off;
163166

164-
return (__zpl_clone_file_range(src_file, src_off,
165-
dst_file, dst_off, len));
167+
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
168+
dst_file, dst_off, len, zfs_bclone_wait_dirty);
169+
170+
if (!(flags & REMAP_FILE_CAN_SHORTEN) && (ret != len))
171+
ret = -EINVAL;
172+
173+
return (ret);
166174
}
167175
#endif /* HAVE_VFS_REMAP_FILE_RANGE */
168176

@@ -179,8 +187,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off,
179187
if (len == 0)
180188
len = i_size_read(file_inode(src_file)) - src_off;
181189

182-
return (__zpl_clone_file_range(src_file, src_off,
183-
dst_file, dst_off, len));
190+
/* The entire length must be cloned or this is an error. */
191+
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
192+
dst_file, dst_off, len, zfs_bclone_wait_dirty);
193+
194+
if (len != ret)
195+
ret = -EINVAL;
196+
197+
return (ret);
184198
}
185199
#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
186200

@@ -214,8 +228,8 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg)
214228

215229
size_t len = i_size_read(file_inode(src_file));
216230

217-
ssize_t ret =
218-
__zpl_clone_file_range(src_file, 0, dst_file, 0, len);
231+
ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len,
232+
zfs_bclone_wait_dirty);
219233

220234
fput(src_file);
221235

@@ -253,8 +267,8 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
253267
if (len == 0)
254268
len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
255269

256-
ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
257-
dst_file, fcr.fcr_dest_offset, len);
270+
ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
271+
dst_file, fcr.fcr_dest_offset, len, zfs_bclone_wait_dirty);
258272

259273
fput(src_file);
260274

module/zfs/zfs_vnops.c

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,7 +1030,7 @@ zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
10301030
*/
10311031
int
10321032
zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
1033-
uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
1033+
uint64_t *outoffp, uint64_t *lenp, cred_t *cr, boolean_t wait_dirty)
10341034
{
10351035
zfsvfs_t *inzfsvfs, *outzfsvfs;
10361036
objset_t *inos, *outos;
@@ -1049,6 +1049,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
10491049
size_t maxblocks, nbps;
10501050
uint_t inblksz;
10511051
uint64_t clear_setid_bits_txg = 0;
1052+
uint64_t last_synced_txg = 0;
10521053

10531054
inoff = *inoffp;
10541055
outoff = *outoffp;
@@ -1287,15 +1288,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
12871288
}
12881289

12891290
nbps = maxblocks;
1291+
last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
12901292
error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
12911293
&nbps);
12921294
if (error != 0) {
12931295
/*
12941296
* If we are trying to clone a block that was created
1295-
* in the current transaction group, error will be
1296-
* EAGAIN here, which we can just return to the caller
1297-
* so it can fallback if it likes.
1297+
* in the current transaction group, the error will be
1298+
* EAGAIN here. Based on sync_wait either return a
1299+
* shortened range to the caller so it can fallback,
1300+
* or wait for the next sync and check again.
12981301
*/
1302+
if (error == EAGAIN && wait_dirty) {
1303+
txg_wait_synced(dmu_objset_pool(inos),
1304+
last_synced_txg + 1);
1305+
continue;
1306+
}
1307+
12991308
break;
13001309
}
13011310

tests/runfiles/common.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -631,7 +631,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
631631
tags = ['functional', 'compression']
632632

633633
[tests/functional/cp_files]
634-
tests = ['cp_files_001_pos', 'cp_stress']
634+
tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
635635
tags = ['functional', 'cp_files']
636636

637637
[tests/functional/crtime]

tests/test-runner/bin/zts-report.py.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ if sys.platform.startswith('freebsd'):
176176
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
177177
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
178178
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
179+
'cp_files/cp_files_002_pos': ['SKIP', na_reason],
179180
'link_count/link_count_001': ['SKIP', na_reason],
180181
'casenorm/mixed_create_failure': ['FAIL', 13215],
181182
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],

tests/zfs-tests/include/tunables.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ VOL_MODE vol.mode zvol_volmode
9494
VOL_RECURSIVE vol.recursive UNSUPPORTED
9595
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
9696
BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled
97+
BCLONE_WAIT_DIRTY zfs_bclone_wait_dirty zfs_bclone_wait_dirty
9798
XATTR_COMPAT xattr_compat zfs_xattr_compat
9899
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
99100
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,6 +1394,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
13941394
functional/compression/setup.ksh \
13951395
functional/cp_files/cleanup.ksh \
13961396
functional/cp_files/cp_files_001_pos.ksh \
1397+
functional/cp_files/cp_files_002_pos.ksh \
13971398
functional/cp_files/cp_stress.ksh \
13981399
functional/cp_files/setup.ksh \
13991400
functional/crtime/cleanup.ksh \

0 commit comments

Comments
 (0)