Skip to content

Commit 6dccdf5

Browse files
authored
BRT: Fix FICLONE/FICLONERANGE shortened copy
On Linux the ioctl_ficlonerange() and ioctl_ficlone() system calls are expected to either fully clone the specified range or return an error. The range may be for an entire file. While internally ZFS supports cloning partial ranges there's no way to return the length cloned to the caller so we need to make this all or nothing. As part of this change support for the REMAP_FILE_CAN_SHORTEN flag has been added. When REMAP_FILE_CAN_SHORTEN is set zfs_clone_range() will return a shortened range when encountering pending dirty records. When it's clear zfs_clone_range() will block and wait for the records to be written out allowing the blocks to be cloned. Furthermore, the file range lock is held over the region being cloned to prevent it from being modified while cloning. This doesn't quite provide an atomic semantics since if an error is encountered only a portion of the range may be cloned. This will be converted to an error if REMAP_FILE_CAN_SHORTEN was not provided and returned to the caller. However, the destination file range is left in an undefined state. A test case has been added which exercises this functionality by verifying that `cp --reflink=never|auto|always` works correctly. Reviewed-by: Alexander Motin <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #15728 Closes #15842
1 parent a0d3fe7 commit 6dccdf5

File tree

13 files changed

+243
-39
lines changed

13 files changed

+243
-39
lines changed

include/os/freebsd/zfs/sys/zfs_vfsops_os.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,6 @@ typedef struct zfid_long {
285285
#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))
286286

287287
extern int zfs_super_owner;
288-
extern int zfs_bclone_enabled;
289288

290289
extern void zfs_init(void);
291290
extern void zfs_fini(void);

include/os/linux/zfs/sys/zfs_vfsops_os.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,6 @@ extern "C" {
4545
typedef struct zfsvfs zfsvfs_t;
4646
struct znode;
4747

48-
extern int zfs_bclone_enabled;
49-
5048
/*
5149
* This structure emulates the vfs_t from other platforms. It's purpose
5250
* is to facilitate the handling of mount options and minimize structural

include/sys/zfs_vnops.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,11 @@
2424

2525
#ifndef _SYS_FS_ZFS_VNOPS_H
2626
#define _SYS_FS_ZFS_VNOPS_H
27+
2728
#include <sys/zfs_vnops_os.h>
2829

30+
extern int zfs_bclone_enabled;
31+
2932
extern int zfs_fsync(znode_t *, int, cred_t *);
3033
extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
3134
extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);

man/man4/zfs.4

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1159,6 +1159,15 @@ Enable the experimental block cloning feature.
11591159
If this setting is 0, then even if feature@block_cloning is enabled,
11601160
attempts to clone blocks will act as though the feature is disabled.
11611161
.
1162+
.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
1163+
When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
1164+
written to disk.
1165+
This allows the clone operation to reliably succeed when a file is
1166+
modified and then immediately cloned.
1167+
For small files this may be slower than making a copy of the file.
1168+
Therefore, this setting defaults to 0 which causes a clone operation to
1169+
immediately fail when encountering a dirty block.
1170+
.
11621171
.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
11631172
Select a BLAKE3 implementation.
11641173
.Pp

module/os/freebsd/zfs/zfs_vfsops.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,6 @@ int zfs_debug_level;
8989
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
9090
"Debug level");
9191

92-
int zfs_bclone_enabled = 1;
93-
SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
94-
&zfs_bclone_enabled, 0, "Enable block cloning");
95-
9692
struct zfs_jailparam {
9793
int mount_snapshot;
9894
};

module/os/linux/zfs/zfs_vnops_os.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4255,9 +4255,4 @@ EXPORT_SYMBOL(zfs_map);
42554255
/* CSTYLED */
42564256
module_param(zfs_delete_blocks, ulong, 0644);
42574257
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
4258-
4259-
/* CSTYLED */
4260-
module_param(zfs_bclone_enabled, uint, 0644);
4261-
MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");
4262-
42634258
#endif

module/os/linux/zfs/zpl_file_range.c

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,14 @@
3131
#include <sys/zfs_vnops.h>
3232
#include <sys/zfeature.h>
3333

34-
int zfs_bclone_enabled = 1;
35-
3634
/*
3735
* Clone part of a file via block cloning.
3836
*
3937
* Note that we are not required to update file offsets; the kernel will take
4038
* care of that depending on how it was called.
4139
*/
4240
static ssize_t
43-
__zpl_clone_file_range(struct file *src_file, loff_t src_off,
41+
zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
4442
struct file *dst_file, loff_t dst_off, size_t len)
4543
{
4644
struct inode *src_i = file_inode(src_file);
@@ -96,11 +94,12 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
9694
{
9795
ssize_t ret;
9896

97+
/* Flags is reserved for future extensions and must be zero. */
9998
if (flags != 0)
10099
return (-EINVAL);
101100

102-
/* Try to do it via zfs_clone_range() */
103-
ret = __zpl_clone_file_range(src_file, src_off,
101+
/* Try to do it via zfs_clone_range() and allow shortening. */
102+
ret = zpl_clone_file_range_impl(src_file, src_off,
104103
dst_file, dst_off, len);
105104

106105
#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
@@ -137,6 +136,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
137136
* FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
138137
* range in both files and if they're the same, arrange for them to be backed
139138
* by the same storage.
139+
*
140+
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
141+
* if we want. It's designed for filesystems that may need to shorten the
142+
* length for alignment, EOF, or any other requirement. ZFS may shorten the
143+
* request when there is outstanding dirty data which hasn't been written.
140144
*/
141145
loff_t
142146
zpl_remap_file_range(struct file *src_file, loff_t src_off,
@@ -145,24 +149,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off,
145149
if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
146150
return (-EINVAL);
147151

148-
/*
149-
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
150-
* range if we want. Its designed for filesystems that make data past
151-
* EOF available, and don't want it to be visible in both files. ZFS
152-
* doesn't do that, so we just turn the flag off.
153-
*/
154-
flags &= ~REMAP_FILE_CAN_SHORTEN;
155-
152+
/* No support for dedup yet */
156153
if (flags & REMAP_FILE_DEDUP)
157-
/* No support for dedup yet */
158154
return (-EOPNOTSUPP);
159155

160156
/* Zero length means to clone everything to the end of the file */
161157
if (len == 0)
162158
len = i_size_read(file_inode(src_file)) - src_off;
163159

164-
return (__zpl_clone_file_range(src_file, src_off,
165-
dst_file, dst_off, len));
160+
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
161+
dst_file, dst_off, len);
162+
163+
if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len)
164+
ret = -EINVAL;
165+
166+
return (ret);
166167
}
167168
#endif /* HAVE_VFS_REMAP_FILE_RANGE */
168169

@@ -179,8 +180,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off,
179180
if (len == 0)
180181
len = i_size_read(file_inode(src_file)) - src_off;
181182

182-
return (__zpl_clone_file_range(src_file, src_off,
183-
dst_file, dst_off, len));
183+
/* The entire length must be cloned or this is an error. */
184+
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
185+
dst_file, dst_off, len);
186+
187+
if (ret >= 0 && ret != len)
188+
ret = -EINVAL;
189+
190+
return (ret);
184191
}
185192
#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
186193

@@ -214,8 +221,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg)
214221

215222
size_t len = i_size_read(file_inode(src_file));
216223

217-
ssize_t ret =
218-
__zpl_clone_file_range(src_file, 0, dst_file, 0, len);
224+
ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len);
219225

220226
fput(src_file);
221227

@@ -253,7 +259,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
253259
if (len == 0)
254260
len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
255261

256-
ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
262+
ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
257263
dst_file, fcr.fcr_dest_offset, len);
258264

259265
fput(src_file);

module/zfs/zfs_vnops.c

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,26 @@
5858
#include <sys/zfs_vfsops.h>
5959
#include <sys/zfs_znode.h>
6060

61+
/*
62+
* Enable the experimental block cloning feature. If this setting is 0, then
63+
* even if feature@block_cloning is enabled, attempts to clone blocks will act
64+
* as though the feature is disabled.
65+
*/
66+
int zfs_bclone_enabled = 1;
67+
68+
/*
69+
* When set zfs_clone_range() waits for dirty data to be written to disk.
70+
* This allows the clone operation to reliably succeed when a file is modified
71+
* and then immediately cloned. For small files this may be slower than making
72+
* a copy of the file and is therefore not the default. However, in certain
73+
* scenarios this behavior may be desirable so a tunable is provided.
74+
*/
75+
static int zfs_bclone_wait_dirty = 0;
76+
77+
/*
78+
* Maximum bytes to read per chunk in zfs_read().
79+
*/
80+
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
6181

6282
int
6383
zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
@@ -182,8 +202,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
182202
return (error);
183203
}
184204

185-
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
186-
187205
/*
188206
* Read bytes from specified file into supplied buffer.
189207
*
@@ -1049,6 +1067,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
10491067
size_t maxblocks, nbps;
10501068
uint_t inblksz;
10511069
uint64_t clear_setid_bits_txg = 0;
1070+
uint64_t last_synced_txg = 0;
10521071

10531072
inoff = *inoffp;
10541073
outoff = *outoffp;
@@ -1287,15 +1306,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
12871306
}
12881307

12891308
nbps = maxblocks;
1309+
last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
12901310
error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
12911311
&nbps);
12921312
if (error != 0) {
12931313
/*
12941314
* If we are trying to clone a block that was created
1295-
* in the current transaction group, error will be
1296-
* EAGAIN here, which we can just return to the caller
1297-
* so it can fallback if it likes.
1315+
* in the current transaction group, the error will be
1316+
* EAGAIN here. Based on zfs_bclone_wait_dirty either
1317+
* return a shortened range to the caller so it can
1318+
* fallback, or wait for the next TXG and check again.
12981319
*/
1320+
if (error == EAGAIN && zfs_bclone_wait_dirty) {
1321+
txg_wait_synced(dmu_objset_pool(inos),
1322+
last_synced_txg + 1);
1323+
continue;
1324+
}
1325+
12991326
break;
13001327
}
13011328

@@ -1517,3 +1544,9 @@ EXPORT_SYMBOL(zfs_clone_range_replay);
15171544

15181545
ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
15191546
"Bytes to read per chunk");
1547+
1548+
ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
1549+
"Enable block cloning");
1550+
1551+
ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
1552+
"Wait for dirty blocks when cloning");

tests/runfiles/common.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -631,7 +631,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
631631
tags = ['functional', 'compression']
632632

633633
[tests/functional/cp_files]
634-
tests = ['cp_files_001_pos', 'cp_stress']
634+
tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
635635
tags = ['functional', 'cp_files']
636636

637637
[tests/functional/crtime]

tests/test-runner/bin/zts-report.py.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ if sys.platform.startswith('freebsd'):
176176
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
177177
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
178178
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
179+
'cp_files/cp_files_002_pos': ['SKIP', na_reason],
179180
'link_count/link_count_001': ['SKIP', na_reason],
180181
'casenorm/mixed_create_failure': ['FAIL', 13215],
181182
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
@@ -312,6 +313,7 @@ elif sys.platform.startswith('linux'):
312313
['SKIP', cfr_reason],
313314
'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
314315
'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
316+
'cp_files/cp_files_002_pos': ['SKIP', cfr_reason],
315317
'fault/auto_online_002_pos': ['FAIL', 11889],
316318
'fault/auto_replace_001_pos': ['FAIL', 14851],
317319
'fault/auto_spare_002_pos': ['FAIL', 11889],

tests/zfs-tests/include/tunables.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ VOL_MODE vol.mode zvol_volmode
9494
VOL_RECURSIVE vol.recursive UNSUPPORTED
9595
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
9696
BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled
97+
BCLONE_WAIT_DIRTY zfs_bclone_wait_dirty zfs_bclone_wait_dirty
9798
XATTR_COMPAT xattr_compat zfs_xattr_compat
9899
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
99100
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,6 +1394,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
13941394
functional/compression/setup.ksh \
13951395
functional/cp_files/cleanup.ksh \
13961396
functional/cp_files/cp_files_001_pos.ksh \
1397+
functional/cp_files/cp_files_002_pos.ksh \
13971398
functional/cp_files/cp_stress.ksh \
13981399
functional/cp_files/setup.ksh \
13991400
functional/crtime/cleanup.ksh \

0 commit comments

Comments
 (0)