Skip to content

Commit e6d3a84

Browse files
pcd1193182behlendorf
authored andcommitted
OpenZFS 6393 - zfs receive a full send as a clone
Authored by: Paul Dagnelie <[email protected]> Reviewed by: Matthew Ahrens <[email protected]> Reviewed by: Prakash Surya <[email protected]> Reviewed by: Richard Elling <[email protected]> Approved by: Dan McDonald <[email protected]> Ported-by: Brian Behlendorf <[email protected]> OpenZFS-issue: https://www.illumos.org/issues/6394 OpenZFS-commit: openzfs/openzfs@68ecb2e
1 parent fd41e93 commit e6d3a84

File tree

5 files changed

+131
-64
lines changed

5 files changed

+131
-64
lines changed

include/sys/dmu_impl.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
*/
2525
/*
2626
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
27-
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
27+
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
2828
*/
2929

3030
#ifndef _SYS_DMU_IMPL_H
@@ -268,7 +268,6 @@ typedef struct dmu_sendarg {
268268
uint64_t dsa_toguid;
269269
int dsa_err;
270270
dmu_pendop_t dsa_pending_op;
271-
boolean_t dsa_incremental;
272271
uint64_t dsa_featureflags;
273272
uint64_t dsa_last_data_object;
274273
uint64_t dsa_last_data_offset;

include/sys/zfs_ioctl.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
23+
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
2424
*/
2525

2626
#ifndef _SYS_ZFS_IOCTL_H
@@ -138,6 +138,16 @@ typedef enum dmu_send_resume_token_version {
138138

139139
#define DRR_FLAG_CLONE (1<<0)
140140
#define DRR_FLAG_CI_DATA (1<<1)
141+
/*
142+
* This send stream, if it is a full send, includes the FREE and FREEOBJECT
143+
* records that are created by the sending process. This means that the send
144+
* stream can be received as a clone, even though it is not an incremental.
145+
* This is not implemented as a feature flag, because the receiving side does
146+
* not need to have implemented it to receive this stream; it is fully backwards
147+
* compatible. We need a flag, though, because full send streams without it
148+
* cannot necessarily be received as a clone correctly.
149+
*/
150+
#define DRR_FLAG_FREERECORDS (1<<2)
141151

142152
/*
143153
* flags in the drr_checksumflags field in the DRR_WRITE and

man/man8/zfs.8

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
.\"
2323
.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
2424
.\" Copyright 2011 Joshua M. Clulow <[email protected]>
25-
.\" Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25+
.\" Copyright (c) 2011, 2015 by Delphix. All rights reserved.
2626
.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
2727
.\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved.
2828
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
@@ -2991,7 +2991,12 @@ Discard all but the last element of the sent snapshot's file system name, using
29912991
.ad
29922992
.sp .6
29932993
.RS 4n
2994-
Forces the stream to be received as a clone of the given snapshot. This is only valid if the stream is an incremental stream whose source is the same as the provided origin.
2994+
Forces the stream to be received as a clone of the given snapshot.
2995+
If the stream is a full send stream, this will create the filesystem
2996+
described by the stream as a clone of the specified snapshot. Which
2997+
snapshot was specified will not affect the success or failure of the
2998+
receive, as long as the snapshot does exist. If the stream is an
2999+
incremental send stream, all the normal verification will be performed.
29953000
.RE
29963001

29973002
.RE

module/zfs/dmu_send.c

Lines changed: 110 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,10 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
2423
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24+
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
2525
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
2626
* Copyright 2014 HybridCluster. All rights reserved.
27-
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
2827
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
2928
*/
3029

@@ -173,6 +172,14 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
173172
return (0);
174173
}
175174

175+
/*
176+
* Fill in the drr_free struct, or perform aggregation if the previous record is
177+
* also a free record, and the two are adjacent.
178+
*
179+
* Note that we send free records even for a full send, because we want to be
180+
* able to receive a full send as a clone, which requires a list of all the free
181+
* and freeobject records that were generated on the source.
182+
*/
176183
static int
177184
dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
178185
uint64_t length)
@@ -196,15 +203,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
196203
(object == dsp->dsa_last_data_object &&
197204
offset > dsp->dsa_last_data_offset));
198205

199-
/*
200-
* If we are doing a non-incremental send, then there can't
201-
* be any data in the dataset we're receiving into. Therefore
202-
* a free record would simply be a no-op. Save space by not
203-
* sending it to begin with.
204-
*/
205-
if (!dsp->dsa_incremental)
206-
return (0);
207-
208206
if (length != -1ULL && offset + length < offset)
209207
length = -1ULL;
210208

@@ -382,10 +380,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
382380
{
383381
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
384382

385-
/* See comment in dump_free(). */
386-
if (!dsp->dsa_incremental)
387-
return (0);
388-
389383
/*
390384
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
391385
* push it out, since free block aggregation can only be done for
@@ -796,6 +790,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
796790
drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
797791
if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
798792
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
793+
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
799794

800795
if (ancestor_zb != NULL) {
801796
drr->drr_u.drr_begin.drr_fromguid =
@@ -818,7 +813,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
818813
dsp->dsa_off = off;
819814
dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
820815
dsp->dsa_pending_op = PENDING_NONE;
821-
dsp->dsa_incremental = (ancestor_zb != NULL);
822816
dsp->dsa_featureflags = featureflags;
823817
dsp->dsa_resume_object = resumeobj;
824818
dsp->dsa_resume_offset = resumeoff;
@@ -1336,7 +1330,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
13361330
/* target fs already exists; recv into temp clone */
13371331

13381332
/* Can't recv a clone into an existing fs */
1339-
if (flags & DRR_FLAG_CLONE) {
1333+
if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
13401334
dsl_dataset_rele(ds, FTAG);
13411335
return (SET_ERROR(EINVAL));
13421336
}
@@ -1355,6 +1349,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
13551349
drba->drba_origin))
13561350
return (SET_ERROR(ENOENT));
13571351

1352+
/*
1353+
* If we're receiving a full send as a clone, and it doesn't
1354+
* contain all the necessary free records and freeobject
1355+
* records, reject it.
1356+
*/
1357+
if (fromguid == 0 && drba->drba_origin &&
1358+
!(flags & DRR_FLAG_FREERECORDS))
1359+
return (SET_ERROR(EINVAL));
1360+
13581361
/* Open the parent of tofs */
13591362
ASSERT3U(strlen(tofs), <, MAXNAMELEN);
13601363
(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
@@ -1394,7 +1397,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
13941397
dsl_dataset_rele(ds, FTAG);
13951398
return (SET_ERROR(EINVAL));
13961399
}
1397-
if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
1400+
if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
1401+
fromguid != 0) {
13981402
dsl_dataset_rele(origin, FTAG);
13991403
dsl_dataset_rele(ds, FTAG);
14001404
return (SET_ERROR(ENODEV));
@@ -1724,6 +1728,20 @@ struct receive_writer_arg {
17241728
uint64_t bytes_read; /* bytes read when current record created */
17251729
};
17261730

1731+
struct objlist {
1732+
list_t list; /* List of struct receive_objnode. */
1733+
/*
1734+
* Last object looked up. Used to assert that objects are being looked
1735+
* up in ascending order.
1736+
*/
1737+
uint64_t last_lookup;
1738+
};
1739+
1740+
struct receive_objnode {
1741+
list_node_t node;
1742+
uint64_t object;
1743+
};
1744+
17271745
struct receive_arg {
17281746
objset_t *os;
17291747
vnode_t *vp; /* The vnode to read the stream from */
@@ -1741,12 +1759,7 @@ struct receive_arg {
17411759
int err;
17421760
boolean_t byteswap;
17431761
/* Sorted list of objects not to issue prefetches for. */
1744-
list_t ignore_obj_list;
1745-
};
1746-
1747-
struct receive_ign_obj_node {
1748-
list_node_t node;
1749-
uint64_t object;
1762+
struct objlist ignore_objlist;
17501763
};
17511764

17521765
typedef struct guid_map_entry {
@@ -2063,13 +2076,14 @@ receive_freeobjects(struct receive_writer_arg *rwa,
20632076
struct drr_freeobjects *drrfo)
20642077
{
20652078
uint64_t obj;
2079+
int next_err = 0;
20662080

20672081
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
20682082
return (SET_ERROR(EINVAL));
20692083

20702084
for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
2071-
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
2072-
(void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
2085+
obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
2086+
next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
20732087
dmu_object_info_t doi;
20742088
int err;
20752089

@@ -2085,7 +2099,8 @@ receive_freeobjects(struct receive_writer_arg *rwa,
20852099
if (err != 0)
20862100
return (err);
20872101
}
2088-
2102+
if (next_err != ESRCH)
2103+
return (next_err);
20892104
return (0);
20902105
}
20912106

@@ -2415,6 +2430,70 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
24152430
return (0);
24162431
}
24172432

2433+
static void
2434+
objlist_create(struct objlist *list)
2435+
{
2436+
list_create(&list->list, sizeof (struct receive_objnode),
2437+
offsetof(struct receive_objnode, node));
2438+
list->last_lookup = 0;
2439+
}
2440+
2441+
static void
2442+
objlist_destroy(struct objlist *list)
2443+
{
2444+
struct receive_objnode *n;
2445+
2446+
for (n = list_remove_head(&list->list);
2447+
n != NULL; n = list_remove_head(&list->list)) {
2448+
kmem_free(n, sizeof (*n));
2449+
}
2450+
list_destroy(&list->list);
2451+
}
2452+
2453+
/*
2454+
* This function looks through the objlist to see if the specified object number
2455+
* is contained in the objlist. In the process, it will remove all object
2456+
* numbers in the list that are smaller than the specified object number. Thus,
2457+
* any lookup of an object number smaller than a previously looked up object
2458+
* number will always return false; therefore, all lookups should be done in
2459+
* ascending order.
2460+
*/
2461+
static boolean_t
2462+
objlist_exists(struct objlist *list, uint64_t object)
2463+
{
2464+
struct receive_objnode *node = list_head(&list->list);
2465+
ASSERT3U(object, >=, list->last_lookup);
2466+
list->last_lookup = object;
2467+
while (node != NULL && node->object < object) {
2468+
VERIFY3P(node, ==, list_remove_head(&list->list));
2469+
kmem_free(node, sizeof (*node));
2470+
node = list_head(&list->list);
2471+
}
2472+
return (node != NULL && node->object == object);
2473+
}
2474+
2475+
/*
2476+
* The objlist is a list of object numbers stored in ascending order. However,
2477+
* the insertion of new object numbers does not seek out the correct location to
2478+
* store a new object number; instead, it appends it to the list for simplicity.
2479+
* Thus, any users must take care to only insert new object numbers in ascending
2480+
* order.
2481+
*/
2482+
static void
2483+
objlist_insert(struct objlist *list, uint64_t object)
2484+
{
2485+
struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
2486+
node->object = object;
2487+
#ifdef ZFS_DEBUG
2488+
{
2489+
struct receive_objnode *last_object = list_tail(&list->list);
2490+
uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
2491+
ASSERT3U(node->object, >, last_objnum);
2492+
}
2493+
#endif
2494+
list_insert_tail(&list->list, node);
2495+
}
2496+
24182497
/*
24192498
* Issue the prefetch reads for any necessary indirect blocks.
24202499
*
@@ -2437,13 +2516,7 @@ static void
24372516
receive_read_prefetch(struct receive_arg *ra,
24382517
uint64_t object, uint64_t offset, uint64_t length)
24392518
{
2440-
struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
2441-
while (node != NULL && node->object < object) {
2442-
VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
2443-
kmem_free(node, sizeof (*node));
2444-
node = list_head(&ra->ignore_obj_list);
2445-
}
2446-
if (node == NULL || node->object > object) {
2519+
if (!objlist_exists(&ra->ignore_objlist, object)) {
24472520
dmu_prefetch(ra->os, object, 1, offset, length,
24482521
ZIO_PRIORITY_SYNC_READ);
24492522
}
@@ -2476,20 +2549,7 @@ receive_read_record(struct receive_arg *ra)
24762549
*/
24772550
if (err == ENOENT ||
24782551
(err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
2479-
struct receive_ign_obj_node *node =
2480-
kmem_zalloc(sizeof (*node),
2481-
KM_SLEEP);
2482-
node->object = drro->drr_object;
2483-
#ifdef ZFS_DEBUG
2484-
{
2485-
struct receive_ign_obj_node *last_object =
2486-
list_tail(&ra->ignore_obj_list);
2487-
uint64_t last_objnum = (last_object != NULL ?
2488-
last_object->object : 0);
2489-
ASSERT3U(node->object, >, last_objnum);
2490-
}
2491-
#endif
2492-
list_insert_tail(&ra->ignore_obj_list, node);
2552+
objlist_insert(&ra->ignore_objlist, drro->drr_object);
24932553
err = 0;
24942554
}
24952555
return (err);
@@ -2706,7 +2766,6 @@ resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
27062766
return (0);
27072767
}
27082768

2709-
27102769
/*
27112770
* Read in the stream's records, one by one, and apply them to the pool. There
27122771
* are two threads involved; the thread that calls this function will spin up a
@@ -2727,7 +2786,6 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
27272786
struct receive_arg *ra;
27282787
struct receive_writer_arg *rwa;
27292788
int featureflags;
2730-
struct receive_ign_obj_node *n;
27312789
uint32_t payloadlen;
27322790
void *payload;
27332791
nvlist_t *begin_nvl = NULL;
@@ -2746,8 +2804,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
27462804
sizeof (ra->bytes_read), 1, &ra->bytes_read);
27472805
}
27482806

2749-
list_create(&ra->ignore_obj_list, sizeof (struct receive_ign_obj_node),
2750-
offsetof(struct receive_ign_obj_node, node));
2807+
objlist_create(&ra->ignore_objlist);
27512808

27522809
/* these were verified in dmu_recv_begin */
27532810
ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
@@ -2901,12 +2958,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
29012958
}
29022959

29032960
*voffp = ra->voff;
2904-
2905-
for (n = list_remove_head(&ra->ignore_obj_list); n != NULL;
2906-
n = list_remove_head(&ra->ignore_obj_list)) {
2907-
kmem_free(n, sizeof (*n));
2908-
}
2909-
list_destroy(&ra->ignore_obj_list);
2961+
objlist_destroy(&ra->ignore_objlist);
29102962
kmem_free(ra, sizeof (*ra));
29112963
kmem_free(rwa, sizeof (*rwa));
29122964
return (err);

tests/runfiles/linux.run

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ tests = []
152152
[tests/functional/cli_root/zfs_receive]
153153
tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
154154
'zfs_receive_005_neg', 'zfs_receive_006_pos',
155-
'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg']
155+
'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg',
156+
'zfs_receive_010_pos']
156157

157158
# DISABLED:
158159
# zfs_rename_002_pos - needs investigation

0 commit comments

Comments
 (0)