Skip to content

Commit 45cff1d

Browse files
pcd1193182kernelOfTruth
authored andcommitted
Illumos 6393 zfs receive a full send as a clone
Reviewed by: Matthew Ahrens <[email protected]> Reviewed by: Prakash Surya <[email protected]> Reviewed by: Richard Elling <[email protected]> Approved by: Dan McDonald <[email protected]> References: https://www.illumos.org/issues/6393 illumos/illumos-gate@68ecb2e diverged code base from Illumos: [module/zfs/dmu_send.c] fcff0f3 Illumos 5960, 5925 (due to ISO C90 & braces around #ifdef ZFS_DEBUG; now equal to upstream again) ISO C90 - mixed declarations and code: for (struct receive_objnode *n = list_remove_head(&list->list); struct receive_objnode *last_object; Reorder code in function 'objlist_insert' around the #ifdef ZFS_DEBUG statement to account for unused variable & ISO C90 warnings Remove struct receive_ign_obj_node *n; from Illumos 5960, 5925 which now isn't needed anymore (ISO C90). Ported-by: kernelOfTruth [email protected]
1 parent 8d0c7dd commit 45cff1d

File tree

4 files changed

+125
-60
lines changed

4 files changed

+125
-60
lines changed

include/sys/dmu_impl.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
*/
2525
/*
2626
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
27-
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
27+
* Copyright (c) 2013, 2015 by Delphix. All rights reserved.
2828
*/
2929

3030
#ifndef _SYS_DMU_IMPL_H
@@ -268,7 +268,6 @@ typedef struct dmu_sendarg {
268268
uint64_t dsa_toguid;
269269
int dsa_err;
270270
dmu_pendop_t dsa_pending_op;
271-
boolean_t dsa_incremental;
272271
uint64_t dsa_featureflags;
273272
uint64_t dsa_last_data_object;
274273
uint64_t dsa_last_data_offset;

include/sys/zfs_ioctl.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
23+
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
2424
*/
2525

2626
#ifndef _SYS_ZFS_IOCTL_H
@@ -137,6 +137,16 @@ typedef enum dmu_send_resume_token_version {
137137

138138
#define DRR_FLAG_CLONE (1<<0)
139139
#define DRR_FLAG_CI_DATA (1<<1)
140+
/*
141+
* This send stream, if it is a full send, includes the FREE and FREEOBJECT
142+
* records that are created by the sending process. This means that the send
143+
* stream can be received as a clone, even though it is not an incremental.
144+
* This is not implemented as a feature flag, because the receiving side does
145+
* not need to have implemented it to receive this stream; it is fully backwards
146+
* compatible. We need a flag, though, because full send streams without it
147+
* cannot necessarily be received as a clone correctly.
148+
*/
149+
#define DRR_FLAG_FREERECORDS (1<<2)
140150

141151
/*
142152
* flags in the drr_checksumflags field in the DRR_WRITE and

man/man8/zfs.8

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
.\"
2323
.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
2424
.\" Copyright 2011 Joshua M. Clulow <[email protected]>
25-
.\" Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25+
.\" Copyright (c) 2011, 2015 by Delphix. All rights reserved.
2626
.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
2727
.\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved.
2828
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
@@ -3067,7 +3067,7 @@ Discard all but the last element of the sent snapshot's file system name, using
30673067
.ad
30683068
.sp .6
30693069
.RS 4n
3070-
Forces the stream to be received as a clone of the given snapshot. This is only valid if the stream is an incremental stream whose source is the same as the provided origin.
3070+
Forces the stream to be received as a clone of the given snapshot. If the stream is a full send stream, this will create the filesystem described by the stream as a clone of the specified snapshot. Which snapshot was specified will not affect the success or failure of the receive, as long as the snapshot does exist. If the stream is an incremental send stream, all the normal verification will be performed.
30713071
.RE
30723072

30733073
.RE

module/zfs/dmu_send.c

Lines changed: 111 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,14 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
169169
return (0);
170170
}
171171

172+
/*
173+
* Fill in the drr_free struct, or perform aggregation if the previous record is
174+
* also a free record, and the two are adjacent.
175+
*
176+
* Note that we send free records even for a full send, because we want to be
177+
* able to receive a full send as a clone, which requires a list of all the free
178+
* and freeobject records that were generated on the source.
179+
*/
172180
static int
173181
dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
174182
uint64_t length)
@@ -192,15 +200,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
192200
(object == dsp->dsa_last_data_object &&
193201
offset > dsp->dsa_last_data_offset));
194202

195-
/*
196-
* If we are doing a non-incremental send, then there can't
197-
* be any data in the dataset we're receiving into. Therefore
198-
* a free record would simply be a no-op. Save space by not
199-
* sending it to begin with.
200-
*/
201-
if (!dsp->dsa_incremental)
202-
return (0);
203-
204203
if (length != -1ULL && offset + length < offset)
205204
length = -1ULL;
206205

@@ -378,10 +377,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
378377
{
379378
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
380379

381-
/* See comment in dump_free(). */
382-
if (!dsp->dsa_incremental)
383-
return (0);
384-
385380
/*
386381
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
387382
* push it out, since free block aggregation can only be done for
@@ -787,6 +782,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
787782
drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
788783
if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
789784
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
785+
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
790786

791787
if (ancestor_zb != NULL) {
792788
drr->drr_u.drr_begin.drr_fromguid =
@@ -809,7 +805,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
809805
dsp->dsa_off = off;
810806
dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
811807
dsp->dsa_pending_op = PENDING_NONE;
812-
dsp->dsa_incremental = (ancestor_zb != NULL);
813808
dsp->dsa_featureflags = featureflags;
814809
dsp->dsa_resume_object = resumeobj;
815810
dsp->dsa_resume_offset = resumeoff;
@@ -1319,7 +1314,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
13191314
/* target fs already exists; recv into temp clone */
13201315

13211316
/* Can't recv a clone into an existing fs */
1322-
if (flags & DRR_FLAG_CLONE) {
1317+
if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
13231318
dsl_dataset_rele(ds, FTAG);
13241319
return (SET_ERROR(EINVAL));
13251320
}
@@ -1338,6 +1333,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
13381333
drba->drba_origin))
13391334
return (SET_ERROR(ENOENT));
13401335

1336+
/*
1337+
* If we're receiving a full send as a clone, and it doesn't
1338+
* contain all the necessary free records and freeobject
1339+
* records, reject it.
1340+
*/
1341+
if (fromguid == 0 && drba->drba_origin &&
1342+
!(flags & DRR_FLAG_FREERECORDS))
1343+
return (SET_ERROR(EINVAL));
1344+
13411345
/* Open the parent of tofs */
13421346
ASSERT3U(strlen(tofs), <, MAXNAMELEN);
13431347
(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
@@ -1377,7 +1381,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
13771381
dsl_dataset_rele(ds, FTAG);
13781382
return (SET_ERROR(EINVAL));
13791383
}
1380-
if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
1384+
if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
1385+
fromguid != 0) {
13811386
dsl_dataset_rele(origin, FTAG);
13821387
dsl_dataset_rele(ds, FTAG);
13831388
return (SET_ERROR(ENODEV));
@@ -1706,6 +1711,20 @@ struct receive_writer_arg {
17061711
uint64_t bytes_read; /* bytes read when current record created */
17071712
};
17081713

1714+
struct objlist {
1715+
list_t list; /* List of struct receive_objnode. */
1716+
/*
1717+
* Last object looked up. Used to assert that objects are being looked
1718+
* up in ascending order.
1719+
*/
1720+
uint64_t last_lookup;
1721+
};
1722+
1723+
struct receive_objnode {
1724+
list_node_t node;
1725+
uint64_t object;
1726+
};
1727+
17091728
struct receive_arg {
17101729
objset_t *os;
17111730
vnode_t *vp; /* The vnode to read the stream from */
@@ -1723,12 +1742,7 @@ struct receive_arg {
17231742
int err;
17241743
boolean_t byteswap;
17251744
/* Sorted list of objects not to issue prefetches for. */
1726-
list_t ignore_obj_list;
1727-
};
1728-
1729-
struct receive_ign_obj_node {
1730-
list_node_t node;
1731-
uint64_t object;
1745+
struct objlist ignore_objlist;
17321746
};
17331747

17341748
typedef struct guid_map_entry {
@@ -2042,13 +2056,14 @@ receive_freeobjects(struct receive_writer_arg *rwa,
20422056
struct drr_freeobjects *drrfo)
20432057
{
20442058
uint64_t obj;
2059+
int next_err = 0;
20452060

20462061
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
20472062
return (SET_ERROR(EINVAL));
20482063

20492064
for (obj = drrfo->drr_firstobj;
2050-
obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
2051-
(void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
2065+
obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
2066+
next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
20522067
int err;
20532068

20542069
if (dmu_object_info(rwa->os, obj, NULL) != 0)
@@ -2058,7 +2073,8 @@ receive_freeobjects(struct receive_writer_arg *rwa,
20582073
if (err != 0)
20592074
return (err);
20602075
}
2061-
2076+
if (next_err != ESRCH)
2077+
return (next_err);
20622078
return (0);
20632079
}
20642080

@@ -2389,6 +2405,72 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
23892405
return (0);
23902406
}
23912407

2408+
static void
2409+
objlist_create(struct objlist *list)
2410+
{
2411+
list_create(&list->list, sizeof (struct receive_objnode),
2412+
offsetof(struct receive_objnode, node));
2413+
list->last_lookup = 0;
2414+
}
2415+
2416+
static void
2417+
objlist_destroy(struct objlist *list)
2418+
{
2419+
struct receive_objnode *n;
2420+
2421+
for (n = list_remove_head(&list->list);
2422+
n != NULL; n = list_remove_head(&list->list)) {
2423+
kmem_free(n, sizeof (*n));
2424+
}
2425+
list_destroy(&list->list);
2426+
}
2427+
2428+
/*
2429+
* This function looks through the objlist to see if the specified object number
2430+
* is contained in the objlist. In the process, it will remove all object
2431+
* numbers in the list that are smaller than the specified object number. Thus,
2432+
* any lookup of an object number smaller than a previously looked up object
2433+
* number will always return false; therefore, all lookups should be done in
2434+
* ascending order.
2435+
*/
2436+
static boolean_t
2437+
objlist_exists(struct objlist *list, uint64_t object)
2438+
{
2439+
struct receive_objnode *node = list_head(&list->list);
2440+
ASSERT3U(object, >=, list->last_lookup);
2441+
list->last_lookup = object;
2442+
while (node != NULL && node->object < object) {
2443+
VERIFY3P(node, ==, list_remove_head(&list->list));
2444+
kmem_free(node, sizeof (*node));
2445+
node = list_head(&list->list);
2446+
}
2447+
return (node != NULL && node->object == object);
2448+
}
2449+
2450+
/*
2451+
* The objlist is a list of object numbers stored in ascending order. However,
2452+
* the insertion of new object numbers does not seek out the correct location to
2453+
* store a new object number; instead, it appends it to the list for simplicity.
2454+
* Thus, any users must take care to only insert new object numbers in ascending
2455+
* order.
2456+
*/
2457+
static void
2458+
objlist_insert(struct objlist *list, uint64_t object)
2459+
{
2460+
struct receive_objnode *node;
2461+
node = kmem_zalloc(sizeof (node), KM_SLEEP);
2462+
node->object = object;
2463+
#ifdef ZFS_DEBUG
2464+
struct receive_objnode *last_object;
2465+
uint64_t last_objnum;
2466+
2467+
last_object = list_tail(&list->list);
2468+
last_objnum = (last_object != NULL ? last_object->object : 0);
2469+
ASSERT3U(node->object, >, last_objnum);
2470+
#endif
2471+
list_insert_tail(&list->list, node);
2472+
}
2473+
23922474
/*
23932475
* Issue the prefetch reads for any necessary indirect blocks.
23942476
*
@@ -2411,13 +2493,7 @@ static void
24112493
receive_read_prefetch(struct receive_arg *ra,
24122494
uint64_t object, uint64_t offset, uint64_t length)
24132495
{
2414-
struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
2415-
while (node != NULL && node->object < object) {
2416-
VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
2417-
kmem_free(node, sizeof (*node));
2418-
node = list_head(&ra->ignore_obj_list);
2419-
}
2420-
if (node == NULL || node->object > object) {
2496+
if (!objlist_exists(&ra->ignore_objlist, object)) {
24212497
dmu_prefetch(ra->os, object, 1, offset, length,
24222498
ZIO_PRIORITY_SYNC_READ);
24232499
}
@@ -2450,20 +2526,7 @@ receive_read_record(struct receive_arg *ra)
24502526
*/
24512527
if (err == ENOENT ||
24522528
(err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
2453-
struct receive_ign_obj_node *node =
2454-
kmem_zalloc(sizeof (*node),
2455-
KM_SLEEP);
2456-
node->object = drro->drr_object;
2457-
#ifdef ZFS_DEBUG
2458-
{
2459-
struct receive_ign_obj_node *last_object =
2460-
list_tail(&ra->ignore_obj_list);
2461-
uint64_t last_objnum = (last_object != NULL ?
2462-
last_object->object : 0);
2463-
ASSERT3U(node->object, >, last_objnum);
2464-
}
2465-
#endif
2466-
list_insert_tail(&ra->ignore_obj_list, node);
2529+
objlist_insert(&ra->ignore_objlist, drro->drr_object);
24672530
err = 0;
24682531
}
24692532
return (err);
@@ -2680,7 +2743,6 @@ resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
26802743
return (0);
26812744
}
26822745

2683-
26842746
/*
26852747
* Read in the stream's records, one by one, and apply them to the pool. There
26862748
* are two threads involved; the thread that calls this function will spin up a
@@ -2701,7 +2763,6 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
27012763
struct receive_arg ra = { 0 };
27022764
struct receive_writer_arg rwa = { 0 };
27032765
int featureflags;
2704-
struct receive_ign_obj_node *n;
27052766
uint32_t payloadlen;
27062767
void *payload;
27072768
nvlist_t *begin_nvl = NULL;
@@ -2717,8 +2778,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
27172778
sizeof (ra.bytes_read), 1, &ra.bytes_read);
27182779
}
27192780

2720-
list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node),
2721-
offsetof(struct receive_ign_obj_node, node));
2781+
objlist_create(&ra.ignore_objlist);
27222782

27232783
/* these were verified in dmu_recv_begin */
27242784
ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
@@ -2873,11 +2933,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
28732933

28742934
*voffp = ra.voff;
28752935

2876-
for (n = list_remove_head(&ra.ignore_obj_list); n != NULL;
2877-
n = list_remove_head(&ra.ignore_obj_list)) {
2878-
kmem_free(n, sizeof (*n));
2879-
}
2880-
list_destroy(&ra.ignore_obj_list);
2936+
objlist_destroy(&ra.ignore_objlist);
28812937
return (err);
28822938
}
28832939

0 commit comments

Comments
 (0)