Skip to content

Commit 0d783c8

Browse files
committed
ZFS send should use spill block prefetched from send_reader_thread
Currently, even though send_reader_thread prefetches spill block, do_dump() will not use it and issues its own blocking arc_read. This causes significant performance degradation when sending datasets with lots of spill blocks. For unmodified spill blocks, we also create send_range struct for them in send_reader_thread and issue prefetches for them. We piggyback them on the dnode send_range instead of enqueueing them so we don't break send_range_after check. Signed-off-by: Chunwei Chen <[email protected]>
1 parent ae93aeb commit 0d783c8

File tree

1 file changed

+63
-61
lines changed

1 file changed

+63
-61
lines changed

module/zfs/dmu_send.c

Lines changed: 63 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,8 @@ struct send_range {
180180
*/
181181
dnode_phys_t *dnp;
182182
blkptr_t bp;
183+
/* Piggyback unmodified spill block */
184+
struct send_range *spill_range;
183185
} object;
184186
struct srr {
185187
uint32_t datablksz;
@@ -231,6 +233,8 @@ range_free(struct send_range *range)
231233
size_t size = sizeof (dnode_phys_t) *
232234
(range->sru.object.dnp->dn_extra_slots + 1);
233235
kmem_free(range->sru.object.dnp, size);
236+
if (range->sru.object.spill_range)
237+
range_free(range->sru.object.spill_range);
234238
} else if (range->type == DATA) {
235239
mutex_enter(&range->sru.data.lock);
236240
while (range->sru.data.io_outstanding)
@@ -617,7 +621,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
617621
drrs->drr_length = blksz;
618622
drrs->drr_toguid = dscp->dsc_toguid;
619623

620-
/* See comment in dump_dnode() for full details */
624+
/* See comment in piggyback_unmodified_spill() for full details */
621625
if (zfs_send_unmodified_spill_blocks &&
622626
(BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) {
623627
drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
@@ -793,35 +797,6 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
793797
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
794798
return (SET_ERROR(EINTR));
795799

796-
/*
797-
* Send DRR_SPILL records for unmodified spill blocks. This is useful
798-
* because changing certain attributes of the object (e.g. blocksize)
799-
* can cause old versions of ZFS to incorrectly remove a spill block.
800-
* Including these records in the stream forces an up to date version
801-
* to always be written ensuring they're never lost. Current versions
802-
* of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
803-
* ignore these unmodified spill blocks.
804-
*/
805-
if (zfs_send_unmodified_spill_blocks &&
806-
(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
807-
(BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) {
808-
struct send_range record;
809-
blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
810-
811-
memset(&record, 0, sizeof (struct send_range));
812-
record.type = DATA;
813-
record.object = object;
814-
record.eos_marker = B_FALSE;
815-
record.start_blkid = DMU_SPILL_BLKID;
816-
record.end_blkid = record.start_blkid + 1;
817-
record.sru.data.bp = *bp;
818-
record.sru.data.obj_type = dnp->dn_type;
819-
record.sru.data.datablksz = BP_GET_LSIZE(bp);
820-
821-
if (do_dump(dscp, &record) != 0)
822-
return (SET_ERROR(EINTR));
823-
}
824-
825800
if (dscp->dsc_err != 0)
826801
return (SET_ERROR(EINTR));
827802

@@ -911,6 +886,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
911886
case OBJECT:
912887
err = dump_dnode(dscp, &range->sru.object.bp, range->object,
913888
range->sru.object.dnp);
889+
/* Dump piggybacked unmodified spill block */
890+
if (!err && range->sru.object.spill_range)
891+
err = do_dump(dscp, range->sru.object.spill_range);
914892
return (err);
915893
case OBJECT_RANGE: {
916894
ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
@@ -939,34 +917,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
939917

940918
ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp));
941919
ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
942-
if (BP_GET_TYPE(bp) == DMU_OT_SA) {
943-
arc_flags_t aflags = ARC_FLAG_WAIT;
944-
zio_flag_t zioflags = ZIO_FLAG_CANFAIL;
945-
946-
if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
947-
ASSERT(BP_IS_PROTECTED(bp));
948-
zioflags |= ZIO_FLAG_RAW;
949-
}
950920

951-
zbookmark_phys_t zb;
952-
ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID);
953-
zb.zb_objset = dmu_objset_id(dscp->dsc_os);
954-
zb.zb_object = range->object;
955-
zb.zb_level = 0;
956-
zb.zb_blkid = range->start_blkid;
957-
958-
arc_buf_t *abuf = NULL;
959-
if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa,
960-
bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
961-
zioflags, &aflags, &zb) != 0)
962-
return (SET_ERROR(EIO));
963-
964-
err = dump_spill(dscp, bp, zb.zb_object,
965-
(abuf == NULL ? NULL : abuf->b_data));
966-
if (abuf != NULL)
967-
arc_buf_destroy(abuf, &abuf);
968-
return (err);
969-
}
970921
if (send_do_embed(bp, dscp->dsc_featureflags)) {
971922
err = dump_write_embedded(dscp, range->object,
972923
range->start_blkid * srdp->datablksz,
@@ -975,8 +926,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
975926
}
976927
ASSERT(range->object > dscp->dsc_resume_object ||
977928
(range->object == dscp->dsc_resume_object &&
929+
(range->start_blkid == DMU_SPILL_BLKID ||
978930
range->start_blkid * srdp->datablksz >=
979-
dscp->dsc_resume_offset));
931+
dscp->dsc_resume_offset)));
980932
/* it's a level-0 block of a regular object */
981933

982934
mutex_enter(&srdp->lock);
@@ -1006,8 +958,6 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
1006958
ASSERT(dscp->dsc_dso->dso_dryrun ||
1007959
srdp->abuf != NULL || srdp->abd != NULL);
1008960

1009-
uint64_t offset = range->start_blkid * srdp->datablksz;
1010-
1011961
char *data = NULL;
1012962
if (srdp->abd != NULL) {
1013963
data = abd_to_buf(srdp->abd);
@@ -1016,6 +966,14 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
1016966
data = srdp->abuf->b_data;
1017967
}
1018968

969+
if (BP_GET_TYPE(bp) == DMU_OT_SA) {
970+
ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID);
971+
err = dump_spill(dscp, bp, range->object, data);
972+
return (err);
973+
}
974+
975+
uint64_t offset = range->start_blkid * srdp->datablksz;
976+
1019977
/*
1020978
* If we have large blocks stored on disk but the send flags
1021979
* don't allow us to send large blocks, we split the data from
@@ -1098,6 +1056,8 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid,
10981056
range->sru.data.io_outstanding = 0;
10991057
range->sru.data.io_err = 0;
11001058
range->sru.data.io_compressed = B_FALSE;
1059+
} else if (type == OBJECT) {
1060+
range->sru.object.spill_range = NULL;
11011061
}
11021062
return (range);
11031063
}
@@ -1742,6 +1702,45 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn,
17421702
bqueue_enqueue(q, range, datablksz);
17431703
}
17441704

1705+
/*
1706+
* Send DRR_SPILL records for unmodified spill blocks. This is useful
1707+
* because changing certain attributes of the object (e.g. blocksize)
1708+
* can cause old versions of ZFS to incorrectly remove a spill block.
1709+
* Including these records in the stream forces an up to date version
1710+
* to always be written ensuring they're never lost. Current versions
1711+
* of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
1712+
* ignore these unmodified spill blocks.
1713+
*/
1714+
static uint64_t
1715+
piggyback_unmodified_spill(struct send_reader_thread_arg *srta,
1716+
struct send_range *range)
1717+
{
1718+
if (range->type != OBJECT ||
1719+
!zfs_send_unmodified_spill_blocks)
1720+
return (0);
1721+
1722+
struct send_range *spill_range = NULL;
1723+
dnode_phys_t *dnp = range->sru.object.dnp;
1724+
uint64_t fromtxg = srta->smta->to_arg->fromtxg;
1725+
1726+
if (!(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ||
1727+
!(BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= fromtxg))
1728+
return (0);
1729+
1730+
blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
1731+
1732+
spill_range = range_alloc(DATA, range->object, DMU_SPILL_BLKID,
1733+
DMU_SPILL_BLKID+1, B_FALSE);
1734+
spill_range->sru.data.bp = *bp;
1735+
spill_range->sru.data.obj_type = dnp->dn_type;
1736+
spill_range->sru.data.datablksz = BP_GET_LSIZE(bp);
1737+
1738+
issue_data_read(srta, spill_range);
1739+
range->sru.object.spill_range = spill_range;
1740+
1741+
return (BP_GET_LSIZE(bp));
1742+
}
1743+
17451744
/*
17461745
* This thread is responsible for two things: First, it retrieves the correct
17471746
* blkptr in the to ds if we need to send the data because of something from
@@ -1760,6 +1759,7 @@ send_reader_thread(void *arg)
17601759
fstrans_cookie_t cookie = spl_fstrans_mark();
17611760
struct send_range *range = bqueue_dequeue(inq);
17621761
int err = 0;
1762+
uint64_t spill = 0;
17631763

17641764
/*
17651765
* If the record we're analyzing is from a redaction bookmark from the
@@ -1783,7 +1783,9 @@ send_reader_thread(void *arg)
17831783
case OBJECT:
17841784
case OBJECT_RANGE:
17851785
case REDACT: // Redacted blocks must exist
1786-
bqueue_enqueue(outq, range, sizeof (*range));
1786+
/* For OBJECT */
1787+
spill = piggyback_unmodified_spill(srta, range);
1788+
bqueue_enqueue(outq, range, sizeof (*range) + spill);
17871789
range = get_next_range_nofree(inq, range);
17881790
break;
17891791
case PREVIOUSLY_REDACTED: {

0 commit comments

Comments
 (0)