Skip to content

Commit e3cfa1f

Browse files
committed
fix prefetching of indirect blocks while destroying
When traversing a tree of block pointers (e.g. for `zfs destroy <fs>` or `zfs send`), we prefetch the indirect blocks that will be needed, in `traverse_prefetch_metadata()`. In the case of `zfs destroy <fs>`, we do a little traversing each txg, and resume the traversal the next txg. So the indirect blocks that will be needed, and thus are candidates for prefetching, does not include blocks that are before the resume point. The problem is that the logic for determining if the indirect blocks are before the resume point is incorrect, causing the (up to 1024) L1 indirect blocks that are inside the first L2 to not be prefetched. In practice, if we are able to read many more than 1024 blocks per txg, then this will be inconsequential. But if i/o latency is more than a few milliseconds, almost no L1's will be prefetched, so they will be read serially, and thus the destroying will be very slow. This can be observed as `zpool get freeing` decreasing very slowly. Specifically: When we first examine the L2 that contains the block we'll be resuming from, we have not yet resumed, so `td_resume` is nonzero. At this point, all calls to `traverse_prefetch_metadata()` will fail, even if the L1 in question is after the resume point. It isn't until the callback is issued for the resume point that we zero out `td_resume`, but by this point we've already attempted and failed to prefetch everything under this L2 indirect block. This commit addresses the issue by reusing the existing `resume_skip_check()` to determine if the L1's bookmark is before or after the resume point. To do so, this function is made non-mutating (the caller now zeros `td_resume`). Note, this bug likely predates (was not introduced by) #11803. Signed-off-by: Matthew Ahrens <[email protected]>
1 parent 5f42d1d commit e3cfa1f

File tree

1 file changed

+14
-14
lines changed

1 file changed

+14
-14
lines changed

module/zfs/dmu_traverse.c

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ typedef enum resume_skip {
154154
* Otherwise returns RESUME_SKIP_NONE.
155155
*/
156156
static resume_skip_t
157-
resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
157+
resume_skip_check(const traverse_data_t *td, const dnode_phys_t *dnp,
158158
const zbookmark_phys_t *zb)
159159
{
160160
if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
@@ -165,12 +165,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
165165
if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
166166
return (RESUME_SKIP_ALL);
167167

168-
/*
169-
* If we found the block we're trying to resume from, zero
170-
* the bookmark out to indicate that we have resumed.
171-
*/
172168
if (memcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
173-
memset(td->td_resume, 0, sizeof (*zb));
174169
if (td->td_flags & TRAVERSE_POST)
175170
return (RESUME_SKIP_CHILDREN);
176171
}
@@ -182,7 +177,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
182177
* Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE.
183178
*/
184179
static boolean_t
185-
traverse_prefetch_metadata(traverse_data_t *td,
180+
traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
186181
const blkptr_t *bp, const zbookmark_phys_t *zb)
187182
{
188183
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
@@ -192,11 +187,10 @@ traverse_prefetch_metadata(traverse_data_t *td,
192187
if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
193188
return (B_FALSE);
194189
/*
195-
* If we are in the process of resuming, don't prefetch, because
196-
* some children will not be needed (and in fact may have already
197-
* been freed).
190+
* If this bp is before the resume point, it may have already been
191+
* freed.
198192
*/
199-
if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
193+
if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)
200194
return (B_FALSE);
201195
if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
202196
return (B_FALSE);
@@ -241,6 +235,12 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
241235
ASSERT(0);
242236
}
243237

238+
if (td->td_resume != NULL &&
239+
memcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
240+
/* found the resume point; no longer resuming */
241+
memset(td->td_resume, 0, sizeof (*zb));
242+
}
243+
244244
if (bp->blk_birth == 0) {
245245
/*
246246
* Since this block has a birth time of 0 it must be one of
@@ -344,7 +344,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
344344
SET_BOOKMARK(czb, zb->zb_objset,
345345
zb->zb_object, zb->zb_level - 1,
346346
zb->zb_blkid * epb + pidx);
347-
if (traverse_prefetch_metadata(td,
347+
if (traverse_prefetch_metadata(td, dnp,
348348
&((blkptr_t *)buf->b_data)[pidx],
349349
czb) == B_TRUE) {
350350
prefetched++;
@@ -506,12 +506,12 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
506506

507507
for (j = 0; j < dnp->dn_nblkptr; j++) {
508508
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
509-
traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
509+
traverse_prefetch_metadata(td, dnp, &dnp->dn_blkptr[j], &czb);
510510
}
511511

512512
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
513513
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
514-
traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
514+
traverse_prefetch_metadata(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
515515
}
516516
}
517517

0 commit comments

Comments
 (0)