Skip to content

Commit e7681da

Browse files
committed
Optimize RAIDZ expansion
- Instead of copying one ashift-sized block per ZIO, copy as much as we have contiguous data up to 16MB per old vdev. To avoid data moves use gang ABDs, so that read ZIOs can directly fill buffers for write ZIOs. ABDs have much smaller overhead than ZIOs in both memory usage and processing time, plus big I/Os do not depend on I/O aggregation and scheduling to reach decent performance on HDDs. - Reduce raidz_expand_max_copy_bytes to 16MB on 32bit platforms. - Use 32bit range tree when possible (practically always now) to slightly reduce memory usage. - Use ZIO_PRIORITY_REMOVAL for early stages of expansion, same as for main ones. - Fix rate overflows in `zpool status` reporting. With these changes expanding RAIDZ1 from 4 to 5 children I am able to reach 6-12GB/s rate on SSDs and ~500MB/s on HDDs, both are limited by devices instead of CPU. Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc.
1 parent 0ffa6f3 commit e7681da

File tree

2 files changed

+121
-61
lines changed

2 files changed

+121
-61
lines changed

cmd/zpool/zpool_main.c

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10034,9 +10034,8 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs)
1003410034
(void) printf(gettext("Removal of %s canceled on %s"),
1003510035
vdev_name, ctime(&end));
1003610036
} else {
10037-
uint64_t copied, total, elapsed, mins_left, hours_left;
10037+
uint64_t copied, total, elapsed, rate, mins_left, hours_left;
1003810038
double fraction_done;
10039-
uint_t rate;
1004010039

1004110040
assert(prs->prs_state == DSS_SCANNING);
1004210041

@@ -10132,9 +10131,8 @@ print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres)
1013210131
copied_buf, time_buf, ctime((time_t *)&end));
1013310132
} else {
1013410133
char examined_buf[7], total_buf[7], rate_buf[7];
10135-
uint64_t copied, total, elapsed, secs_left;
10134+
uint64_t copied, total, elapsed, rate, secs_left;
1013610135
double fraction_done;
10137-
uint_t rate;
1013810136

1013910137
assert(pres->pres_state == DSS_SCANNING);
1014010138

module/zfs/vdev_raidz.c

Lines changed: 119 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,11 @@ uint_t raidz_expand_pause_point = 0;
357357
/*
358358
* Maximum amount of copy io's outstanding at once.
359359
*/
360+
#ifdef _ILP32
361+
static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
362+
#else
360363
static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
364+
#endif
361365

362366
/*
363367
* Apply raidz map abds aggregation if the number of rows in the map is equal
@@ -3817,16 +3821,21 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
38173821
}
38183822

38193823
/*
3820-
* Struct for one copy zio.
3824+
* State of one copy batch.
38213825
*/
38223826
typedef struct raidz_reflow_arg {
3823-
vdev_raidz_expand_t *rra_vre;
3824-
zfs_locked_range_t *rra_lr;
3825-
uint64_t rra_txg;
3827+
vdev_raidz_expand_t *rra_vre; /* Global expantion state. */
3828+
zfs_locked_range_t *rra_lr; /* Range lock of this batch. */
3829+
uint64_t rra_txg; /* TXG of this batch. */
3830+
uint_t rra_ashift; /* Ashift of the vdev. */
3831+
uint32_t rra_tbd; /* Number of in-flight ZIOs. */
3832+
uint32_t rra_writes; /* Number of write ZIOs. */
3833+
zio_t *rra_zio[]; /* Write ZIO pointers. */
38263834
} raidz_reflow_arg_t;
38273835

38283836
/*
3829-
* The write of the new location is done.
3837+
* Write of the new location on one child is done. Once all of them are done
3838+
* we can unlock and free everything.
38303839
*/
38313840
static void
38323841
raidz_reflow_write_done(zio_t *zio)
@@ -3850,24 +3859,30 @@ raidz_reflow_write_done(zio_t *zio)
38503859
zio->io_size;
38513860
}
38523861
cv_signal(&vre->vre_cv);
3862+
boolean_t done = (--rra->rra_tbd == 0);
38533863
mutex_exit(&vre->vre_lock);
38543864

3855-
zfs_rangelock_exit(rra->rra_lr);
3856-
3857-
kmem_free(rra, sizeof (*rra));
3865+
if (!done)
3866+
return;
38583867
spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
3868+
zfs_rangelock_exit(rra->rra_lr);
3869+
kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
38593870
}
38603871

38613872
/*
3862-
* The read of the old location is done. The parent zio is the write to
3863-
* the new location. Allow it to start.
3873+
* Read of the old location on one child is done. Once all of them are done
3874+
* writes should have all the data and we can issue them.
38643875
*/
38653876
static void
38663877
raidz_reflow_read_done(zio_t *zio)
38673878
{
38683879
raidz_reflow_arg_t *rra = zio->io_private;
38693880
vdev_raidz_expand_t *vre = rra->rra_vre;
38703881

3882+
/* Reads of only one block use write ABDs. For bigger free gangs. */
3883+
if (zio->io_size > (1 << rra->rra_ashift))
3884+
abd_free(zio->io_abd);
3885+
38713886
/*
38723887
* If the read failed, or if it was done on a vdev that is not fully
38733888
* healthy (e.g. a child that has a resilver in progress), we may not
@@ -3891,7 +3906,11 @@ raidz_reflow_read_done(zio_t *zio)
38913906
mutex_exit(&vre->vre_lock);
38923907
}
38933908

3894-
zio_nowait(zio_unique_parent(zio));
3909+
if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
3910+
return;
3911+
rra->rra_tbd = rra->rra_writes;
3912+
for (uint64_t i = 0; i < rra->rra_writes; i++)
3913+
zio_nowait(rra->rra_zio[i]);
38953914
}
38963915

38973916
static void
@@ -3932,21 +3951,19 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
39323951
dmu_tx_t *tx)
39333952
{
39343953
spa_t *spa = vd->vdev_spa;
3935-
int ashift = vd->vdev_top->vdev_ashift;
3936-
uint64_t offset, size;
3954+
uint_t ashift = vd->vdev_top->vdev_ashift;
39373955

3938-
if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
3939-
&offset, &size)) {
3956+
range_seg_t *rs = range_tree_first(rt);
3957+
if (rt == NULL)
39403958
return (B_FALSE);
3941-
}
3959+
uint64_t offset = rs_get_start(rs, rt);
39423960
ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3961+
uint64_t size = rs_get_end(rs, rt) - offset;
39433962
ASSERT3U(size, >=, 1 << ashift);
3944-
uint64_t length = 1 << ashift;
3945-
int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3963+
ASSERT(IS_P2ALIGNED(size, 1 << ashift));
39463964

39473965
uint64_t blkid = offset >> ashift;
3948-
3949-
int old_children = vd->vdev_children - 1;
3966+
uint_t old_children = vd->vdev_children - 1;
39503967

39513968
/*
39523969
* We can only progress to the point that writes will not overlap
@@ -3965,26 +3982,34 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
39653982
uint64_t next_overwrite_blkid = ubsync_blkid +
39663983
ubsync_blkid / old_children - old_children;
39673984
VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
3968-
39693985
if (blkid >= next_overwrite_blkid) {
39703986
raidz_reflow_record_progress(vre,
39713987
next_overwrite_blkid << ashift, tx);
39723988
return (B_TRUE);
39733989
}
39743990

3975-
range_tree_remove(rt, offset, length);
3991+
size = MIN(size, raidz_expand_max_copy_bytes);
3992+
size = MIN(size, (uint64_t)old_children *
3993+
MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
3994+
size = MAX(size, 1 << ashift);
3995+
uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
3996+
size = (uint64_t)blocks << ashift;
3997+
3998+
range_tree_remove(rt, offset, size);
39763999

3977-
raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
4000+
uint_t reads = MIN(blocks, old_children);
4001+
uint_t writes = MIN(blocks, vd->vdev_children);
4002+
raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
4003+
sizeof (zio_t *) * writes, KM_SLEEP);
39784004
rra->rra_vre = vre;
39794005
rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
3980-
offset, length, RL_WRITER);
4006+
offset, size, RL_WRITER);
39814007
rra->rra_txg = dmu_tx_get_txg(tx);
4008+
rra->rra_ashift = ashift;
4009+
rra->rra_tbd = reads;
4010+
rra->rra_writes = writes;
39824011

3983-
raidz_reflow_record_progress(vre, offset + length, tx);
3984-
3985-
mutex_enter(&vre->vre_lock);
3986-
vre->vre_outstanding_bytes += length;
3987-
mutex_exit(&vre->vre_lock);
4012+
raidz_reflow_record_progress(vre, offset + size, tx);
39884013

39894014
/*
39904015
* SCL_STATE will be released when the read and write are done,
@@ -4006,29 +4031,61 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
40064031
mutex_exit(&vre->vre_lock);
40074032

40084033
/* drop everything we acquired */
4009-
zfs_rangelock_exit(rra->rra_lr);
4010-
kmem_free(rra, sizeof (*rra));
40114034
spa_config_exit(spa, SCL_STATE, spa);
4035+
zfs_rangelock_exit(rra->rra_lr);
4036+
kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
40124037
return (B_TRUE);
40134038
}
40144039

4040+
mutex_enter(&vre->vre_lock);
4041+
vre->vre_outstanding_bytes += size;
4042+
mutex_exit(&vre->vre_lock);
4043+
4044+
/* Allocate ABD and ZIO for each child we write. */
4045+
int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
40154046
zio_t *pio = spa->spa_txg_zio[txgoff];
4016-
abd_t *abd = abd_alloc_for_io(length, B_FALSE);
4017-
zio_t *write_zio = zio_vdev_child_io(pio, NULL,
4018-
vd->vdev_child[blkid % vd->vdev_children],
4019-
(blkid / vd->vdev_children) << ashift,
4020-
abd, length,
4021-
ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4022-
ZIO_FLAG_CANFAIL,
4023-
raidz_reflow_write_done, rra);
4024-
4025-
zio_nowait(zio_vdev_child_io(write_zio, NULL,
4026-
vd->vdev_child[blkid % old_children],
4027-
(blkid / old_children) << ashift,
4028-
abd, length,
4029-
ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4030-
ZIO_FLAG_CANFAIL,
4031-
raidz_reflow_read_done, rra));
4047+
uint_t b = blocks / vd->vdev_children;
4048+
uint_t bb = blocks % vd->vdev_children;
4049+
for (uint_t i = 0; i < writes; i++) {
4050+
uint_t n = b + (i < bb);
4051+
abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
4052+
rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
4053+
vd->vdev_child[(blkid + i) % vd->vdev_children],
4054+
((blkid + i) / vd->vdev_children) << ashift,
4055+
abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
4056+
ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
4057+
}
4058+
4059+
/*
4060+
* Allocate and issue ZIO for each child we read. For reads of only
4061+
* one block we can use respective writer ABDs, since they will also
4062+
* have only one block. For bigger reads create gang ABDs and fill
4063+
* them with respective blocks from writer ABDs.
4064+
*/
4065+
b = blocks / old_children;
4066+
bb = blocks % old_children;
4067+
for (uint_t i = 0; i < reads; i++) {
4068+
uint_t n = b + (i < bb);
4069+
abd_t *abd;
4070+
if (n > 1) {
4071+
abd = abd_alloc_gang();
4072+
for (uint_t j = 0; j < n; j++) {
4073+
uint_t b = j * old_children + i;
4074+
abd_t *cabd = abd_get_offset_size(
4075+
rra->rra_zio[b % vd->vdev_children]->io_abd,
4076+
(b / vd->vdev_children) << ashift,
4077+
1 << ashift);
4078+
abd_gang_add(abd, cabd, B_TRUE);
4079+
}
4080+
} else {
4081+
abd = rra->rra_zio[i]->io_abd;
4082+
}
4083+
zio_nowait(zio_vdev_child_io(pio, NULL,
4084+
vd->vdev_child[(blkid + i) % old_children],
4085+
((blkid + i) / old_children) << ashift, abd,
4086+
n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
4087+
ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
4088+
}
40324089

40334090
return (B_FALSE);
40344091
}
@@ -4122,7 +4179,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
41224179
zio_nowait(zio_vdev_child_io(pio, NULL,
41234180
raidvd->vdev_child[i],
41244181
VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4125-
write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,
4182+
write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
41264183
ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
41274184
}
41284185
error = zio_wait(pio);
@@ -4142,7 +4199,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
41424199
ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
41434200
zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
41444201
0, abds[i], read_size, ZIO_TYPE_READ,
4145-
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
4202+
ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
41464203
raidz_scratch_child_done, pio));
41474204
}
41484205
error = zio_wait(pio);
@@ -4197,7 +4254,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
41974254
*/
41984255
zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
41994256
VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4200-
write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
4257+
write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
42014258
ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
42024259
}
42034260
error = zio_wait(pio);
@@ -4246,7 +4303,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
42464303
for (int i = 0; i < raidvd->vdev_children; i++) {
42474304
zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
42484305
0, abds[i], write_size, ZIO_TYPE_WRITE,
4249-
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
4306+
ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
42504307
raidz_scratch_child_done, pio));
42514308
}
42524309
error = zio_wait(pio);
@@ -4355,8 +4412,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa)
43554412
*/
43564413
zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
43574414
VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4358-
write_size, ZIO_TYPE_READ,
4359-
ZIO_PRIORITY_ASYNC_READ, 0,
4415+
write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
43604416
raidz_scratch_child_done, pio));
43614417
}
43624418
zio_wait(pio);
@@ -4368,7 +4424,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa)
43684424
for (int i = 0; i < raidvd->vdev_children; i++) {
43694425
zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
43704426
0, abds[i], write_size, ZIO_TYPE_WRITE,
4371-
ZIO_PRIORITY_ASYNC_WRITE, 0,
4427+
ZIO_PRIORITY_REMOVAL, 0,
43724428
raidz_scratch_child_done, pio));
43734429
}
43744430
zio_wait(pio);
@@ -4490,8 +4546,11 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr)
44904546
* space. Note that there may be a little bit more free
44914547
* space (e.g. in ms_defer), and it's fine to copy that too.
44924548
*/
4493-
range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,
4494-
NULL, 0, 0);
4549+
uint64_t shift, start;
4550+
range_seg_type_t type = metaslab_calculate_range_tree_type(
4551+
raidvd, msp, &start, &shift);
4552+
range_tree_t *rt = range_tree_create(NULL, type, NULL,
4553+
start, shift);
44954554
range_tree_add(rt, msp->ms_start, msp->ms_size);
44964555
range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
44974556
mutex_exit(&msp->ms_lock);
@@ -4516,7 +4575,10 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr)
45164575
* when importing a pool with a expansion in progress),
45174576
* discard any state that we have already processed.
45184577
*/
4519-
range_tree_clear(rt, 0, vre->vre_offset);
4578+
if (vre->vre_offset > msp->ms_start) {
4579+
range_tree_clear(rt, msp->ms_start,
4580+
vre->vre_offset - msp->ms_start);
4581+
}
45204582

45214583
while (!zthr_iscancelled(zthr) &&
45224584
!range_tree_is_empty(rt) &&

0 commit comments

Comments
 (0)