Skip to content

Commit 546bd92

Browse files
ahrensdon-brady
authored andcommitted
raidz expansion feature
This feature allows disks to be added one at a time to a RAID-Z group, expanding its capacity incrementally. This feature is especially useful for small pools (typically with only one RAID-Z group), where there isn't sufficient hardware to add capacity by adding a whole new RAID-Z group (typically doubling the number of disks). == Initiating expansion == A new device (disk) can be attached to an existing RAIDZ vdev, by running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank raidz2-0 sda`. The new device will become part of the RAIDZ group. A "raidz expansion" will be initiated, and the new device will contribute additional space to the RAIDZ group once the expansion completes. The `feature@raidz_expansion` on-disk feature flag must be `enabled` to initiate an expansion, and it remains `active` for the life of the pool. In other words, pools with expanded RAIDZ vdevs can not be imported by older releases of the ZFS software. == During expansion == The expansion entails reading all allocated space from existing disks in the RAIDZ group, and rewriting it to the new disks in the RAIDZ group (including the newly added device). The expansion progress can be monitored with `zpool status`. Data redundancy is maintained during (and after) the expansion. If a disk fails while the expansion is in progress, the expansion pauses until the health of the RAIDZ vdev is restored (e.g. by replacing the failed disk and waiting for reconstruction to complete). The pool remains accessible during expansion. Following a reboot or export/import, the expansion resumes where it left off. == After expansion == When the expansion completes, the additional space is available for use, and is reflected in the `available` zfs property (as seen in `zfs list`, `df`, etc). Expansion does not change the number of failures that can be tolerated without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after expansion). A RAIDZ vdev can be expanded multiple times. After the expansion completes, old blocks remain with their old data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but distributed among the larger set of disks. New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ vdev's "assumed parity ratio" does not change, so slightly less space than is expected may be reported for newly-written blocks, according to `zfs list`, `df`, `ls -s`, and similar tools. Sponsored-by: The FreeBSD Foundation Sponsored by: iXsystems, Inc. Authored-by: Matthew Ahrens <[email protected]> Contributions-by: Fedor Uporov <[email protected]> Contributions-by: Stuart Maybee <[email protected]> Contributions-by: Thorsten Behrens <[email protected]> Contributions-by: Fmstrat <[email protected]> Contributions-by: Don Brady <[email protected]> Signed-off-by: Don Brady <[email protected]>
1 parent a9d6b06 commit 546bd92

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+4582
-519
lines changed

cmd/raidz_test/raidz_bench.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,10 @@ run_gen_bench_impl(const char *impl)
8484

8585
if (rto_opts.rto_expand) {
8686
rm_bench = vdev_raidz_map_alloc_expanded(
87-
zio_bench.io_abd,
88-
zio_bench.io_size, zio_bench.io_offset,
87+
&zio_bench,
8988
rto_opts.rto_ashift, ncols+1, ncols,
90-
fn+1, rto_opts.rto_expand_offset);
89+
fn+1, rto_opts.rto_expand_offset,
90+
0, B_FALSE);
9191
} else {
9292
rm_bench = vdev_raidz_map_alloc(&zio_bench,
9393
BENCH_ASHIFT, ncols, fn+1);
@@ -172,10 +172,10 @@ run_rec_bench_impl(const char *impl)
172172

173173
if (rto_opts.rto_expand) {
174174
rm_bench = vdev_raidz_map_alloc_expanded(
175-
zio_bench.io_abd,
176-
zio_bench.io_size, zio_bench.io_offset,
175+
&zio_bench,
177176
BENCH_ASHIFT, ncols+1, ncols,
178-
PARITY_PQR, rto_opts.rto_expand_offset);
177+
PARITY_PQR,
178+
rto_opts.rto_expand_offset, 0, B_FALSE);
179179
} else {
180180
rm_bench = vdev_raidz_map_alloc(&zio_bench,
181181
BENCH_ASHIFT, ncols, PARITY_PQR);

cmd/raidz_test/raidz_test.c

Lines changed: 6 additions & 190 deletions
Original file line numberDiff line numberDiff line change
@@ -327,14 +327,12 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
327327

328328
if (opts->rto_expand) {
329329
opts->rm_golden =
330-
vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
331-
opts->zio_golden->io_size, opts->zio_golden->io_offset,
330+
vdev_raidz_map_alloc_expanded(opts->zio_golden,
332331
opts->rto_ashift, total_ncols+1, total_ncols,
333-
parity, opts->rto_expand_offset);
334-
rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
335-
zio_test->io_size, zio_test->io_offset,
332+
parity, opts->rto_expand_offset, 0, B_FALSE);
333+
rm_test = vdev_raidz_map_alloc_expanded(zio_test,
336334
opts->rto_ashift, total_ncols+1, total_ncols,
337-
parity, opts->rto_expand_offset);
335+
parity, opts->rto_expand_offset, 0, B_FALSE);
338336
} else {
339337
opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
340338
opts->rto_ashift, total_ncols, parity);
@@ -361,187 +359,6 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
361359
return (err);
362360
}
363361

364-
/*
365-
* If reflow is not in progress, reflow_offset should be UINT64_MAX.
366-
* For each row, if the row is entirely before reflow_offset, it will
367-
* come from the new location. Otherwise this row will come from the
368-
* old location. Therefore, rows that straddle the reflow_offset will
369-
* come from the old location.
370-
*
371-
* NOTE: Until raidz expansion is implemented this function is only
372-
* needed by raidz_test.c to the multi-row raid_map_t functionality.
373-
*/
374-
raidz_map_t *
375-
vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
376-
uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
377-
uint64_t nparity, uint64_t reflow_offset)
378-
{
379-
/* The zio's size in units of the vdev's minimum sector size. */
380-
uint64_t s = size >> ashift;
381-
uint64_t q, r, bc, devidx, asize = 0, tot;
382-
383-
/*
384-
* "Quotient": The number of data sectors for this stripe on all but
385-
* the "big column" child vdevs that also contain "remainder" data.
386-
* AKA "full rows"
387-
*/
388-
q = s / (logical_cols - nparity);
389-
390-
/*
391-
* "Remainder": The number of partial stripe data sectors in this I/O.
392-
* This will add a sector to some, but not all, child vdevs.
393-
*/
394-
r = s - q * (logical_cols - nparity);
395-
396-
/* The number of "big columns" - those which contain remainder data. */
397-
bc = (r == 0 ? 0 : r + nparity);
398-
399-
/*
400-
* The total number of data and parity sectors associated with
401-
* this I/O.
402-
*/
403-
tot = s + nparity * (q + (r == 0 ? 0 : 1));
404-
405-
/* How many rows contain data (not skip) */
406-
uint64_t rows = howmany(tot, logical_cols);
407-
int cols = MIN(tot, logical_cols);
408-
409-
raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
410-
KM_SLEEP);
411-
rm->rm_nrows = rows;
412-
413-
for (uint64_t row = 0; row < rows; row++) {
414-
raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
415-
rr_col[cols]), KM_SLEEP);
416-
rm->rm_row[row] = rr;
417-
418-
/* The starting RAIDZ (parent) vdev sector of the row. */
419-
uint64_t b = (offset >> ashift) + row * logical_cols;
420-
421-
/*
422-
* If we are in the middle of a reflow, and any part of this
423-
* row has not been copied, then use the old location of
424-
* this row.
425-
*/
426-
int row_phys_cols = physical_cols;
427-
if (b + (logical_cols - nparity) > reflow_offset >> ashift)
428-
row_phys_cols--;
429-
430-
/* starting child of this row */
431-
uint64_t child_id = b % row_phys_cols;
432-
/* The starting byte offset on each child vdev. */
433-
uint64_t child_offset = (b / row_phys_cols) << ashift;
434-
435-
/*
436-
* We set cols to the entire width of the block, even
437-
* if this row is shorter. This is needed because parity
438-
* generation (for Q and R) needs to know the entire width,
439-
* because it treats the short row as though it was
440-
* full-width (and the "phantom" sectors were zero-filled).
441-
*
442-
* Another approach to this would be to set cols shorter
443-
* (to just the number of columns that we might do i/o to)
444-
* and have another mechanism to tell the parity generation
445-
* about the "entire width". Reconstruction (at least
446-
* vdev_raidz_reconstruct_general()) would also need to
447-
* know about the "entire width".
448-
*/
449-
rr->rr_cols = cols;
450-
rr->rr_bigcols = bc;
451-
rr->rr_missingdata = 0;
452-
rr->rr_missingparity = 0;
453-
rr->rr_firstdatacol = nparity;
454-
rr->rr_abd_empty = NULL;
455-
rr->rr_nempty = 0;
456-
457-
for (int c = 0; c < rr->rr_cols; c++, child_id++) {
458-
if (child_id >= row_phys_cols) {
459-
child_id -= row_phys_cols;
460-
child_offset += 1ULL << ashift;
461-
}
462-
rr->rr_col[c].rc_devidx = child_id;
463-
rr->rr_col[c].rc_offset = child_offset;
464-
rr->rr_col[c].rc_orig_data = NULL;
465-
rr->rr_col[c].rc_error = 0;
466-
rr->rr_col[c].rc_tried = 0;
467-
rr->rr_col[c].rc_skipped = 0;
468-
rr->rr_col[c].rc_need_orig_restore = B_FALSE;
469-
470-
uint64_t dc = c - rr->rr_firstdatacol;
471-
if (c < rr->rr_firstdatacol) {
472-
rr->rr_col[c].rc_size = 1ULL << ashift;
473-
rr->rr_col[c].rc_abd =
474-
abd_alloc_linear(rr->rr_col[c].rc_size,
475-
B_TRUE);
476-
} else if (row == rows - 1 && bc != 0 && c >= bc) {
477-
/*
478-
* Past the end, this for parity generation.
479-
*/
480-
rr->rr_col[c].rc_size = 0;
481-
rr->rr_col[c].rc_abd = NULL;
482-
} else {
483-
/*
484-
* "data column" (col excluding parity)
485-
* Add an ASCII art diagram here
486-
*/
487-
uint64_t off;
488-
489-
if (c < bc || r == 0) {
490-
off = dc * rows + row;
491-
} else {
492-
off = r * rows +
493-
(dc - r) * (rows - 1) + row;
494-
}
495-
rr->rr_col[c].rc_size = 1ULL << ashift;
496-
rr->rr_col[c].rc_abd = abd_get_offset_struct(
497-
&rr->rr_col[c].rc_abdstruct,
498-
abd, off << ashift, 1 << ashift);
499-
}
500-
501-
asize += rr->rr_col[c].rc_size;
502-
}
503-
/*
504-
* If all data stored spans all columns, there's a danger that
505-
* parity will always be on the same device and, since parity
506-
* isn't read during normal operation, that that device's I/O
507-
* bandwidth won't be used effectively. We therefore switch
508-
* the parity every 1MB.
509-
*
510-
* ...at least that was, ostensibly, the theory. As a practical
511-
* matter unless we juggle the parity between all devices
512-
* evenly, we won't see any benefit. Further, occasional writes
513-
* that aren't a multiple of the LCM of the number of children
514-
* and the minimum stripe width are sufficient to avoid pessimal
515-
* behavior. Unfortunately, this decision created an implicit
516-
* on-disk format requirement that we need to support for all
517-
* eternity, but only for single-parity RAID-Z.
518-
*
519-
* If we intend to skip a sector in the zeroth column for
520-
* padding we must make sure to note this swap. We will never
521-
* intend to skip the first column since at least one data and
522-
* one parity column must appear in each row.
523-
*/
524-
if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
525-
(offset & (1ULL << 20))) {
526-
ASSERT(rr->rr_cols >= 2);
527-
ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
528-
devidx = rr->rr_col[0].rc_devidx;
529-
uint64_t o = rr->rr_col[0].rc_offset;
530-
rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
531-
rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
532-
rr->rr_col[1].rc_devidx = devidx;
533-
rr->rr_col[1].rc_offset = o;
534-
}
535-
536-
}
537-
ASSERT3U(asize, ==, tot << ashift);
538-
539-
/* init RAIDZ parity ops */
540-
rm->rm_ops = vdev_raidz_math_get_ops();
541-
542-
return (rm);
543-
}
544-
545362
static raidz_map_t *
546363
init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
547364
{
@@ -561,10 +378,9 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
561378
init_zio_abd(*zio);
562379

563380
if (opts->rto_expand) {
564-
rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
565-
(*zio)->io_size, (*zio)->io_offset,
381+
rm = vdev_raidz_map_alloc_expanded(*zio,
566382
opts->rto_ashift, total_ncols+1, total_ncols,
567-
parity, opts->rto_expand_offset);
383+
parity, opts->rto_expand_offset, 0, B_FALSE);
568384
} else {
569385
rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
570386
total_ncols, parity);

cmd/raidz_test/raidz_test.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,4 @@ void init_zio_abd(zio_t *zio);
119119

120120
void run_raidz_benchmark(void);
121121

122-
struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
123-
uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
124-
125122
#endif /* RAIDZ_TEST_H */

cmd/zdb/zdb.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4133,6 +4133,11 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
41334133
}
41344134
(void) printf("\tcheckpoint_txg = %llu\n",
41354135
(u_longlong_t)ub->ub_checkpoint_txg);
4136+
4137+
(void) printf("\traidz_reflow state=%u off=%llu\n",
4138+
(int)RRSS_GET_STATE(ub),
4139+
(u_longlong_t)RRSS_GET_OFFSET(ub));
4140+
41364141
(void) printf("%s", footer ? footer : "");
41374142
}
41384143

0 commit comments

Comments
 (0)