Skip to content

Commit 3c1266e

Browse files
committed
RAIDZ: Use cache blocking during parity math
RAIDZ parity is calculated by adding data one column at a time. It works OK for small blocks, but for large blocks results of previous addition may already be evicted from CPU caches to main memory, and in addition to extra memory write require extra read to get it back. This patch splits large parity operations into 64KB chunks, that should in most cases fit into CPU L2 caches from the last decade. I haven't touched more complicated cases of data reconstruction to not overcomplicate the code. Those should be relatively rare. My tests on Xeon Gold 6242R CPU with 1MB of L2 cache per core show up to 10/20% memory traffic reduction when writing to 4-wide RAIDZ/ RAIDZ2 blocks of ~4MB and up. Older CPUs with 256KB of L2 cache should see the effect even on smaller blocks. Wider RAIDZ vdevs should be less affected. Signed-off-by: Alexander Motin <[email protected]> Sponsored by: iXsystems, Inc.
1 parent e007908 commit 3c1266e

File tree

3 files changed

+97
-71
lines changed

3 files changed

+97
-71
lines changed

include/sys/abd.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
133133
void abd_zero_off(abd_t *, size_t, size_t);
134134
void abd_verify(abd_t *);
135135

136-
void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
136+
void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ssize_t off,
137137
ssize_t csize, ssize_t dsize, const unsigned parity,
138138
void (*func_raidz_gen)(void **, const void *, size_t, size_t));
139139
void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,

module/zfs/abd.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,7 +1017,7 @@ abd_cmp(abd_t *dabd, abd_t *sabd)
10171017
* is the same when taking linear and when taking scatter
10181018
*/
10191019
void
1020-
abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
1020+
abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ssize_t off,
10211021
ssize_t csize, ssize_t dsize, const unsigned parity,
10221022
void (*func_raidz_gen)(void **, const void *, size_t, size_t))
10231023
{
@@ -1033,16 +1033,16 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
10331033
ASSERT3U(parity, <=, 3);
10341034
for (i = 0; i < parity; i++) {
10351035
abd_verify(cabds[i]);
1036-
ASSERT3U(csize, <=, cabds[i]->abd_size);
1037-
c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0);
1036+
ASSERT3U(off + csize, <=, cabds[i]->abd_size);
1037+
c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], off);
10381038
}
10391039

10401040
ASSERT3S(dsize, >=, 0);
10411041
if (dsize > 0) {
10421042
ASSERT(dabd);
10431043
abd_verify(dabd);
1044-
ASSERT3U(dsize, <=, dabd->abd_size);
1045-
c_dabd = abd_init_abd_iter(dabd, &daiter, 0);
1044+
ASSERT3U(off + dsize, <=, dabd->abd_size);
1045+
c_dabd = abd_init_abd_iter(dabd, &daiter, off);
10461046
}
10471047

10481048
abd_enter_critical(flags);

module/zfs/vdev_raidz_math_impl.h

Lines changed: 91 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -214,9 +214,10 @@ raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private)
214214
}
215215

216216

217-
#define raidz_copy(dabd, sabd, size) \
217+
#define raidz_copy(dabd, sabd, off, size) \
218218
{ \
219-
abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\
219+
abd_iterate_func2(dabd, sabd, off, off, size, raidz_copy_abd_cb, \
220+
NULL); \
220221
}
221222

222223
/*
@@ -254,9 +255,10 @@ raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private)
254255
return (0);
255256
}
256257

257-
#define raidz_add(dabd, sabd, size) \
258+
#define raidz_add(dabd, sabd, off, size) \
258259
{ \
259-
abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\
260+
abd_iterate_func2(dabd, sabd, off, off, size, raidz_add_abd_cb, \
261+
NULL); \
260262
}
261263

262264
/*
@@ -343,7 +345,10 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private)
343345
* the parity/syndrome if data column is shorter.
344346
*
345347
* P parity is calculated using raidz_add_abd().
348+
*
349+
* For CPU L2 cache blocking we process 64KB at a time.
346350
*/
351+
#define BLOCK 65536
347352

348353
/*
349354
* Generate P parity (RAIDZ1)
@@ -357,20 +362,26 @@ raidz_generate_p_impl(raidz_row_t * const rr)
357362
const size_t ncols = rr->rr_cols;
358363
const size_t psize = rr->rr_col[CODE_P].rc_size;
359364
abd_t *pabd = rr->rr_col[CODE_P].rc_abd;
360-
size_t size;
361-
abd_t *dabd;
365+
size_t off, size;
362366

363367
raidz_math_begin();
364368

365-
/* start with first data column */
366-
raidz_copy(pabd, rr->rr_col[1].rc_abd, psize);
369+
for (off = 0; off < psize; off += BLOCK) {
367370

368-
for (c = 2; c < ncols; c++) {
369-
dabd = rr->rr_col[c].rc_abd;
370-
size = rr->rr_col[c].rc_size;
371+
/* start with first data column */
372+
size = MIN(BLOCK, psize - off);
373+
raidz_copy(pabd, rr->rr_col[1].rc_abd, off, size);
371374

372-
/* add data column */
373-
raidz_add(pabd, dabd, size);
375+
for (c = 2; c < ncols; c++) {
376+
size = rr->rr_col[c].rc_size;
377+
if (size <= off)
378+
continue;
379+
380+
/* add data column */
381+
size = MIN(BLOCK, size - off);
382+
abd_t *dabd = rr->rr_col[c].rc_abd;
383+
raidz_add(pabd, dabd, off, size);
384+
}
374385
}
375386

376387
raidz_math_end();
@@ -423,7 +434,7 @@ raidz_generate_pq_impl(raidz_row_t * const rr)
423434
size_t c;
424435
const size_t ncols = rr->rr_cols;
425436
const size_t csize = rr->rr_col[CODE_P].rc_size;
426-
size_t dsize;
437+
size_t off, size, dsize;
427438
abd_t *dabd;
428439
abd_t *cabds[] = {
429440
rr->rr_col[CODE_P].rc_abd,
@@ -432,15 +443,20 @@ raidz_generate_pq_impl(raidz_row_t * const rr)
432443

433444
raidz_math_begin();
434445

435-
raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize);
436-
raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize);
446+
for (off = 0; off < csize; off += BLOCK) {
447+
448+
size = MIN(BLOCK, csize - off);
449+
raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, off, size);
450+
raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, off, size);
437451

438-
for (c = 3; c < ncols; c++) {
439-
dabd = rr->rr_col[c].rc_abd;
440-
dsize = rr->rr_col[c].rc_size;
452+
for (c = 3; c < ncols; c++) {
453+
dabd = rr->rr_col[c].rc_abd;
454+
dsize = rr->rr_col[c].rc_size;
455+
dsize = (dsize > off) ? MIN(BLOCK, dsize - off) : 0;
441456

442-
abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
443-
raidz_gen_pq_add);
457+
abd_raidz_gen_iterate(cabds, dabd, off, size, dsize, 2,
458+
raidz_gen_pq_add);
459+
}
444460
}
445461

446462
raidz_math_end();
@@ -496,7 +512,7 @@ raidz_generate_pqr_impl(raidz_row_t * const rr)
496512
size_t c;
497513
const size_t ncols = rr->rr_cols;
498514
const size_t csize = rr->rr_col[CODE_P].rc_size;
499-
size_t dsize;
515+
size_t off, size, dsize;
500516
abd_t *dabd;
501517
abd_t *cabds[] = {
502518
rr->rr_col[CODE_P].rc_abd,
@@ -506,16 +522,21 @@ raidz_generate_pqr_impl(raidz_row_t * const rr)
506522

507523
raidz_math_begin();
508524

509-
raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize);
510-
raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize);
511-
raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize);
525+
for (off = 0; off < csize; off += BLOCK) {
512526

513-
for (c = 4; c < ncols; c++) {
514-
dabd = rr->rr_col[c].rc_abd;
515-
dsize = rr->rr_col[c].rc_size;
527+
size = MIN(BLOCK, csize - off);
528+
raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, off, size);
529+
raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, off, size);
530+
raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, off, size);
531+
532+
for (c = 4; c < ncols; c++) {
533+
dabd = rr->rr_col[c].rc_abd;
534+
dsize = rr->rr_col[c].rc_size;
535+
dsize = (dsize > off) ? MIN(BLOCK, dsize - off) : 0;
516536

517-
abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
518-
raidz_gen_pqr_add);
537+
abd_raidz_gen_iterate(cabds, dabd, off, size, dsize, 3,
538+
raidz_gen_pqr_add);
539+
}
519540
}
520541

521542
raidz_math_end();
@@ -592,26 +613,31 @@ raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx)
592613
const size_t x = tgtidx[TARGET_X];
593614
const size_t xsize = rr->rr_col[x].rc_size;
594615
abd_t *xabd = rr->rr_col[x].rc_abd;
595-
size_t size;
596-
abd_t *dabd;
616+
size_t off, size;
597617

598618
if (xabd == NULL)
599619
return (1 << CODE_P);
600620

601621
raidz_math_begin();
602622

603-
/* copy P into target */
604-
raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize);
623+
for (off = 0; off < xsize; off += BLOCK) {
605624

606-
/* generate p_syndrome */
607-
for (c = firstdc; c < ncols; c++) {
608-
if (c == x)
609-
continue;
625+
/* copy P into target */
626+
size = MIN(BLOCK, xsize - off);
627+
raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, off, size);
610628

611-
dabd = rr->rr_col[c].rc_abd;
612-
size = MIN(rr->rr_col[c].rc_size, xsize);
629+
/* generate p_syndrome */
630+
for (c = firstdc; c < ncols; c++) {
631+
if (c == x)
632+
continue;
633+
size = rr->rr_col[c].rc_size;
634+
if (size <= off)
635+
continue;
613636

614-
raidz_add(xabd, dabd, size);
637+
size = MIN(BLOCK, MIN(size, xsize) - off);
638+
abd_t *dabd = rr->rr_col[c].rc_abd;
639+
raidz_add(xabd, dabd, off, size);
640+
}
615641
}
616642

617643
raidz_math_end();
@@ -683,7 +709,7 @@ raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
683709

684710
/* Start with first data column if present */
685711
if (firstdc != x) {
686-
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
712+
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
687713
} else {
688714
raidz_zero(xabd, xsize);
689715
}
@@ -698,12 +724,12 @@ raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
698724
dsize = rr->rr_col[c].rc_size;
699725
}
700726

701-
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
727+
abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 1,
702728
raidz_syn_q_abd);
703729
}
704730

705731
/* add Q to the syndrome */
706-
raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize);
732+
raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, 0, xsize);
707733

708734
/* transform the syndrome */
709735
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
@@ -777,7 +803,7 @@ raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
777803

778804
/* Start with first data column if present */
779805
if (firstdc != x) {
780-
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
806+
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
781807
} else {
782808
raidz_zero(xabd, xsize);
783809
}
@@ -793,12 +819,12 @@ raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
793819
dsize = rr->rr_col[c].rc_size;
794820
}
795821

796-
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
822+
abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 1,
797823
raidz_syn_r_abd);
798824
}
799825

800826
/* add R to the syndrome */
801-
raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize);
827+
raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, 0, xsize);
802828

803829
/* transform the syndrome */
804830
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
@@ -934,8 +960,8 @@ raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
934960

935961
/* Start with first data column if present */
936962
if (firstdc != x) {
937-
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
938-
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
963+
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
964+
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
939965
} else {
940966
raidz_zero(xabd, xsize);
941967
raidz_zero(yabd, xsize);
@@ -951,15 +977,15 @@ raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
951977
dsize = rr->rr_col[c].rc_size;
952978
}
953979

954-
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
980+
abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2,
955981
raidz_syn_pq_abd);
956982
}
957983

958984
abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff);
959985

960986
/* Copy shorter targets back to the original abd buffer */
961987
if (ysize < xsize)
962-
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
988+
raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
963989

964990
raidz_math_end();
965991

@@ -1094,8 +1120,8 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
10941120

10951121
/* Start with first data column if present */
10961122
if (firstdc != x) {
1097-
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
1098-
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
1123+
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
1124+
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
10991125
} else {
11001126
raidz_zero(xabd, xsize);
11011127
raidz_zero(yabd, xsize);
@@ -1111,7 +1137,7 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
11111137
dsize = rr->rr_col[c].rc_size;
11121138
}
11131139

1114-
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
1140+
abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2,
11151141
raidz_syn_pr_abd);
11161142
}
11171143

@@ -1121,7 +1147,7 @@ raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
11211147
* Copy shorter targets back to the original abd buffer
11221148
*/
11231149
if (ysize < xsize)
1124-
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
1150+
raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
11251151

11261152
raidz_math_end();
11271153

@@ -1261,8 +1287,8 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
12611287

12621288
/* Start with first data column if present */
12631289
if (firstdc != x) {
1264-
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
1265-
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
1290+
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
1291+
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
12661292
} else {
12671293
raidz_zero(xabd, xsize);
12681294
raidz_zero(yabd, xsize);
@@ -1278,7 +1304,7 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
12781304
dsize = rr->rr_col[c].rc_size;
12791305
}
12801306

1281-
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
1307+
abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 2,
12821308
raidz_syn_qr_abd);
12831309
}
12841310

@@ -1288,7 +1314,7 @@ raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
12881314
* Copy shorter targets back to the original abd buffer
12891315
*/
12901316
if (ysize < xsize)
1291-
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
1317+
raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
12921318

12931319
raidz_math_end();
12941320

@@ -1456,9 +1482,9 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
14561482

14571483
/* Start with first data column if present */
14581484
if (firstdc != x) {
1459-
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
1460-
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
1461-
raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize);
1485+
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
1486+
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
1487+
raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, 0, xsize);
14621488
} else {
14631489
raidz_zero(xabd, xsize);
14641490
raidz_zero(yabd, xsize);
@@ -1475,7 +1501,7 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
14751501
dsize = rr->rr_col[c].rc_size;
14761502
}
14771503

1478-
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
1504+
abd_raidz_gen_iterate(tabds, dabd, 0, xsize, dsize, 3,
14791505
raidz_syn_pqr_abd);
14801506
}
14811507

@@ -1485,9 +1511,9 @@ raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
14851511
* Copy shorter targets back to the original abd buffer
14861512
*/
14871513
if (ysize < xsize)
1488-
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
1514+
raidz_copy(rr->rr_col[y].rc_abd, yabd, 0, ysize);
14891515
if (zsize < xsize)
1490-
raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize);
1516+
raidz_copy(rr->rr_col[z].rc_abd, zabd, 0, zsize);
14911517

14921518
raidz_math_end();
14931519

0 commit comments

Comments
 (0)