@@ -3817,16 +3817,21 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3817
3817
}
3818
3818
3819
3819
/*
3820
- * Struct for one copy zio .
3820
+ * State of one copy batch .
3821
3821
*/
3822
3822
typedef struct raidz_reflow_arg {
3823
- vdev_raidz_expand_t * rra_vre ;
3824
- zfs_locked_range_t * rra_lr ;
3825
- uint64_t rra_txg ;
3823
+ vdev_raidz_expand_t * rra_vre ; /* Global expantion state. */
3824
+ zfs_locked_range_t * rra_lr ; /* Range lock of this batch. */
3825
+ uint64_t rra_txg ; /* TXG of this batch. */
3826
+ uint_t rra_ashift ; /* Ashift of the vdev. */
3827
+ uint32_t rra_tbd ; /* Number of in-flight ZIOs. */
3828
+ uint32_t rra_writes ; /* Number of write ZIOs. */
3829
+ zio_t * rra_zio []; /* Write ZIO pointers. */
3826
3830
} raidz_reflow_arg_t ;
3827
3831
3828
3832
/*
3829
- * The write of the new location is done.
3833
+ * Write of the new location on one child is done. Once all of them are done
3834
+ * we can unlock and free everything.
3830
3835
*/
3831
3836
static void
3832
3837
raidz_reflow_write_done (zio_t * zio )
@@ -3850,24 +3855,30 @@ raidz_reflow_write_done(zio_t *zio)
3850
3855
zio -> io_size ;
3851
3856
}
3852
3857
cv_signal (& vre -> vre_cv );
3858
+ boolean_t done = (-- rra -> rra_tbd == 0 );
3853
3859
mutex_exit (& vre -> vre_lock );
3854
3860
3855
- zfs_rangelock_exit (rra -> rra_lr );
3856
-
3857
- kmem_free (rra , sizeof (* rra ));
3861
+ if (!done )
3862
+ return ;
3858
3863
spa_config_exit (zio -> io_spa , SCL_STATE , zio -> io_spa );
3864
+ zfs_rangelock_exit (rra -> rra_lr );
3865
+ kmem_free (rra , sizeof (* rra ) + sizeof (zio_t * ) * rra -> rra_writes );
3859
3866
}
3860
3867
3861
3868
/*
3862
- * The read of the old location is done. The parent zio is the write to
3863
- * the new location. Allow it to start .
3869
+ * Read of the old location on one child is done. Once all of them are done
3870
+ * writes should have all the data and we can issue them .
3864
3871
*/
3865
3872
static void
3866
3873
raidz_reflow_read_done (zio_t * zio )
3867
3874
{
3868
3875
raidz_reflow_arg_t * rra = zio -> io_private ;
3869
3876
vdev_raidz_expand_t * vre = rra -> rra_vre ;
3870
3877
3878
+ /* Reads of only one block use write ABDs. For bigger free gangs. */
3879
+ if (zio -> io_size > (1 << rra -> rra_ashift ))
3880
+ abd_free (zio -> io_abd );
3881
+
3871
3882
/*
3872
3883
* If the read failed, or if it was done on a vdev that is not fully
3873
3884
* healthy (e.g. a child that has a resilver in progress), we may not
@@ -3891,7 +3902,11 @@ raidz_reflow_read_done(zio_t *zio)
3891
3902
mutex_exit (& vre -> vre_lock );
3892
3903
}
3893
3904
3894
- zio_nowait (zio_unique_parent (zio ));
3905
+ if (atomic_dec_32_nv (& rra -> rra_tbd ) > 0 )
3906
+ return ;
3907
+ rra -> rra_tbd = rra -> rra_writes ;
3908
+ for (uint64_t i = 0 ; i < rra -> rra_writes ; i ++ )
3909
+ zio_nowait (rra -> rra_zio [i ]);
3895
3910
}
3896
3911
3897
3912
static void
@@ -3932,21 +3947,19 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3932
3947
dmu_tx_t * tx )
3933
3948
{
3934
3949
spa_t * spa = vd -> vdev_spa ;
3935
- int ashift = vd -> vdev_top -> vdev_ashift ;
3936
- uint64_t offset , size ;
3950
+ uint_t ashift = vd -> vdev_top -> vdev_ashift ;
3937
3951
3938
- if (! range_tree_find_in ( rt , 0 , vd -> vdev_top -> vdev_asize ,
3939
- & offset , & size )) {
3952
+ range_seg_t * rs = range_tree_first ( rt );
3953
+ if ( rt == NULL )
3940
3954
return (B_FALSE );
3941
- }
3955
+ uint64_t offset = rs_get_start ( rs , rt );
3942
3956
ASSERT (IS_P2ALIGNED (offset , 1 << ashift ));
3957
+ uint64_t size = rs_get_end (rs , rt ) - offset ;
3943
3958
ASSERT3U (size , >=, 1 << ashift );
3944
- uint64_t length = 1 << ashift ;
3945
- int txgoff = dmu_tx_get_txg (tx ) & TXG_MASK ;
3959
+ ASSERT (IS_P2ALIGNED (size , 1 << ashift ));
3946
3960
3947
3961
uint64_t blkid = offset >> ashift ;
3948
-
3949
- int old_children = vd -> vdev_children - 1 ;
3962
+ uint_t old_children = vd -> vdev_children - 1 ;
3950
3963
3951
3964
/*
3952
3965
* We can only progress to the point that writes will not overlap
@@ -3965,26 +3978,34 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3965
3978
uint64_t next_overwrite_blkid = ubsync_blkid +
3966
3979
ubsync_blkid / old_children - old_children ;
3967
3980
VERIFY3U (next_overwrite_blkid , > , ubsync_blkid );
3968
-
3969
3981
if (blkid >= next_overwrite_blkid ) {
3970
3982
raidz_reflow_record_progress (vre ,
3971
3983
next_overwrite_blkid << ashift , tx );
3972
3984
return (B_TRUE );
3973
3985
}
3974
3986
3975
- range_tree_remove (rt , offset , length );
3987
+ size = MIN (size , raidz_expand_max_copy_bytes );
3988
+ size = MIN (size , (uint64_t )old_children *
3989
+ MIN (zfs_max_recordsize , SPA_MAXBLOCKSIZE ));
3990
+ size = MAX (size , 1 << ashift );
3991
+ uint_t blocks = MIN (size >> ashift , next_overwrite_blkid - blkid );
3992
+ size = (uint64_t )blocks << ashift ;
3976
3993
3977
- raidz_reflow_arg_t * rra = kmem_zalloc (sizeof (* rra ), KM_SLEEP );
3994
+ range_tree_remove (rt , offset , size );
3995
+
3996
+ uint_t reads = MIN (blocks , old_children );
3997
+ uint_t writes = MIN (blocks , vd -> vdev_children );
3998
+ raidz_reflow_arg_t * rra = kmem_zalloc (sizeof (* rra ) +
3999
+ sizeof (zio_t * ) * writes , KM_SLEEP );
3978
4000
rra -> rra_vre = vre ;
3979
4001
rra -> rra_lr = zfs_rangelock_enter (& vre -> vre_rangelock ,
3980
- offset , length , RL_WRITER );
4002
+ offset , size , RL_WRITER );
3981
4003
rra -> rra_txg = dmu_tx_get_txg (tx );
4004
+ rra -> rra_ashift = ashift ;
4005
+ rra -> rra_tbd = reads ;
4006
+ rra -> rra_writes = writes ;
3982
4007
3983
- raidz_reflow_record_progress (vre , offset + length , tx );
3984
-
3985
- mutex_enter (& vre -> vre_lock );
3986
- vre -> vre_outstanding_bytes += length ;
3987
- mutex_exit (& vre -> vre_lock );
4008
+ raidz_reflow_record_progress (vre , offset + size , tx );
3988
4009
3989
4010
/*
3990
4011
* SCL_STATE will be released when the read and write are done,
@@ -4006,29 +4027,61 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
4006
4027
mutex_exit (& vre -> vre_lock );
4007
4028
4008
4029
/* drop everything we acquired */
4009
- zfs_rangelock_exit (rra -> rra_lr );
4010
- kmem_free (rra , sizeof (* rra ));
4011
4030
spa_config_exit (spa , SCL_STATE , spa );
4031
+ zfs_rangelock_exit (rra -> rra_lr );
4032
+ kmem_free (rra , sizeof (* rra ) + sizeof (zio_t * ) * writes );
4012
4033
return (B_TRUE );
4013
4034
}
4014
4035
4036
+ mutex_enter (& vre -> vre_lock );
4037
+ vre -> vre_outstanding_bytes += size ;
4038
+ mutex_exit (& vre -> vre_lock );
4039
+
4040
+ /* Allocate ABD and ZIO for each child we write. */
4041
+ int txgoff = dmu_tx_get_txg (tx ) & TXG_MASK ;
4015
4042
zio_t * pio = spa -> spa_txg_zio [txgoff ];
4016
- abd_t * abd = abd_alloc_for_io (length , B_FALSE );
4017
- zio_t * write_zio = zio_vdev_child_io (pio , NULL ,
4018
- vd -> vdev_child [blkid % vd -> vdev_children ],
4019
- (blkid / vd -> vdev_children ) << ashift ,
4020
- abd , length ,
4021
- ZIO_TYPE_WRITE , ZIO_PRIORITY_REMOVAL ,
4022
- ZIO_FLAG_CANFAIL ,
4023
- raidz_reflow_write_done , rra );
4024
-
4025
- zio_nowait (zio_vdev_child_io (write_zio , NULL ,
4026
- vd -> vdev_child [blkid % old_children ],
4027
- (blkid / old_children ) << ashift ,
4028
- abd , length ,
4029
- ZIO_TYPE_READ , ZIO_PRIORITY_REMOVAL ,
4030
- ZIO_FLAG_CANFAIL ,
4031
- raidz_reflow_read_done , rra ));
4043
+ uint_t b = blocks / vd -> vdev_children ;
4044
+ uint_t bb = blocks % vd -> vdev_children ;
4045
+ for (uint_t i = 0 ; i < writes ; i ++ ) {
4046
+ uint_t n = b + (i < bb );
4047
+ abd_t * abd = abd_alloc_for_io (n << ashift , B_FALSE );
4048
+ rra -> rra_zio [i ] = zio_vdev_child_io (pio , NULL ,
4049
+ vd -> vdev_child [(blkid + i ) % vd -> vdev_children ],
4050
+ ((blkid + i ) / vd -> vdev_children ) << ashift ,
4051
+ abd , n << ashift , ZIO_TYPE_WRITE , ZIO_PRIORITY_REMOVAL ,
4052
+ ZIO_FLAG_CANFAIL , raidz_reflow_write_done , rra );
4053
+ }
4054
+
4055
+ /*
4056
+ * Allocate and issue ZIO for each child we read. For reads of only
4057
+ * one block we can use respective writer ABDs, since they will also
4058
+ * have only one block. For bigger reads create gang ABDs and fill
4059
+ * them with respective blocks from writer ABDs.
4060
+ */
4061
+ b = blocks / old_children ;
4062
+ bb = blocks % old_children ;
4063
+ for (uint_t i = 0 ; i < reads ; i ++ ) {
4064
+ uint_t n = b + (i < bb );
4065
+ abd_t * abd ;
4066
+ if (n > 1 ) {
4067
+ abd = abd_alloc_gang ();
4068
+ for (uint_t j = 0 ; j < n ; j ++ ) {
4069
+ uint_t b = j * old_children + i ;
4070
+ abd_t * cabd = abd_get_offset_size (
4071
+ rra -> rra_zio [b % vd -> vdev_children ]-> io_abd ,
4072
+ (b / vd -> vdev_children ) << ashift ,
4073
+ 1 << ashift );
4074
+ abd_gang_add (abd , cabd , B_TRUE );
4075
+ }
4076
+ } else {
4077
+ abd = rra -> rra_zio [i ]-> io_abd ;
4078
+ }
4079
+ zio_nowait (zio_vdev_child_io (pio , NULL ,
4080
+ vd -> vdev_child [(blkid + i ) % old_children ],
4081
+ ((blkid + i ) / old_children ) << ashift , abd ,
4082
+ n << ashift , ZIO_TYPE_READ , ZIO_PRIORITY_REMOVAL ,
4083
+ ZIO_FLAG_CANFAIL , raidz_reflow_read_done , rra ));
4084
+ }
4032
4085
4033
4086
return (B_FALSE );
4034
4087
}
@@ -4122,7 +4175,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4122
4175
zio_nowait (zio_vdev_child_io (pio , NULL ,
4123
4176
raidvd -> vdev_child [i ],
4124
4177
VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE , abds [i ],
4125
- write_size , ZIO_TYPE_READ , ZIO_PRIORITY_ASYNC_READ ,
4178
+ write_size , ZIO_TYPE_READ , ZIO_PRIORITY_REMOVAL ,
4126
4179
ZIO_FLAG_CANFAIL , raidz_scratch_child_done , pio ));
4127
4180
}
4128
4181
error = zio_wait (pio );
@@ -4142,7 +4195,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4142
4195
ASSERT0 (vdev_is_dead (raidvd -> vdev_child [i ]));
4143
4196
zio_nowait (zio_vdev_child_io (pio , NULL , raidvd -> vdev_child [i ],
4144
4197
0 , abds [i ], read_size , ZIO_TYPE_READ ,
4145
- ZIO_PRIORITY_ASYNC_READ , ZIO_FLAG_CANFAIL ,
4198
+ ZIO_PRIORITY_REMOVAL , ZIO_FLAG_CANFAIL ,
4146
4199
raidz_scratch_child_done , pio ));
4147
4200
}
4148
4201
error = zio_wait (pio );
@@ -4197,7 +4250,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4197
4250
*/
4198
4251
zio_nowait (zio_vdev_child_io (pio , NULL , raidvd -> vdev_child [i ],
4199
4252
VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE , abds [i ],
4200
- write_size , ZIO_TYPE_WRITE , ZIO_PRIORITY_ASYNC_WRITE ,
4253
+ write_size , ZIO_TYPE_WRITE , ZIO_PRIORITY_REMOVAL ,
4201
4254
ZIO_FLAG_CANFAIL , raidz_scratch_child_done , pio ));
4202
4255
}
4203
4256
error = zio_wait (pio );
@@ -4246,7 +4299,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4246
4299
for (int i = 0 ; i < raidvd -> vdev_children ; i ++ ) {
4247
4300
zio_nowait (zio_vdev_child_io (pio , NULL , raidvd -> vdev_child [i ],
4248
4301
0 , abds [i ], write_size , ZIO_TYPE_WRITE ,
4249
- ZIO_PRIORITY_ASYNC_WRITE , ZIO_FLAG_CANFAIL ,
4302
+ ZIO_PRIORITY_REMOVAL , ZIO_FLAG_CANFAIL ,
4250
4303
raidz_scratch_child_done , pio ));
4251
4304
}
4252
4305
error = zio_wait (pio );
@@ -4355,8 +4408,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa)
4355
4408
*/
4356
4409
zio_nowait (zio_vdev_child_io (pio , NULL , raidvd -> vdev_child [i ],
4357
4410
VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE , abds [i ],
4358
- write_size , ZIO_TYPE_READ ,
4359
- ZIO_PRIORITY_ASYNC_READ , 0 ,
4411
+ write_size , ZIO_TYPE_READ , ZIO_PRIORITY_REMOVAL , 0 ,
4360
4412
raidz_scratch_child_done , pio ));
4361
4413
}
4362
4414
zio_wait (pio );
@@ -4368,7 +4420,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa)
4368
4420
for (int i = 0 ; i < raidvd -> vdev_children ; i ++ ) {
4369
4421
zio_nowait (zio_vdev_child_io (pio , NULL , raidvd -> vdev_child [i ],
4370
4422
0 , abds [i ], write_size , ZIO_TYPE_WRITE ,
4371
- ZIO_PRIORITY_ASYNC_WRITE , 0 ,
4423
+ ZIO_PRIORITY_REMOVAL , 0 ,
4372
4424
raidz_scratch_child_done , pio ));
4373
4425
}
4374
4426
zio_wait (pio );
@@ -4490,8 +4542,11 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4490
4542
* space. Note that there may be a little bit more free
4491
4543
* space (e.g. in ms_defer), and it's fine to copy that too.
4492
4544
*/
4493
- range_tree_t * rt = range_tree_create (NULL , RANGE_SEG64 ,
4494
- NULL , 0 , 0 );
4545
+ uint64_t shift , start ;
4546
+ range_seg_type_t type = metaslab_calculate_range_tree_type (
4547
+ raidvd , msp , & start , & shift );
4548
+ range_tree_t * rt = range_tree_create (NULL , type , NULL ,
4549
+ start , shift );
4495
4550
range_tree_add (rt , msp -> ms_start , msp -> ms_size );
4496
4551
range_tree_walk (msp -> ms_allocatable , range_tree_remove , rt );
4497
4552
mutex_exit (& msp -> ms_lock );
@@ -4516,7 +4571,10 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4516
4571
* when importing a pool with a expansion in progress),
4517
4572
* discard any state that we have already processed.
4518
4573
*/
4519
- range_tree_clear (rt , 0 , vre -> vre_offset );
4574
+ if (vre -> vre_offset > msp -> ms_start ) {
4575
+ range_tree_clear (rt , msp -> ms_start ,
4576
+ vre -> vre_offset - msp -> ms_start );
4577
+ }
4520
4578
4521
4579
while (!zthr_iscancelled (zthr ) &&
4522
4580
!range_tree_is_empty (rt ) &&
0 commit comments