@@ -357,7 +357,11 @@ uint_t raidz_expand_pause_point = 0;
357
357
/*
358
358
* Maximum amount of copy io's outstanding at once.
359
359
*/
360
+ #ifdef _ILP32
361
+ static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE ;
362
+ #else
360
363
static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE ;
364
+ #endif
361
365
362
366
/*
363
367
* Apply raidz map abds aggregation if the number of rows in the map is equal
@@ -3817,16 +3821,21 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3817
3821
}
3818
3822
3819
3823
/*
3820
- * Struct for one copy zio .
3824
+ * State of one copy batch .
3821
3825
*/
3822
3826
typedef struct raidz_reflow_arg {
3823
- vdev_raidz_expand_t * rra_vre ;
3824
- zfs_locked_range_t * rra_lr ;
3825
- uint64_t rra_txg ;
3827
+ vdev_raidz_expand_t * rra_vre ; /* Global expantion state. */
3828
+ zfs_locked_range_t * rra_lr ; /* Range lock of this batch. */
3829
+ uint64_t rra_txg ; /* TXG of this batch. */
3830
+ uint_t rra_ashift ; /* Ashift of the vdev. */
3831
+ uint32_t rra_tbd ; /* Number of in-flight ZIOs. */
3832
+ uint32_t rra_writes ; /* Number of write ZIOs. */
3833
+ zio_t * rra_zio []; /* Write ZIO pointers. */
3826
3834
} raidz_reflow_arg_t ;
3827
3835
3828
3836
/*
3829
- * The write of the new location is done.
3837
+ * Write of the new location on one child is done. Once all of them are done
3838
+ * we can unlock and free everything.
3830
3839
*/
3831
3840
static void
3832
3841
raidz_reflow_write_done (zio_t * zio )
@@ -3850,24 +3859,30 @@ raidz_reflow_write_done(zio_t *zio)
3850
3859
zio -> io_size ;
3851
3860
}
3852
3861
cv_signal (& vre -> vre_cv );
3862
+ boolean_t done = (-- rra -> rra_tbd == 0 );
3853
3863
mutex_exit (& vre -> vre_lock );
3854
3864
3855
- zfs_rangelock_exit (rra -> rra_lr );
3856
-
3857
- kmem_free (rra , sizeof (* rra ));
3865
+ if (!done )
3866
+ return ;
3858
3867
spa_config_exit (zio -> io_spa , SCL_STATE , zio -> io_spa );
3868
+ zfs_rangelock_exit (rra -> rra_lr );
3869
+ kmem_free (rra , sizeof (* rra ) + sizeof (zio_t * ) * rra -> rra_writes );
3859
3870
}
3860
3871
3861
3872
/*
3862
- * The read of the old location is done. The parent zio is the write to
3863
- * the new location. Allow it to start .
3873
+ * Read of the old location on one child is done. Once all of them are done
3874
+ * writes should have all the data and we can issue them .
3864
3875
*/
3865
3876
static void
3866
3877
raidz_reflow_read_done (zio_t * zio )
3867
3878
{
3868
3879
raidz_reflow_arg_t * rra = zio -> io_private ;
3869
3880
vdev_raidz_expand_t * vre = rra -> rra_vre ;
3870
3881
3882
+ /* Reads of only one block use write ABDs. For bigger free gangs. */
3883
+ if (zio -> io_size > (1 << rra -> rra_ashift ))
3884
+ abd_free (zio -> io_abd );
3885
+
3871
3886
/*
3872
3887
* If the read failed, or if it was done on a vdev that is not fully
3873
3888
* healthy (e.g. a child that has a resilver in progress), we may not
@@ -3891,7 +3906,11 @@ raidz_reflow_read_done(zio_t *zio)
3891
3906
mutex_exit (& vre -> vre_lock );
3892
3907
}
3893
3908
3894
- zio_nowait (zio_unique_parent (zio ));
3909
+ if (atomic_dec_32_nv (& rra -> rra_tbd ) > 0 )
3910
+ return ;
3911
+ rra -> rra_tbd = rra -> rra_writes ;
3912
+ for (uint64_t i = 0 ; i < rra -> rra_writes ; i ++ )
3913
+ zio_nowait (rra -> rra_zio [i ]);
3895
3914
}
3896
3915
3897
3916
static void
@@ -3932,21 +3951,19 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3932
3951
dmu_tx_t * tx )
3933
3952
{
3934
3953
spa_t * spa = vd -> vdev_spa ;
3935
- int ashift = vd -> vdev_top -> vdev_ashift ;
3936
- uint64_t offset , size ;
3954
+ uint_t ashift = vd -> vdev_top -> vdev_ashift ;
3937
3955
3938
- if (! range_tree_find_in ( rt , 0 , vd -> vdev_top -> vdev_asize ,
3939
- & offset , & size )) {
3956
+ range_seg_t * rs = range_tree_first ( rt );
3957
+ if ( rt == NULL )
3940
3958
return (B_FALSE );
3941
- }
3959
+ uint64_t offset = rs_get_start ( rs , rt );
3942
3960
ASSERT (IS_P2ALIGNED (offset , 1 << ashift ));
3961
+ uint64_t size = rs_get_end (rs , rt ) - offset ;
3943
3962
ASSERT3U (size , >=, 1 << ashift );
3944
- uint64_t length = 1 << ashift ;
3945
- int txgoff = dmu_tx_get_txg (tx ) & TXG_MASK ;
3963
+ ASSERT (IS_P2ALIGNED (size , 1 << ashift ));
3946
3964
3947
3965
uint64_t blkid = offset >> ashift ;
3948
-
3949
- int old_children = vd -> vdev_children - 1 ;
3966
+ uint_t old_children = vd -> vdev_children - 1 ;
3950
3967
3951
3968
/*
3952
3969
* We can only progress to the point that writes will not overlap
@@ -3965,26 +3982,34 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3965
3982
uint64_t next_overwrite_blkid = ubsync_blkid +
3966
3983
ubsync_blkid / old_children - old_children ;
3967
3984
VERIFY3U (next_overwrite_blkid , > , ubsync_blkid );
3968
-
3969
3985
if (blkid >= next_overwrite_blkid ) {
3970
3986
raidz_reflow_record_progress (vre ,
3971
3987
next_overwrite_blkid << ashift , tx );
3972
3988
return (B_TRUE );
3973
3989
}
3974
3990
3975
- range_tree_remove (rt , offset , length );
3991
+ size = MIN (size , raidz_expand_max_copy_bytes );
3992
+ size = MIN (size , (uint64_t )old_children *
3993
+ MIN (zfs_max_recordsize , SPA_MAXBLOCKSIZE ));
3994
+ size = MAX (size , 1 << ashift );
3995
+ uint_t blocks = MIN (size >> ashift , next_overwrite_blkid - blkid );
3996
+ size = (uint64_t )blocks << ashift ;
3997
+
3998
+ range_tree_remove (rt , offset , size );
3976
3999
3977
- raidz_reflow_arg_t * rra = kmem_zalloc (sizeof (* rra ), KM_SLEEP );
4000
+ uint_t reads = MIN (blocks , old_children );
4001
+ uint_t writes = MIN (blocks , vd -> vdev_children );
4002
+ raidz_reflow_arg_t * rra = kmem_zalloc (sizeof (* rra ) +
4003
+ sizeof (zio_t * ) * writes , KM_SLEEP );
3978
4004
rra -> rra_vre = vre ;
3979
4005
rra -> rra_lr = zfs_rangelock_enter (& vre -> vre_rangelock ,
3980
- offset , length , RL_WRITER );
4006
+ offset , size , RL_WRITER );
3981
4007
rra -> rra_txg = dmu_tx_get_txg (tx );
4008
+ rra -> rra_ashift = ashift ;
4009
+ rra -> rra_tbd = reads ;
4010
+ rra -> rra_writes = writes ;
3982
4011
3983
- raidz_reflow_record_progress (vre , offset + length , tx );
3984
-
3985
- mutex_enter (& vre -> vre_lock );
3986
- vre -> vre_outstanding_bytes += length ;
3987
- mutex_exit (& vre -> vre_lock );
4012
+ raidz_reflow_record_progress (vre , offset + size , tx );
3988
4013
3989
4014
/*
3990
4015
* SCL_STATE will be released when the read and write are done,
@@ -4006,29 +4031,61 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
4006
4031
mutex_exit (& vre -> vre_lock );
4007
4032
4008
4033
/* drop everything we acquired */
4009
- zfs_rangelock_exit (rra -> rra_lr );
4010
- kmem_free (rra , sizeof (* rra ));
4011
4034
spa_config_exit (spa , SCL_STATE , spa );
4035
+ zfs_rangelock_exit (rra -> rra_lr );
4036
+ kmem_free (rra , sizeof (* rra ) + sizeof (zio_t * ) * writes );
4012
4037
return (B_TRUE );
4013
4038
}
4014
4039
4040
+ mutex_enter (& vre -> vre_lock );
4041
+ vre -> vre_outstanding_bytes += size ;
4042
+ mutex_exit (& vre -> vre_lock );
4043
+
4044
+ /* Allocate ABD and ZIO for each child we write. */
4045
+ int txgoff = dmu_tx_get_txg (tx ) & TXG_MASK ;
4015
4046
zio_t * pio = spa -> spa_txg_zio [txgoff ];
4016
- abd_t * abd = abd_alloc_for_io (length , B_FALSE );
4017
- zio_t * write_zio = zio_vdev_child_io (pio , NULL ,
4018
- vd -> vdev_child [blkid % vd -> vdev_children ],
4019
- (blkid / vd -> vdev_children ) << ashift ,
4020
- abd , length ,
4021
- ZIO_TYPE_WRITE , ZIO_PRIORITY_REMOVAL ,
4022
- ZIO_FLAG_CANFAIL ,
4023
- raidz_reflow_write_done , rra );
4024
-
4025
- zio_nowait (zio_vdev_child_io (write_zio , NULL ,
4026
- vd -> vdev_child [blkid % old_children ],
4027
- (blkid / old_children ) << ashift ,
4028
- abd , length ,
4029
- ZIO_TYPE_READ , ZIO_PRIORITY_REMOVAL ,
4030
- ZIO_FLAG_CANFAIL ,
4031
- raidz_reflow_read_done , rra ));
4047
+ uint_t b = blocks / vd -> vdev_children ;
4048
+ uint_t bb = blocks % vd -> vdev_children ;
4049
+ for (uint_t i = 0 ; i < writes ; i ++ ) {
4050
+ uint_t n = b + (i < bb );
4051
+ abd_t * abd = abd_alloc_for_io (n << ashift , B_FALSE );
4052
+ rra -> rra_zio [i ] = zio_vdev_child_io (pio , NULL ,
4053
+ vd -> vdev_child [(blkid + i ) % vd -> vdev_children ],
4054
+ ((blkid + i ) / vd -> vdev_children ) << ashift ,
4055
+ abd , n << ashift , ZIO_TYPE_WRITE , ZIO_PRIORITY_REMOVAL ,
4056
+ ZIO_FLAG_CANFAIL , raidz_reflow_write_done , rra );
4057
+ }
4058
+
4059
+ /*
4060
+ * Allocate and issue ZIO for each child we read. For reads of only
4061
+ * one block we can use respective writer ABDs, since they will also
4062
+ * have only one block. For bigger reads create gang ABDs and fill
4063
+ * them with respective blocks from writer ABDs.
4064
+ */
4065
+ b = blocks / old_children ;
4066
+ bb = blocks % old_children ;
4067
+ for (uint_t i = 0 ; i < reads ; i ++ ) {
4068
+ uint_t n = b + (i < bb );
4069
+ abd_t * abd ;
4070
+ if (n > 1 ) {
4071
+ abd = abd_alloc_gang ();
4072
+ for (uint_t j = 0 ; j < n ; j ++ ) {
4073
+ uint_t b = j * old_children + i ;
4074
+ abd_t * cabd = abd_get_offset_size (
4075
+ rra -> rra_zio [b % vd -> vdev_children ]-> io_abd ,
4076
+ (b / vd -> vdev_children ) << ashift ,
4077
+ 1 << ashift );
4078
+ abd_gang_add (abd , cabd , B_TRUE );
4079
+ }
4080
+ } else {
4081
+ abd = rra -> rra_zio [i ]-> io_abd ;
4082
+ }
4083
+ zio_nowait (zio_vdev_child_io (pio , NULL ,
4084
+ vd -> vdev_child [(blkid + i ) % old_children ],
4085
+ ((blkid + i ) / old_children ) << ashift , abd ,
4086
+ n << ashift , ZIO_TYPE_READ , ZIO_PRIORITY_REMOVAL ,
4087
+ ZIO_FLAG_CANFAIL , raidz_reflow_read_done , rra ));
4088
+ }
4032
4089
4033
4090
return (B_FALSE );
4034
4091
}
@@ -4122,7 +4179,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4122
4179
zio_nowait (zio_vdev_child_io (pio , NULL ,
4123
4180
raidvd -> vdev_child [i ],
4124
4181
VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE , abds [i ],
4125
- write_size , ZIO_TYPE_READ , ZIO_PRIORITY_ASYNC_READ ,
4182
+ write_size , ZIO_TYPE_READ , ZIO_PRIORITY_REMOVAL ,
4126
4183
ZIO_FLAG_CANFAIL , raidz_scratch_child_done , pio ));
4127
4184
}
4128
4185
error = zio_wait (pio );
@@ -4142,7 +4199,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4142
4199
ASSERT0 (vdev_is_dead (raidvd -> vdev_child [i ]));
4143
4200
zio_nowait (zio_vdev_child_io (pio , NULL , raidvd -> vdev_child [i ],
4144
4201
0 , abds [i ], read_size , ZIO_TYPE_READ ,
4145
- ZIO_PRIORITY_ASYNC_READ , ZIO_FLAG_CANFAIL ,
4202
+ ZIO_PRIORITY_REMOVAL , ZIO_FLAG_CANFAIL ,
4146
4203
raidz_scratch_child_done , pio ));
4147
4204
}
4148
4205
error = zio_wait (pio );
@@ -4197,7 +4254,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4197
4254
*/
4198
4255
zio_nowait (zio_vdev_child_io (pio , NULL , raidvd -> vdev_child [i ],
4199
4256
VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE , abds [i ],
4200
- write_size , ZIO_TYPE_WRITE , ZIO_PRIORITY_ASYNC_WRITE ,
4257
+ write_size , ZIO_TYPE_WRITE , ZIO_PRIORITY_REMOVAL ,
4201
4258
ZIO_FLAG_CANFAIL , raidz_scratch_child_done , pio ));
4202
4259
}
4203
4260
error = zio_wait (pio );
@@ -4246,7 +4303,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4246
4303
for (int i = 0 ; i < raidvd -> vdev_children ; i ++ ) {
4247
4304
zio_nowait (zio_vdev_child_io (pio , NULL , raidvd -> vdev_child [i ],
4248
4305
0 , abds [i ], write_size , ZIO_TYPE_WRITE ,
4249
- ZIO_PRIORITY_ASYNC_WRITE , ZIO_FLAG_CANFAIL ,
4306
+ ZIO_PRIORITY_REMOVAL , ZIO_FLAG_CANFAIL ,
4250
4307
raidz_scratch_child_done , pio ));
4251
4308
}
4252
4309
error = zio_wait (pio );
@@ -4355,8 +4412,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa)
4355
4412
*/
4356
4413
zio_nowait (zio_vdev_child_io (pio , NULL , raidvd -> vdev_child [i ],
4357
4414
VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE , abds [i ],
4358
- write_size , ZIO_TYPE_READ ,
4359
- ZIO_PRIORITY_ASYNC_READ , 0 ,
4415
+ write_size , ZIO_TYPE_READ , ZIO_PRIORITY_REMOVAL , 0 ,
4360
4416
raidz_scratch_child_done , pio ));
4361
4417
}
4362
4418
zio_wait (pio );
@@ -4368,7 +4424,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa)
4368
4424
for (int i = 0 ; i < raidvd -> vdev_children ; i ++ ) {
4369
4425
zio_nowait (zio_vdev_child_io (pio , NULL , raidvd -> vdev_child [i ],
4370
4426
0 , abds [i ], write_size , ZIO_TYPE_WRITE ,
4371
- ZIO_PRIORITY_ASYNC_WRITE , 0 ,
4427
+ ZIO_PRIORITY_REMOVAL , 0 ,
4372
4428
raidz_scratch_child_done , pio ));
4373
4429
}
4374
4430
zio_wait (pio );
@@ -4490,8 +4546,11 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4490
4546
* space. Note that there may be a little bit more free
4491
4547
* space (e.g. in ms_defer), and it's fine to copy that too.
4492
4548
*/
4493
- range_tree_t * rt = range_tree_create (NULL , RANGE_SEG64 ,
4494
- NULL , 0 , 0 );
4549
+ uint64_t shift , start ;
4550
+ range_seg_type_t type = metaslab_calculate_range_tree_type (
4551
+ raidvd , msp , & start , & shift );
4552
+ range_tree_t * rt = range_tree_create (NULL , type , NULL ,
4553
+ start , shift );
4495
4554
range_tree_add (rt , msp -> ms_start , msp -> ms_size );
4496
4555
range_tree_walk (msp -> ms_allocatable , range_tree_remove , rt );
4497
4556
mutex_exit (& msp -> ms_lock );
@@ -4516,7 +4575,10 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4516
4575
* when importing a pool with a expansion in progress),
4517
4576
* discard any state that we have already processed.
4518
4577
*/
4519
- range_tree_clear (rt , 0 , vre -> vre_offset );
4578
+ if (vre -> vre_offset > msp -> ms_start ) {
4579
+ range_tree_clear (rt , msp -> ms_start ,
4580
+ vre -> vre_offset - msp -> ms_start );
4581
+ }
4520
4582
4521
4583
while (!zthr_iscancelled (zthr ) &&
4522
4584
!range_tree_is_empty (rt ) &&
0 commit comments