@@ -832,14 +832,11 @@ BIO_END_IO_PROTO(vbio_completion, bio, error)
832
832
* to the ADB, with changes if appropriate.
833
833
*/
834
834
if (vbio -> vbio_abd != NULL ) {
835
- void * buf = abd_to_buf (vbio -> vbio_abd );
835
+ if (zio -> io_type == ZIO_TYPE_READ )
836
+ abd_copy (zio -> io_abd , vbio -> vbio_abd , zio -> io_size );
837
+
836
838
abd_free (vbio -> vbio_abd );
837
839
vbio -> vbio_abd = NULL ;
838
-
839
- if (zio -> io_type == ZIO_TYPE_READ )
840
- abd_return_buf_copy (zio -> io_abd , buf , zio -> io_size );
841
- else
842
- abd_return_buf (zio -> io_abd , buf , zio -> io_size );
843
840
}
844
841
845
842
/* Final cleanup */
@@ -859,34 +856,59 @@ BIO_END_IO_PROTO(vbio_completion, bio, error)
859
856
* split the BIO, the two halves will still be properly aligned.
860
857
*/
861
858
typedef struct {
862
- uint_t bmask ;
863
- uint_t npages ;
864
- uint_t end ;
865
- } vdev_disk_check_pages_t ;
859
+ size_t blocksize ;
860
+ int seen_first ;
861
+ int seen_last ;
862
+ } vdev_disk_check_alignment_t ;
866
863
867
864
static int
868
- vdev_disk_check_pages_cb (struct page * page , size_t off , size_t len , void * priv )
865
+ vdev_disk_check_alignment_cb (struct page * page , size_t off , size_t len ,
866
+ void * priv )
869
867
{
870
- vdev_disk_check_pages_t * s = priv ;
868
+ (void ) page ;
869
+ vdev_disk_check_alignment_t * s = priv ;
871
870
872
871
/*
873
- * If we didn't finish on a block size boundary last time, then there
874
- * would be a gap if we tried to use this ABD as-is, so abort.
872
+ * The cardinal rule: a single on-disk block must never cross an
873
+ * physical (order-0) page boundary, as the kernel expects to be able
874
+ * to split at both LBS and page boundaries.
875
+ *
876
+ * This implies various alignment rules for the blocks in this
877
+ * (possibly compound) page, which we can check for.
875
878
*/
876
- if (s -> end != 0 )
877
- return (1 );
878
879
879
880
/*
880
- * Note if we're taking less than a full block, so we can check it
881
- * above on the next call .
881
+ * If the previous page did not end on a page boundary, then we
882
+ * can't proceed without creating a hole .
882
883
*/
883
- s -> end = (off + len ) & s -> bmask ;
884
+ if (s -> seen_last )
885
+ return (1 );
884
886
885
- /* All blocks after the first must start on a block size boundary . */
886
- if (s -> npages != 0 && ( off & s -> bmask ) != 0 )
887
+ /* This page must contain only whole LBS-sized blocks . */
888
+ if (! IS_P2ALIGNED ( len , s -> blocksize ) )
887
889
return (1 );
888
890
889
- s -> npages ++ ;
891
+ /*
892
+ * If this is not the first page in the ABD, then the data must start
893
+ * on a page-aligned boundary (so the kernel can split on page
894
+ * boundaries without having to deal with a hole). If it is, then
895
+ * it can start on LBS-alignment.
896
+ */
897
+ if (s -> seen_first ) {
898
+ if (!IS_P2ALIGNED (off , PAGESIZE ))
899
+ return (1 );
900
+ } else {
901
+ if (!IS_P2ALIGNED (off , s -> blocksize ))
902
+ return (1 );
903
+ s -> seen_first = 1 ;
904
+ }
905
+
906
+ /*
907
+ * If this data does not end on a page-aligned boundary, then this
908
+ * must be the last page in the ABD, for the same reason.
909
+ */
910
+ s -> seen_last = !IS_P2ALIGNED (off + len , PAGESIZE );
911
+
890
912
return (0 );
891
913
}
892
914
@@ -895,15 +917,14 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
895
917
* the number of pages, or 0 if it can't be submitted like this.
896
918
*/
897
919
static boolean_t
898
- vdev_disk_check_pages (abd_t * abd , uint64_t size , struct block_device * bdev )
920
+ vdev_disk_check_alignment (abd_t * abd , uint64_t size , struct block_device * bdev )
899
921
{
900
- vdev_disk_check_pages_t s = {
901
- .bmask = bdev_logical_block_size (bdev )- 1 ,
902
- .npages = 0 ,
903
- .end = 0 ,
922
+ vdev_disk_check_alignment_t s = {
923
+ .blocksize = bdev_logical_block_size (bdev ),
904
924
};
905
925
906
- if (abd_iterate_page_func (abd , 0 , size , vdev_disk_check_pages_cb , & s ))
926
+ if (abd_iterate_page_func (abd , 0 , size ,
927
+ vdev_disk_check_alignment_cb , & s ))
907
928
return (B_FALSE );
908
929
909
930
return (B_TRUE );
@@ -937,37 +958,32 @@ vdev_disk_io_rw(zio_t *zio)
937
958
938
959
/*
939
960
* Check alignment of the incoming ABD. If any part of it would require
940
- * submitting a page that is not aligned to the logical block size,
941
- * then we take a copy into a linear buffer and submit that instead.
942
- * This should be impossible on a 512b LBS, and fairly rare on 4K,
943
- * usually requiring abnormally-small data blocks (eg gang blocks)
944
- * mixed into the same ABD as larger ones (eg aggregated).
961
+ * submitting a page that is not aligned to both the logical block size
962
+ * and the page size, then we take a copy into a new memory region with
963
+ * correct alignment. This should be impossible on a 512b LBS. On
964
+ * larger blocks, this can happen at least when a small number of
965
+ * blocks (usually 1) are allocated from a shared slab, or when
966
+ * abnormally-small data regions (eg gang headers) are mixed into the
967
+ * same ABD as larger allocations (eg aggregations).
945
968
*/
946
969
abd_t * abd = zio -> io_abd ;
947
- if (!vdev_disk_check_pages (abd , zio -> io_size , bdev )) {
948
- void * buf ;
949
- if (zio -> io_type == ZIO_TYPE_READ )
950
- buf = abd_borrow_buf (zio -> io_abd , zio -> io_size );
951
- else
952
- buf = abd_borrow_buf_copy (zio -> io_abd , zio -> io_size );
970
+ if (!vdev_disk_check_alignment (abd , zio -> io_size , bdev )) {
971
+ /* Allocate a new memory region with guaranteed alignment */
972
+ abd = abd_alloc_for_io (zio -> io_size ,
973
+ zio -> io_abd -> abd_flags & ABD_FLAG_META );
953
974
954
- /*
955
- * Wrap the copy in an abd_t, so we can use the same iterators
956
- * to count and fill the vbio later.
957
- */
958
- abd = abd_get_from_buf (buf , zio -> io_size );
975
+ /* If we're writing copy our data into it */
976
+ if (zio -> io_type == ZIO_TYPE_WRITE )
977
+ abd_copy (abd , zio -> io_abd , zio -> io_size );
959
978
960
979
/*
961
- * False here would mean the borrowed copy has an invalid
962
- * alignment too, which would mean we've somehow been passed a
963
- * linear ABD with an interior page that has a non-zero offset
964
- * or a size not a multiple of PAGE_SIZE. This is not possible.
965
- * It would mean either zio_buf_alloc() or its underlying
966
- * allocators have done something extremely strange, or our
967
- * math in vdev_disk_check_pages() is wrong. In either case,
980
+ * False here would mean the new allocation has an invalid
981
+ * alignment too, which would mean that abd_alloc() is not
982
+ * guaranteeing this, or our logic in
983
+ * vdev_disk_check_alignment() is wrong. In either case,
968
984
* something in seriously wrong and its not safe to continue.
969
985
*/
970
- VERIFY (vdev_disk_check_pages (abd , zio -> io_size , bdev ));
986
+ VERIFY (vdev_disk_check_alignment (abd , zio -> io_size , bdev ));
971
987
}
972
988
973
989
/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
0 commit comments