@@ -805,14 +805,11 @@ vbio_completion(struct bio *bio)
805
805
* to the ADB, with changes if appropriate.
806
806
*/
807
807
if (vbio -> vbio_abd != NULL ) {
808
- void * buf = abd_to_buf (vbio -> vbio_abd );
808
+ if (zio -> io_type == ZIO_TYPE_READ )
809
+ abd_copy (zio -> io_abd , vbio -> vbio_abd , zio -> io_size );
810
+
809
811
abd_free (vbio -> vbio_abd );
810
812
vbio -> vbio_abd = NULL ;
811
-
812
- if (zio -> io_type == ZIO_TYPE_READ )
813
- abd_return_buf_copy (zio -> io_abd , buf , zio -> io_size );
814
- else
815
- abd_return_buf (zio -> io_abd , buf , zio -> io_size );
816
813
}
817
814
818
815
/* Final cleanup */
@@ -834,38 +831,61 @@ vbio_completion(struct bio *bio)
834
831
* NOTE: if you change this function, change the copy in
835
832
* tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test
836
833
* data there to validate the change you're making.
837
- *
838
834
*/
839
835
typedef struct {
840
- uint_t bmask ;
841
- uint_t npages ;
842
- uint_t end ;
843
- } vdev_disk_check_pages_t ;
836
+ size_t blocksize ;
837
+ int seen_first ;
838
+ int seen_last ;
839
+ } vdev_disk_check_alignment_t ;
844
840
845
841
static int
846
- vdev_disk_check_pages_cb (struct page * page , size_t off , size_t len , void * priv )
842
+ vdev_disk_check_alignment_cb (struct page * page , size_t off , size_t len ,
843
+ void * priv )
847
844
{
848
845
(void ) page ;
849
- vdev_disk_check_pages_t * s = priv ;
846
+ vdev_disk_check_alignment_t * s = priv ;
850
847
851
848
/*
852
- * If we didn't finish on a block size boundary last time, then there
853
- * would be a gap if we tried to use this ABD as-is, so abort.
849
+ * The cardinal rule: a single on-disk block must never cross an
850
+ * physical (order-0) page boundary, as the kernel expects to be able
851
+ * to split at both LBS and page boundaries.
852
+ *
853
+ * This implies various alignment rules for the blocks in this
854
+ * (possibly compound) page, which we can check for.
854
855
*/
855
- if (s -> end != 0 )
856
- return (1 );
857
856
858
857
/*
859
- * Note if we're taking less than a full block, so we can check it
860
- * above on the next call .
858
+ * If the previous page did not end on a page boundary, then we
859
+ * can't proceed without creating a hole .
861
860
*/
862
- s -> end = (off + len ) & s -> bmask ;
861
+ if (s -> seen_last )
862
+ return (1 );
863
863
864
- /* All blocks after the first must start on a block size boundary . */
865
- if (s -> npages != 0 && ( off & s -> bmask ) != 0 )
864
+ /* This page must contain only whole LBS-sized blocks . */
865
+ if (! IS_P2ALIGNED ( len , s -> blocksize ) )
866
866
return (1 );
867
867
868
- s -> npages ++ ;
868
+ /*
869
+ * If this is not the first page in the ABD, then the data must start
870
+ * on a page-aligned boundary (so the kernel can split on page
871
+ * boundaries without having to deal with a hole). If it is, then
872
+ * it can start on LBS-alignment.
873
+ */
874
+ if (s -> seen_first ) {
875
+ if (!IS_P2ALIGNED (off , PAGESIZE ))
876
+ return (1 );
877
+ } else {
878
+ if (!IS_P2ALIGNED (off , s -> blocksize ))
879
+ return (1 );
880
+ s -> seen_first = 1 ;
881
+ }
882
+
883
+ /*
884
+ * If this data does not end on a page-aligned boundary, then this
885
+ * must be the last page in the ABD, for the same reason.
886
+ */
887
+ s -> seen_last = !IS_P2ALIGNED (off + len , PAGESIZE );
888
+
869
889
return (0 );
870
890
}
871
891
@@ -874,15 +894,14 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
874
894
* the number of pages, or 0 if it can't be submitted like this.
875
895
*/
876
896
static boolean_t
877
- vdev_disk_check_pages (abd_t * abd , uint64_t size , struct block_device * bdev )
897
+ vdev_disk_check_alignment (abd_t * abd , uint64_t size , struct block_device * bdev )
878
898
{
879
- vdev_disk_check_pages_t s = {
880
- .bmask = bdev_logical_block_size (bdev )- 1 ,
881
- .npages = 0 ,
882
- .end = 0 ,
899
+ vdev_disk_check_alignment_t s = {
900
+ .blocksize = bdev_logical_block_size (bdev ),
883
901
};
884
902
885
- if (abd_iterate_page_func (abd , 0 , size , vdev_disk_check_pages_cb , & s ))
903
+ if (abd_iterate_page_func (abd , 0 , size ,
904
+ vdev_disk_check_alignment_cb , & s ))
886
905
return (B_FALSE );
887
906
888
907
return (B_TRUE );
@@ -916,37 +935,32 @@ vdev_disk_io_rw(zio_t *zio)
916
935
917
936
/*
918
937
* Check alignment of the incoming ABD. If any part of it would require
919
- * submitting a page that is not aligned to the logical block size,
920
- * then we take a copy into a linear buffer and submit that instead.
921
- * This should be impossible on a 512b LBS, and fairly rare on 4K,
922
- * usually requiring abnormally-small data blocks (eg gang blocks)
923
- * mixed into the same ABD as larger ones (eg aggregated).
938
+ * submitting a page that is not aligned to both the logical block size
939
+ * and the page size, then we take a copy into a new memory region with
940
+ * correct alignment. This should be impossible on a 512b LBS. On
941
+ * larger blocks, this can happen at least when a small number of
942
+ * blocks (usually 1) are allocated from a shared slab, or when
943
+ * abnormally-small data regions (eg gang headers) are mixed into the
944
+ * same ABD as larger allocations (eg aggregations).
924
945
*/
925
946
abd_t * abd = zio -> io_abd ;
926
- if (!vdev_disk_check_pages (abd , zio -> io_size , bdev )) {
927
- void * buf ;
928
- if (zio -> io_type == ZIO_TYPE_READ )
929
- buf = abd_borrow_buf (zio -> io_abd , zio -> io_size );
930
- else
931
- buf = abd_borrow_buf_copy (zio -> io_abd , zio -> io_size );
947
+ if (!vdev_disk_check_alignment (abd , zio -> io_size , bdev )) {
948
+ /* Allocate a new memory region with guaranteed alignment */
949
+ abd = abd_alloc_for_io (zio -> io_size ,
950
+ zio -> io_abd -> abd_flags & ABD_FLAG_META );
932
951
933
- /*
934
- * Wrap the copy in an abd_t, so we can use the same iterators
935
- * to count and fill the vbio later.
936
- */
937
- abd = abd_get_from_buf (buf , zio -> io_size );
952
+ /* If we're writing copy our data into it */
953
+ if (zio -> io_type == ZIO_TYPE_WRITE )
954
+ abd_copy (abd , zio -> io_abd , zio -> io_size );
938
955
939
956
/*
940
- * False here would mean the borrowed copy has an invalid
941
- * alignment too, which would mean we've somehow been passed a
942
- * linear ABD with an interior page that has a non-zero offset
943
- * or a size not a multiple of PAGE_SIZE. This is not possible.
944
- * It would mean either zio_buf_alloc() or its underlying
945
- * allocators have done something extremely strange, or our
946
- * math in vdev_disk_check_pages() is wrong. In either case,
957
+ * False here would mean the new allocation has an invalid
958
+ * alignment too, which would mean that abd_alloc() is not
959
+ * guaranteeing this, or our logic in
960
+ * vdev_disk_check_alignment() is wrong. In either case,
947
961
* something in seriously wrong and its not safe to continue.
948
962
*/
949
- VERIFY (vdev_disk_check_pages (abd , zio -> io_size , bdev ));
963
+ VERIFY (vdev_disk_check_alignment (abd , zio -> io_size , bdev ));
950
964
}
951
965
952
966
/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
0 commit comments