Skip to content

Commit f237b8e

Browse files
robntonyhutter
authored andcommitted
vdev_disk: try harder to ensure IO alignment rules
It seems out our notion of "properly" aligned IO was incomplete. In particular, dm-crypt does its own splitting, and assumes that a logical block will never cross an order-0 page boundary (ie, the physical page size, not compound size). This effectively means that it needs to be possible to split a BIO at any page or block size boundary and have it work correctly. This updates the alignment check function to enforce these rules (to the extent possible). Our response to misaligned data is to make some new allocation that is properly aligned, and copy the data into it. It turns out that linearising (via abd_borrow_buf()) is not enough, because we allocate eg 4K blocks from a general purpose slab, and so may receive (or already have) a 4K block that crosses pages. So instead, we allocate a new ABD, which is guaranteed to be aligned properly to block sizes, and then copy everything into it, and back out on the way back. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Signed-off-by: Rob Norris <[email protected]> Closes openzfs#16687 openzfs#16631 openzfs#15646 openzfs#15533 openzfs#14533 (cherry picked from commit 63bafe6)
1 parent 727506c commit f237b8e

File tree

1 file changed

+68
-52
lines changed

1 file changed

+68
-52
lines changed

module/os/linux/zfs/vdev_disk.c

Lines changed: 68 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -832,14 +832,11 @@ BIO_END_IO_PROTO(vbio_completion, bio, error)
832832
* to the ADB, with changes if appropriate.
833833
*/
834834
if (vbio->vbio_abd != NULL) {
835-
void *buf = abd_to_buf(vbio->vbio_abd);
835+
if (zio->io_type == ZIO_TYPE_READ)
836+
abd_copy(zio->io_abd, vbio->vbio_abd, zio->io_size);
837+
836838
abd_free(vbio->vbio_abd);
837839
vbio->vbio_abd = NULL;
838-
839-
if (zio->io_type == ZIO_TYPE_READ)
840-
abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
841-
else
842-
abd_return_buf(zio->io_abd, buf, zio->io_size);
843840
}
844841

845842
/* Final cleanup */
@@ -859,34 +856,59 @@ BIO_END_IO_PROTO(vbio_completion, bio, error)
859856
* split the BIO, the two halves will still be properly aligned.
860857
*/
861858
typedef struct {
862-
uint_t bmask;
863-
uint_t npages;
864-
uint_t end;
865-
} vdev_disk_check_pages_t;
859+
size_t blocksize;
860+
int seen_first;
861+
int seen_last;
862+
} vdev_disk_check_alignment_t;
866863

867864
static int
868-
vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
865+
vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len,
866+
void *priv)
869867
{
870-
vdev_disk_check_pages_t *s = priv;
868+
(void) page;
869+
vdev_disk_check_alignment_t *s = priv;
871870

872871
/*
873-
* If we didn't finish on a block size boundary last time, then there
874-
* would be a gap if we tried to use this ABD as-is, so abort.
872+
* The cardinal rule: a single on-disk block must never cross an
873+
* physical (order-0) page boundary, as the kernel expects to be able
874+
* to split at both LBS and page boundaries.
875+
*
876+
* This implies various alignment rules for the blocks in this
877+
* (possibly compound) page, which we can check for.
875878
*/
876-
if (s->end != 0)
877-
return (1);
878879

879880
/*
880-
* Note if we're taking less than a full block, so we can check it
881-
* above on the next call.
881+
* If the previous page did not end on a page boundary, then we
882+
* can't proceed without creating a hole.
882883
*/
883-
s->end = (off+len) & s->bmask;
884+
if (s->seen_last)
885+
return (1);
884886

885-
/* All blocks after the first must start on a block size boundary. */
886-
if (s->npages != 0 && (off & s->bmask) != 0)
887+
/* This page must contain only whole LBS-sized blocks. */
888+
if (!IS_P2ALIGNED(len, s->blocksize))
887889
return (1);
888890

889-
s->npages++;
891+
/*
892+
* If this is not the first page in the ABD, then the data must start
893+
* on a page-aligned boundary (so the kernel can split on page
894+
* boundaries without having to deal with a hole). If it is, then
895+
* it can start on LBS-alignment.
896+
*/
897+
if (s->seen_first) {
898+
if (!IS_P2ALIGNED(off, PAGESIZE))
899+
return (1);
900+
} else {
901+
if (!IS_P2ALIGNED(off, s->blocksize))
902+
return (1);
903+
s->seen_first = 1;
904+
}
905+
906+
/*
907+
* If this data does not end on a page-aligned boundary, then this
908+
* must be the last page in the ABD, for the same reason.
909+
*/
910+
s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE);
911+
890912
return (0);
891913
}
892914

@@ -895,15 +917,14 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
895917
* the number of pages, or 0 if it can't be submitted like this.
896918
*/
897919
static boolean_t
898-
vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
920+
vdev_disk_check_alignment(abd_t *abd, uint64_t size, struct block_device *bdev)
899921
{
900-
vdev_disk_check_pages_t s = {
901-
.bmask = bdev_logical_block_size(bdev)-1,
902-
.npages = 0,
903-
.end = 0,
922+
vdev_disk_check_alignment_t s = {
923+
.blocksize = bdev_logical_block_size(bdev),
904924
};
905925

906-
if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
926+
if (abd_iterate_page_func(abd, 0, size,
927+
vdev_disk_check_alignment_cb, &s))
907928
return (B_FALSE);
908929

909930
return (B_TRUE);
@@ -937,37 +958,32 @@ vdev_disk_io_rw(zio_t *zio)
937958

938959
/*
939960
* Check alignment of the incoming ABD. If any part of it would require
940-
* submitting a page that is not aligned to the logical block size,
941-
* then we take a copy into a linear buffer and submit that instead.
942-
* This should be impossible on a 512b LBS, and fairly rare on 4K,
943-
* usually requiring abnormally-small data blocks (eg gang blocks)
944-
* mixed into the same ABD as larger ones (eg aggregated).
961+
* submitting a page that is not aligned to both the logical block size
962+
* and the page size, then we take a copy into a new memory region with
963+
* correct alignment. This should be impossible on a 512b LBS. On
964+
* larger blocks, this can happen at least when a small number of
965+
* blocks (usually 1) are allocated from a shared slab, or when
966+
* abnormally-small data regions (eg gang headers) are mixed into the
967+
* same ABD as larger allocations (eg aggregations).
945968
*/
946969
abd_t *abd = zio->io_abd;
947-
if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
948-
void *buf;
949-
if (zio->io_type == ZIO_TYPE_READ)
950-
buf = abd_borrow_buf(zio->io_abd, zio->io_size);
951-
else
952-
buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
970+
if (!vdev_disk_check_alignment(abd, zio->io_size, bdev)) {
971+
/* Allocate a new memory region with guaranteed alignment */
972+
abd = abd_alloc_for_io(zio->io_size,
973+
zio->io_abd->abd_flags & ABD_FLAG_META);
953974

954-
/*
955-
* Wrap the copy in an abd_t, so we can use the same iterators
956-
* to count and fill the vbio later.
957-
*/
958-
abd = abd_get_from_buf(buf, zio->io_size);
975+
/* If we're writing copy our data into it */
976+
if (zio->io_type == ZIO_TYPE_WRITE)
977+
abd_copy(abd, zio->io_abd, zio->io_size);
959978

960979
/*
961-
* False here would mean the borrowed copy has an invalid
962-
* alignment too, which would mean we've somehow been passed a
963-
* linear ABD with an interior page that has a non-zero offset
964-
* or a size not a multiple of PAGE_SIZE. This is not possible.
965-
* It would mean either zio_buf_alloc() or its underlying
966-
* allocators have done something extremely strange, or our
967-
* math in vdev_disk_check_pages() is wrong. In either case,
980+
* False here would mean the new allocation has an invalid
981+
* alignment too, which would mean that abd_alloc() is not
982+
* guaranteeing this, or our logic in
983+
* vdev_disk_check_alignment() is wrong. In either case,
968984
* something in seriously wrong and its not safe to continue.
969985
*/
970-
VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
986+
VERIFY(vdev_disk_check_alignment(abd, zio->io_size, bdev));
971987
}
972988

973989
/* Allocate vbio, with a pointer to the borrowed ABD if necessary */

0 commit comments

Comments
 (0)