Skip to content

Commit a00093b

Browse files
robnlundman
authored andcommitted
vdev_disk: try harder to ensure IO alignment rules
It seems out our notion of "properly" aligned IO was incomplete. In particular, dm-crypt does its own splitting, and assumes that a logical block will never cross an order-0 page boundary (ie, the physical page size, not compound size). This effectively means that it needs to be possible to split a BIO at any page or block size boundary and have it work correctly. This updates the alignment check function to enforce these rules (to the extent possible). Our response to misaligned data is to make some new allocation that is properly aligned, and copy the data into it. It turns out that linearising (via abd_borrow_buf()) is not enough, because we allocate eg 4K blocks from a general purpose slab, and so may receive (or already have) a 4K block that crosses pages. So instead, we allocate a new ABD, which is guaranteed to be aligned properly to block sizes, and then copy everything into it, and back out on the way back. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf <[email protected]> Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Tony Hutter <[email protected]> Signed-off-by: Rob Norris <[email protected]> Closes openzfs#16687 openzfs#16631 openzfs#15646 openzfs#15533 openzfs#14533
1 parent 8a2e7a3 commit a00093b

File tree

2 files changed

+230
-154
lines changed

2 files changed

+230
-154
lines changed

module/os/linux/zfs/vdev_disk.c

Lines changed: 67 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -805,14 +805,11 @@ vbio_completion(struct bio *bio)
805805
* to the ADB, with changes if appropriate.
806806
*/
807807
if (vbio->vbio_abd != NULL) {
808-
void *buf = abd_to_buf(vbio->vbio_abd);
808+
if (zio->io_type == ZIO_TYPE_READ)
809+
abd_copy(zio->io_abd, vbio->vbio_abd, zio->io_size);
810+
809811
abd_free(vbio->vbio_abd);
810812
vbio->vbio_abd = NULL;
811-
812-
if (zio->io_type == ZIO_TYPE_READ)
813-
abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
814-
else
815-
abd_return_buf(zio->io_abd, buf, zio->io_size);
816813
}
817814

818815
/* Final cleanup */
@@ -834,38 +831,61 @@ vbio_completion(struct bio *bio)
834831
* NOTE: if you change this function, change the copy in
835832
* tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test
836833
* data there to validate the change you're making.
837-
*
838834
*/
839835
typedef struct {
840-
uint_t bmask;
841-
uint_t npages;
842-
uint_t end;
843-
} vdev_disk_check_pages_t;
836+
size_t blocksize;
837+
int seen_first;
838+
int seen_last;
839+
} vdev_disk_check_alignment_t;
844840

845841
static int
846-
vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
842+
vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len,
843+
void *priv)
847844
{
848845
(void) page;
849-
vdev_disk_check_pages_t *s = priv;
846+
vdev_disk_check_alignment_t *s = priv;
850847

851848
/*
852-
* If we didn't finish on a block size boundary last time, then there
853-
* would be a gap if we tried to use this ABD as-is, so abort.
849+
* The cardinal rule: a single on-disk block must never cross an
850+
* physical (order-0) page boundary, as the kernel expects to be able
851+
* to split at both LBS and page boundaries.
852+
*
853+
* This implies various alignment rules for the blocks in this
854+
* (possibly compound) page, which we can check for.
854855
*/
855-
if (s->end != 0)
856-
return (1);
857856

858857
/*
859-
* Note if we're taking less than a full block, so we can check it
860-
* above on the next call.
858+
* If the previous page did not end on a page boundary, then we
859+
* can't proceed without creating a hole.
861860
*/
862-
s->end = (off+len) & s->bmask;
861+
if (s->seen_last)
862+
return (1);
863863

864-
/* All blocks after the first must start on a block size boundary. */
865-
if (s->npages != 0 && (off & s->bmask) != 0)
864+
/* This page must contain only whole LBS-sized blocks. */
865+
if (!IS_P2ALIGNED(len, s->blocksize))
866866
return (1);
867867

868-
s->npages++;
868+
/*
869+
* If this is not the first page in the ABD, then the data must start
870+
* on a page-aligned boundary (so the kernel can split on page
871+
* boundaries without having to deal with a hole). If it is, then
872+
* it can start on LBS-alignment.
873+
*/
874+
if (s->seen_first) {
875+
if (!IS_P2ALIGNED(off, PAGESIZE))
876+
return (1);
877+
} else {
878+
if (!IS_P2ALIGNED(off, s->blocksize))
879+
return (1);
880+
s->seen_first = 1;
881+
}
882+
883+
/*
884+
* If this data does not end on a page-aligned boundary, then this
885+
* must be the last page in the ABD, for the same reason.
886+
*/
887+
s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE);
888+
869889
return (0);
870890
}
871891

@@ -874,15 +894,14 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
874894
* the number of pages, or 0 if it can't be submitted like this.
875895
*/
876896
static boolean_t
877-
vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
897+
vdev_disk_check_alignment(abd_t *abd, uint64_t size, struct block_device *bdev)
878898
{
879-
vdev_disk_check_pages_t s = {
880-
.bmask = bdev_logical_block_size(bdev)-1,
881-
.npages = 0,
882-
.end = 0,
899+
vdev_disk_check_alignment_t s = {
900+
.blocksize = bdev_logical_block_size(bdev),
883901
};
884902

885-
if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
903+
if (abd_iterate_page_func(abd, 0, size,
904+
vdev_disk_check_alignment_cb, &s))
886905
return (B_FALSE);
887906

888907
return (B_TRUE);
@@ -916,37 +935,32 @@ vdev_disk_io_rw(zio_t *zio)
916935

917936
/*
918937
* Check alignment of the incoming ABD. If any part of it would require
919-
* submitting a page that is not aligned to the logical block size,
920-
* then we take a copy into a linear buffer and submit that instead.
921-
* This should be impossible on a 512b LBS, and fairly rare on 4K,
922-
* usually requiring abnormally-small data blocks (eg gang blocks)
923-
* mixed into the same ABD as larger ones (eg aggregated).
938+
* submitting a page that is not aligned to both the logical block size
939+
* and the page size, then we take a copy into a new memory region with
940+
* correct alignment. This should be impossible on a 512b LBS. On
941+
* larger blocks, this can happen at least when a small number of
942+
* blocks (usually 1) are allocated from a shared slab, or when
943+
* abnormally-small data regions (eg gang headers) are mixed into the
944+
* same ABD as larger allocations (eg aggregations).
924945
*/
925946
abd_t *abd = zio->io_abd;
926-
if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
927-
void *buf;
928-
if (zio->io_type == ZIO_TYPE_READ)
929-
buf = abd_borrow_buf(zio->io_abd, zio->io_size);
930-
else
931-
buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
947+
if (!vdev_disk_check_alignment(abd, zio->io_size, bdev)) {
948+
/* Allocate a new memory region with guaranteed alignment */
949+
abd = abd_alloc_for_io(zio->io_size,
950+
zio->io_abd->abd_flags & ABD_FLAG_META);
932951

933-
/*
934-
* Wrap the copy in an abd_t, so we can use the same iterators
935-
* to count and fill the vbio later.
936-
*/
937-
abd = abd_get_from_buf(buf, zio->io_size);
952+
/* If we're writing copy our data into it */
953+
if (zio->io_type == ZIO_TYPE_WRITE)
954+
abd_copy(abd, zio->io_abd, zio->io_size);
938955

939956
/*
940-
* False here would mean the borrowed copy has an invalid
941-
* alignment too, which would mean we've somehow been passed a
942-
* linear ABD with an interior page that has a non-zero offset
943-
* or a size not a multiple of PAGE_SIZE. This is not possible.
944-
* It would mean either zio_buf_alloc() or its underlying
945-
* allocators have done something extremely strange, or our
946-
* math in vdev_disk_check_pages() is wrong. In either case,
957+
* False here would mean the new allocation has an invalid
958+
* alignment too, which would mean that abd_alloc() is not
959+
* guaranteeing this, or our logic in
960+
* vdev_disk_check_alignment() is wrong. In either case,
947961
* something in seriously wrong and its not safe to continue.
948962
*/
949-
VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
963+
VERIFY(vdev_disk_check_alignment(abd, zio->io_size, bdev));
950964
}
951965

952966
/* Allocate vbio, with a pointer to the borrowed ABD if necessary */

0 commit comments

Comments
 (0)