Skip to content

Commit 21eddef

Browse files
bwatkinsonMark Maybee
authored andcommitted
WIP Direct IO ZoL
This current state of adding Direct IO support to ZFS on Linux rebased on ZoL master. The current work still remaining is: 1. Handle issues related to Direct IO requests for dbuf's with multiple holds. 2. Create ZTS tests 3. Further debugging At the moment, tests have been run using FIO and XDD to resolve all failed VERIFY and ASSERT statements. Signed-off-by: Brian <[email protected]> Co-authored-by: Mark Maybee <[email protected]> Co-authored-by: Brian Atkinson <[email protected]>
1 parent db0ad39 commit 21eddef

File tree

23 files changed

+2529
-809
lines changed

23 files changed

+2529
-809
lines changed

config/kernel-get-user-pages.m4

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
dnl #
2+
dnl # get_user_pages_unlocked() function was not available till 4.0.
3+
dnl #
4+
dnl # long get_user_pages_unlocked(struct task_struct *tsk,
5+
dnl # struct mm_struct *mm, unsigned long start, unsigned long nr_pages,
6+
dnl # int write, int force, struct page **pages)
7+
dnl # 4.8 API Change
8+
dnl # long get_user_pages_unlocked(unsigned long start,
9+
dnl # unsigned long nr_pages, int write, int force, struct page **page)
10+
dnl # 4.9 API Change
11+
dnl # long get_user_pages_unlocked(usigned long start, int nr_pages,
12+
dnl # struct page **pages, unsigned int gup_flags)
13+
dnl #
14+
dnl #
15+
dnl # In earlier kernels (< 4.0) get_user_pages() is available
16+
dnl #
17+
dnl # int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
18+
dnl # unsigned long start, int nr_pages, int write, int force,
19+
dnl # struct_page **pages, struct vm_area_struct **vmas)
20+
dnl #
21+
dnl # 4.6 API Change
22+
dnl # long get_user_pages(unsigned long start, unsigned long nr_pages,
23+
dnl # unsigned int gup_flags, struct page **pages,
24+
dnl # struct vm_area_struct **vmas)
25+
dnl #
26+
27+
dnl#
28+
dnl# Check available get_user_pages/_unlocked interfaces.
29+
dnl#
30+
AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_USER_PAGES], [
31+
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_gup_flags], [
32+
#include <linux/mm.h>
33+
], [
34+
unsigned long start = 0;
35+
unsigned long nr_pages = 1;
36+
unsigned int gup_flags = 0;
37+
struct page **pages = NULL;
38+
long ret __attribute__ ((unused));
39+
ret = get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
40+
])
41+
42+
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_write_flag], [
43+
#include <linux/mm.h>
44+
], [
45+
unsigned long start = 0;
46+
unsigned long nr_pages = 1;
47+
int write = 0;
48+
int force = 0;
49+
long ret __attribute__ ((unused));
50+
struct page **pages = NULL;
51+
ret = get_user_pages_unlocked(start, nr_pages, write, force, pages);
52+
])
53+
54+
55+
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct], [
56+
#include <linux/mm.h>
57+
], [
58+
struct task_struct *tsk = NULL;
59+
struct mm_struct *mm = NULL;
60+
unsigned long start = 0;
61+
unsigned long nr_pages = 1;
62+
int write = 0;
63+
int force = 0;
64+
struct page **pages = NULL;
65+
long ret __attribute__ ((unused));
66+
ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
67+
force, pages);
68+
])
69+
70+
ZFS_LINUX_TEST_SRC([get_user_pages_gup_flags], [
71+
#include <linux/mm.h>
72+
], [
73+
struct vm_area_struct **vmas = NULL;
74+
unsigned long start = 0;
75+
unsigned long nr_pages = 1;
76+
unsigned int gup_flags = 0;
77+
struct page **pages = NULL;
78+
long ret __attribute__ ((unused));
79+
ret = get_user_pagees(start, nr_pages, gup_flags, pages, vmas);
80+
])
81+
82+
ZFS_LINUX_TEST_SRC([get_user_pages_task_struct], [
83+
#include <linux/mm.h>
84+
], [
85+
struct task_struct *tsk = NULL;
86+
struct mm_struct *mm = NULL;
87+
struct vm_area_struct **vmas = NULL;
88+
unsigned long start = 0;
89+
unsigned long nr_pages = 1;
90+
int write = 0;
91+
int force = 0;
92+
struct page **pages = NULL;
93+
int ret __attribute__ ((unused));
94+
ret = get_user_pages(tsk, mm, start, nr_pages, write,
95+
force, pages, vmas);
96+
])
97+
])
98+
99+
dnl #
100+
dnl # Supported get_user_pages/_unlocked interfaces checked newest to oldest.
101+
dnl # We first check for get_user_pages_unlocked as that is available in
102+
dnl # newer kernels.
103+
dnl #
104+
AC_DEFUN([ZFS_AC_KERNEL_GET_USER_PAGES], [
105+
dnl #
106+
dnl # Current API of get_user_pages_unlocked
107+
dnl #
108+
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes gup flags])
109+
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_gup_flags], [
110+
AC_MSG_RESULT(yes)
111+
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS, 1,
112+
[get_user_pages_unlocked() takes gup flags])
113+
], [
114+
AC_MSG_RESULT(no)
115+
116+
dnl #
117+
dnl # 4.8 API change, get_user_pages_unlocked
118+
dnl #
119+
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes write flag])
120+
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_write_flag], [
121+
AC_MSG_RESULT(yes)
122+
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG, 1,
123+
[get_user_pages_unlocked() takes write flag])
124+
], [
125+
AC_MSG_RESULT(no)
126+
127+
dnl #
128+
dnl # 4.0 API, get_user_pages_unlocked
129+
dnl #
130+
AC_MSG_CHECKING(
131+
[whether get_user_pages_unlocked() takes struct task_struct])
132+
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_task_struct], [
133+
AC_MSG_RESULT(yes)
134+
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT, 1,
135+
[get_user_pages_unlocked() takes struct task_struct])
136+
], [
137+
AC_MSG_RESULT(no)
138+
139+
dnl #
140+
dnl # 4.6 API change, get_user_pages
141+
dnl #
142+
AC_MSG_CHECKING([whether get_user_pages() takes gup flags])
143+
ZFS_LINUX_TEST_RESULT([get_user_pages_gup_flags], [
144+
AC_MSG_RESULT(yes)
145+
AC_DEFINE(HAVE_GET_USER_PAGES_GUP_FLAGS, 1,
146+
[get_user_pages() takes gup flags])
147+
], [
148+
AC_MSG_RESULT(no)
149+
150+
dnl #
151+
dnl # 2.6.31 API, get_user_pages
152+
dnl #
153+
AC_MSG_CHECKING(
154+
[whether get_user_pages() takes struct task_struct])
155+
ZFS_LINUX_TEST_RESULT([get_user_pages_task_struct], [
156+
AC_MSG_RESULT(yes)
157+
AC_DEFINE(HAVE_GET_USER_PAGES_TASK_STRUCT, 1,
158+
[get_user_pages() takes struct task_struct])
159+
], [
160+
dnl #
161+
dnl # If we can not map the users pages in
162+
dnl # then we can not do Direct IO
163+
dnl #
164+
AC_MSG_ERROR(
165+
[no; Direct IO not supported for this kernel])
166+
])
167+
])
168+
])
169+
])
170+
])
171+
])

config/kernel.m4

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
9898
ZFS_AC_KERNEL_SRC_VFS_GETATTR
9999
ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
100100
ZFS_AC_KERNEL_SRC_VFS_ITERATE
101+
ZFS_AC_KERNEL_SRC_GET_USER_PAGES
101102
ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO
102103
ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE
103104
ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS
@@ -192,6 +193,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
192193
ZFS_AC_KERNEL_VFS_GETATTR
193194
ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
194195
ZFS_AC_KERNEL_VFS_ITERATE
196+
ZFS_AC_KERNEL_GET_USER_PAGES
195197
ZFS_AC_KERNEL_VFS_DIRECT_IO
196198
ZFS_AC_KERNEL_VFS_RW_ITERATE
197199
ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS

include/os/linux/kernel/linux/kmap_compat.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,40 @@
4040
#define zfs_access_ok(type, addr, size) access_ok(addr, size)
4141
#endif
4242

43+
/*
44+
* read returning FOLL_WRITE is due to the fact that we are stating
45+
* that the kernel will have write access to the user pages. So, when
46+
* a Direct IO read request is issued, the kernel must write to the user
47+
* pages.
48+
*
49+
* get_user_pages_unlocked was not available to 4.0, so we also check
50+
* for get_user_pages on older kernels.
51+
*/
52+
/* 4.9 API change - for and read flag is passed as gup flags */
53+
#if defined(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS)
54+
#define zfs_get_user_pages(addr, numpages, read, pages) \
55+
get_user_pages_unlocked(addr, numpages, pages, read ? FOLL_WRITE : 0)
56+
/* 4.8 API change - no longer takes struct task_struct as arguement */
57+
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG)
58+
#define zfs_get_user_pages(addr, numpages, read, pages) \
59+
get_user_pages_unlocked(addr, numpages, read, 0, pages)
60+
/* 4.0 API */
61+
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT)
62+
#define zfs_get_user_pages(addr, numpages, read, pages) \
63+
get_user_pages_unlocked(current, current->mm, addr, numpages, read, 0, \
64+
pages)
65+
/* 4.6 API change - no longer requires struct's task_struct or mm_struct */
66+
#elif defined(HAVE_USER_GET_PAGES_GUP_FLAGS)
67+
#define zfs_get_user_pages(addr, numpages, read, pages) \
68+
get_user_pages(addr, numpages, read ? FOLL_WRITE : 0, pages, NULL)
69+
#elif defined(HAVE_USER_GET_PAGES_TASK_STRUCT)
70+
/* 2.6.31 API */
71+
#define zfs_get_user_pages(addr, numpages, read, pages) \
72+
get_user_pages(current, current->mm, addr, numpages, read, 0, pages, \
73+
NULL)
74+
#else
75+
#define zfs_get_user_pages(addr, numpages, read, pages) \
76+
SET_ERROR(ENOTSUP)
77+
#endif
78+
4379
#endif /* _ZFS_KMAP_H */

include/os/linux/spl/sys/mutex.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ spl_mutex_clear_owner(kmutex_t *mp)
6565
#define MUTEX_HELD(mp) mutex_owned(mp)
6666
#define MUTEX_NOT_HELD(mp) (!MUTEX_HELD(mp))
6767

68+
6869
#ifdef CONFIG_LOCKDEP
6970
static inline void
7071
spl_mutex_set_type(kmutex_t *mp, kmutex_type_t type)
@@ -151,6 +152,15 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \
151152

152153
#define mutex_enter(mp) mutex_enter_nested((mp), 0)
153154

155+
#define mutex_transfer_ownership(mp) \
156+
{ \
157+
if (mutex_owner((mp)) != current) { \
158+
ASSERT3P(mutex_owner((mp)), !=, NULL); \
159+
spl_mutex_set_owner((mp)); \
160+
} \
161+
}
162+
163+
154164
/*
155165
* The reason for the spinlock:
156166
*

include/os/linux/spl/sys/uio.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@
3535
#include <asm/uaccess.h>
3636
#include <sys/types.h>
3737

38+
/*
39+
* uio_extflg: extended flags
40+
*/
41+
#define UIO_COPY_DEFAULT 0x0000 /* no special options to copy */
42+
#define UIO_COPY_CACHED 0x0001 /* copy should not bypass caches */
43+
#define UIO_ASYNC 0x0002 /* uio_t is reall a uioa_t */
44+
#define UIO_XUIO 0x0004 /* struct is xuio_t */
45+
#define UIO_DIRECT 0x0008 /* request direct I/O */
46+
3847
typedef struct iovec iovec_t;
3948

4049
typedef enum uio_rw {

include/sys/abd.h

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,22 @@
3535
extern "C" {
3636
#endif
3737

38+
#ifndef _KERNEL
39+
struct page; /* forward declaration to be used in abd.c */
40+
#endif
41+
3842
typedef enum abd_flags {
39-
ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
40-
ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
41-
ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */
43+
ABD_FLAG_LINEAR = 1 << 0, /* is ABD linear/scattered? */
44+
ABD_FLAG_OWNER = 1 << 1, /* own its data buffers? */
45+
ABD_FLAG_META = 1 << 2, /* represents FS metadata? */
4246
ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
4347
ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */
4448
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
49+
ABD_FLAG_FROM_PAGES = 1 << 6, /* does not own the pages */
50+
ABD_FLAG_MULTI_LIST = 1 << 7, /* mult ABDs chained together */
51+
ABD_FLAG_LINKED = 1 << 8, /* ABD is on a chained list */
52+
ABD_FLAG_GAP = 1 << 9, /* ABD is for read gap */
53+
ABD_FLAG_ZEROS = 1 << 10 /* ABD a zero-filled buffer */
4554
} abd_flags_t;
4655

4756
typedef struct abd {
@@ -64,6 +73,9 @@ typedef struct abd {
6473
void *abd_buf;
6574
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
6675
} abd_linear;
76+
struct abd_multi {
77+
list_t abd_chain;
78+
} abd_multi;
6779
} abd_u;
6880
} abd_t;
6981

@@ -75,14 +87,25 @@ extern int zfs_abd_scatter_enabled;
7587
static inline boolean_t
7688
abd_is_linear(abd_t *abd)
7789
{
78-
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
90+
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0);
7991
}
8092

8193
static inline boolean_t
8294
abd_is_linear_page(abd_t *abd)
8395
{
84-
return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ?
85-
B_TRUE : B_FALSE);
96+
return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0);
97+
}
98+
99+
static inline boolean_t
100+
abd_is_zero_buf(abd_t *abd)
101+
{
102+
return ((abd->abd_flags & ABD_FLAG_ZEROS) != 0);
103+
}
104+
105+
static inline boolean_t
106+
abd_is_multi(abd_t *abd)
107+
{
108+
return ((abd->abd_flags & ABD_FLAG_MULTI_LIST) != 0);
86109
}
87110

88111
/*
@@ -91,12 +114,18 @@ abd_is_linear_page(abd_t *abd)
91114

92115
abd_t *abd_alloc(size_t, boolean_t);
93116
abd_t *abd_alloc_linear(size_t, boolean_t);
117+
abd_t *abd_alloc_multi(void);
94118
abd_t *abd_alloc_for_io(size_t, boolean_t);
95119
abd_t *abd_alloc_sametype(abd_t *, size_t);
120+
void abd_add_child(abd_t *, abd_t *, boolean_t);
96121
void abd_free(abd_t *);
97122
abd_t *abd_get_offset(abd_t *, size_t);
98123
abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
124+
abd_t *abd_get_zeros(size_t);
99125
abd_t *abd_get_from_buf(void *, size_t);
126+
#ifdef _KERNEL
127+
abd_t *abd_get_from_pages(struct page **, uint_t);
128+
#endif
100129
void abd_put(abd_t *);
101130

102131
/*
@@ -126,8 +155,7 @@ int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
126155
void abd_zero_off(abd_t *, size_t, size_t);
127156

128157
#if defined(_KERNEL)
129-
unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int,
130-
size_t);
158+
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
131159
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
132160
#endif
133161

include/sys/dbuf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
342342
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
343343
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
344344
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
345+
blkptr_t *dmu_buf_get_bp(dmu_buf_impl_t *db);
345346
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
346347
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
347348
arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);

0 commit comments

Comments
 (0)