Skip to content

Commit 04e3a35

Browse files
bwatkinsonMark Maybee
andcommitted
WIP Direct IO ZoL
This current state of adding Direct IO support to ZFS on Linux rebased on ZoL master. The current work still remaining is: 1. Handle issues related to Direct IO requests for dbuf's with multiple holds. 2. Create ZTS tests 3. Further debugging At the moment, tests have been run using FIO and XDD to resolve all failed VERIFY and ASSERT statements. Signed-off-by: Brian <[email protected]> Co-authored-by: Mark Maybee <[email protected]> Co-authored-by: Brian Atkinson <[email protected]>
1 parent 093902e commit 04e3a35

File tree

23 files changed

+1966
-206
lines changed

23 files changed

+1966
-206
lines changed

config/kernel-get-user-pages.m4

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
dnl #
2+
dnl # get_user_pages_unlocked() function was not available till 4.0.
3+
dnl #
4+
dnl # long get_user_pages_unlocked(struct task_struct *tsk,
5+
dnl # struct mm_struct *mm, unsigned long start, unsigned long nr_pages,
6+
dnl # int write, int force, struct page **pages)
7+
dnl # 4.8 API Change
8+
dnl # long get_user_pages_unlocked(unsigned long start,
9+
dnl # unsigned long nr_pages, int write, int force, struct page **page)
10+
dnl # 4.9 API Change
11+
dnl # long get_user_pages_unlocked(usigned long start, int nr_pages,
12+
dnl # struct page **pages, unsigned int gup_flags)
13+
dnl #
14+
dnl #
15+
dnl # In earlier kernels (< 4.0) get_user_pages() is available
16+
dnl #
17+
18+
dnl#
19+
dnl# Check available get_user_pages/_unlocked interfaces.
20+
dnl#
21+
AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_USER_PAGES], [
22+
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_gup_flags], [
23+
#include <linux/mm.h>
24+
], [
25+
unsigned long start = 0;
26+
unsigned long nr_pages = 1;
27+
unsigned int gup_flags = 0;
28+
struct page **pages = NULL;
29+
long ret __attribute__ ((unused));
30+
ret = get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
31+
])
32+
33+
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_write_flag], [
34+
#include <linux/mm.h>
35+
], [
36+
unsigned long start = 0;
37+
unsigned long nr_pages = 1;
38+
int write = 0;
39+
int force = 0;
40+
long ret __attribute__ ((unused));
41+
struct page **pages = NULL;
42+
ret = get_user_pages_unlocked(start, nr_pages, write, force, pages);
43+
])
44+
45+
46+
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct], [
47+
#include <linux/mm.h>
48+
], [
49+
struct task_struct *tsk = NULL;
50+
struct mm_struct *mm = NULL;
51+
unsigned long start = 0;
52+
unsigned long nr_pages = 1;
53+
int write = 0;
54+
int force = 0;
55+
struct page **pages = NULL;
56+
long ret __attribute__ ((unused));
57+
ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
58+
force, pages);
59+
])
60+
61+
ZFS_LINUX_TEST_SRC([get_user_pages_task_struct], [
62+
#include <linux/mm.h>
63+
], [
64+
struct task_struct *tsk = NULL;
65+
struct mm_struct *mm = NULL;
66+
struct vm_area_struct **vmas = NULL;
67+
unsigned long start = 0;
68+
unsigned long nr_pages = 1;
69+
int write = 0;
70+
int force = 0;
71+
struct page **pages = NULL;
72+
int ret __attribute__ ((unused));
73+
ret = get_user_pages(tsk, mm, start, nr_pages, write,
74+
force, pages, vmas);
75+
])
76+
])
77+
78+
dnl #
79+
dnl # Supported get_user_pages/_unlocked interfaces checked newest to oldest.
80+
dnl # We first check for get_user_pages_unlocked as that is available in
81+
dnl # newer kernels.
82+
dnl #
83+
AC_DEFUN([ZFS_AC_KERNEL_GET_USER_PAGES], [
84+
dnl #
85+
dnl # Current API of get_user_pages_unlocked
86+
dnl #
87+
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes gup flags])
88+
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_gup_flags], [
89+
AC_MSG_RESULT(yes)
90+
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS, 1,
91+
[get_user_pages_unlocked() takes gup flags])
92+
], [
93+
AC_MSG_RESULT(no)
94+
95+
dnl #
96+
dnl # 4.8 API change, get_user_pages_unlocked
97+
dnl #
98+
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes write flag])
99+
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_write_flag], [
100+
AC_MSG_RESULT(yes)
101+
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG, 1,
102+
[get_user_pages_unlocked() takes write flag])
103+
], [
104+
AC_MSG_RESULT(no)
105+
106+
dnl #
107+
dnl # 4.0 API, get_user_pages_unlocked
108+
dnl #
109+
AC_MSG_CHECKING(
110+
[whether get_user_pages_unlocked() takes struct task_struct])
111+
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_task_struct], [
112+
AC_MSG_RESULT(yes)
113+
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT, 1,
114+
[get_user_pages_unlocked() takes struct task_struct])
115+
], [
116+
AC_MSG_RESULT(no)
117+
118+
dnl # get_user_pages
119+
AC_MSG_CHECKING(
120+
[whether get_user_pages() takes struct task_struct])
121+
ZFS_LINUX_TEST_RESULT([get_user_pages_task_struct], [
122+
AC_MSG_RESULT(yes)
123+
AC_DEFINE(HAVE_GET_USER_PAGES_TASK_STRUCT, 1,
124+
[get_user_pages() takes struct task_struct])
125+
], [
126+
dnl #
127+
dnl # If we can not map the users pages in
128+
dnl # then we can not do Direct IO
129+
dnl #
130+
ZFS_LINUX_TEST_ERROR([Direct IO])
131+
])
132+
])
133+
])
134+
])
135+
])

config/kernel.m4

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
9898
ZFS_AC_KERNEL_SRC_VFS_GETATTR
9999
ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
100100
ZFS_AC_KERNEL_SRC_VFS_ITERATE
101+
ZFS_AC_KERNEL_SRC_GET_USER_PAGES
101102
ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO
102103
ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE
103104
ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS
@@ -192,6 +193,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
192193
ZFS_AC_KERNEL_VFS_GETATTR
193194
ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
194195
ZFS_AC_KERNEL_VFS_ITERATE
196+
ZFS_AC_KERNEL_GET_USER_PAGES
195197
ZFS_AC_KERNEL_VFS_DIRECT_IO
196198
ZFS_AC_KERNEL_VFS_RW_ITERATE
197199
ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS

include/os/linux/kernel/linux/kmap_compat.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,43 @@
4040
#define zfs_access_ok(type, addr, size) access_ok(addr, size)
4141
#endif
4242

43+
/*
44+
* read returning FOLL_WRITE is due to the fact that we are stating
45+
* that the kernel will have write access to the user pages. So, when
46+
* a Direct IO read request is issued, the kernel must write to the user
47+
* pages.
48+
*
49+
* get_user_pages_unlocked was not available to 4.0, so we also check
50+
* for get_user_pages on older kernels.
51+
*/
52+
/* 4.9 API change - for and read flag is passed as gup flags */
53+
#if defined(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS)
54+
#define zfs_get_user_pages(addr, numpages, read, pages) \
55+
get_user_pages_unlocked(addr, numpages, pages, read ? FOLL_WRITE : 0)
56+
57+
/* 4.8 API change - no longer takes struct task_struct as arguement */
58+
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG)
59+
#define zfs_get_user_pages(addr, numpages, read, pages) \
60+
get_user_pages_unlocked(addr, numpages, read, 0, pages)
61+
62+
/* 4.0 API */
63+
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT)
64+
#define zfs_get_user_pages(addr, numpages, read, pages) \
65+
get_user_pages_unlocked(current, current->mm, addr, numpages, read, 0, \
66+
pages)
67+
68+
/* Using get_user_pages if kernel is < 4.0 */
69+
#elif defined(HAVE_GET_USER_PAGES_TASK_STRUCT)
70+
#define zfs_get_user_pages(addr, numpages, read, pages) \
71+
get_user_pages(current, current->mm, addr, numpages, read, 0, pages, \
72+
NULL)
73+
#else
74+
/*
75+
* This case is unreachable. We must be able to use either
76+
* get_user_pages_unlocked() or get_user_pages() to map user pages into
77+
* the kernel.
78+
*/
79+
#error "Unknown Direct IO interface"
80+
#endif
81+
4382
#endif /* _ZFS_KMAP_H */

include/os/linux/spl/sys/mutex.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,15 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \
151151

152152
#define mutex_enter(mp) mutex_enter_nested((mp), 0)
153153

154+
#define mutex_transfer_ownership(mp) \
155+
{ \
156+
if (mutex_owner((mp)) != current) { \
157+
ASSERT3P(mutex_owner((mp)), !=, NULL); \
158+
spl_mutex_set_owner((mp)); \
159+
} \
160+
}
161+
162+
154163
/*
155164
* The reason for the spinlock:
156165
*

include/os/linux/spl/sys/uio.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@
3535
#include <asm/uaccess.h>
3636
#include <sys/types.h>
3737

38+
/*
39+
* uio_extflg: extended flags
40+
*/
41+
#define UIO_COPY_DEFAULT 0x0000 /* no special options to copy */
42+
#define UIO_COPY_CACHED 0x0001 /* copy should not bypass caches */
43+
#define UIO_ASYNC 0x0002 /* uio_t is reall a uioa_t */
44+
#define UIO_XUIO 0x0004 /* struct is xuio_t */
45+
#define UIO_DIRECT 0x0008 /* request direct I/O */
46+
3847
typedef struct iovec iovec_t;
3948

4049
typedef enum uio_rw {

include/sys/abd.h

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,22 @@
3535
extern "C" {
3636
#endif
3737

38+
#ifndef _KERNEL
39+
struct page; /* forward declaration to be used in abd.c */
40+
#endif
41+
3842
typedef enum abd_flags {
39-
ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
40-
ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
41-
ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */
42-
ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
43-
ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */
44-
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
43+
ABD_FLAG_LINEAR = 1 << 0, /* is ABD linear/scattered? */
44+
ABD_FLAG_OWNER = 1 << 1, /* own its data buffers? */
45+
ABD_FLAG_META = 1 << 2, /* represents FS metadata? */
46+
ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
47+
ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */
48+
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
49+
ABD_FLAG_FROM_PAGES = 1 << 6, /* does not own the pages */
50+
ABD_FLAG_MULTI_LIST = 1 << 7, /* mult ABDs chained together */
51+
ABD_FLAG_LINKED = 1 << 8, /* ABD is on a chained list */
52+
ABD_FLAG_GAP = 1 << 9, /* ABD is for read gap */
53+
ABD_FLAG_ZEROS = 1 << 10 /* ABD a zero-filled buffer */
4554
} abd_flags_t;
4655

4756
typedef struct abd {
@@ -64,6 +73,9 @@ typedef struct abd {
6473
void *abd_buf;
6574
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
6675
} abd_linear;
76+
struct abd_multi {
77+
list_t abd_chain;
78+
} abd_multi;
6779
} abd_u;
6880
} abd_t;
6981

@@ -75,14 +87,19 @@ extern int zfs_abd_scatter_enabled;
7587
static inline boolean_t
7688
abd_is_linear(abd_t *abd)
7789
{
78-
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
90+
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0);
7991
}
8092

8193
static inline boolean_t
8294
abd_is_linear_page(abd_t *abd)
8395
{
84-
return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ?
85-
B_TRUE : B_FALSE);
96+
return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0);
97+
}
98+
99+
static inline boolean_t
100+
abd_is_zero_buf(abd_t *abd)
101+
{
102+
return ((abd->abd_flags & ABD_FLAG_ZEROS) != 0);
86103
}
87104

88105
/*
@@ -91,12 +108,18 @@ abd_is_linear_page(abd_t *abd)
91108

92109
abd_t *abd_alloc(size_t, boolean_t);
93110
abd_t *abd_alloc_linear(size_t, boolean_t);
111+
abd_t *abd_alloc_multi(void);
94112
abd_t *abd_alloc_for_io(size_t, boolean_t);
95113
abd_t *abd_alloc_sametype(abd_t *, size_t);
114+
void abd_add_child(abd_t *, abd_t *, boolean_t);
96115
void abd_free(abd_t *);
97116
abd_t *abd_get_offset(abd_t *, size_t);
98117
abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
118+
abd_t *abd_get_zeros(size_t);
99119
abd_t *abd_get_from_buf(void *, size_t);
120+
#ifdef _KERNEL
121+
abd_t *abd_get_from_pages(struct page **, uint_t);
122+
#endif
100123
void abd_put(abd_t *);
101124

102125
/*
@@ -126,8 +149,7 @@ int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
126149
void abd_zero_off(abd_t *, size_t, size_t);
127150

128151
#if defined(_KERNEL)
129-
unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int,
130-
size_t);
152+
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
131153
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
132154
#endif
133155

include/sys/dbuf.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,12 @@ typedef struct dmu_buf_impl {
298298
uint8_t db_pending_evict;
299299

300300
uint8_t db_dirtycnt;
301+
302+
/*
303+
* Used to signal that the dbuf intends to transfer
304+
* ownership of of its db_mtx to another thread.
305+
*/
306+
uint8_t db_transferring_ownership;
301307
} dmu_buf_impl_t;
302308

303309
/* Note: the dbuf hash table is exposed only for the mdb module */

include/sys/dmu.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -565,9 +565,7 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
565565
void *tag, dmu_buf_t **, int flags);
566566
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
567567
void *tag, dmu_buf_t **dbp, int flags);
568-
int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
569-
uint64_t length, boolean_t read, void *tag, int *numbufsp,
570-
dmu_buf_t ***dbpp, uint32_t flags);
568+
571569
/*
572570
* Add a reference to a dmu buffer that has already been held via
573571
* dmu_buf_hold() in the current context.
@@ -826,7 +824,8 @@ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
826824
int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
827825
uint64_t size);
828826
int dmu_free_long_object(objset_t *os, uint64_t object);
829-
827+
int dmu_check_directio_valid(dnode_t *dn, uint64_t offset, uint64_t size,
828+
boolean_t read);
830829
/*
831830
* Convenience functions.
832831
*
@@ -836,12 +835,15 @@ int dmu_free_long_object(objset_t *os, uint64_t object);
836835
#define DMU_READ_PREFETCH 0 /* prefetch */
837836
#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
838837
#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */
838+
#define DMU_DIRECTIO 4 /* use direct IO */
839839
int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
840840
void *buf, uint32_t flags);
841841
int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
842842
uint32_t flags);
843843
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
844844
const void *buf, dmu_tx_t *tx);
845+
void dmu_write_direct_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
846+
const void *buf, dmu_tx_t *tx);
845847
void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
846848
const void *buf, dmu_tx_t *tx);
847849
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,

0 commit comments

Comments
 (0)