Skip to content

Commit 7240ecf

Browse files
bwatkinsonMark Maybee
andcommitted
WIP Direct IO
Adding O_DIRECT support to ZFS to bypass the ARC for writes/reads. O_DIRECT support in ZFS will always ensure there is coherency between buffered and O_DIRECT IO requests. This ensures that all IO requests, whether buffered or direct, will see the same file contents at all times. Just as in other FS's , O_DIRECT does not imply O_SYNC. While data is written directly to VDEV disks, metadata will not be synced until the associated TXG is synced. For both O_DIRECT read and write request the offset and requeset sizes, at a minimum, must be PAGE_SIZE aligned. In the event they are not, then EINVAL is returned. For O_DIRECT writes: The request also must be block aligned (recordsize) or the write request will take the normal (buffered) write path. In the event that request is block aligned and a cached copy of the buffer in the ARC, then it will be discarded from the ARC forcing all further reads to retrieve the data from disk. For O_DIRECT reads: The only alignment restrictions are PAGE_SIZE alignment. In the event that the requested data is in buffered (in the ARC) it will just be copied from the ARC into the user buffer. To ensure data integrity for all data written using O_DIRECT, all user pages are made stable in the event one of the following is required: Checksum Compression Encryption Parity By making the user pages stable, we make sure the contents of the user provided buffer can not be changed after any of the above operations have taken place. A new dataset property `direct` has been added with the following 3 allowable values: disabled - Accepts O_DIRECT flag, but silently ignores it and treats the request as a buffered IO request. default - Follows the alignment restrictions outlined above for write/read IO requests when the O_DIRECT flag is used. always - Treats every write/read IO request as though it passed O_DIRECT and follows the alignment restirctions outlined above. Signed-off-by: Brian Atkinson <[email protected]> Co-authored-by: Mark Maybee <[email protected]> Co-authored-by: Brian Atkinson <[email protected]>
1 parent eb8e535 commit 7240ecf

35 files changed

+2006
-298
lines changed

config/kernel-get-user-pages.m4

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
dnl #
2+
dnl # get_user_pages_unlocked() function was not available till 4.0.
3+
dnl #
4+
dnl # long get_user_pages_unlocked(struct task_struct *tsk,
5+
dnl # struct mm_struct *mm, unsigned long start, unsigned long nr_pages,
6+
dnl # int write, int force, struct page **pages)
7+
dnl # 4.8 API Change
8+
dnl # long get_user_pages_unlocked(unsigned long start,
9+
dnl # unsigned long nr_pages, int write, int force, struct page **page)
10+
dnl # 4.9 API Change
11+
dnl # long get_user_pages_unlocked(usigned long start, int nr_pages,
12+
dnl # struct page **pages, unsigned int gup_flags)
13+
dnl #
14+
dnl #
15+
dnl # In earlier kernels (< 4.0) get_user_pages() is available
16+
dnl #
17+
18+
dnl#
19+
dnl# Check available get_user_pages/_unlocked interfaces.
20+
dnl#
21+
AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_USER_PAGES], [
22+
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_gup_flags], [
23+
#include <linux/mm.h>
24+
], [
25+
unsigned long start = 0;
26+
unsigned long nr_pages = 1;
27+
unsigned int gup_flags = 0;
28+
struct page **pages = NULL;
29+
long ret __attribute__ ((unused));
30+
ret = get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
31+
])
32+
33+
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_write_flag], [
34+
#include <linux/mm.h>
35+
], [
36+
unsigned long start = 0;
37+
unsigned long nr_pages = 1;
38+
int write = 0;
39+
int force = 0;
40+
long ret __attribute__ ((unused));
41+
struct page **pages = NULL;
42+
ret = get_user_pages_unlocked(start, nr_pages, write, force, pages);
43+
])
44+
45+
46+
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct], [
47+
#include <linux/mm.h>
48+
], [
49+
struct task_struct *tsk = NULL;
50+
struct mm_struct *mm = NULL;
51+
unsigned long start = 0;
52+
unsigned long nr_pages = 1;
53+
int write = 0;
54+
int force = 0;
55+
struct page **pages = NULL;
56+
long ret __attribute__ ((unused));
57+
ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
58+
force, pages);
59+
])
60+
61+
ZFS_LINUX_TEST_SRC([get_user_pages_task_struct], [
62+
#include <linux/mm.h>
63+
], [
64+
struct task_struct *tsk = NULL;
65+
struct mm_struct *mm = NULL;
66+
struct vm_area_struct **vmas = NULL;
67+
unsigned long start = 0;
68+
unsigned long nr_pages = 1;
69+
int write = 0;
70+
int force = 0;
71+
struct page **pages = NULL;
72+
int ret __attribute__ ((unused));
73+
ret = get_user_pages(tsk, mm, start, nr_pages, write,
74+
force, pages, vmas);
75+
])
76+
])
77+
78+
dnl #
79+
dnl # Supported get_user_pages/_unlocked interfaces checked newest to oldest.
80+
dnl # We first check for get_user_pages_unlocked as that is available in
81+
dnl # newer kernels.
82+
dnl #
83+
AC_DEFUN([ZFS_AC_KERNEL_GET_USER_PAGES], [
84+
dnl #
85+
dnl # Current API of get_user_pages_unlocked
86+
dnl #
87+
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes gup flags])
88+
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_gup_flags], [
89+
AC_MSG_RESULT(yes)
90+
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS, 1,
91+
[get_user_pages_unlocked() takes gup flags])
92+
], [
93+
AC_MSG_RESULT(no)
94+
95+
dnl #
96+
dnl # 4.8 API change, get_user_pages_unlocked
97+
dnl #
98+
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes write flag])
99+
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_write_flag], [
100+
AC_MSG_RESULT(yes)
101+
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG, 1,
102+
[get_user_pages_unlocked() takes write flag])
103+
], [
104+
AC_MSG_RESULT(no)
105+
106+
dnl #
107+
dnl # 4.0 API, get_user_pages_unlocked
108+
dnl #
109+
AC_MSG_CHECKING(
110+
[whether get_user_pages_unlocked() takes struct task_struct])
111+
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_task_struct], [
112+
AC_MSG_RESULT(yes)
113+
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT, 1,
114+
[get_user_pages_unlocked() takes struct task_struct])
115+
], [
116+
AC_MSG_RESULT(no)
117+
118+
dnl # get_user_pages
119+
AC_MSG_CHECKING(
120+
[whether get_user_pages() takes struct task_struct])
121+
ZFS_LINUX_TEST_RESULT([get_user_pages_task_struct], [
122+
AC_MSG_RESULT(yes)
123+
AC_DEFINE(HAVE_GET_USER_PAGES_TASK_STRUCT, 1,
124+
[get_user_pages() takes struct task_struct])
125+
], [
126+
dnl #
127+
dnl # If we can not map the users pages in
128+
dnl # then we can not do Direct IO
129+
dnl #
130+
ZFS_LINUX_TEST_ERROR([Direct IO])
131+
])
132+
])
133+
])
134+
])
135+
])

config/kernel.m4

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
103103
ZFS_AC_KERNEL_SRC_VFS_GETATTR
104104
ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
105105
ZFS_AC_KERNEL_SRC_VFS_ITERATE
106+
ZFS_AC_KERNEL_SRC_GET_USER_PAGES
106107
ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO
107108
ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE
108109
ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS
@@ -200,6 +201,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
200201
ZFS_AC_KERNEL_VFS_GETATTR
201202
ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
202203
ZFS_AC_KERNEL_VFS_ITERATE
204+
ZFS_AC_KERNEL_GET_USER_PAGES
203205
ZFS_AC_KERNEL_VFS_DIRECT_IO
204206
ZFS_AC_KERNEL_VFS_RW_ITERATE
205207
ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS

include/os/freebsd/spl/sys/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ KERNEL_H = \
1717
extdirent.h \
1818
file.h \
1919
freebsd_rwlock.h \
20+
page.h \
2021
inttypes.h \
2122
isa_defs.h \
2223
kmem_cache.h \

include/os/freebsd/spl/sys/mutex.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,5 @@ typedef enum {
6969
#define mutex_exit(lock) sx_xunlock(lock)
7070
#define mutex_owned(lock) sx_xlocked(lock)
7171
#define mutex_owner(lock) sx_xholder(lock)
72+
7273
#endif /* _OPENSOLARIS_SYS_MUTEX_H_ */

include/os/freebsd/spl/sys/page.h

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Redistribution and use in source and binary forms, with or without
3+
* modification, are permitted provided that the following conditions
4+
* are met:
5+
* 1. Redistributions of source code must retain the above copyright
6+
* notice, this list of conditions and the following disclaimer.
7+
* 2. Redistributions in binary form must reproduce the above copyright
8+
* notice, this list of conditions and the following disclaimer in the
9+
* documentation and/or other materials provided with the distribution.
10+
*
11+
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
12+
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
13+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
14+
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
15+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
16+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
17+
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
18+
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
19+
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
20+
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
21+
* SUCH DAMAGE.
22+
*
23+
* $FreeBSD$
24+
*/
25+
26+
#ifndef _SPL_PAGE_H_
27+
#define _SPL_PAGE_H_
28+
29+
#include <sys/param.h>
30+
#include <sys/uio.h>
31+
32+
#ifdef __cplusplus
33+
extern "C" {
34+
#endif
35+
36+
typedef vm_page_t zfs_page_p;
37+
38+
long zfs_hold_pages(unsigned long start, unsigned long nr_pages, int read,
39+
zfs_page_p *pages);
40+
long zfs_get_user_pages(unsigned long start, unsigned long nr_pages, int read,
41+
zfs_page_p *pages);
42+
void zfs_put_user_pages(zfs_page_p *pages, unsigned long nr_pages,
43+
boolean_t read);
44+
void zfs_set_page_to_stable(zfs_page_p page);
45+
void zfs_release_stable_page(zfs_page_p page);
46+
int zfs_uio_get_user_pages(uio_t *uio, zfs_page_p *pages, unsigned maxpages,
47+
enum uio_rw rw);
48+
49+
#ifdef __cplusplus
50+
}
51+
#endif
52+
53+
#endif /* _SPL_PAGE_H_ */

include/os/freebsd/spl/sys/uio.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,6 @@
3535
#include <sys/_uio.h>
3636
#include <sys/debug.h>
3737

38-
39-
4038
#define uio_loffset uio_offset
4139

4240
typedef struct uio uio_t;

include/os/freebsd/spl/sys/vm.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,15 @@ void zfs_vmobject_wunlock(vm_object_t object);
5757
#define vm_page_grab_valid_unlocked(m, obj, idx, flags) \
5858
vm_page_grab_valid((m), (obj), (idx), (flags))
5959
#endif
60+
61+
#if __FreeBSD_version >= 1300047
62+
#define vm_page_wire_lock(pp)
63+
#define vm_page_wire_unlock(pp)
64+
#else
65+
#define vm_page_wire_lock(pp) vm_page_lock(pp)
66+
#define vm_page_wire_unlock(pp) vm_page_unlock(pp)
67+
#endif
68+
6069
static inline caddr_t
6170
zfs_map_page(vm_page_t pp, struct sf_buf **sfp)
6271
{
@@ -70,4 +79,16 @@ zfs_unmap_page(struct sf_buf *sf)
7079
sf_buf_free(sf);
7180
}
7281

82+
static inline void
83+
page_unhold(vm_page_t pp)
84+
{
85+
vm_page_wire_lock(pp);
86+
#if __FreeBSD_version >= 1300035
87+
vm_page_unwire(pp, PQ_ACTIVE);
88+
#else
89+
vm_page_unhold(pp);
90+
#endif
91+
vm_page_wire_unlock(pp);
92+
}
93+
7394
#endif /* _OPENSOLARIS_SYS_VM_H_ */

include/os/linux/spl/sys/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ KERNEL_H = \
1515
errno.h \
1616
fcntl.h \
1717
file.h \
18+
page.h \
1819
inttypes.h \
1920
isa_defs.h \
2021
kmem_cache.h \

include/os/linux/spl/sys/page.h

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
3+
* Copyright (C) 2007 The Regents of the University of California.
4+
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
5+
* Written by Brian Behlendorf <[email protected]>.
6+
* UCRL-CODE-235197
7+
*
8+
* This file is part of the SPL, Solaris Porting Layer.
9+
* For details, see <http://zfsonlinux.org/>.
10+
*
11+
* The SPL is free software; you can redistribute it and/or modify it
12+
* under the terms of the GNU General Public License as published by the
13+
* Free Software Foundation; either version 2 of the License, or (at your
14+
* option) any later version.
15+
*
16+
* The SPL is distributed in the hope that it will be useful, but WITHOUT
17+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
18+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
19+
* for more details.
20+
*
21+
* You should have received a copy of the GNU General Public License along
22+
* with the SPL. If not, see <http://www.gnu.org/licenses/>.
23+
*/
24+
25+
#ifndef _SPL_PAGE_H
26+
#define _SPL_PAGE_H
27+
28+
#include <linux/page-flags.h>
29+
#include <linux/pagemap.h>
30+
#include <linux/mm.h>
31+
#include <sys/types.h>
32+
#include <sys/uio.h>
33+
34+
/*
35+
* read returning FOLL_WRITE is due to the fact that we are stating
36+
* that the kernel will have write access to the user pages. So, when
37+
* a Direct IO read request is issued, the kernel must write to the user
38+
* pages.
39+
*
40+
* get_user_pages_unlocked was not available to 4.0, so we also check
41+
* for get_user_pages on older kernels.
42+
*/
43+
/* 4.9 API change - for and read flag is passed as gup flags */
44+
#if defined(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS)
45+
#define zfs_get_user_pages(addr, numpages, read, pages) \
46+
get_user_pages_unlocked(addr, numpages, pages, read ? FOLL_WRITE : 0)
47+
48+
/* 4.8 API change - no longer takes struct task_struct as arguement */
49+
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG)
50+
#define zfs_get_user_pages(addr, numpages, read, pages) \
51+
get_user_pages_unlocked(addr, numpages, read, 0, pages)
52+
53+
/* 4.0 API */
54+
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT)
55+
#define zfs_get_user_pages(addr, numpages, read, pages) \
56+
get_user_pages_unlocked(current, current->mm, addr, numpages, read, 0, \
57+
pages)
58+
59+
/* Using get_user_pages if kernel is < 4.0 */
60+
#elif defined(HAVE_GET_USER_PAGES_TASK_STRUCT)
61+
#define zfs_get_user_pages(addr, numpages, read, pages) \
62+
get_user_pages(current, current->mm, addr, numpages, read, 0, pages, \
63+
NULL)
64+
#else
65+
/*
66+
* This case is unreachable. We must be able to use either
67+
* get_user_pages_unlocked() or get_user_pages() to map user pages into
68+
* the kernel.
69+
*/
70+
#error "Unknown Direct IO interface"
71+
#endif
72+
73+
typedef struct page *zfs_page_p;
74+
75+
void zfs_put_user_pages(zfs_page_p *pages, unsigned long nr_pages,
76+
boolean_t read);
77+
void zfs_set_page_to_stable(zfs_page_p page);
78+
void zfs_release_stable_page(zfs_page_p page);
79+
int zfs_uio_get_user_pages(uio_t *uio, zfs_page_p *pages, unsigned maxpages,
80+
enum uio_rw rw);
81+
82+
#endif /* _SPL_PAGE_H */

0 commit comments

Comments
 (0)