Merge 72f674a22b
into 1713aa7b4d
This commit is contained in:
commit
419db3d2b5
|
@ -522,7 +522,7 @@ get_usage(zpool_help_t idx)
|
|||
return (gettext("\tstatus [--power] [-j [--json-int, "
|
||||
"--json-flat-vdevs, ...\n"
|
||||
"\t --json-pool-key-guid]] [-c [script1,script2,...]] "
|
||||
"[-DegiLpPstvx] ...\n"
|
||||
"[-dDegiLpPstvx] ...\n"
|
||||
"\t [-T d|u] [pool] [interval [count]]\n"));
|
||||
case HELP_UPGRADE:
|
||||
return (gettext("\tupgrade\n"
|
||||
|
@ -2602,6 +2602,7 @@ typedef struct status_cbdata {
|
|||
boolean_t cb_print_unhealthy;
|
||||
boolean_t cb_print_status;
|
||||
boolean_t cb_print_slow_ios;
|
||||
boolean_t cb_print_dio_verify;
|
||||
boolean_t cb_print_vdev_init;
|
||||
boolean_t cb_print_vdev_trim;
|
||||
vdev_cmd_data_list_t *vcdl;
|
||||
|
@ -2879,7 +2880,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
|||
uint_t c, i, vsc, children;
|
||||
pool_scan_stat_t *ps = NULL;
|
||||
vdev_stat_t *vs;
|
||||
char rbuf[6], wbuf[6], cbuf[6];
|
||||
char rbuf[6], wbuf[6], cbuf[6], dbuf[6];
|
||||
char *vname;
|
||||
uint64_t notpresent;
|
||||
spare_cbdata_t spare_cb;
|
||||
|
@ -2997,6 +2998,17 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
|||
printf(" %5s", "-");
|
||||
}
|
||||
}
|
||||
if (VDEV_STAT_VALID(vs_dio_verify_errors, vsc) &&
|
||||
cb->cb_print_dio_verify) {
|
||||
zfs_nicenum(vs->vs_dio_verify_errors, dbuf,
|
||||
sizeof (dbuf));
|
||||
|
||||
if (cb->cb_literal)
|
||||
printf(" %5llu",
|
||||
(u_longlong_t)vs->vs_dio_verify_errors);
|
||||
else
|
||||
printf(" %5s", dbuf);
|
||||
}
|
||||
}
|
||||
|
||||
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
|
||||
|
@ -10873,6 +10885,10 @@ status_callback(zpool_handle_t *zhp, void *data)
|
|||
printf_color(ANSI_BOLD, " %5s", gettext("POWER"));
|
||||
}
|
||||
|
||||
if (cbp->cb_print_dio_verify) {
|
||||
printf_color(ANSI_BOLD, " %5s", gettext("DIO"));
|
||||
}
|
||||
|
||||
if (cbp->vcdl != NULL)
|
||||
print_cmd_columns(cbp->vcdl, 0);
|
||||
|
||||
|
@ -10921,10 +10937,11 @@ status_callback(zpool_handle_t *zhp, void *data)
|
|||
}
|
||||
|
||||
/*
|
||||
* zpool status [-c [script1,script2,...]] [-DegiLpPstvx] [--power] [-T d|u] ...
|
||||
* [pool] [interval [count]]
|
||||
* zpool status [-c [script1,script2,...]] [-dDegiLpPstvx] [--power] ...
|
||||
* [-T d|u] [pool] [interval [count]]
|
||||
*
|
||||
* -c CMD For each vdev, run command CMD
|
||||
* -d Display Direct I/O write verify errors
|
||||
* -D Display dedup status (undocumented)
|
||||
* -e Display only unhealthy vdevs
|
||||
* -g Display guid for individual vdev name.
|
||||
|
@ -10967,7 +10984,7 @@ zpool_do_status(int argc, char **argv)
|
|||
};
|
||||
|
||||
/* check options */
|
||||
while ((c = getopt_long(argc, argv, "c:jDegiLpPstT:vx", long_options,
|
||||
while ((c = getopt_long(argc, argv, "c:jdDegiLpPstT:vx", long_options,
|
||||
NULL)) != -1) {
|
||||
switch (c) {
|
||||
case 'c':
|
||||
|
@ -10994,6 +11011,9 @@ zpool_do_status(int argc, char **argv)
|
|||
}
|
||||
cmd = optarg;
|
||||
break;
|
||||
case 'd':
|
||||
cb.cb_print_dio_verify = B_TRUE;
|
||||
break;
|
||||
case 'D':
|
||||
if (++cb.cb_dedup_stats > 2)
|
||||
cb.cb_dedup_stats = 2;
|
||||
|
|
46
cmd/ztest.c
46
cmd/ztest.c
|
@ -2262,6 +2262,13 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
|
|||
if (ztest_random(4) != 0) {
|
||||
int prefetch = ztest_random(2) ?
|
||||
DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
|
||||
|
||||
/*
|
||||
* We will randomly set when to do O_DIRECT on a read.
|
||||
*/
|
||||
if (ztest_random(4) == 0)
|
||||
prefetch |= DMU_DIRECTIO;
|
||||
|
||||
ztest_block_tag_t rbt;
|
||||
|
||||
VERIFY(dmu_read(os, lr->lr_foid, offset,
|
||||
|
@ -2813,6 +2820,13 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
|
|||
enum ztest_io_type io_type;
|
||||
uint64_t blocksize;
|
||||
void *data;
|
||||
uint32_t dmu_read_flags = DMU_READ_NO_PREFETCH;
|
||||
|
||||
/*
|
||||
* We will randomly set when to do O_DIRECT on a read.
|
||||
*/
|
||||
if (ztest_random(4) == 0)
|
||||
dmu_read_flags |= DMU_DIRECTIO;
|
||||
|
||||
VERIFY0(dmu_object_info(zd->zd_os, object, &doi));
|
||||
blocksize = doi.doi_data_block_size;
|
||||
|
@ -2878,7 +2892,7 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
|
|||
(void) pthread_rwlock_unlock(&ztest_name_lock);
|
||||
|
||||
VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
|
||||
DMU_READ_NO_PREFETCH));
|
||||
dmu_read_flags));
|
||||
|
||||
(void) ztest_write(zd, object, offset, blocksize, data);
|
||||
break;
|
||||
|
@ -5045,6 +5059,13 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
|
|||
uint64_t stride = 123456789ULL;
|
||||
uint64_t width = 40;
|
||||
int free_percent = 5;
|
||||
uint32_t dmu_read_flags = DMU_READ_PREFETCH;
|
||||
|
||||
/*
|
||||
* We will randomly set when to do O_DIRECT on a read.
|
||||
*/
|
||||
if (ztest_random(4) == 0)
|
||||
dmu_read_flags |= DMU_DIRECTIO;
|
||||
|
||||
/*
|
||||
* This test uses two objects, packobj and bigobj, that are always
|
||||
|
@ -5123,10 +5144,10 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
|
|||
* Read the current contents of our objects.
|
||||
*/
|
||||
error = dmu_read(os, packobj, packoff, packsize, packbuf,
|
||||
DMU_READ_PREFETCH);
|
||||
dmu_read_flags);
|
||||
ASSERT0(error);
|
||||
error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
|
||||
DMU_READ_PREFETCH);
|
||||
dmu_read_flags);
|
||||
ASSERT0(error);
|
||||
|
||||
/*
|
||||
|
@ -5244,9 +5265,9 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
|
|||
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
|
||||
|
||||
VERIFY0(dmu_read(os, packobj, packoff,
|
||||
packsize, packcheck, DMU_READ_PREFETCH));
|
||||
packsize, packcheck, dmu_read_flags));
|
||||
VERIFY0(dmu_read(os, bigobj, bigoff,
|
||||
bigsize, bigcheck, DMU_READ_PREFETCH));
|
||||
bigsize, bigcheck, dmu_read_flags));
|
||||
|
||||
ASSERT0(memcmp(packbuf, packcheck, packsize));
|
||||
ASSERT0(memcmp(bigbuf, bigcheck, bigsize));
|
||||
|
@ -5336,6 +5357,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
|
|||
dmu_buf_t *bonus_db;
|
||||
arc_buf_t **bigbuf_arcbufs;
|
||||
dmu_object_info_t doi;
|
||||
uint32_t dmu_read_flags = DMU_READ_PREFETCH;
|
||||
|
||||
/*
|
||||
* We will randomly set when to do O_DIRECT on a read.
|
||||
*/
|
||||
if (ztest_random(4) == 0)
|
||||
dmu_read_flags |= DMU_DIRECTIO;
|
||||
|
||||
size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
|
||||
od = umem_alloc(size, UMEM_NOFAIL);
|
||||
|
@ -5466,10 +5494,10 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
|
|||
*/
|
||||
if (i != 0 || ztest_random(2) != 0) {
|
||||
error = dmu_read(os, packobj, packoff,
|
||||
packsize, packbuf, DMU_READ_PREFETCH);
|
||||
packsize, packbuf, dmu_read_flags);
|
||||
ASSERT0(error);
|
||||
error = dmu_read(os, bigobj, bigoff, bigsize,
|
||||
bigbuf, DMU_READ_PREFETCH);
|
||||
bigbuf, dmu_read_flags);
|
||||
ASSERT0(error);
|
||||
}
|
||||
compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
|
||||
|
@ -5529,9 +5557,9 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
|
|||
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
|
||||
|
||||
VERIFY0(dmu_read(os, packobj, packoff,
|
||||
packsize, packcheck, DMU_READ_PREFETCH));
|
||||
packsize, packcheck, dmu_read_flags));
|
||||
VERIFY0(dmu_read(os, bigobj, bigoff,
|
||||
bigsize, bigcheck, DMU_READ_PREFETCH));
|
||||
bigsize, bigcheck, dmu_read_flags));
|
||||
|
||||
ASSERT0(memcmp(packbuf, packcheck, packsize));
|
||||
ASSERT0(memcmp(bigbuf, bigcheck, bigsize));
|
||||
|
|
|
@ -0,0 +1,179 @@
|
|||
dnl #
|
||||
dnl # get_user_pages_unlocked() function was not available till 4.0.
|
||||
dnl # In earlier kernels (< 4.0) get_user_pages() is available().
|
||||
dnl #
|
||||
dnl # 4.0 API change,
|
||||
dnl # long get_user_pages_unlocked(struct task_struct *tsk,
|
||||
dnl # struct mm_struct *mm, unsigned long start, unsigned long nr_pages,
|
||||
dnl # int write, int force, struct page **pages)
|
||||
dnl #
|
||||
dnl # 4.8 API change,
|
||||
dnl # long get_user_pages_unlocked(unsigned long start,
|
||||
dnl # unsigned long nr_pages, int write, int force, struct page **page)
|
||||
dnl #
|
||||
dnl # 4.9 API change,
|
||||
dnl # long get_user_pages_unlocked(usigned long start, int nr_pages,
|
||||
dnl # struct page **pages, unsigned int gup_flags)
|
||||
dnl #
|
||||
|
||||
dnl#
|
||||
dnl# Check available get_user_pages/_unlocked interfaces.
|
||||
dnl#
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_USER_PAGES], [
|
||||
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_gup_flags], [
|
||||
#include <linux/mm.h>
|
||||
], [
|
||||
unsigned long start = 0;
|
||||
unsigned long nr_pages = 1;
|
||||
unsigned int gup_flags = 0;
|
||||
struct page **pages = NULL;
|
||||
long ret __attribute__ ((unused));
|
||||
|
||||
ret = get_user_pages_unlocked(start, nr_pages, pages,
|
||||
gup_flags);
|
||||
])
|
||||
|
||||
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_write_flag], [
|
||||
#include <linux/mm.h>
|
||||
], [
|
||||
unsigned long start = 0;
|
||||
unsigned long nr_pages = 1;
|
||||
int write = 0;
|
||||
int force = 0;
|
||||
long ret __attribute__ ((unused));
|
||||
struct page **pages = NULL;
|
||||
|
||||
ret = get_user_pages_unlocked(start, nr_pages, write, force,
|
||||
pages);
|
||||
])
|
||||
|
||||
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct], [
|
||||
#include <linux/mm.h>
|
||||
], [
|
||||
struct task_struct *tsk = NULL;
|
||||
struct mm_struct *mm = NULL;
|
||||
unsigned long start = 0;
|
||||
unsigned long nr_pages = 1;
|
||||
int write = 0;
|
||||
int force = 0;
|
||||
struct page **pages = NULL;
|
||||
long ret __attribute__ ((unused));
|
||||
|
||||
ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
|
||||
force, pages);
|
||||
])
|
||||
|
||||
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct_gup_flags], [
|
||||
#include <linux/mm.h>
|
||||
], [
|
||||
struct task_struct *tsk = NULL;
|
||||
struct mm_struct *mm = NULL;
|
||||
unsigned long start = 0;
|
||||
unsigned long nr_pages = 1;
|
||||
struct page **pages = NULL;
|
||||
unsigned int gup_flags = 0;
|
||||
long ret __attribute__ ((unused));
|
||||
|
||||
ret = get_user_pages_unlocked(tsk, mm, start, nr_pages,
|
||||
pages, gup_flags);
|
||||
])
|
||||
|
||||
ZFS_LINUX_TEST_SRC([get_user_pages_task_struct], [
|
||||
#include <linux/mm.h>
|
||||
], [
|
||||
struct task_struct *tsk = NULL;
|
||||
struct mm_struct *mm = NULL;
|
||||
struct vm_area_struct **vmas = NULL;
|
||||
unsigned long start = 0;
|
||||
unsigned long nr_pages = 1;
|
||||
int write = 0;
|
||||
int force = 0;
|
||||
struct page **pages = NULL;
|
||||
int ret __attribute__ ((unused));
|
||||
|
||||
ret = get_user_pages(tsk, mm, start, nr_pages, write,
|
||||
force, pages, vmas);
|
||||
])
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # Supported get_user_pages/_unlocked interfaces checked newest to oldest.
|
||||
dnl # We first check for get_user_pages_unlocked as that is available in
|
||||
dnl # newer kernels.
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_GET_USER_PAGES], [
|
||||
dnl #
|
||||
dnl # Current API (as of 4.9) of get_user_pages_unlocked
|
||||
dnl #
|
||||
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes gup flags])
|
||||
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_gup_flags], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS, 1,
|
||||
[get_user_pages_unlocked() takes gup flags])
|
||||
], [
|
||||
AC_MSG_RESULT(no)
|
||||
|
||||
dnl #
|
||||
dnl # 4.8 API change, get_user_pages_unlocked
|
||||
dnl #
|
||||
AC_MSG_CHECKING(
|
||||
[whether get_user_pages_unlocked() takes write flag])
|
||||
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_write_flag], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG, 1,
|
||||
[get_user_pages_unlocked() takes write flag])
|
||||
], [
|
||||
AC_MSG_RESULT(no)
|
||||
|
||||
dnl #
|
||||
dnl # 4.0-4.3, 4.5-4.7 API, get_user_pages_unlocked
|
||||
dnl #
|
||||
AC_MSG_CHECKING(
|
||||
[whether get_user_pages_unlocked() takes task_struct])
|
||||
ZFS_LINUX_TEST_RESULT(
|
||||
[get_user_pages_unlocked_task_struct], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(
|
||||
HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT, 1,
|
||||
[get_user_pages_unlocked() takes task_struct])
|
||||
], [
|
||||
AC_MSG_RESULT(no)
|
||||
|
||||
dnl #
|
||||
dnl # 4.4 API, get_user_pages_unlocked
|
||||
dnl #
|
||||
AC_MSG_CHECKING(
|
||||
[whether get_user_pages_unlocked() takes task_struct, gup_flags])
|
||||
ZFS_LINUX_TEST_RESULT(
|
||||
[get_user_pages_unlocked_task_struct_gup_flags], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(
|
||||
HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT_GUP_FLAGS, 1,
|
||||
[get_user_pages_unlocked() takes task_struct, gup_flags])
|
||||
], [
|
||||
AC_MSG_RESULT(no)
|
||||
|
||||
dnl #
|
||||
dnl # get_user_pages
|
||||
dnl #
|
||||
AC_MSG_CHECKING(
|
||||
[whether get_user_pages() takes struct task_struct])
|
||||
ZFS_LINUX_TEST_RESULT(
|
||||
[get_user_pages_task_struct], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(
|
||||
HAVE_GET_USER_PAGES_TASK_STRUCT, 1,
|
||||
[get_user_pages() takes task_struct])
|
||||
], [
|
||||
dnl #
|
||||
dnl # If we cannot map the user's
|
||||
dnl # pages in then we cannot do
|
||||
dnl # Direct I/O
|
||||
dnl #
|
||||
ZFS_LINUX_TEST_ERROR([Direct I/O])
|
||||
])
|
||||
])
|
||||
])
|
||||
])
|
||||
])
|
||||
])
|
|
@ -1,5 +1,5 @@
|
|||
dnl #
|
||||
dnl # Check for direct IO interfaces.
|
||||
dnl # Check for Direct I/O interfaces.
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO], [
|
||||
ZFS_LINUX_TEST_SRC([direct_io_iter], [
|
||||
|
@ -100,7 +100,7 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO], [
|
|||
AC_DEFINE(HAVE_VFS_DIRECT_IO_IOVEC, 1,
|
||||
[aops->direct_IO() uses iovec])
|
||||
],[
|
||||
ZFS_LINUX_TEST_ERROR([direct IO])
|
||||
ZFS_LINUX_TEST_ERROR([Direct I/O])
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
])
|
||||
|
|
|
@ -85,6 +85,34 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
|
|||
bytes = copy_from_iter((void *)&buf, size, &iter);
|
||||
])
|
||||
|
||||
ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [
|
||||
#include <linux/uio.h>
|
||||
], [
|
||||
struct iov_iter iter = { 0 };
|
||||
struct page **pages = NULL;
|
||||
size_t maxsize = 4096;
|
||||
unsigned maxpages = 1;
|
||||
size_t start;
|
||||
size_t ret __attribute__ ((unused));
|
||||
|
||||
ret = iov_iter_get_pages2(&iter, pages, maxsize, maxpages,
|
||||
&start);
|
||||
])
|
||||
|
||||
ZFS_LINUX_TEST_SRC([iov_iter_get_pages], [
|
||||
#include <linux/uio.h>
|
||||
], [
|
||||
struct iov_iter iter = { 0 };
|
||||
struct page **pages = NULL;
|
||||
size_t maxsize = 4096;
|
||||
unsigned maxpages = 1;
|
||||
size_t start;
|
||||
size_t ret __attribute__ ((unused));
|
||||
|
||||
ret = iov_iter_get_pages(&iter, pages, maxsize, maxpages,
|
||||
&start);
|
||||
])
|
||||
|
||||
ZFS_LINUX_TEST_SRC([iov_iter_type], [
|
||||
#include <linux/fs.h>
|
||||
#include <linux/uio.h>
|
||||
|
@ -184,6 +212,27 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
|
|||
enable_vfs_iov_iter="no"
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # Kernel 6.0 changed iov_iter_get_pages() to iov_iter_page_pages2().
|
||||
dnl #
|
||||
AC_MSG_CHECKING([whether iov_iter_get_pages2() is available])
|
||||
ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1,
|
||||
[iov_iter_get_pages2() is available])
|
||||
], [
|
||||
AC_MSG_RESULT(no)
|
||||
AC_MSG_CHECKING([whether iov_iter_get_pages() is available])
|
||||
ZFS_LINUX_TEST_RESULT([iov_iter_get_pages], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_IOV_ITER_GET_PAGES, 1,
|
||||
[iov_iter_get_pages() is available])
|
||||
], [
|
||||
AC_MSG_RESULT(no)
|
||||
enable_vfs_iov_iter="no"
|
||||
])
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # This checks for iov_iter_type() in linux/uio.h. It is not
|
||||
dnl # required, however, and the module will compiled without it
|
||||
|
|
|
@ -79,6 +79,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
|||
ZFS_AC_KERNEL_SRC_SHOW_OPTIONS
|
||||
ZFS_AC_KERNEL_SRC_FILE_INODE
|
||||
ZFS_AC_KERNEL_SRC_FILE_DENTRY
|
||||
ZFS_AC_KERNEL_SRC_FILEMAP
|
||||
ZFS_AC_KERNEL_SRC_FSYNC
|
||||
ZFS_AC_KERNEL_SRC_AIO_FSYNC
|
||||
ZFS_AC_KERNEL_SRC_EVICT_INODE
|
||||
|
@ -111,6 +112,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
|||
ZFS_AC_KERNEL_SRC_VFS_GETATTR
|
||||
ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
|
||||
ZFS_AC_KERNEL_SRC_VFS_ITERATE
|
||||
ZFS_AC_KERNEL_SRC_GET_USER_PAGES
|
||||
ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO
|
||||
ZFS_AC_KERNEL_SRC_VFS_READPAGES
|
||||
ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS
|
||||
|
@ -234,6 +236,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
|||
ZFS_AC_KERNEL_SHOW_OPTIONS
|
||||
ZFS_AC_KERNEL_FILE_INODE
|
||||
ZFS_AC_KERNEL_FILE_DENTRY
|
||||
ZFS_AC_KERNEL_FILEMAP
|
||||
ZFS_AC_KERNEL_FSYNC
|
||||
ZFS_AC_KERNEL_AIO_FSYNC
|
||||
ZFS_AC_KERNEL_EVICT_INODE
|
||||
|
@ -266,6 +269,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
|||
ZFS_AC_KERNEL_VFS_GETATTR
|
||||
ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
|
||||
ZFS_AC_KERNEL_VFS_ITERATE
|
||||
ZFS_AC_KERNEL_GET_USER_PAGES
|
||||
ZFS_AC_KERNEL_VFS_DIRECT_IO
|
||||
ZFS_AC_KERNEL_VFS_READPAGES
|
||||
ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS
|
||||
|
|
|
@ -70,4 +70,5 @@ typedef enum {
|
|||
#define mutex_exit(lock) sx_xunlock(lock)
|
||||
#define mutex_owned(lock) sx_xlocked(lock)
|
||||
#define mutex_owner(lock) sx_xholder(lock)
|
||||
|
||||
#endif /* _OPENSOLARIS_SYS_MUTEX_H_ */
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#include <sys/types.h>
|
||||
#include_next <sys/param.h>
|
||||
#define PAGESIZE PAGE_SIZE
|
||||
#define PAGESHIFT PAGE_SHIFT
|
||||
#define ptob(x) ((uint64_t)(x) << PAGE_SHIFT)
|
||||
#ifdef _KERNEL
|
||||
#include <sys/systm.h>
|
||||
|
|
|
@ -34,13 +34,30 @@
|
|||
#include_next <sys/uio.h>
|
||||
#include <sys/_uio.h>
|
||||
#include <sys/debug.h>
|
||||
#include <sys/sysmacros.h>
|
||||
|
||||
/*
|
||||
* uio_extflg: extended flags
|
||||
*/
|
||||
#define UIO_DIRECT 0x0001 /* Direct I/O requset */
|
||||
|
||||
typedef struct iovec iovec_t;
|
||||
typedef enum uio_seg zfs_uio_seg_t;
|
||||
typedef enum uio_rw zfs_uio_rw_t;
|
||||
|
||||
/*
|
||||
* This structure is used when doing Direct I/O.
|
||||
*/
|
||||
typedef struct {
|
||||
vm_page_t *pages;
|
||||
int npages;
|
||||
} zfs_uio_dio_t;
|
||||
|
||||
typedef struct zfs_uio {
|
||||
struct uio *uio;
|
||||
offset_t uio_soffset;
|
||||
uint16_t uio_extflg;
|
||||
zfs_uio_dio_t uio_dio;
|
||||
} zfs_uio_t;
|
||||
|
||||
#define GET_UIO_STRUCT(u) (u)->uio
|
||||
|
@ -52,6 +69,7 @@ typedef struct zfs_uio {
|
|||
#define zfs_uio_iovbase(u, idx) GET_UIO_STRUCT(u)->uio_iov[(idx)].iov_base
|
||||
#define zfs_uio_td(u) GET_UIO_STRUCT(u)->uio_td
|
||||
#define zfs_uio_rw(u) GET_UIO_STRUCT(u)->uio_rw
|
||||
#define zfs_uio_soffset(u) (u)->uio_soffset
|
||||
#define zfs_uio_fault_disable(u, set)
|
||||
#define zfs_uio_prefaultpages(size, u) (0)
|
||||
|
||||
|
@ -61,6 +79,13 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
|
|||
zfs_uio_offset(uio) = off;
|
||||
}
|
||||
|
||||
static inline void
|
||||
zfs_uio_setsoffset(zfs_uio_t *uio, offset_t off)
|
||||
{
|
||||
ASSERT3U(zfs_uio_offset(uio), ==, off);
|
||||
zfs_uio_soffset(uio) = off;
|
||||
}
|
||||
|
||||
static inline void
|
||||
zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
|
||||
{
|
||||
|
@ -71,7 +96,11 @@ zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
|
|||
static __inline void
|
||||
zfs_uio_init(zfs_uio_t *uio, struct uio *uio_s)
|
||||
{
|
||||
memset(uio, 0, sizeof (zfs_uio_t));
|
||||
if (uio_s != NULL) {
|
||||
GET_UIO_STRUCT(uio) = uio_s;
|
||||
zfs_uio_soffset(uio) = uio_s->uio_offset;
|
||||
}
|
||||
}
|
||||
|
||||
int zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio);
|
||||
|
|
|
@ -26,10 +26,15 @@
|
|||
#ifndef _ABD_OS_H
|
||||
#define _ABD_OS_H
|
||||
|
||||
#include <sys/vm.h>
|
||||
#include <vm/vm_page.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct abd;
|
||||
|
||||
struct abd_scatter {
|
||||
uint_t abd_offset;
|
||||
void *abd_chunks[1]; /* actually variable-length */
|
||||
|
@ -37,8 +42,14 @@ struct abd_scatter {
|
|||
|
||||
struct abd_linear {
|
||||
void *abd_buf;
|
||||
#if defined(_KERNEL)
|
||||
struct sf_buf *sf; /* for LINEAR_PAGE FreeBSD */
|
||||
#endif
|
||||
};
|
||||
|
||||
__attribute__((malloc))
|
||||
struct abd *abd_alloc_from_pages(vm_page_t *, unsigned long, uint64_t);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -38,6 +38,8 @@
|
|||
#define zfs_kmap_local(page) kmap_atomic(page)
|
||||
#define zfs_kunmap_local(addr) kunmap_atomic(addr)
|
||||
#endif
|
||||
#define zfs_kmap(page) kmap(page)
|
||||
#define zfs_kunmap(page) kunmap(page)
|
||||
|
||||
/* 5.0 API change - no more 'type' argument for access_ok() */
|
||||
#ifdef HAVE_ACCESS_OK_TYPE
|
||||
|
@ -46,4 +48,49 @@
|
|||
#define zfs_access_ok(type, addr, size) access_ok(addr, size)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* read returning FOLL_WRITE is due to the fact that we are stating
|
||||
* that the kernel will have write access to the user pages. So, when
|
||||
* a Direct I/O read request is issued, the kernel must write to the user
|
||||
* pages.
|
||||
*
|
||||
* get_user_pages_unlocked was not available to 4.0, so we also check
|
||||
* for get_user_pages on older kernels.
|
||||
*/
|
||||
/* 4.9 API change - for and read flag is passed as gup flags */
|
||||
#if defined(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS)
|
||||
#define zfs_get_user_pages(addr, numpages, read, pages) \
|
||||
get_user_pages_unlocked(addr, numpages, pages, read ? FOLL_WRITE : 0)
|
||||
|
||||
/* 4.8 API change - no longer takes struct task_struct as arguement */
|
||||
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG)
|
||||
#define zfs_get_user_pages(addr, numpages, read, pages) \
|
||||
get_user_pages_unlocked(addr, numpages, read, 0, pages)
|
||||
|
||||
/* 4.0-4.3, 4.5-4.7 API */
|
||||
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT)
|
||||
#define zfs_get_user_pages(addr, numpages, read, pages) \
|
||||
get_user_pages_unlocked(current, current->mm, addr, numpages, read, 0, \
|
||||
pages)
|
||||
|
||||
/* 4.4 API */
|
||||
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT_GUP_FLAGS)
|
||||
#define zfs_get_user_pages(addr, numpages, read, pages) \
|
||||
get_user_pages_unlocked(current, current->mm, addr, numpages, pages, \
|
||||
read ? FOLL_WRITE : 0)
|
||||
|
||||
/* Using get_user_pages if kernel is < 4.0 */
|
||||
#elif defined(HAVE_GET_USER_PAGES_TASK_STRUCT)
|
||||
#define zfs_get_user_pages(addr, numpages, read, pages) \
|
||||
get_user_pages(current, current->mm, addr, numpages, read, 0, pages, \
|
||||
NULL)
|
||||
#else
|
||||
/*
|
||||
* This case is unreachable. We must be able to use either
|
||||
* get_user_pages_unlocked() or get_user_pages() to map user pages into
|
||||
* the kernel.
|
||||
*/
|
||||
#error "Unknown Direct I/O interface"
|
||||
#endif
|
||||
|
||||
#endif /* _ZFS_KMAP_H */
|
||||
|
|
|
@ -33,6 +33,12 @@
|
|||
#include <linux/bio.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/string.h>
|
||||
|
||||
/*
|
||||
* uio_extflg: extended flags
|
||||
*/
|
||||
#define UIO_DIRECT 0x0001 /* Direct I/O request */
|
||||
|
||||
#if defined(HAVE_VFS_IOV_ITER) && defined(HAVE_FAULT_IN_IOV_ITER_READABLE)
|
||||
#define iov_iter_fault_in_readable(a, b) fault_in_iov_iter_readable(a, b)
|
||||
|
@ -54,6 +60,14 @@ typedef enum zfs_uio_seg {
|
|||
#endif
|
||||
} zfs_uio_seg_t;
|
||||
|
||||
/*
|
||||
* This structures is used when doing Direct I/O.
|
||||
*/
|
||||
typedef struct {
|
||||
struct page **pages; /* Mapped pages */
|
||||
long npages; /* Number of mapped pages */
|
||||
} zfs_uio_dio_t;
|
||||
|
||||
typedef struct zfs_uio {
|
||||
union {
|
||||
const struct iovec *uio_iov;
|
||||
|
@ -62,15 +76,16 @@ typedef struct zfs_uio {
|
|||
struct iov_iter *uio_iter;
|
||||
#endif
|
||||
};
|
||||
int uio_iovcnt;
|
||||
offset_t uio_loffset;
|
||||
zfs_uio_seg_t uio_segflg;
|
||||
int uio_iovcnt; /* Number of iovecs */
|
||||
offset_t uio_soffset; /* Starting logical offset */
|
||||
offset_t uio_loffset; /* Current logical offset */
|
||||
zfs_uio_seg_t uio_segflg; /* Segment type */
|
||||
boolean_t uio_fault_disable;
|
||||
uint16_t uio_fmode;
|
||||
uint16_t uio_extflg;
|
||||
ssize_t uio_resid;
|
||||
|
||||
size_t uio_skip;
|
||||
uint16_t uio_fmode; /* Access mode (unused) */
|
||||
uint16_t uio_extflg; /* Extra flags (UIO_DIRECT) */
|
||||
ssize_t uio_resid; /* Residual unprocessed bytes */
|
||||
size_t uio_skip; /* Skipped bytes in current iovec */
|
||||
zfs_uio_dio_t uio_dio; /* Direct I/O user pages */
|
||||
|
||||
struct request *rq;
|
||||
} zfs_uio_t;
|
||||
|
@ -83,6 +98,7 @@ typedef struct zfs_uio {
|
|||
#define zfs_uio_iovlen(u, idx) (u)->uio_iov[(idx)].iov_len
|
||||
#define zfs_uio_iovbase(u, idx) (u)->uio_iov[(idx)].iov_base
|
||||
#define zfs_uio_fault_disable(u, set) (u)->uio_fault_disable = set
|
||||
#define zfs_uio_soffset(u) (u)->uio_soffset
|
||||
#define zfs_uio_rlimit_fsize(z, u) (0)
|
||||
#define zfs_uio_fault_move(p, n, rw, u) zfs_uiomove((p), (n), (rw), (u))
|
||||
|
||||
|
@ -94,6 +110,13 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
|
|||
uio->uio_loffset = off;
|
||||
}
|
||||
|
||||
static inline void
|
||||
zfs_uio_setsoffset(zfs_uio_t *uio, offset_t off)
|
||||
{
|
||||
ASSERT3U(zfs_uio_offset(uio), ==, off);
|
||||
zfs_uio_soffset(uio) = off;
|
||||
}
|
||||
|
||||
static inline void
|
||||
zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
|
||||
{
|
||||
|
@ -117,6 +140,8 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov,
|
|||
uio->uio_extflg = 0;
|
||||
uio->uio_resid = resid;
|
||||
uio->uio_skip = skip;
|
||||
uio->uio_soffset = uio->uio_loffset;
|
||||
memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t));
|
||||
}
|
||||
|
||||
static inline void
|
||||
|
@ -146,6 +171,8 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq)
|
|||
}
|
||||
|
||||
uio->rq = rq;
|
||||
uio->uio_soffset = uio->uio_loffset;
|
||||
memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t));
|
||||
}
|
||||
|
||||
#if defined(HAVE_VFS_IOV_ITER)
|
||||
|
@ -162,8 +189,10 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset,
|
|||
uio->uio_extflg = 0;
|
||||
uio->uio_resid = resid;
|
||||
uio->uio_skip = skip;
|
||||
uio->uio_soffset = uio->uio_loffset;
|
||||
memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t));
|
||||
}
|
||||
#endif
|
||||
#endif /* HAVE_VFS_IOV_ITER */
|
||||
|
||||
#if defined(HAVE_ITER_IOV)
|
||||
#define zfs_uio_iter_iov(iter) iter_iov((iter))
|
||||
|
|
|
@ -55,6 +55,9 @@ int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
|
|||
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
|
||||
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
|
||||
|
||||
__attribute__((malloc))
|
||||
abd_t *abd_alloc_from_pages(struct page **, unsigned long, uint64_t);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -184,12 +184,6 @@ extern int zfs_inode_alloc(struct super_block *, struct inode **ip);
|
|||
extern void zfs_inode_destroy(struct inode *);
|
||||
extern void zfs_mark_inode_dirty(struct inode *);
|
||||
extern boolean_t zfs_relatime_need_update(const struct inode *);
|
||||
|
||||
#if defined(HAVE_UIO_RW)
|
||||
extern caddr_t zfs_map_page(page_t *, enum seg_rw);
|
||||
extern void zfs_unmap_page(page_t *, caddr_t);
|
||||
#endif /* HAVE_UIO_RW */
|
||||
|
||||
extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -32,7 +32,6 @@
|
|||
#include <linux/exportfs.h>
|
||||
#include <linux/falloc.h>
|
||||
#include <linux/parser.h>
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
#include <linux/vfs_compat.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/xattr_compat.h>
|
||||
|
|
|
@ -46,6 +46,7 @@ typedef enum abd_flags {
|
|||
ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */
|
||||
ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */
|
||||
ABD_FLAG_ALLOCD = 1 << 8, /* we allocated the abd_t */
|
||||
ABD_FLAG_FROM_PAGES = 1 << 9, /* does not own pages */
|
||||
} abd_flags_t;
|
||||
|
||||
typedef struct abd {
|
||||
|
@ -200,6 +201,12 @@ abd_get_size(abd_t *abd)
|
|||
return (abd->abd_size);
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
abd_is_from_pages(abd_t *abd)
|
||||
{
|
||||
return ((abd->abd_flags & ABD_FLAG_FROM_PAGES) ? B_TRUE : B_FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Module lifecycle
|
||||
* Defined in each specific OS's abd_os.c
|
||||
|
|
|
@ -43,6 +43,9 @@ typedef enum abd_stats_op {
|
|||
/* forward declarations */
|
||||
struct scatterlist;
|
||||
struct page;
|
||||
#if defined(__FreeBSD__) && defined(_KERNEL)
|
||||
struct sf_buf;
|
||||
#endif
|
||||
|
||||
struct abd_iter {
|
||||
/* public interface */
|
||||
|
@ -70,7 +73,11 @@ struct abd_iter {
|
|||
size_t iter_pos;
|
||||
size_t iter_offset; /* offset in current sg/abd_buf, */
|
||||
/* abd_offset included */
|
||||
#if defined(__FreeBSD__) && defined(_KERNEL)
|
||||
struct sf_buf *sf; /* used to map in vm_page_t FreeBSD */
|
||||
#else
|
||||
struct scatterlist *iter_sg; /* current sg */
|
||||
#endif
|
||||
};
|
||||
|
||||
extern abd_t *abd_zero_scatter;
|
||||
|
@ -78,6 +85,7 @@ extern abd_t *abd_zero_scatter;
|
|||
abd_t *abd_gang_get_offset(abd_t *, size_t *);
|
||||
abd_t *abd_alloc_struct(size_t);
|
||||
void abd_free_struct(abd_t *);
|
||||
void abd_init_struct(abd_t *);
|
||||
|
||||
/*
|
||||
* OS specific functions
|
||||
|
@ -108,9 +116,9 @@ void abd_iter_page(struct abd_iter *);
|
|||
#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
|
||||
#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
|
||||
|
||||
#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter)
|
||||
#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf)
|
||||
#define ABD_GANG(abd) (abd->abd_u.abd_gang)
|
||||
#define ABD_SCATTER(abd) ((abd)->abd_u.abd_scatter)
|
||||
#define ABD_LINEAR_BUF(abd) ((abd)->abd_u.abd_linear.abd_buf)
|
||||
#define ABD_GANG(abd) ((abd)->abd_u.abd_gang)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -120,7 +120,7 @@ typedef enum arc_flags
|
|||
|
||||
/*
|
||||
* Private ARC flags. These flags are private ARC only flags that
|
||||
* will show up in b_flags in the arc_hdr_buf_t. These flags should
|
||||
* will show up in b_flags in the arc_buf_hdr_t. These flags should
|
||||
* only be set by ARC code.
|
||||
*/
|
||||
ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
|
||||
|
@ -179,7 +179,6 @@ typedef enum arc_flags
|
|||
ARC_FLAG_COMPRESS_4 = 1 << 28,
|
||||
ARC_FLAG_COMPRESS_5 = 1 << 29,
|
||||
ARC_FLAG_COMPRESS_6 = 1 << 30
|
||||
|
||||
} arc_flags_t;
|
||||
|
||||
typedef enum arc_buf_flags {
|
||||
|
|
|
@ -61,17 +61,17 @@ extern "C" {
|
|||
/*
|
||||
* The simplified state transition diagram for dbufs looks like:
|
||||
*
|
||||
* +--> READ --+
|
||||
* +-------> READ ------+
|
||||
* | |
|
||||
* | V
|
||||
* (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
|
||||
* ^ | ^ ^
|
||||
* | | | |
|
||||
* | +--> FILL --+ |
|
||||
* | | |
|
||||
* | | |
|
||||
* | +------> NOFILL -----+
|
||||
* | |
|
||||
* | +-------> FILL ------+ |
|
||||
* | | | |
|
||||
* | | | |
|
||||
* | +------> NOFILL -----+-----> UNCACHED
|
||||
* | | (Direct I/O)
|
||||
* +---------------+
|
||||
*
|
||||
* DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
|
||||
|
@ -176,6 +176,7 @@ typedef struct dbuf_dirty_record {
|
|||
uint8_t dr_copies;
|
||||
boolean_t dr_nopwrite;
|
||||
boolean_t dr_brtwrite;
|
||||
boolean_t dr_diowrite;
|
||||
boolean_t dr_has_raw_params;
|
||||
|
||||
/*
|
||||
|
@ -384,7 +385,7 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
|
|||
uint64_t blkid, uint64_t *hash_out);
|
||||
|
||||
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
|
||||
void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx);
|
||||
void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx);
|
||||
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
|
||||
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail);
|
||||
boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed);
|
||||
|
@ -393,6 +394,8 @@ dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
|
|||
dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
|
||||
dmu_tx_t *tx);
|
||||
boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
|
||||
int dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp);
|
||||
int dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa);
|
||||
arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
|
||||
void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
|
||||
bp_embedded_type_t etype, enum zio_compress comp,
|
||||
|
@ -473,7 +476,7 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg)
|
|||
(dbuf_is_metadata(_db) && \
|
||||
((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
|
||||
|
||||
boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db);
|
||||
boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp);
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
|
||||
|
|
|
@ -525,6 +525,7 @@ void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
|||
#define WP_NOFILL 0x1
|
||||
#define WP_DMU_SYNC 0x2
|
||||
#define WP_SPILL 0x4
|
||||
#define WP_DIRECT_WR 0x8
|
||||
|
||||
void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
|
||||
struct zio_prop *zp);
|
||||
|
@ -589,6 +590,7 @@ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
|
|||
dmu_buf_t ***dbpp, uint32_t flags);
|
||||
int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
|
||||
dmu_buf_t **dbp);
|
||||
|
||||
/*
|
||||
* Add a reference to a dmu buffer that has already been held via
|
||||
* dmu_buf_hold() in the current context.
|
||||
|
@ -873,14 +875,18 @@ int dmu_free_long_object(objset_t *os, uint64_t object);
|
|||
#define DMU_READ_PREFETCH 0 /* prefetch */
|
||||
#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
|
||||
#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */
|
||||
#define DMU_DIRECTIO 4 /* use Direct I/O */
|
||||
|
||||
int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
void *buf, uint32_t flags);
|
||||
int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
|
||||
uint32_t flags);
|
||||
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
const void *buf, dmu_tx_t *tx);
|
||||
void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
const void *buf, dmu_tx_t *tx);
|
||||
int dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
const void *buf, dmu_tx_t *tx, uint32_t flags);
|
||||
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
||||
dmu_tx_t *tx);
|
||||
#ifdef _KERNEL
|
||||
|
|
|
@ -35,6 +35,10 @@
|
|||
#include <sys/dnode.h>
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/zfs_ioctl.h>
|
||||
#include <sys/uio.h>
|
||||
#include <sys/abd.h>
|
||||
#include <sys/arc.h>
|
||||
#include <sys/dbuf.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -134,7 +138,7 @@ extern "C" {
|
|||
* db_data_pending
|
||||
* db_dirtied
|
||||
* db_link
|
||||
* db_dirty_node (??)
|
||||
* db_dirty_records
|
||||
* db_dirtycnt
|
||||
* db_d.*
|
||||
* db.*
|
||||
|
@ -150,8 +154,10 @@ extern "C" {
|
|||
* dbuf_find: none (db_holds)
|
||||
* dbuf_hash_insert: none (db_holds)
|
||||
* dmu_buf_read_array_impl: none (db_state, db_changed)
|
||||
* dmu_sync: none (db_dirty_node, db_d)
|
||||
* dmu_sync: none (db_dirty_records, db_d)
|
||||
* dnode_reallocate: none (db)
|
||||
* dmu_write_direct: none (db_dirty_records, db_d)
|
||||
* dmu_write_direct_done: none (db_dirty_records, db_d)
|
||||
*
|
||||
* dn_mtx (leaf)
|
||||
* protects:
|
||||
|
@ -234,8 +240,9 @@ extern "C" {
|
|||
* dnode_new_blkid
|
||||
*/
|
||||
|
||||
struct objset;
|
||||
struct dmu_pool;
|
||||
struct dmu_buf;
|
||||
struct zgd;
|
||||
|
||||
typedef struct dmu_sendstatus {
|
||||
list_node_t dss_link;
|
||||
|
@ -245,9 +252,30 @@ typedef struct dmu_sendstatus {
|
|||
uint64_t dss_blocks; /* blocks visited during the sending process */
|
||||
} dmu_sendstatus_t;
|
||||
|
||||
/*
|
||||
* dmu_sync_{ready/done} args
|
||||
*/
|
||||
typedef struct {
|
||||
dbuf_dirty_record_t *dsa_dr;
|
||||
void (*dsa_done)(struct zgd *, int);
|
||||
struct zgd *dsa_zgd;
|
||||
dmu_tx_t *dsa_tx;
|
||||
} dmu_sync_arg_t;
|
||||
|
||||
void dmu_sync_done(zio_t *, arc_buf_t *buf, void *varg);
|
||||
void dmu_sync_ready(zio_t *, arc_buf_t *buf, void *varg);
|
||||
|
||||
void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
|
||||
void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
|
||||
|
||||
int dmu_write_direct(zio_t *, dmu_buf_impl_t *, abd_t *, dmu_tx_t *);
|
||||
int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t flags);
|
||||
int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t, dmu_tx_t *);
|
||||
#if defined(_KERNEL)
|
||||
int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t);
|
||||
int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_tx_t *);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -134,6 +134,7 @@ struct objset {
|
|||
zfs_cache_type_t os_secondary_cache;
|
||||
zfs_prefetch_type_t os_prefetch;
|
||||
zfs_sync_type_t os_sync;
|
||||
zfs_direct_t os_direct;
|
||||
zfs_redundant_metadata_type_t os_redundant_metadata;
|
||||
uint64_t os_recordsize;
|
||||
/*
|
||||
|
|
|
@ -42,6 +42,7 @@ extern "C" {
|
|||
#define FM_EREPORT_ZFS_DATA "data"
|
||||
#define FM_EREPORT_ZFS_DELAY "delay"
|
||||
#define FM_EREPORT_ZFS_DEADMAN "deadman"
|
||||
#define FM_EREPORT_ZFS_DIO_VERIFY "dio_verify"
|
||||
#define FM_EREPORT_ZFS_POOL "zpool"
|
||||
#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
|
||||
#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"
|
||||
|
@ -84,6 +85,7 @@ extern "C" {
|
|||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS "dio_verify_errors"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
|
||||
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
|
||||
|
|
|
@ -193,6 +193,7 @@ typedef enum {
|
|||
ZFS_PROP_SNAPSHOTS_CHANGED,
|
||||
ZFS_PROP_PREFETCH,
|
||||
ZFS_PROP_VOLTHREADING,
|
||||
ZFS_PROP_DIRECT,
|
||||
ZFS_NUM_PROPS
|
||||
} zfs_prop_t;
|
||||
|
||||
|
@ -533,6 +534,12 @@ typedef enum {
|
|||
ZFS_VOLMODE_NONE = 3
|
||||
} zfs_volmode_t;
|
||||
|
||||
typedef enum {
|
||||
ZFS_DIRECT_DISABLED = 0,
|
||||
ZFS_DIRECT_STANDARD,
|
||||
ZFS_DIRECT_ALWAYS
|
||||
} zfs_direct_t;
|
||||
|
||||
typedef enum zfs_keystatus {
|
||||
ZFS_KEYSTATUS_NONE = 0,
|
||||
ZFS_KEYSTATUS_UNAVAILABLE,
|
||||
|
@ -790,6 +797,9 @@ typedef struct zpool_load_policy {
|
|||
/* Number of slow IOs */
|
||||
#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios"
|
||||
|
||||
/* Number of Direct I/O write verify errors */
|
||||
#define ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS "vdev_dio_verify_errors"
|
||||
|
||||
/* vdev enclosure sysfs path */
|
||||
#define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path"
|
||||
|
||||
|
@ -1262,6 +1272,7 @@ typedef struct vdev_stat {
|
|||
uint64_t vs_physical_ashift; /* vdev_physical_ashift */
|
||||
uint64_t vs_noalloc; /* allocations halted? */
|
||||
uint64_t vs_pspace; /* physical capacity */
|
||||
uint64_t vs_dio_verify_errors; /* DIO write verify errors */
|
||||
} vdev_stat_t;
|
||||
|
||||
#define VDEV_STAT_VALID(field, uint64_t_field_count) \
|
||||
|
|
|
@ -949,6 +949,14 @@ typedef struct spa_iostats {
|
|||
kstat_named_t simple_trim_bytes_skipped;
|
||||
kstat_named_t simple_trim_extents_failed;
|
||||
kstat_named_t simple_trim_bytes_failed;
|
||||
kstat_named_t arc_read_count;
|
||||
kstat_named_t arc_read_bytes;
|
||||
kstat_named_t arc_write_count;
|
||||
kstat_named_t arc_write_bytes;
|
||||
kstat_named_t direct_read_count;
|
||||
kstat_named_t direct_read_bytes;
|
||||
kstat_named_t direct_write_count;
|
||||
kstat_named_t direct_write_bytes;
|
||||
} spa_iostats_t;
|
||||
|
||||
extern void spa_stats_init(spa_t *spa);
|
||||
|
@ -972,6 +980,10 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
|
|||
uint64_t extents_written, uint64_t bytes_written,
|
||||
uint64_t extents_skipped, uint64_t bytes_skipped,
|
||||
uint64_t extents_failed, uint64_t bytes_failed);
|
||||
extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops,
|
||||
uint32_t flags);
|
||||
extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops,
|
||||
uint32_t flags);
|
||||
extern void spa_import_progress_add(spa_t *spa);
|
||||
extern void spa_import_progress_remove(uint64_t spa_guid);
|
||||
extern int spa_import_progress_set_mmp_check(uint64_t pool_guid,
|
||||
|
|
|
@ -40,10 +40,49 @@
|
|||
#define _SYS_UIO_IMPL_H
|
||||
|
||||
#include <sys/uio.h>
|
||||
#include <sys/sysmacros.h>
|
||||
|
||||
extern int zfs_uiomove(void *, size_t, zfs_uio_rw_t, zfs_uio_t *);
|
||||
extern int zfs_uiocopy(void *, size_t, zfs_uio_rw_t, zfs_uio_t *, size_t *);
|
||||
extern void zfs_uioskip(zfs_uio_t *, size_t);
|
||||
extern void zfs_uio_free_dio_pages(zfs_uio_t *, zfs_uio_rw_t);
|
||||
extern int zfs_uio_get_dio_pages_alloc(zfs_uio_t *, zfs_uio_rw_t);
|
||||
extern boolean_t zfs_uio_page_aligned(zfs_uio_t *);
|
||||
|
||||
#ifdef _KERNEL
|
||||
static inline boolean_t
|
||||
zfs_dio_page_aligned(void *buf)
|
||||
{
|
||||
return ((((uintptr_t)(buf) & (PAGESIZE - 1)) == 0) ?
|
||||
B_TRUE : B_FALSE);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline boolean_t
|
||||
zfs_dio_offset_aligned(uint64_t offset, uint64_t blksz)
|
||||
{
|
||||
return (IS_P2ALIGNED(offset, blksz));
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
zfs_dio_size_aligned(uint64_t size, uint64_t blksz)
|
||||
{
|
||||
return ((size % blksz) == 0);
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
zfs_dio_aligned(uint64_t offset, uint64_t size, uint64_t blksz)
|
||||
{
|
||||
return (zfs_dio_offset_aligned(offset, blksz) &&
|
||||
zfs_dio_size_aligned(size, blksz));
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
zfs_uio_aligned(zfs_uio_t *uio, uint64_t blksz)
|
||||
{
|
||||
return (zfs_dio_aligned(zfs_uio_offset(uio), zfs_uio_resid(uio),
|
||||
blksz));
|
||||
}
|
||||
|
||||
static inline void
|
||||
zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len)
|
||||
|
|
|
@ -448,9 +448,14 @@ struct vdev {
|
|||
/*
|
||||
* We rate limit ZIO delay, deadman, and checksum events, since they
|
||||
* can flood ZED with tons of events when a drive is acting up.
|
||||
*
|
||||
* We also rate limit Direct I/O write verify errors, since a user might
|
||||
* be continually manipulating a buffer that can flood ZED with tons of
|
||||
* events.
|
||||
*/
|
||||
zfs_ratelimit_t vdev_delay_rl;
|
||||
zfs_ratelimit_t vdev_deadman_rl;
|
||||
zfs_ratelimit_t vdev_dio_verify_rl;
|
||||
zfs_ratelimit_t vdev_checksum_rl;
|
||||
|
||||
/*
|
||||
|
@ -649,6 +654,11 @@ extern uint_t zfs_vdev_max_auto_ashift;
|
|||
int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS);
|
||||
int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS);
|
||||
|
||||
/*
|
||||
* VDEV checksum verification for Direct I/O writes
|
||||
*/
|
||||
extern uint_t zfs_vdev_direct_write_verify;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -26,12 +26,13 @@
|
|||
#ifndef _SYS_ZFS_RACCT_H
|
||||
#define _SYS_ZFS_RACCT_H
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/spa.h>
|
||||
|
||||
/*
|
||||
* Platform-dependent resource accounting hooks
|
||||
*/
|
||||
void zfs_racct_read(uint64_t size, uint64_t iops);
|
||||
void zfs_racct_write(uint64_t size, uint64_t iops);
|
||||
void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags);
|
||||
void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags);
|
||||
|
||||
#endif /* _SYS_ZFS_RACCT_H */
|
||||
|
|
|
@ -308,7 +308,7 @@ extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx,
|
|||
const char *dname, znode_t *szp, znode_t *wzp);
|
||||
extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
znode_t *zp, offset_t off, ssize_t len, boolean_t commit,
|
||||
zil_callback_t callback, void *callback_data);
|
||||
boolean_t o_direct, zil_callback_t callback, void *callback_data);
|
||||
extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
znode_t *zp, uint64_t off, uint64_t len);
|
||||
extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
|
|
|
@ -225,6 +225,7 @@ typedef uint64_t zio_flag_t;
|
|||
#define ZIO_FLAG_NOPWRITE (1ULL << 28)
|
||||
#define ZIO_FLAG_REEXECUTED (1ULL << 29)
|
||||
#define ZIO_FLAG_DELEGATED (1ULL << 30)
|
||||
#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 31)
|
||||
|
||||
#define ZIO_ALLOCATOR_NONE (-1)
|
||||
#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
|
||||
|
@ -355,6 +356,7 @@ typedef struct zio_prop {
|
|||
boolean_t zp_brtwrite;
|
||||
boolean_t zp_encrypt;
|
||||
boolean_t zp_byteorder;
|
||||
boolean_t zp_direct_write;
|
||||
uint8_t zp_salt[ZIO_DATA_SALT_LEN];
|
||||
uint8_t zp_iv[ZIO_DATA_IV_LEN];
|
||||
uint8_t zp_mac[ZIO_DATA_MAC_LEN];
|
||||
|
|
|
@ -160,8 +160,9 @@ enum zio_stage {
|
|||
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--XT */
|
||||
|
||||
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R----- */
|
||||
ZIO_STAGE_DIO_CHECKSUM_VERIFY = 1 << 25, /* -W---- */
|
||||
|
||||
ZIO_STAGE_DONE = 1 << 25 /* RWFCXT */
|
||||
ZIO_STAGE_DONE = 1 << 26 /* RWFCXT */
|
||||
};
|
||||
|
||||
#define ZIO_ROOT_PIPELINE \
|
||||
|
@ -227,6 +228,10 @@ enum zio_stage {
|
|||
ZIO_STAGE_DVA_THROTTLE | \
|
||||
ZIO_STAGE_DVA_ALLOCATE)
|
||||
|
||||
#define ZIO_DIRECT_WRITE_PIPELINE \
|
||||
ZIO_WRITE_PIPELINE & \
|
||||
(~ZIO_STAGE_ISSUE_ASYNC)
|
||||
|
||||
#define ZIO_DDT_CHILD_WRITE_PIPELINE \
|
||||
(ZIO_INTERLOCK_STAGES | \
|
||||
ZIO_VDEV_IO_STAGES | \
|
||||
|
|
|
@ -82,6 +82,32 @@ typedef struct zfs_uio {
|
|||
#define zfs_uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len
|
||||
#define zfs_uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base
|
||||
|
||||
static inline boolean_t
|
||||
zfs_dio_page_aligned(void *buf)
|
||||
{
|
||||
return ((((unsigned long)(buf) & (PAGESIZE - 1)) == 0) ?
|
||||
B_TRUE : B_FALSE);
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
zfs_dio_offset_aligned(uint64_t offset, uint64_t blksz)
|
||||
{
|
||||
return (IS_P2ALIGNED(offset, blksz));
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
zfs_dio_size_aligned(uint64_t size, uint64_t blksz)
|
||||
{
|
||||
return ((size % blksz) == 0);
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
zfs_dio_aligned(uint64_t offset, uint64_t size, uint64_t blksz)
|
||||
{
|
||||
return (zfs_dio_offset_aligned(offset, blksz) &&
|
||||
zfs_dio_size_aligned(size, blksz));
|
||||
}
|
||||
|
||||
static inline void
|
||||
zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len)
|
||||
{
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -87,6 +87,7 @@ nodist_libzpool_la_SOURCES = \
|
|||
module/zfs/ddt_zap.c \
|
||||
module/zfs/dmu.c \
|
||||
module/zfs/dmu_diff.c \
|
||||
module/zfs/dmu_direct.c \
|
||||
module/zfs/dmu_object.c \
|
||||
module/zfs/dmu_objset.c \
|
||||
module/zfs/dmu_recv.c \
|
||||
|
|
|
@ -363,3 +363,67 @@ void
|
|||
abd_cache_reap_now(void)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Borrow a raw buffer from an ABD without copying the contents of the ABD
|
||||
* into the buffer. If the ABD is scattered, this will alloate a raw buffer
|
||||
* whose contents are undefined. To copy over the existing data in the ABD, use
|
||||
* abd_borrow_buf_copy() instead.
|
||||
*/
|
||||
void *
|
||||
abd_borrow_buf(abd_t *abd, size_t n)
|
||||
{
|
||||
void *buf;
|
||||
abd_verify(abd);
|
||||
ASSERT3U(abd->abd_size, >=, 0);
|
||||
if (abd_is_linear(abd)) {
|
||||
buf = abd_to_buf(abd);
|
||||
} else {
|
||||
buf = zio_buf_alloc(n);
|
||||
}
|
||||
#ifdef ZFS_DEBUG
|
||||
(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
|
||||
#endif
|
||||
return (buf);
|
||||
}
|
||||
|
||||
void *
|
||||
abd_borrow_buf_copy(abd_t *abd, size_t n)
|
||||
{
|
||||
void *buf = abd_borrow_buf(abd, n);
|
||||
if (!abd_is_linear(abd)) {
|
||||
abd_copy_to_buf(buf, abd, n);
|
||||
}
|
||||
return (buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
|
||||
* no change the contents of the ABD and will ASSERT that you didn't modify
|
||||
* the buffer since it was borrowed. If you want any changes you made to buf to
|
||||
* be copied back to abd, use abd_return_buf_copy() instead.
|
||||
*/
|
||||
void
|
||||
abd_return_buf(abd_t *abd, void *buf, size_t n)
|
||||
{
|
||||
abd_verify(abd);
|
||||
ASSERT3U(abd->abd_size, >=, n);
|
||||
#ifdef ZFS_DEBUG
|
||||
(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
|
||||
#endif
|
||||
if (abd_is_linear(abd)) {
|
||||
ASSERT3P(buf, ==, abd_to_buf(abd));
|
||||
} else {
|
||||
ASSERT0(abd_cmp_buf(abd, buf, n));
|
||||
zio_buf_free(buf, n);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
|
||||
{
|
||||
if (!abd_is_linear(abd)) {
|
||||
abd_copy_from_buf(abd, buf, n);
|
||||
}
|
||||
abd_return_buf(abd, buf, n);
|
||||
}
|
||||
|
|
|
@ -291,6 +291,14 @@ Default dnode block size as a power of 2.
|
|||
.It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int
|
||||
Default dnode indirect block size as a power of 2.
|
||||
.
|
||||
.It Sy zfs_dio_enabled Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
||||
Enable Direct I/O.
|
||||
If this setting is 0, then all I/O requests will be directed through the ARC
|
||||
acting as though the dataset property
|
||||
.Sy direct
|
||||
was set to
|
||||
.Sy disabled .
|
||||
.
|
||||
.It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
|
||||
When attempting to log an output nvlist of an ioctl in the on-disk history,
|
||||
the output will not be stored if it is larger than this size (in bytes).
|
||||
|
@ -416,6 +424,26 @@ May be increased up to
|
|||
.Sy ASHIFT_MAX Po 16 Pc ,
|
||||
but this may negatively impact pool space efficiency.
|
||||
.
|
||||
.It Sy zfs_vdev_direct_write_verify Ns = Ns Sy Linux 1 | FreeBSED 0 Pq uint
|
||||
If non-zero, then a Direct I/O write's checksum will be verified every
|
||||
time the write is issued and before it is commited to the block pointer.
|
||||
In the event the checksum is not valid then the I/O operation will return EIO.
|
||||
This module parameter can be used to detect if the
|
||||
contents of the users buffer have changed in the process of doing a Direct I/O
|
||||
write.
|
||||
It can also help to identify if reported checksum errors are tied to Direct I/O
|
||||
writes.
|
||||
Each verify error causes a
|
||||
.Sy dio_verify
|
||||
zevent.
|
||||
Direct Write I/O checkum verify errors can be seen with
|
||||
.Nm zpool Cm status Fl d .
|
||||
The default value for this is 1 on Linux, but is 0 for
|
||||
.Fx
|
||||
because user pages can be placed under write protection in
|
||||
.Fx
|
||||
before the Direct I/O write is issued.
|
||||
.
|
||||
.It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq uint
|
||||
Minimum ashift used when creating new top-level vdevs.
|
||||
.
|
||||
|
@ -1093,6 +1121,9 @@ This will smoothly handle between ten times and a tenth of this number.
|
|||
.Pp
|
||||
.Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 .
|
||||
.
|
||||
.It Sy zfs_dio_write_verify_events_per_second Ns = Ns Sy 20 Ns /s Pq uint
|
||||
Rate limit Direct I/O write verify events to this many per second.
|
||||
.
|
||||
.It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
||||
Disables requirement for IVset GUIDs to be present and match when doing a raw
|
||||
receive of encrypted datasets.
|
||||
|
|
|
@ -1039,6 +1039,44 @@ See the
|
|||
section of
|
||||
.Xr zfsconcepts 7 .
|
||||
.It Xo
|
||||
.Sy direct Ns = Ns Sy disabled Ns | Ns Sy standard Ns | Ns Sy always
|
||||
.Xc
|
||||
Controls the behavior of Direct I/O requests
|
||||
.Pq e.g. Dv O_DIRECT .
|
||||
The
|
||||
.Sy standard
|
||||
behavior for Direct I/O requests is to bypass the ARC when possible.
|
||||
These requests will not be cached and performance will be limited by the
|
||||
raw speed of the underlying disks
|
||||
.Pq Dv this is the default .
|
||||
.Sy always
|
||||
causes every properly aligned read or write to be treated as a direct request.
|
||||
.Sy disabled
|
||||
causes the O_DIRECT flag to be silently ignored and all direct requests will
|
||||
be handled by the ARC.
|
||||
This is the default behavior for OpenZFS 2.2 and prior releases.
|
||||
.Pp
|
||||
Bypassing the ARC requires that a direct request be correctly aligned.
|
||||
For write requests the starting offset and size of the request must be
|
||||
.Sy recordsize Ns
|
||||
-aligned, if not then the unaligned portion of the request will be silently
|
||||
redirected through the ARC.
|
||||
For read requests there is no
|
||||
.Sy recordsize
|
||||
alignment restriction on either the starting offset or size.
|
||||
All direct requests must use a page-aligned memory buffer and the request
|
||||
size must be a multiple of the page size or an error is returned.
|
||||
.Pp
|
||||
Concurrently mixing buffered and direct requests to overlapping regions of
|
||||
a file can decrease performance.
|
||||
However, the resulting file will always be coherent.
|
||||
For example, a direct read after a buffered write will return the data
|
||||
from the buffered write.
|
||||
Furthermore, if an application uses
|
||||
.Xr mmap 2
|
||||
based file access then in order to maintain coherency all direct requests
|
||||
are converted to buffered requests while the file is mapped.
|
||||
.It Xo
|
||||
.Sy dnodesize Ns = Ns Sy legacy Ns | Ns Sy auto Ns | Ns Sy 1k Ns | Ns
|
||||
.Sy 2k Ns | Ns Sy 4k Ns | Ns Sy 8k Ns | Ns Sy 16k
|
||||
.Xc
|
||||
|
|
|
@ -98,6 +98,17 @@ This can be an indicator of problems with the underlying storage device.
|
|||
The number of delay events is ratelimited by the
|
||||
.Sy zfs_slow_io_events_per_second
|
||||
module parameter.
|
||||
.It Sy dio_verify
|
||||
Issued when there was a checksum verify error after a Direct I/O write has been
|
||||
issued.
|
||||
This event can only take place if the module parameter
|
||||
.Sy zfs_vdev_direct_write_verify
|
||||
is not set to zero.
|
||||
See
|
||||
.Xr zfs 4
|
||||
for more details on the
|
||||
.Sy zfs_vdev_direct_write_verify
|
||||
module paramter.
|
||||
.It Sy config
|
||||
Issued every time a vdev change have been done to the pool.
|
||||
.It Sy zpool
|
||||
|
@ -408,8 +419,9 @@ ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--XT
|
|||
ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--XT
|
||||
|
||||
ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R-----
|
||||
ZIO_STAGE_DIO_CHECKSUM_VERIFY:0x02000000:-W----
|
||||
|
||||
ZIO_STAGE_DONE:0x02000000:RWFCXT
|
||||
ZIO_STAGE_DONE:0x04000000:RWFCXT
|
||||
.TE
|
||||
.
|
||||
.Sh I/O FLAGS
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
.Sh SYNOPSIS
|
||||
.Nm zpool
|
||||
.Cm status
|
||||
.Op Fl DegiLpPstvx
|
||||
.Op Fl dDegiLpPstvx
|
||||
.Op Fl T Sy u Ns | Ns Sy d
|
||||
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns …
|
||||
.Oo Ar pool Oc Ns …
|
||||
|
@ -81,6 +81,15 @@ to display vdevs in flat hierarchy instead of nested vdev objects.
|
|||
Specify
|
||||
.Sy --json-pool-key-guid
|
||||
to set pool GUID as key for pool objects instead of pool names.
|
||||
.It Fl d
|
||||
Display the number of Direct I/O write checksum verify errors that have occured
|
||||
on a top-level VDEV.
|
||||
See
|
||||
.Sx zfs_vdev_direct_write_verify
|
||||
in
|
||||
.Xr zfs 4
|
||||
for details about the conditions that can cause Direct I/O write checksum
|
||||
verify failures to occur.
|
||||
.It Fl D
|
||||
Display a histogram of deduplication statistics, showing the allocated
|
||||
.Pq physically present on disk
|
||||
|
|
|
@ -327,6 +327,7 @@ ZFS_OBJS := \
|
|||
ddt_stats.o \
|
||||
ddt_zap.o \
|
||||
dmu.o \
|
||||
dmu_direct.o \
|
||||
dmu_diff.o \
|
||||
dmu_object.o \
|
||||
dmu_objset.o \
|
||||
|
|
|
@ -257,6 +257,7 @@ SRCS+= abd.c \
|
|||
ddt_stats.c \
|
||||
ddt_zap.c \
|
||||
dmu.c \
|
||||
dmu_direct.c \
|
||||
dmu_diff.c \
|
||||
dmu_object.c \
|
||||
dmu_objset.c \
|
||||
|
|
|
@ -44,6 +44,10 @@
|
|||
#include <sys/uio_impl.h>
|
||||
#include <sys/vnode.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#include <sys/byteorder.h>
|
||||
#include <sys/lock.h>
|
||||
#include <sys/vm.h>
|
||||
#include <vm/vm_map.h>
|
||||
|
||||
static void
|
||||
zfs_freeuio(struct uio *uio)
|
||||
|
@ -115,3 +119,198 @@ zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio)
|
|||
ASSERT3U(zfs_uio_rw(uio), ==, dir);
|
||||
return (vn_io_fault_uiomove(p, n, GET_UIO_STRUCT(uio)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if the uio is page-aligned in memory.
|
||||
*/
|
||||
boolean_t
|
||||
zfs_uio_page_aligned(zfs_uio_t *uio)
|
||||
{
|
||||
const struct iovec *iov = GET_UIO_STRUCT(uio)->uio_iov;
|
||||
|
||||
for (int i = zfs_uio_iovcnt(uio); i > 0; iov++, i--) {
|
||||
uintptr_t addr = (uintptr_t)iov->iov_base;
|
||||
size_t size = iov->iov_len;
|
||||
if ((addr & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
|
||||
return (B_FALSE);
|
||||
}
|
||||
}
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_uio_set_pages_to_stable(zfs_uio_t *uio)
|
||||
{
|
||||
ASSERT3P(uio->uio_dio.pages, !=, NULL);
|
||||
ASSERT3S(uio->uio_dio.npages, >, 0);
|
||||
|
||||
for (int i = 0; i < uio->uio_dio.npages; i++) {
|
||||
vm_page_t page = uio->uio_dio.pages[i];
|
||||
ASSERT3P(page, !=, NULL);
|
||||
|
||||
MPASS(page == PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(page)));
|
||||
vm_page_busy_acquire(page, VM_ALLOC_SBUSY);
|
||||
pmap_remove_write(page);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
zfs_uio_release_stable_pages(zfs_uio_t *uio)
|
||||
{
|
||||
ASSERT3P(uio->uio_dio.pages, !=, NULL);
|
||||
for (int i = 0; i < uio->uio_dio.npages; i++) {
|
||||
vm_page_t page = uio->uio_dio.pages[i];
|
||||
|
||||
ASSERT3P(page, !=, NULL);
|
||||
vm_page_sunbusy(page);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If the operation is marked as read, then we are stating the pages will be
|
||||
* written to and must be given write access.
|
||||
*/
|
||||
static int
|
||||
zfs_uio_hold_pages(unsigned long start, size_t len, int nr_pages,
|
||||
zfs_uio_rw_t rw, vm_page_t *pages)
|
||||
{
|
||||
vm_map_t map;
|
||||
vm_prot_t prot;
|
||||
int count;
|
||||
|
||||
map = &curthread->td_proc->p_vmspace->vm_map;
|
||||
ASSERT3S(len, >, 0);
|
||||
|
||||
prot = rw == UIO_READ ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ;
|
||||
count = vm_fault_quick_hold_pages(map, start, len, prot, pages,
|
||||
nr_pages);
|
||||
|
||||
return (count);
|
||||
}
|
||||
|
||||
void
|
||||
zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
|
||||
{
|
||||
ASSERT(uio->uio_extflg & UIO_DIRECT);
|
||||
ASSERT3P(uio->uio_dio.pages, !=, NULL);
|
||||
ASSERT(zfs_uio_rw(uio) == rw);
|
||||
|
||||
if (rw == UIO_WRITE)
|
||||
zfs_uio_release_stable_pages(uio);
|
||||
|
||||
vm_page_unhold_pages(&uio->uio_dio.pages[0],
|
||||
uio->uio_dio.npages);
|
||||
|
||||
kmem_free(uio->uio_dio.pages,
|
||||
uio->uio_dio.npages * sizeof (vm_page_t));
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_uio_get_user_pages(unsigned long start, int nr_pages,
|
||||
size_t len, zfs_uio_rw_t rw, vm_page_t *pages)
|
||||
{
|
||||
int count;
|
||||
|
||||
count = zfs_uio_hold_pages(start, len, nr_pages, rw, pages);
|
||||
|
||||
if (count != nr_pages) {
|
||||
if (count > 0)
|
||||
vm_page_unhold_pages(pages, count);
|
||||
return (count);
|
||||
}
|
||||
|
||||
ASSERT3S(count, ==, nr_pages);
|
||||
|
||||
return (count);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_uio_iov_step(struct iovec v, zfs_uio_t *uio, int *numpages)
|
||||
{
|
||||
unsigned long addr = (unsigned long)(v.iov_base);
|
||||
size_t len = v.iov_len;
|
||||
int n = DIV_ROUND_UP(len, PAGE_SIZE);
|
||||
|
||||
int res = zfs_uio_get_user_pages(
|
||||
P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, len,
|
||||
zfs_uio_rw(uio), &uio->uio_dio.pages[uio->uio_dio.npages]);
|
||||
|
||||
if (res != n)
|
||||
return (SET_ERROR(EFAULT));
|
||||
|
||||
ASSERT3U(len, ==, res * PAGE_SIZE);
|
||||
*numpages = res;
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_uio_get_dio_pages_impl(zfs_uio_t *uio)
|
||||
{
|
||||
const struct iovec *iovp = GET_UIO_STRUCT(uio)->uio_iov;
|
||||
size_t len = zfs_uio_resid(uio);
|
||||
|
||||
for (int i = 0; i < zfs_uio_iovcnt(uio); i++) {
|
||||
struct iovec iov;
|
||||
int numpages = 0;
|
||||
|
||||
if (iovp->iov_len == 0) {
|
||||
iovp++;
|
||||
continue;
|
||||
}
|
||||
iov.iov_len = MIN(len, iovp->iov_len);
|
||||
iov.iov_base = iovp->iov_base;
|
||||
int error = zfs_uio_iov_step(iov, uio, &numpages);
|
||||
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
uio->uio_dio.npages += numpages;
|
||||
len -= iov.iov_len;
|
||||
iovp++;
|
||||
}
|
||||
|
||||
ASSERT0(len);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function holds user pages into the kernel. In the event that the user
|
||||
* pages are not successfully held an error value is returned.
|
||||
*
|
||||
* On success, 0 is returned.
|
||||
*/
|
||||
int
|
||||
zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
|
||||
{
|
||||
int error = 0;
|
||||
int npages = DIV_ROUND_UP(zfs_uio_resid(uio), PAGE_SIZE);
|
||||
size_t size = npages * sizeof (vm_page_t);
|
||||
|
||||
ASSERT(zfs_uio_rw(uio) == rw);
|
||||
|
||||
uio->uio_dio.pages = kmem_alloc(size, KM_SLEEP);
|
||||
|
||||
error = zfs_uio_get_dio_pages_impl(uio);
|
||||
|
||||
if (error) {
|
||||
kmem_free(uio->uio_dio.pages, size);
|
||||
return (error);
|
||||
}
|
||||
|
||||
ASSERT3S(uio->uio_dio.npages, >, 0);
|
||||
|
||||
/*
|
||||
* Since we will be writing the user pages we must make sure that
|
||||
* they are stable. That way the contents of the pages can not change
|
||||
* while we are doing: compression, checksumming, encryption, parity
|
||||
* calculations or deduplication.
|
||||
*/
|
||||
if (zfs_uio_rw(uio) == UIO_WRITE)
|
||||
zfs_uio_set_pages_to_stable(uio);
|
||||
|
||||
uio->uio_extflg |= UIO_DIRECT;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
#include <sys/zio.h>
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#include <sys/vm.h>
|
||||
|
||||
typedef struct abd_stats {
|
||||
kstat_named_t abdstat_struct_size;
|
||||
|
@ -135,7 +136,9 @@ abd_size_alloc_linear(size_t size)
|
|||
void
|
||||
abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
|
||||
{
|
||||
uint_t n = abd_scatter_chunkcnt(abd);
|
||||
uint_t n;
|
||||
|
||||
n = abd_scatter_chunkcnt(abd);
|
||||
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
|
||||
int waste = (n << PAGE_SHIFT) - abd->abd_size;
|
||||
if (op == ABDSTAT_INCR) {
|
||||
|
@ -198,12 +201,18 @@ abd_free_chunks(abd_t *abd)
|
|||
{
|
||||
uint_t i, n;
|
||||
|
||||
/*
|
||||
* Scatter ABDs may be constructed by abd_alloc_from_pages() from
|
||||
* an array of pages. In which case they should not be freed.
|
||||
*/
|
||||
if (!abd_is_from_pages(abd)) {
|
||||
n = abd_scatter_chunkcnt(abd);
|
||||
for (i = 0; i < n; i++) {
|
||||
kmem_cache_free(abd_chunk_cache,
|
||||
ABD_SCATTER(abd).abd_chunks[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
abd_t *
|
||||
abd_alloc_struct_impl(size_t size)
|
||||
|
@ -342,11 +351,8 @@ abd_fini(void)
|
|||
void
|
||||
abd_free_linear_page(abd_t *abd)
|
||||
{
|
||||
/*
|
||||
* FreeBSD does not have scatter linear pages
|
||||
* so there is an error.
|
||||
*/
|
||||
VERIFY(0);
|
||||
ASSERT3P(abd->abd_u.abd_linear.sf, !=, NULL);
|
||||
zfs_unmap_page(abd->abd_u.abd_linear.sf);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -365,6 +371,26 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata)
|
|||
return (abd_alloc_linear(size, is_metadata));
|
||||
}
|
||||
|
||||
static abd_t *
|
||||
abd_get_offset_from_pages(abd_t *abd, abd_t *sabd, size_t chunkcnt,
|
||||
size_t new_offset)
|
||||
{
|
||||
ASSERT(abd_is_from_pages(sabd));
|
||||
|
||||
/*
|
||||
* Set the child child chunks to point at the parent chunks as
|
||||
* the chunks are just pages and we don't want to copy them.
|
||||
*/
|
||||
size_t parent_offset = new_offset / PAGE_SIZE;
|
||||
ASSERT3U(parent_offset, <, abd_scatter_chunkcnt(sabd));
|
||||
for (int i = 0; i < chunkcnt; i++)
|
||||
ABD_SCATTER(abd).abd_chunks[i] =
|
||||
ABD_SCATTER(sabd).abd_chunks[parent_offset + i];
|
||||
|
||||
abd->abd_flags |= ABD_FLAG_FROM_PAGES;
|
||||
return (abd);
|
||||
}
|
||||
|
||||
abd_t *
|
||||
abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
|
||||
size_t size)
|
||||
|
@ -399,6 +425,11 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
|
|||
|
||||
ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK;
|
||||
|
||||
if (abd_is_from_pages(sabd)) {
|
||||
return (abd_get_offset_from_pages(abd, sabd, chunkcnt,
|
||||
new_offset));
|
||||
}
|
||||
|
||||
/* Copy the scatterlist starting at the correct offset */
|
||||
(void) memcpy(&ABD_SCATTER(abd).abd_chunks,
|
||||
&ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT],
|
||||
|
@ -407,6 +438,44 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
|
|||
return (abd);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a scatter ABD structure from user pages.
|
||||
*/
|
||||
abd_t *
|
||||
abd_alloc_from_pages(vm_page_t *pages, unsigned long offset, uint64_t size)
|
||||
{
|
||||
VERIFY3U(size, <=, DMU_MAX_ACCESS);
|
||||
ASSERT3U(offset, <, PAGE_SIZE);
|
||||
ASSERT3P(pages, !=, NULL);
|
||||
|
||||
abd_t *abd = abd_alloc_struct(size);
|
||||
abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES;
|
||||
abd->abd_size = size;
|
||||
|
||||
if ((offset + size) <= PAGE_SIZE) {
|
||||
/*
|
||||
* There is only a single page worth of data, so we will just
|
||||
* use a linear ABD. We have to make sure to take into account
|
||||
* the offset though. In all other cases our offset will be 0
|
||||
* as we are always PAGE_SIZE aligned.
|
||||
*/
|
||||
abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
|
||||
ABD_LINEAR_BUF(abd) = (char *)zfs_map_page(pages[0],
|
||||
&abd->abd_u.abd_linear.sf) + offset;
|
||||
} else {
|
||||
ABD_SCATTER(abd).abd_offset = offset;
|
||||
ASSERT0(ABD_SCATTER(abd).abd_offset);
|
||||
|
||||
/*
|
||||
* Setting the ABD's abd_chunks to point to the user pages.
|
||||
*/
|
||||
for (int i = 0; i < abd_chunkcnt_for_bytes(size); i++)
|
||||
ABD_SCATTER(abd).abd_chunks[i] = pages[i];
|
||||
}
|
||||
|
||||
return (abd);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the abd_iter.
|
||||
*/
|
||||
|
@ -468,6 +537,16 @@ abd_iter_map(struct abd_iter *aiter)
|
|||
if (abd_is_linear(abd)) {
|
||||
aiter->iter_mapsize = abd->abd_size - offset;
|
||||
paddr = ABD_LINEAR_BUF(abd);
|
||||
} else if (abd_is_from_pages(abd)) {
|
||||
aiter->sf = NULL;
|
||||
offset += ABD_SCATTER(abd).abd_offset;
|
||||
size_t index = offset / PAGE_SIZE;
|
||||
offset &= PAGE_MASK;
|
||||
aiter->iter_mapsize = MIN(PAGE_SIZE - offset,
|
||||
abd->abd_size - aiter->iter_pos);
|
||||
paddr = zfs_map_page(
|
||||
ABD_SCATTER(aiter->iter_abd).abd_chunks[index],
|
||||
&aiter->sf);
|
||||
} else {
|
||||
offset += ABD_SCATTER(abd).abd_offset;
|
||||
paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT];
|
||||
|
@ -490,6 +569,12 @@ abd_iter_unmap(struct abd_iter *aiter)
|
|||
ASSERT3U(aiter->iter_mapsize, >, 0);
|
||||
}
|
||||
|
||||
if (abd_is_from_pages(aiter->iter_abd) &&
|
||||
!abd_is_linear_page(aiter->iter_abd)) {
|
||||
ASSERT3P(aiter->sf, !=, NULL);
|
||||
zfs_unmap_page(aiter->sf);
|
||||
}
|
||||
|
||||
aiter->iter_mapaddr = NULL;
|
||||
aiter->iter_mapsize = 0;
|
||||
}
|
||||
|
@ -499,3 +584,67 @@ abd_cache_reap_now(void)
|
|||
{
|
||||
kmem_cache_reap_soon(abd_chunk_cache);
|
||||
}
|
||||
|
||||
/*
|
||||
* Borrow a raw buffer from an ABD without copying the contents of the ABD
|
||||
* into the buffer. If the ABD is scattered, this will alloate a raw buffer
|
||||
* whose contents are undefined. To copy over the existing data in the ABD, use
|
||||
* abd_borrow_buf_copy() instead.
|
||||
*/
|
||||
void *
|
||||
abd_borrow_buf(abd_t *abd, size_t n)
|
||||
{
|
||||
void *buf;
|
||||
abd_verify(abd);
|
||||
ASSERT3U(abd->abd_size, >=, 0);
|
||||
if (abd_is_linear(abd)) {
|
||||
buf = abd_to_buf(abd);
|
||||
} else {
|
||||
buf = zio_buf_alloc(n);
|
||||
}
|
||||
#ifdef ZFS_DEBUG
|
||||
(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
|
||||
#endif
|
||||
return (buf);
|
||||
}
|
||||
|
||||
void *
|
||||
abd_borrow_buf_copy(abd_t *abd, size_t n)
|
||||
{
|
||||
void *buf = abd_borrow_buf(abd, n);
|
||||
if (!abd_is_linear(abd)) {
|
||||
abd_copy_to_buf(buf, abd, n);
|
||||
}
|
||||
return (buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
|
||||
* no change the contents of the ABD and will ASSERT that you didn't modify
|
||||
* the buffer since it was borrowed. If you want any changes you made to buf to
|
||||
* be copied back to abd, use abd_return_buf_copy() instead.
|
||||
*/
|
||||
void
|
||||
abd_return_buf(abd_t *abd, void *buf, size_t n)
|
||||
{
|
||||
abd_verify(abd);
|
||||
ASSERT3U(abd->abd_size, >=, n);
|
||||
#ifdef ZFS_DEBUG
|
||||
(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
|
||||
#endif
|
||||
if (abd_is_linear(abd)) {
|
||||
ASSERT3P(buf, ==, abd_to_buf(abd));
|
||||
} else {
|
||||
ASSERT0(abd_cmp_buf(abd, buf, n));
|
||||
zio_buf_free(buf, n);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
|
||||
{
|
||||
if (!abd_is_linear(abd)) {
|
||||
abd_copy_from_buf(abd, buf, n);
|
||||
}
|
||||
abd_return_buf(abd, buf, n);
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
#include <sys/racct.h>
|
||||
|
||||
void
|
||||
zfs_racct_read(uint64_t size, uint64_t iops)
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
{
|
||||
curthread->td_ru.ru_inblock += iops;
|
||||
#ifdef RACCT
|
||||
|
@ -40,10 +40,12 @@ zfs_racct_read(uint64_t size, uint64_t iops)
|
|||
#else
|
||||
(void) size;
|
||||
#endif /* RACCT */
|
||||
|
||||
spa_iostats_read_add(spa, size, iops, flags);
|
||||
}
|
||||
|
||||
void
|
||||
zfs_racct_write(uint64_t size, uint64_t iops)
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
{
|
||||
curthread->td_ru.ru_oublock += iops;
|
||||
#ifdef RACCT
|
||||
|
@ -56,4 +58,6 @@ zfs_racct_write(uint64_t size, uint64_t iops)
|
|||
#else
|
||||
(void) size;
|
||||
#endif /* RACCT */
|
||||
|
||||
spa_iostats_write_add(spa, size, iops, flags);
|
||||
}
|
||||
|
|
|
@ -4131,7 +4131,7 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
|
|||
* but that would make the locking messier
|
||||
*/
|
||||
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
|
||||
len, commit, NULL, NULL);
|
||||
len, commit, B_FALSE, NULL, NULL);
|
||||
|
||||
zfs_vmobject_wlock(object);
|
||||
for (i = 0; i < ncount; i++) {
|
||||
|
@ -4266,6 +4266,8 @@ ioflags(int ioflags)
|
|||
flags |= O_APPEND;
|
||||
if (ioflags & IO_NDELAY)
|
||||
flags |= O_NONBLOCK;
|
||||
if (ioflags & IO_DIRECT)
|
||||
flags |= O_DIRECT;
|
||||
if (ioflags & IO_SYNC)
|
||||
flags |= O_SYNC;
|
||||
|
||||
|
@ -4285,9 +4287,36 @@ static int
|
|||
zfs_freebsd_read(struct vop_read_args *ap)
|
||||
{
|
||||
zfs_uio_t uio;
|
||||
int error = 0;
|
||||
zfs_uio_init(&uio, ap->a_uio);
|
||||
return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
|
||||
ap->a_cred));
|
||||
error = zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
|
||||
ap->a_cred);
|
||||
/*
|
||||
* XXX We occasionally get an EFAULT for Direct I/O reads on
|
||||
* FreeBSD 13. This still needs to be resolved. The EFAULT comes
|
||||
* from:
|
||||
* zfs_uio_get__dio_pages_alloc() ->
|
||||
* zfs_uio_get_dio_pages_impl() ->
|
||||
* zfs_uio_iov_step() ->
|
||||
* zfs_uio_get_user_pages().
|
||||
* We return EFAULT from zfs_uio_iov_step(). When a Direct I/O
|
||||
* read fails to map in the user pages (returning EFAULT) the
|
||||
* Direct I/O request is broken up into two separate IO requests
|
||||
* and issued separately using Direct I/O.
|
||||
*/
|
||||
#ifdef ZFS_DEBUG
|
||||
if (error == EFAULT && uio.uio_extflg & UIO_DIRECT) {
|
||||
#if 0
|
||||
printf("%s(%d): Direct I/O read returning EFAULT "
|
||||
"uio = %p, zfs_uio_offset(uio) = %lu "
|
||||
"zfs_uio_resid(uio) = %lu\n",
|
||||
__FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio),
|
||||
zfs_uio_resid(&uio));
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
return (error);
|
||||
}
|
||||
|
||||
#ifndef _SYS_SYSPROTO_H_
|
||||
|
|
|
@ -922,6 +922,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
|
|||
if (commit)
|
||||
zil_commit(zv->zv_zilog, ZVOL_OBJ);
|
||||
rw_exit(&zv->zv_suspend_lock);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
|
|
|
@ -186,6 +186,7 @@ static int zfs_abd_scatter_min_size = 512 * 3;
|
|||
abd_t *abd_zero_scatter = NULL;
|
||||
|
||||
struct page;
|
||||
|
||||
/*
|
||||
* abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
|
||||
* point to ZERO_PAGE if it is available or it will be an allocated zero'd
|
||||
|
@ -453,6 +454,11 @@ abd_free_chunks(abd_t *abd)
|
|||
if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
|
||||
ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
|
||||
|
||||
/*
|
||||
* Scatter ABDs may be constructed by abd_alloc_from_pages() from
|
||||
* an array of pages. In which case they should not be freed.
|
||||
*/
|
||||
if (!abd_is_from_pages(abd)) {
|
||||
abd_for_each_sg(abd, sg, nr_pages, i) {
|
||||
page = sg_page(sg);
|
||||
abd_unmark_zfs_page(page);
|
||||
|
@ -461,6 +467,8 @@ abd_free_chunks(abd_t *abd)
|
|||
ASSERT3U(sg->length, <=, PAGE_SIZE << order);
|
||||
ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
|
||||
}
|
||||
}
|
||||
|
||||
abd_free_sg_table(abd);
|
||||
}
|
||||
|
||||
|
@ -551,17 +559,19 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
|
|||
void
|
||||
abd_verify_scatter(abd_t *abd)
|
||||
{
|
||||
size_t n;
|
||||
int i = 0;
|
||||
struct scatterlist *sg = NULL;
|
||||
|
||||
ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
|
||||
ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
|
||||
ABD_SCATTER(abd).abd_sgl->length);
|
||||
n = ABD_SCATTER(abd).abd_nents;
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
struct scatterlist *sg = NULL;
|
||||
size_t n = ABD_SCATTER(abd).abd_nents;
|
||||
int i = 0;
|
||||
|
||||
abd_for_each_sg(abd, sg, n, i) {
|
||||
ASSERT3P(sg_page(sg), !=, NULL);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -687,14 +697,77 @@ abd_free_linear_page(abd_t *abd)
|
|||
{
|
||||
/* Transform it back into a scatter ABD for freeing */
|
||||
struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
|
||||
|
||||
/* When backed by user page unmap it */
|
||||
if (abd_is_from_pages(abd))
|
||||
zfs_kunmap(sg_page(sg));
|
||||
|
||||
abd->abd_flags &= ~ABD_FLAG_LINEAR;
|
||||
abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
|
||||
ABD_SCATTER(abd).abd_nents = 1;
|
||||
ABD_SCATTER(abd).abd_offset = 0;
|
||||
ABD_SCATTER(abd).abd_sgl = sg;
|
||||
abd_free_chunks(abd);
|
||||
}
|
||||
|
||||
abd_update_scatter_stats(abd, ABDSTAT_DECR);
|
||||
/*
|
||||
* Allocate a scatter ABD structure from user pages. The pages must be
|
||||
* pinned with get_user_pages, or similiar, but need not be mapped via
|
||||
* the kmap interfaces.
|
||||
*/
|
||||
abd_t *
|
||||
abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size)
|
||||
{
|
||||
uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE);
|
||||
struct sg_table table;
|
||||
|
||||
VERIFY3U(size, <=, DMU_MAX_ACCESS);
|
||||
ASSERT3U(offset, <, PAGE_SIZE);
|
||||
ASSERT3P(pages, !=, NULL);
|
||||
|
||||
/*
|
||||
* Even if this buf is filesystem metadata, we only track that we
|
||||
* own the underlying data buffer, which is not true in this case.
|
||||
* Therefore, we don't ever use ABD_FLAG_META here.
|
||||
*/
|
||||
abd_t *abd = abd_alloc_struct(0);
|
||||
abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER;
|
||||
abd->abd_size = size;
|
||||
|
||||
while (sg_alloc_table_from_pages(&table, pages, npages, offset,
|
||||
size, __GFP_NOWARN | GFP_NOIO) != 0) {
|
||||
ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
|
||||
schedule_timeout_interruptible(1);
|
||||
}
|
||||
|
||||
if ((offset + size) <= PAGE_SIZE) {
|
||||
/*
|
||||
* Since there is only one entry, this ABD can be represented
|
||||
* as a linear buffer. All single-page (4K) ABD's constructed
|
||||
* from a user page can be represented this way as long as the
|
||||
* page is mapped to a virtual address. This allows us to
|
||||
* apply an offset in to the mapped page.
|
||||
*
|
||||
* Note that kmap() must be used, not kmap_atomic(), because
|
||||
* the mapping needs to bet set up on all CPUs. Using kmap()
|
||||
* also enables the user of highmem pages when required.
|
||||
*/
|
||||
abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
|
||||
abd->abd_u.abd_linear.abd_sgl = table.sgl;
|
||||
zfs_kmap(sg_page(table.sgl));
|
||||
ABD_LINEAR_BUF(abd) = sg_virt(table.sgl);
|
||||
} else {
|
||||
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
|
||||
abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
|
||||
|
||||
ABD_SCATTER(abd).abd_offset = offset;
|
||||
ABD_SCATTER(abd).abd_sgl = table.sgl;
|
||||
ABD_SCATTER(abd).abd_nents = table.nents;
|
||||
|
||||
ASSERT0(ABD_SCATTER(abd).abd_offset);
|
||||
}
|
||||
|
||||
return (abd);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -746,6 +819,9 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
|
|||
ABD_SCATTER(abd).abd_offset = new_offset;
|
||||
ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
|
||||
|
||||
if (abd_is_from_pages(sabd))
|
||||
abd->abd_flags |= ABD_FLAG_FROM_PAGES;
|
||||
|
||||
return (abd);
|
||||
}
|
||||
|
||||
|
@ -873,6 +949,115 @@ abd_cache_reap_now(void)
|
|||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Borrow a raw buffer from an ABD without copying the contents of the ABD
|
||||
* into the buffer. If the ABD is scattered, this will allocate a raw buffer
|
||||
* whose contents are undefined. To copy over the existing data in the ABD, use
|
||||
* abd_borrow_buf_copy() instead.
|
||||
*/
|
||||
void *
|
||||
abd_borrow_buf(abd_t *abd, size_t n)
|
||||
{
|
||||
void *buf;
|
||||
abd_verify(abd);
|
||||
ASSERT3U(abd->abd_size, >=, 0);
|
||||
/*
|
||||
* In the event the ABD is composed of a single user page from Direct
|
||||
* I/O we can not direclty return the raw buffer. This is a consequence
|
||||
* of not being able to write protect the page and the contents of the
|
||||
* page can be changed at any time by the user.
|
||||
*/
|
||||
if (abd_is_from_pages(abd)) {
|
||||
buf = zio_buf_alloc(n);
|
||||
} else if (abd_is_linear(abd)) {
|
||||
buf = abd_to_buf(abd);
|
||||
} else {
|
||||
buf = zio_buf_alloc(n);
|
||||
}
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
|
||||
#endif
|
||||
return (buf);
|
||||
}
|
||||
|
||||
void *
|
||||
abd_borrow_buf_copy(abd_t *abd, size_t n)
|
||||
{
|
||||
void *buf = abd_borrow_buf(abd, n);
|
||||
|
||||
/*
|
||||
* In the event the ABD is composed of a single user page from Direct
|
||||
* I/O we must make sure copy the data over into the newly allocated
|
||||
* buffer. This is a consequence of the fact that we can not write
|
||||
* protect the user page and there is a risk the contents of the page
|
||||
* could be changed by the user at any moment.
|
||||
*/
|
||||
if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
|
||||
abd_copy_to_buf(buf, abd, n);
|
||||
}
|
||||
return (buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will
|
||||
* not change the contents of the ABD. If you want any changes you made to
|
||||
* buf to be copied back to abd, use abd_return_buf_copy() instead. If the
|
||||
* ABD is not constructed from user pages for Direct I/O then an ASSERT
|
||||
* checks to make sure the contents of buffer have not changed since it was
|
||||
* borrowed. We can not ASSERT that the contents of the buffer have not changed
|
||||
* if it is composed of user pages because the pages can not be placed under
|
||||
* write protection and the user could have possibly changed the contents in
|
||||
* the pages at any time.
|
||||
*/
|
||||
void
|
||||
abd_return_buf(abd_t *abd, void *buf, size_t n)
|
||||
{
|
||||
abd_verify(abd);
|
||||
ASSERT3U(abd->abd_size, >=, n);
|
||||
#ifdef ZFS_DEBUG
|
||||
(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
|
||||
#endif
|
||||
if (abd_is_from_pages(abd)) {
|
||||
zio_buf_free(buf, n);
|
||||
} else if (abd_is_linear(abd)) {
|
||||
ASSERT3P(buf, ==, abd_to_buf(abd));
|
||||
} else if (abd_is_gang(abd)) {
|
||||
#ifdef ZFS_DEBUG
|
||||
/*
|
||||
* We have to be careful with gang ABD's that we do not ASSERT0
|
||||
* for any ABD's that contain user pages from Direct I/O. In
|
||||
* order to handle this, we just iterate through the gang ABD
|
||||
* and only verify ABDs that are not from user pages.
|
||||
*/
|
||||
void *cmp_buf = buf;
|
||||
|
||||
for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
|
||||
cabd != NULL;
|
||||
cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
|
||||
if (!abd_is_from_pages(cabd)) {
|
||||
ASSERT0(abd_cmp_buf(cabd, cmp_buf,
|
||||
cabd->abd_size));
|
||||
}
|
||||
cmp_buf = (char *)cmp_buf + cabd->abd_size;
|
||||
}
|
||||
#endif
|
||||
zio_buf_free(buf, n);
|
||||
} else {
|
||||
ASSERT0(abd_cmp_buf(abd, buf, n));
|
||||
zio_buf_free(buf, n);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
|
||||
{
|
||||
if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
|
||||
abd_copy_from_buf(abd, buf, n);
|
||||
}
|
||||
abd_return_buf(abd, buf, n);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is abd_iter_page(), the function underneath abd_iterate_page_func().
|
||||
* It yields the next page struct and data offset and size within it, without
|
||||
|
|
|
@ -25,14 +25,35 @@
|
|||
|
||||
#include <sys/zfs_racct.h>
|
||||
|
||||
#ifdef _KERNEL
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
|
||||
void
|
||||
zfs_racct_read(uint64_t size, uint64_t iops)
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
{
|
||||
(void) size, (void) iops;
|
||||
task_io_account_read(size);
|
||||
spa_iostats_read_add(spa, size, iops, flags);
|
||||
}
|
||||
|
||||
void
|
||||
zfs_racct_write(uint64_t size, uint64_t iops)
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
{
|
||||
(void) size, (void) iops;
|
||||
task_io_account_write(size);
|
||||
spa_iostats_write_add(spa, size, iops, flags);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void
|
||||
zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
{
|
||||
(void) spa, (void) size, (void) iops, (void) flags;
|
||||
}
|
||||
|
||||
void
|
||||
zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
{
|
||||
(void) spa, (void) size, (void) iops, (void) flags;
|
||||
}
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
|
|
@ -41,12 +41,19 @@
|
|||
|
||||
#ifdef _KERNEL
|
||||
|
||||
#include <sys/errno.h>
|
||||
#include <sys/vmem.h>
|
||||
#include <sys/sysmacros.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/uio_impl.h>
|
||||
#include <sys/sysmacros.h>
|
||||
#include <sys/string.h>
|
||||
#include <sys/zfs_refcount.h>
|
||||
#include <sys/zfs_debug.h>
|
||||
#include <linux/kmap_compat.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/mman.h>
|
||||
|
||||
/*
|
||||
* Move "n" bytes at byte address "p"; "rw" indicates the direction
|
||||
|
@ -327,8 +334,13 @@ EXPORT_SYMBOL(zfs_uiomove);
|
|||
int
|
||||
zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
|
||||
{
|
||||
if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) {
|
||||
/* There's never a need to fault in kernel pages */
|
||||
if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||
|
||||
(uio->uio_extflg & UIO_DIRECT)) {
|
||||
/*
|
||||
* There's never a need to fault in kernel pages or Direct I/O
|
||||
* write pages. Direct I/O write pages have been pinned in so
|
||||
* there is never a time for these pages a fault will occur.
|
||||
*/
|
||||
return (0);
|
||||
#if defined(HAVE_VFS_IOV_ITER)
|
||||
} else if (uio->uio_segflg == UIO_ITER) {
|
||||
|
@ -437,9 +449,288 @@ zfs_uioskip(zfs_uio_t *uio, size_t n)
|
|||
uio->uio_iovcnt--;
|
||||
}
|
||||
}
|
||||
|
||||
uio->uio_loffset += n;
|
||||
uio->uio_resid -= n;
|
||||
}
|
||||
EXPORT_SYMBOL(zfs_uioskip);
|
||||
|
||||
/*
|
||||
* Check if the uio is page-aligned in memory.
|
||||
*/
|
||||
boolean_t
|
||||
zfs_uio_page_aligned(zfs_uio_t *uio)
|
||||
{
|
||||
boolean_t aligned = B_TRUE;
|
||||
|
||||
if (uio->uio_segflg == UIO_USERSPACE ||
|
||||
uio->uio_segflg == UIO_SYSSPACE) {
|
||||
const struct iovec *iov = uio->uio_iov;
|
||||
size_t skip = uio->uio_skip;
|
||||
|
||||
for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {
|
||||
uintptr_t addr = (uintptr_t)(iov->iov_base + skip);
|
||||
size_t size = iov->iov_len - skip;
|
||||
if ((addr & (PAGE_SIZE - 1)) ||
|
||||
(size & (PAGE_SIZE - 1))) {
|
||||
aligned = B_FALSE;
|
||||
break;
|
||||
}
|
||||
skip = 0;
|
||||
}
|
||||
#if defined(HAVE_VFS_IOV_ITER)
|
||||
} else if (uio->uio_segflg == UIO_ITER) {
|
||||
unsigned long alignment =
|
||||
iov_iter_alignment(uio->uio_iter);
|
||||
aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
|
||||
#endif
|
||||
} else {
|
||||
/* Currently not supported */
|
||||
aligned = B_FALSE;
|
||||
}
|
||||
|
||||
return (aligned);
|
||||
}
|
||||
|
||||
|
||||
#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
|
||||
#define ZFS_MARKEED_PAGE 0x0
|
||||
#define IS_ZFS_MARKED_PAGE(_p) 0
|
||||
#define zfs_mark_page(_p)
|
||||
#define zfs_unmark_page(_p)
|
||||
#define IS_ZERO_PAGE(_p) 0
|
||||
|
||||
#else
|
||||
/*
|
||||
* Mark pages to know if they were allocated to replace ZERO_PAGE() for
|
||||
* Direct I/O writes.
|
||||
*/
|
||||
#define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */
|
||||
#define IS_ZFS_MARKED_PAGE(_p) \
|
||||
(page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)
|
||||
#define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))
|
||||
|
||||
static inline void
|
||||
zfs_mark_page(struct page *page)
|
||||
{
|
||||
ASSERT3P(page, !=, NULL);
|
||||
get_page(page);
|
||||
SetPagePrivate(page);
|
||||
set_page_private(page, ZFS_MARKED_PAGE);
|
||||
}
|
||||
|
||||
static inline void
|
||||
zfs_unmark_page(struct page *page)
|
||||
{
|
||||
ASSERT3P(page, !=, NULL);
|
||||
set_page_private(page, 0UL);
|
||||
ClearPagePrivate(page);
|
||||
put_page(page);
|
||||
}
|
||||
#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
|
||||
|
||||
static void
|
||||
zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
|
||||
{
|
||||
ASSERT3P(uio->uio_dio.pages, !=, NULL);
|
||||
|
||||
for (long i = 0; i < uio->uio_dio.npages; i++) {
|
||||
struct page *p = uio->uio_dio.pages[i];
|
||||
lock_page(p);
|
||||
|
||||
if (IS_ZERO_PAGE(p)) {
|
||||
/*
|
||||
* If the user page points the kernels ZERO_PAGE() a
|
||||
* new zero filled page will just be allocated so the
|
||||
* contents of the page can not be changed by the user
|
||||
* while a Direct I/O write is taking place.
|
||||
*/
|
||||
gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO |
|
||||
__GFP_ZERO | GFP_KERNEL;
|
||||
|
||||
ASSERT0(IS_ZFS_MARKED_PAGE(p));
|
||||
unlock_page(p);
|
||||
put_page(p);
|
||||
|
||||
p = __page_cache_alloc(gfp_zero_page);
|
||||
zfs_mark_page(p);
|
||||
} else {
|
||||
unlock_page(p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
|
||||
{
|
||||
|
||||
ASSERT(uio->uio_extflg & UIO_DIRECT);
|
||||
ASSERT3P(uio->uio_dio.pages, !=, NULL);
|
||||
|
||||
for (long i = 0; i < uio->uio_dio.npages; i++) {
|
||||
struct page *p = uio->uio_dio.pages[i];
|
||||
|
||||
if (IS_ZFS_MARKED_PAGE(p)) {
|
||||
zfs_unmark_page(p);
|
||||
__free_page(p);
|
||||
continue;
|
||||
}
|
||||
|
||||
put_page(p);
|
||||
}
|
||||
|
||||
vmem_free(uio->uio_dio.pages,
|
||||
uio->uio_dio.npages * sizeof (struct page *));
|
||||
}
|
||||
|
||||
/*
|
||||
* zfs_uio_iov_step() is just a modified version of the STEP function of Linux's
|
||||
* iov_iter_get_pages().
|
||||
*/
|
||||
static int
|
||||
zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio,
|
||||
long *numpages)
|
||||
{
|
||||
unsigned long addr = (unsigned long)(v.iov_base);
|
||||
size_t len = v.iov_len;
|
||||
unsigned long n = DIV_ROUND_UP(len, PAGE_SIZE);
|
||||
|
||||
long res = zfs_get_user_pages(
|
||||
P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n, rw == UIO_READ,
|
||||
&uio->uio_dio.pages[uio->uio_dio.npages]);
|
||||
if (res < 0) {
|
||||
return (-res);
|
||||
} else if (len != (res * PAGE_SIZE)) {
|
||||
return (EFAULT);
|
||||
}
|
||||
|
||||
ASSERT3S(len, ==, res * PAGE_SIZE);
|
||||
*numpages = res;
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw)
|
||||
{
|
||||
const struct iovec *iovp = uio->uio_iov;
|
||||
size_t skip = uio->uio_skip;
|
||||
size_t len = uio->uio_resid - skip;
|
||||
|
||||
ASSERT(uio->uio_segflg != UIO_SYSSPACE);
|
||||
|
||||
for (int i = 0; i < uio->uio_iovcnt; i++) {
|
||||
struct iovec iov;
|
||||
long numpages = 0;
|
||||
|
||||
if (iovp->iov_len == 0) {
|
||||
iovp++;
|
||||
skip = 0;
|
||||
continue;
|
||||
}
|
||||
iov.iov_len = MIN(len, iovp->iov_len - skip);
|
||||
iov.iov_base = iovp->iov_base + skip;
|
||||
int error = zfs_uio_iov_step(iov, rw, uio, &numpages);
|
||||
|
||||
if (error)
|
||||
return (SET_ERROR(error));
|
||||
|
||||
uio->uio_dio.npages += numpages;
|
||||
len -= iov.iov_len;
|
||||
skip = 0;
|
||||
iovp++;
|
||||
}
|
||||
|
||||
ASSERT0(len);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
#if defined(HAVE_VFS_IOV_ITER)
|
||||
static int
|
||||
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
|
||||
{
|
||||
size_t skip = uio->uio_skip;
|
||||
size_t wanted = uio->uio_resid - uio->uio_skip;
|
||||
ssize_t rollback = 0;
|
||||
ssize_t cnt;
|
||||
unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
|
||||
|
||||
while (wanted) {
|
||||
#if defined(HAVE_IOV_ITER_GET_PAGES2)
|
||||
cnt = iov_iter_get_pages2(uio->uio_iter,
|
||||
&uio->uio_dio.pages[uio->uio_dio.npages],
|
||||
wanted, maxpages, &skip);
|
||||
#else
|
||||
cnt = iov_iter_get_pages(uio->uio_iter,
|
||||
&uio->uio_dio.pages[uio->uio_dio.npages],
|
||||
wanted, maxpages, &skip);
|
||||
#endif
|
||||
if (cnt < 0) {
|
||||
iov_iter_revert(uio->uio_iter, rollback);
|
||||
return (SET_ERROR(-cnt));
|
||||
}
|
||||
uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
|
||||
rollback += cnt;
|
||||
wanted -= cnt;
|
||||
skip = 0;
|
||||
#if !defined(HAVE_IOV_ITER_GET_PAGES2)
|
||||
/*
|
||||
* iov_iter_get_pages2() advances the iov_iter on success.
|
||||
*/
|
||||
iov_iter_advance(uio->uio_iter, cnt);
|
||||
#endif
|
||||
|
||||
}
|
||||
ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
|
||||
iov_iter_revert(uio->uio_iter, rollback);
|
||||
|
||||
return (0);
|
||||
}
|
||||
#endif /* HAVE_VFS_IOV_ITER */
|
||||
|
||||
/*
|
||||
* This function pins user pages. In the event that the user pages were not
|
||||
* successfully pinned an error value is returned.
|
||||
*
|
||||
* On success, 0 is returned.
|
||||
*/
|
||||
int
|
||||
zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
|
||||
{
|
||||
int error = 0;
|
||||
long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);
|
||||
size_t size = npages * sizeof (struct page *);
|
||||
|
||||
if (uio->uio_segflg == UIO_USERSPACE) {
|
||||
uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
|
||||
error = zfs_uio_get_dio_pages_iov(uio, rw);
|
||||
#if defined(HAVE_VFS_IOV_ITER)
|
||||
} else if (uio->uio_segflg == UIO_ITER) {
|
||||
uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
|
||||
error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
|
||||
#endif
|
||||
} else {
|
||||
return (SET_ERROR(EOPNOTSUPP));
|
||||
}
|
||||
|
||||
ASSERT3S(uio->uio_dio.npages, >=, 0);
|
||||
|
||||
if (error) {
|
||||
for (long i = 0; i < uio->uio_dio.npages; i++)
|
||||
put_page(uio->uio_dio.pages[i]);
|
||||
vmem_free(uio->uio_dio.pages, size);
|
||||
return (error);
|
||||
} else {
|
||||
ASSERT3S(uio->uio_dio.npages, ==, npages);
|
||||
}
|
||||
|
||||
if (rw == UIO_WRITE) {
|
||||
zfs_uio_dio_check_for_zero_page(uio);
|
||||
}
|
||||
|
||||
uio->uio_extflg |= UIO_DIRECT;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
|
|
@ -59,6 +59,7 @@
|
|||
#include <sys/objlist.h>
|
||||
#include <sys/zpl.h>
|
||||
#include <linux/vfs_compat.h>
|
||||
#include <linux/fs.h>
|
||||
#include "zfs_comutil.h"
|
||||
|
||||
enum {
|
||||
|
|
|
@ -296,6 +296,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
|
|||
|
||||
struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
|
||||
if (pp) {
|
||||
|
||||
/*
|
||||
* If filemap_fault() retries there exists a window
|
||||
* where the page will be unlocked and not up to date.
|
||||
|
@ -3866,7 +3867,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
|
|||
}
|
||||
|
||||
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
|
||||
for_sync ? zfs_putpage_sync_commit_cb :
|
||||
B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
|
||||
zfs_putpage_async_commit_cb, pp);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
|
@ -4009,6 +4010,7 @@ zfs_inactive(struct inode *ip)
|
|||
static int
|
||||
zfs_fillpage(struct inode *ip, struct page *pp)
|
||||
{
|
||||
znode_t *zp = ITOZ(ip);
|
||||
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
||||
loff_t i_size = i_size_read(ip);
|
||||
u_offset_t io_off = page_offset(pp);
|
||||
|
@ -4020,7 +4022,7 @@ zfs_fillpage(struct inode *ip, struct page *pp)
|
|||
io_len = i_size - io_off;
|
||||
|
||||
void *va = kmap(pp);
|
||||
int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
|
||||
int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off,
|
||||
io_len, va, DMU_READ_PREFETCH);
|
||||
if (io_len != PAGE_SIZE)
|
||||
memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
|
||||
|
@ -4058,11 +4060,49 @@ zfs_getpage(struct inode *ip, struct page *pp)
|
|||
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
||||
znode_t *zp = ITOZ(ip);
|
||||
int error;
|
||||
loff_t i_size = i_size_read(ip);
|
||||
u_offset_t io_off = page_offset(pp);
|
||||
size_t io_len = PAGE_SIZE;
|
||||
|
||||
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
|
||||
return (error);
|
||||
|
||||
ASSERT3U(io_off, <, i_size);
|
||||
|
||||
if (io_off + io_len > i_size)
|
||||
io_len = i_size - io_off;
|
||||
|
||||
/*
|
||||
* It is important to hold the rangelock here because it is possible
|
||||
* a Direct I/O write or block clone might be taking place at the same
|
||||
* time that a page is being faulted in through filemap_fault(). With
|
||||
* Direct I/O writes and block cloning db->db_data will be set to NULL
|
||||
* with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the
|
||||
* rangelock is not held, then there is a race between faulting in a
|
||||
* page and writing out a Direct I/O write or block cloning. Without
|
||||
* the rangelock a NULL pointer dereference can occur in
|
||||
* dmu_read_impl() for db->db_data during the mempcy operation when
|
||||
* zfs_fillpage() calls dmu_read().
|
||||
*/
|
||||
zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock,
|
||||
io_off, io_len, RL_READER);
|
||||
if (lr == NULL) {
|
||||
/*
|
||||
* It is important to drop the page lock before grabbing the
|
||||
* rangelock to avoid another deadlock between here and
|
||||
* zfs_write() -> update_pages(). update_pages() holds both the
|
||||
* rangelock and the page lock.
|
||||
*/
|
||||
get_page(pp);
|
||||
unlock_page(pp);
|
||||
lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
|
||||
io_len, RL_READER);
|
||||
lock_page(pp);
|
||||
put_page(pp);
|
||||
}
|
||||
error = zfs_fillpage(ip, pp);
|
||||
zfs_rangelock_exit(lr);
|
||||
|
||||
if (error == 0)
|
||||
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
|
||||
|
||||
|
|
|
@ -322,14 +322,14 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
|
|||
crhold(cr);
|
||||
cookie = spl_fstrans_mark();
|
||||
|
||||
int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
|
||||
ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
|
||||
filp->f_flags | zfs_io_flags(kiocb), cr);
|
||||
|
||||
spl_fstrans_unmark(cookie);
|
||||
crfree(cr);
|
||||
|
||||
if (error < 0)
|
||||
return (error);
|
||||
if (ret < 0)
|
||||
return (ret);
|
||||
|
||||
ssize_t read = count - uio.uio_resid;
|
||||
kiocb->ki_pos += read;
|
||||
|
@ -384,14 +384,14 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
|
|||
crhold(cr);
|
||||
cookie = spl_fstrans_mark();
|
||||
|
||||
int error = -zfs_write(ITOZ(ip), &uio,
|
||||
ret = -zfs_write(ITOZ(ip), &uio,
|
||||
filp->f_flags | zfs_io_flags(kiocb), cr);
|
||||
|
||||
spl_fstrans_unmark(cookie);
|
||||
crfree(cr);
|
||||
|
||||
if (error < 0)
|
||||
return (error);
|
||||
if (ret < 0)
|
||||
return (ret);
|
||||
|
||||
ssize_t wrote = count - uio.uio_resid;
|
||||
kiocb->ki_pos += wrote;
|
||||
|
@ -422,14 +422,14 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov,
|
|||
crhold(cr);
|
||||
cookie = spl_fstrans_mark();
|
||||
|
||||
int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
|
||||
filp->f_flags | zfs_io_flags(kiocb), cr);
|
||||
ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
|
||||
flip->f_flags | zfs_io_flags(kiocb), cr);
|
||||
|
||||
spl_fstrans_unmark(cookie);
|
||||
crfree(cr);
|
||||
|
||||
if (error < 0)
|
||||
return (error);
|
||||
if (ret < 0)
|
||||
return (ret);
|
||||
|
||||
ssize_t read = count - uio.uio_resid;
|
||||
kiocb->ki_pos += read;
|
||||
|
@ -454,7 +454,7 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov,
|
|||
if (ret)
|
||||
return (ret);
|
||||
|
||||
ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode));
|
||||
ret = geeric_write_checks(filep, &pos, &count, S_ISBLK(ip->i_mode));
|
||||
if (ret)
|
||||
return (ret);
|
||||
|
||||
|
@ -467,53 +467,57 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov,
|
|||
crhold(cr);
|
||||
cookie = spl_fstrans_mark();
|
||||
|
||||
int error = -zfs_write(ITOZ(ip), &uio,
|
||||
ret = -zfs_write(ITOZ(ip), &uio,
|
||||
filp->f_flags | zfs_io_flags(kiocb), cr);
|
||||
|
||||
spl_fstrans_unmark(cookie);
|
||||
crfree(cr);
|
||||
|
||||
if (error < 0)
|
||||
return (error);
|
||||
if (ret < 0)
|
||||
return (ret);
|
||||
|
||||
ssize_t wrote = count - uio.uio_resid;
|
||||
kiocb->ki_pos += wrote;
|
||||
|
||||
return (wrote);
|
||||
}
|
||||
|
||||
#endif /* HAVE_VFS_RW_ITERATE */
|
||||
|
||||
#if defined(HAVE_VFS_RW_ITERATE)
|
||||
static ssize_t
|
||||
zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter)
|
||||
zpl_direct_IO_impl(void)
|
||||
{
|
||||
if (rw == WRITE)
|
||||
return (zpl_iter_write(kiocb, iter));
|
||||
else
|
||||
return (zpl_iter_read(kiocb, iter));
|
||||
/*
|
||||
* All O_DIRCT requests should be handled by
|
||||
* zpl_{iter/aio}_{write/read}(). There is no way kernel generic code
|
||||
* should call the direct_IO address_space_operations function. We set
|
||||
* this code path to be fatal if it is executed.
|
||||
*/
|
||||
PANIC(0);
|
||||
return (0);
|
||||
}
|
||||
|
||||
#if defined(HAVE_VFS_RW_ITERATE)
|
||||
#if defined(HAVE_VFS_DIRECT_IO_ITER)
|
||||
static ssize_t
|
||||
zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
|
||||
{
|
||||
return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
|
||||
return (zpl_direct_IO_impl());
|
||||
}
|
||||
#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
|
||||
static ssize_t
|
||||
zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
|
||||
{
|
||||
ASSERT3S(pos, ==, kiocb->ki_pos);
|
||||
return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
|
||||
return (zpl_direct_IO_impl());
|
||||
}
|
||||
#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
|
||||
static ssize_t
|
||||
zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
|
||||
{
|
||||
ASSERT3S(pos, ==, kiocb->ki_pos);
|
||||
return (zpl_direct_IO_impl(rw, kiocb, iter));
|
||||
return (zpl_direct_IO_impl());
|
||||
}
|
||||
#else
|
||||
#error "Unknown direct IO interface"
|
||||
#error "Unknown Direct I/O interface"
|
||||
#endif
|
||||
|
||||
#else /* HAVE_VFS_RW_ITERATE */
|
||||
|
@ -523,26 +527,16 @@ static ssize_t
|
|||
zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov,
|
||||
loff_t pos, unsigned long nr_segs)
|
||||
{
|
||||
if (rw == WRITE)
|
||||
return (zpl_aio_write(kiocb, iov, nr_segs, pos));
|
||||
else
|
||||
return (zpl_aio_read(kiocb, iov, nr_segs, pos));
|
||||
return (zpl_direct_IO_impl());
|
||||
}
|
||||
#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
|
||||
static ssize_t
|
||||
zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
|
||||
{
|
||||
const struct iovec *iovp = iov_iter_iovec(iter);
|
||||
unsigned long nr_segs = iter->nr_segs;
|
||||
|
||||
ASSERT3S(pos, ==, kiocb->ki_pos);
|
||||
if (rw == WRITE)
|
||||
return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
|
||||
else
|
||||
return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
|
||||
return (zpl_direct_IO_impl());
|
||||
}
|
||||
#else
|
||||
#error "Unknown direct IO interface"
|
||||
#error "Unknown Direct I/O interface"
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_VFS_RW_ITERATE */
|
||||
|
@ -627,6 +621,7 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
|
|||
error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
|
||||
(size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
|
||||
spl_fstrans_unmark(cookie);
|
||||
|
||||
if (error)
|
||||
return (error);
|
||||
|
||||
|
|
|
@ -395,6 +395,13 @@ zfs_prop_init(void)
|
|||
{ NULL }
|
||||
};
|
||||
|
||||
static const zprop_index_t direct_table[] = {
|
||||
{ "disabled", ZFS_DIRECT_DISABLED },
|
||||
{ "standard", ZFS_DIRECT_STANDARD },
|
||||
{ "always", ZFS_DIRECT_ALWAYS },
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
struct zfs_mod_supported_features *sfeatures =
|
||||
zfs_mod_list_supported(ZFS_SYSFS_DATASET_PROPERTIES);
|
||||
|
||||
|
@ -479,6 +486,10 @@ zfs_prop_init(void)
|
|||
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
|
||||
"default | full | geom | dev | none", "VOLMODE", volmode_table,
|
||||
sfeatures);
|
||||
zprop_register_index(ZFS_PROP_DIRECT, "direct",
|
||||
ZFS_DIRECT_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
|
||||
"disabled | standard | always", "DIRECT", direct_table,
|
||||
sfeatures);
|
||||
|
||||
/* inherit index (boolean) properties */
|
||||
zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
|
||||
|
|
|
@ -89,8 +89,8 @@
|
|||
* functions.
|
||||
*
|
||||
* As an additional feature, linear and scatter ABD's can be stitched together
|
||||
* by using the gang ABD type (abd_alloc_gang_abd()). This allows for
|
||||
* multiple ABDs to be viewed as a singular ABD.
|
||||
* by using the gang ABD type (abd_alloc_gang()). This allows for multiple ABDs
|
||||
* to be viewed as a singular ABD.
|
||||
*
|
||||
* It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
|
||||
* B_FALSE.
|
||||
|
@ -109,11 +109,15 @@ void
|
|||
abd_verify(abd_t *abd)
|
||||
{
|
||||
#ifdef ZFS_DEBUG
|
||||
if (abd_is_from_pages(abd)) {
|
||||
ASSERT3U(abd->abd_size, <=, DMU_MAX_ACCESS);
|
||||
} else {
|
||||
ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
|
||||
}
|
||||
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
|
||||
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
|
||||
ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
|
||||
ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD));
|
||||
ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD | ABD_FLAG_FROM_PAGES));
|
||||
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
|
||||
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
|
||||
if (abd_is_linear(abd)) {
|
||||
|
@ -136,7 +140,7 @@ abd_verify(abd_t *abd)
|
|||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
void
|
||||
abd_init_struct(abd_t *abd)
|
||||
{
|
||||
list_link_init(&abd->abd_gang_link);
|
||||
|
@ -238,6 +242,7 @@ abd_free_linear(abd_t *abd)
|
|||
abd_free_linear_page(abd);
|
||||
return;
|
||||
}
|
||||
|
||||
if (abd->abd_flags & ABD_FLAG_META) {
|
||||
zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
|
||||
} else {
|
||||
|
@ -520,6 +525,21 @@ abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
|
|||
*/
|
||||
abd->abd_flags |= ABD_FLAG_LINEAR;
|
||||
|
||||
/*
|
||||
* User pages from Direct I/O requests may be in a single page
|
||||
* (ABD_FLAG_LINEAR_PAGE), and we must make sure to still flag
|
||||
* that here for abd. This is required because we have to be
|
||||
* careful when borrowing the buffer from the ABD because we
|
||||
* can not place user pages under write protection on Linux.
|
||||
* See the comments in abd_os.c for abd_borrow_buf(),
|
||||
* abd_borrow_buf_copy(), abd_return_buf() and
|
||||
* abd_return_buf_copy().
|
||||
*/
|
||||
if (abd_is_from_pages(sabd)) {
|
||||
abd->abd_flags |= ABD_FLAG_FROM_PAGES |
|
||||
ABD_FLAG_LINEAR_PAGE;
|
||||
}
|
||||
|
||||
ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;
|
||||
} else if (abd_is_gang(sabd)) {
|
||||
size_t left = size;
|
||||
|
@ -648,70 +668,6 @@ abd_to_buf(abd_t *abd)
|
|||
return (ABD_LINEAR_BUF(abd));
|
||||
}
|
||||
|
||||
/*
|
||||
* Borrow a raw buffer from an ABD without copying the contents of the ABD
|
||||
* into the buffer. If the ABD is scattered, this will allocate a raw buffer
|
||||
* whose contents are undefined. To copy over the existing data in the ABD, use
|
||||
* abd_borrow_buf_copy() instead.
|
||||
*/
|
||||
void *
|
||||
abd_borrow_buf(abd_t *abd, size_t n)
|
||||
{
|
||||
void *buf;
|
||||
abd_verify(abd);
|
||||
ASSERT3U(abd->abd_size, >=, n);
|
||||
if (abd_is_linear(abd)) {
|
||||
buf = abd_to_buf(abd);
|
||||
} else {
|
||||
buf = zio_buf_alloc(n);
|
||||
}
|
||||
#ifdef ZFS_DEBUG
|
||||
(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
|
||||
#endif
|
||||
return (buf);
|
||||
}
|
||||
|
||||
void *
|
||||
abd_borrow_buf_copy(abd_t *abd, size_t n)
|
||||
{
|
||||
void *buf = abd_borrow_buf(abd, n);
|
||||
if (!abd_is_linear(abd)) {
|
||||
abd_copy_to_buf(buf, abd, n);
|
||||
}
|
||||
return (buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
|
||||
* not change the contents of the ABD and will ASSERT that you didn't modify
|
||||
* the buffer since it was borrowed. If you want any changes you made to buf to
|
||||
* be copied back to abd, use abd_return_buf_copy() instead.
|
||||
*/
|
||||
void
|
||||
abd_return_buf(abd_t *abd, void *buf, size_t n)
|
||||
{
|
||||
abd_verify(abd);
|
||||
ASSERT3U(abd->abd_size, >=, n);
|
||||
#ifdef ZFS_DEBUG
|
||||
(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
|
||||
#endif
|
||||
if (abd_is_linear(abd)) {
|
||||
ASSERT3P(buf, ==, abd_to_buf(abd));
|
||||
} else {
|
||||
ASSERT0(abd_cmp_buf(abd, buf, n));
|
||||
zio_buf_free(buf, n);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
|
||||
{
|
||||
if (!abd_is_linear(abd)) {
|
||||
abd_copy_from_buf(abd, buf, n);
|
||||
}
|
||||
abd_return_buf(abd, buf, n);
|
||||
}
|
||||
|
||||
void
|
||||
abd_release_ownership_of_buf(abd_t *abd)
|
||||
{
|
||||
|
|
|
@ -5961,7 +5961,7 @@ top:
|
|||
ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
|
||||
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
|
||||
metadata, misses);
|
||||
zfs_racct_read(size, 1);
|
||||
zfs_racct_read(spa, size, 1, 0);
|
||||
}
|
||||
|
||||
/* Check if the spa even has l2 configured */
|
||||
|
|
|
@ -217,8 +217,7 @@ dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
|
|||
}
|
||||
|
||||
void
|
||||
dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
|
||||
int64_t nwritten)
|
||||
dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten)
|
||||
{
|
||||
ASSERT3S(nwritten, >=, 0);
|
||||
|
||||
|
@ -230,8 +229,7 @@ dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
|
|||
}
|
||||
|
||||
void
|
||||
dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
|
||||
int64_t nread)
|
||||
dataset_kstats_update_read_kstats(dataset_kstats_t *dk, int64_t nread)
|
||||
{
|
||||
ASSERT3S(nread, >=, 0);
|
||||
|
||||
|
|
|
@ -628,7 +628,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
|
|||
* L2ARC.
|
||||
*/
|
||||
boolean_t
|
||||
dbuf_is_l2cacheable(dmu_buf_impl_t *db)
|
||||
dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp)
|
||||
{
|
||||
if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
|
||||
(db->db_objset->os_secondary_cache ==
|
||||
|
@ -636,10 +636,17 @@ dbuf_is_l2cacheable(dmu_buf_impl_t *db)
|
|||
if (l2arc_exclude_special == 0)
|
||||
return (B_TRUE);
|
||||
|
||||
blkptr_t *bp = db->db_blkptr;
|
||||
if (bp == NULL || BP_IS_HOLE(bp))
|
||||
/*
|
||||
* bp must be checked in the event it was passed from
|
||||
* dbuf_read_impl() as the result of a the BP being set from
|
||||
* a Direct I/O write in dbuf_read(). See comments in
|
||||
* dbuf_read().
|
||||
*/
|
||||
blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp;
|
||||
|
||||
if (db_bp == NULL || BP_IS_HOLE(db_bp))
|
||||
return (B_FALSE);
|
||||
uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
|
||||
uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva);
|
||||
vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
|
||||
vdev_t *vd = NULL;
|
||||
|
||||
|
@ -1380,6 +1387,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
|||
|
||||
mutex_enter(&db->db_mtx);
|
||||
ASSERT3U(db->db_state, ==, DB_READ);
|
||||
|
||||
/*
|
||||
* All reads are synchronous, so we must have a hold on the dbuf
|
||||
*/
|
||||
|
@ -1570,12 +1578,11 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
|||
*/
|
||||
static int
|
||||
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||
db_lock_type_t dblt, const void *tag)
|
||||
db_lock_type_t dblt, blkptr_t *bp, const void *tag)
|
||||
{
|
||||
zbookmark_phys_t zb;
|
||||
uint32_t aflags = ARC_FLAG_NOWAIT;
|
||||
int err, zio_flags;
|
||||
blkptr_t bp, *bpp = NULL;
|
||||
|
||||
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
|
@ -1589,43 +1596,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
|||
goto early_unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we have a pending block clone, we don't want to read the
|
||||
* underlying block, but the content of the block being cloned,
|
||||
* pointed by the dirty record, so we have the most recent data.
|
||||
* If there is no dirty record, then we hit a race in a sync
|
||||
* process when the dirty record is already removed, while the
|
||||
* dbuf is not yet destroyed. Such case is equivalent to uncached.
|
||||
*/
|
||||
if (db->db_state == DB_NOFILL) {
|
||||
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
|
||||
if (dr != NULL) {
|
||||
if (!dr->dt.dl.dr_brtwrite) {
|
||||
err = EIO;
|
||||
goto early_unlock;
|
||||
}
|
||||
bp = dr->dt.dl.dr_overridden_by;
|
||||
bpp = &bp;
|
||||
}
|
||||
}
|
||||
|
||||
if (bpp == NULL && db->db_blkptr != NULL) {
|
||||
bp = *db->db_blkptr;
|
||||
bpp = &bp;
|
||||
}
|
||||
|
||||
err = dbuf_read_hole(db, dn, bpp);
|
||||
err = dbuf_read_hole(db, dn, bp);
|
||||
if (err == 0)
|
||||
goto early_unlock;
|
||||
|
||||
ASSERT(bpp != NULL);
|
||||
ASSERT(bp != NULL);
|
||||
|
||||
/*
|
||||
* Any attempt to read a redacted block should result in an error. This
|
||||
* will never happen under normal conditions, but can be useful for
|
||||
* debugging purposes.
|
||||
*/
|
||||
if (BP_IS_REDACTED(bpp)) {
|
||||
if (BP_IS_REDACTED(bp)) {
|
||||
ASSERT(dsl_dataset_feature_is_active(
|
||||
db->db_objset->os_dsl_dataset,
|
||||
SPA_FEATURE_REDACTED_DATASETS));
|
||||
|
@ -1640,9 +1622,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
|||
* All bps of an encrypted os should have the encryption bit set.
|
||||
* If this is not true it indicates tampering and we report an error.
|
||||
*/
|
||||
if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
|
||||
if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {
|
||||
spa_log_error(db->db_objset->os_spa, &zb,
|
||||
BP_GET_LOGICAL_BIRTH(bpp));
|
||||
BP_GET_LOGICAL_BIRTH(bp));
|
||||
err = SET_ERROR(EIO);
|
||||
goto early_unlock;
|
||||
}
|
||||
|
@ -1653,7 +1635,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
|||
|
||||
if (!DBUF_IS_CACHEABLE(db))
|
||||
aflags |= ARC_FLAG_UNCACHED;
|
||||
else if (dbuf_is_l2cacheable(db))
|
||||
else if (dbuf_is_l2cacheable(db, bp))
|
||||
aflags |= ARC_FLAG_L2CACHE;
|
||||
|
||||
dbuf_add_ref(db, NULL);
|
||||
|
@ -1661,17 +1643,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
|||
zio_flags = (flags & DB_RF_CANFAIL) ?
|
||||
ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
|
||||
|
||||
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
|
||||
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp))
|
||||
zio_flags |= ZIO_FLAG_RAW;
|
||||
|
||||
/*
|
||||
* The zio layer will copy the provided blkptr later, but we have our
|
||||
* own copy so that we can release the parent's rwlock. We have to
|
||||
* do that so that if dbuf_read_done is called synchronously (on
|
||||
* The zio layer will copy the provided blkptr later, but we need to
|
||||
* do this now so that we can release the parent's rwlock. We have to
|
||||
* do that now so that if dbuf_read_done is called synchronously (on
|
||||
* an l1 cache hit) we don't acquire the db_mtx while holding the
|
||||
* parent's rwlock, which would be a lock ordering violation.
|
||||
*/
|
||||
blkptr_t copy = *bp;
|
||||
dmu_buf_unlock_parent(db, dblt, tag);
|
||||
return (arc_read(zio, db->db_objset->os_spa, bpp,
|
||||
return (arc_read(zio, db->db_objset->os_spa, ©,
|
||||
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
|
||||
&aflags, &zb));
|
||||
|
||||
|
@ -1844,13 +1828,30 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
|||
ASSERT(db->db_state == DB_UNCACHED ||
|
||||
db->db_state == DB_NOFILL);
|
||||
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
|
||||
blkptr_t *bp;
|
||||
|
||||
/*
|
||||
* If a block clone or Direct I/O write has occurred we will
|
||||
* get the dirty records overridden BP so we get the most
|
||||
* recent data.
|
||||
*/
|
||||
err = dmu_buf_get_bp_from_dbuf(db, &bp);
|
||||
|
||||
if (!err) {
|
||||
if (pio == NULL && (db->db_state == DB_NOFILL ||
|
||||
(db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
|
||||
(bp != NULL && !BP_IS_HOLE(bp)))) {
|
||||
spa_t *spa = dn->dn_objset->os_spa;
|
||||
pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||
pio =
|
||||
zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||
need_wait = B_TRUE;
|
||||
}
|
||||
err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
|
||||
|
||||
err =
|
||||
dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG);
|
||||
} else {
|
||||
mutex_exit(&db->db_mtx);
|
||||
dmu_buf_unlock_parent(db, dblt, FTAG);
|
||||
}
|
||||
/* dbuf_read_impl drops db_mtx and parent's rwlock. */
|
||||
miss = (db->db_state != DB_CACHED);
|
||||
}
|
||||
|
@ -1918,6 +1919,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
|
|||
uint64_t txg = dr->dr_txg;
|
||||
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
|
||||
/*
|
||||
* This assert is valid because dmu_sync() expects to be called by
|
||||
* a zilog's get_data while holding a range lock. This call only
|
||||
|
@ -1936,16 +1938,20 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
|
|||
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
|
||||
zio_free(db->db_objset->os_spa, txg, bp);
|
||||
|
||||
if (dr->dt.dl.dr_brtwrite) {
|
||||
if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) {
|
||||
ASSERT0P(dr->dt.dl.dr_data);
|
||||
dr->dt.dl.dr_data = db->db_buf;
|
||||
}
|
||||
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
||||
dr->dt.dl.dr_nopwrite = B_FALSE;
|
||||
dr->dt.dl.dr_brtwrite = B_FALSE;
|
||||
dr->dt.dl.dr_diowrite = B_FALSE;
|
||||
dr->dt.dl.dr_has_raw_params = B_FALSE;
|
||||
|
||||
/*
|
||||
* In the event that Direct I/O was used, we do not
|
||||
* need to release the buffer from the ARC.
|
||||
*
|
||||
* Release the already-written buffer, so we leave it in
|
||||
* a consistent dirty state. Note that all callers are
|
||||
* modifying the buffer, so they will immediately do
|
||||
|
@ -2084,6 +2090,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
|
|||
*/
|
||||
dmu_buf_will_dirty(&db->db, tx);
|
||||
|
||||
VERIFY3P(db->db_buf, !=, NULL);
|
||||
|
||||
/* create the data buffer for the new block */
|
||||
buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
|
||||
|
||||
|
@ -2532,6 +2540,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|||
{
|
||||
uint64_t txg = tx->tx_txg;
|
||||
boolean_t brtwrite;
|
||||
boolean_t diowrite;
|
||||
|
||||
ASSERT(txg != 0);
|
||||
|
||||
|
@ -2557,7 +2566,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|||
ASSERT(dr->dr_dbuf == db);
|
||||
|
||||
brtwrite = dr->dt.dl.dr_brtwrite;
|
||||
diowrite = dr->dt.dl.dr_diowrite;
|
||||
if (brtwrite) {
|
||||
ASSERT3B(diowrite, ==, B_FALSE);
|
||||
/*
|
||||
* We are freeing a block that we cloned in the same
|
||||
* transaction group.
|
||||
|
@ -2598,11 +2609,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|||
if (db->db_state != DB_NOFILL && !brtwrite) {
|
||||
dbuf_unoverride(dr);
|
||||
|
||||
if (dr->dt.dl.dr_data != db->db_buf) {
|
||||
ASSERT(db->db_buf != NULL);
|
||||
ASSERT(dr->dt.dl.dr_data != NULL);
|
||||
if (dr->dt.dl.dr_data != db->db_buf)
|
||||
arc_buf_destroy(dr->dt.dl.dr_data, db);
|
||||
}
|
||||
}
|
||||
|
||||
kmem_free(dr, sizeof (dbuf_dirty_record_t));
|
||||
|
||||
|
@ -2610,7 +2622,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
|
|||
db->db_dirtycnt -= 1;
|
||||
|
||||
if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
|
||||
ASSERT(db->db_state == DB_NOFILL || brtwrite ||
|
||||
ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite ||
|
||||
arc_released(db->db_buf));
|
||||
dbuf_destroy(db);
|
||||
return (B_TRUE);
|
||||
|
@ -2670,8 +2682,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
|
|||
* Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
|
||||
* want to make sure dbuf_read() will read the pending cloned block and
|
||||
* not the uderlying block that is being replaced. dbuf_undirty() will
|
||||
* do dbuf_unoverride(), so we will end up with cloned block content,
|
||||
* without overridden BP.
|
||||
* do brt_pending_remove() before removing the dirty record.
|
||||
*/
|
||||
(void) dbuf_read(db, NULL, flags);
|
||||
if (undirty) {
|
||||
|
@ -2701,8 +2712,87 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|||
return (dr != NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Normally the db_blkptr points to the most recent on-disk content for the
|
||||
* dbuf (and anything newer will be cached in the dbuf). However, a pending
|
||||
* block clone or not yet synced Direct I/O write will have a dirty record BP
|
||||
* pointing to the most recent data.
|
||||
*/
|
||||
int
|
||||
dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
int error = 0;
|
||||
|
||||
if (db->db_level != 0) {
|
||||
*bp = db->db_blkptr;
|
||||
return (0);
|
||||
}
|
||||
|
||||
*bp = db->db_blkptr;
|
||||
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
|
||||
if (dr && db->db_state == DB_NOFILL) {
|
||||
/* Block clone */
|
||||
if (!dr->dt.dl.dr_brtwrite)
|
||||
error = EIO;
|
||||
else
|
||||
*bp = &dr->dt.dl.dr_overridden_by;
|
||||
} else if (dr && db->db_state == DB_UNCACHED) {
|
||||
/* Direct I/O write */
|
||||
if (dr->dt.dl.dr_diowrite)
|
||||
*bp = &dr->dt.dl.dr_overridden_by;
|
||||
}
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Direct I/O reads can read directly from the ARC, but the data has
|
||||
* to be untransformed in order to copy it over into user pages.
|
||||
*/
|
||||
int
|
||||
dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa)
|
||||
{
|
||||
int err = 0;
|
||||
DB_DNODE_ENTER(db);
|
||||
dnode_t *dn = DB_DNODE(db);
|
||||
|
||||
ASSERT3S(db->db_state, ==, DB_CACHED);
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
|
||||
/*
|
||||
* Ensure that this block's dnode has been decrypted if
|
||||
* the caller has requested decrypted data.
|
||||
*/
|
||||
err = dbuf_read_verify_dnode_crypt(db, dn, 0);
|
||||
|
||||
/*
|
||||
* If the arc buf is compressed or encrypted and the caller
|
||||
* requested uncompressed data, we need to untransform it
|
||||
* before returning. We also call arc_untransform() on any
|
||||
* unauthenticated blocks, which will verify their MAC if
|
||||
* the key is now available.
|
||||
*/
|
||||
if (err == 0 && db->db_buf != NULL &&
|
||||
(arc_is_encrypted(db->db_buf) ||
|
||||
arc_is_unauthenticated(db->db_buf) ||
|
||||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
|
||||
zbookmark_phys_t zb;
|
||||
|
||||
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
|
||||
db->db.db_object, db->db_level, db->db_blkid);
|
||||
dbuf_fix_old_data(db, spa_syncing_txg(spa));
|
||||
err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
|
||||
dbuf_set_data(db, db->db_buf);
|
||||
}
|
||||
DB_DNODE_EXIT(db);
|
||||
DBUF_STAT_BUMP(hash_hits);
|
||||
|
||||
return (err);
|
||||
}
|
||||
|
||||
void
|
||||
dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
||||
dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
||||
{
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
|
||||
ASSERT0(db->db_level);
|
||||
|
@ -2710,14 +2800,41 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|||
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
|
||||
|
||||
/*
|
||||
* Block cloning: We are going to clone into this block, so undirty
|
||||
* modifications done to this block so far in this txg. This includes
|
||||
* writes and clones into this block.
|
||||
* Block clones and Direct I/O writes always happen in open-context.
|
||||
*/
|
||||
ASSERT(!dmu_tx_is_syncing(tx));
|
||||
ASSERT0(db->db_level);
|
||||
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
||||
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
DBUF_VERIFY(db);
|
||||
VERIFY(!dbuf_undirty(db, tx));
|
||||
|
||||
/*
|
||||
* We are going to clone or issue a Direct I/O write on this block, so
|
||||
* undirty modifications done to this block so far in this txg. This
|
||||
* includes writes and clones into this block.
|
||||
*
|
||||
* If there dirty record associated with this txg from a previous Direct
|
||||
* I/O write then space accounting cleanup takes place. It is important
|
||||
* to go ahead free up the space accounting through dbuf_undirty() ->
|
||||
* dbuf_unoverride() -> zio_free(). Space accountiung for determining
|
||||
* if a write can occur in zfs_write() happens through dmu_tx_assign().
|
||||
* This can cuase an issue with Direct I/O writes in the case of
|
||||
* overwriting the same block, because all DVA allocations are being
|
||||
* done in open-context. Constantly allowing Direct I/O overwrites to
|
||||
* the same block can exhaust the pools available space leading to
|
||||
* ENOSPC errors at the DVA allocation part of the ZIO pipeline, which
|
||||
* will eventually suspend the pool. By cleaning up sapce acccounting
|
||||
* now, the ENOSPC error can be avoided.
|
||||
*
|
||||
* Since we are undirtying the record in open-context, we must have a
|
||||
* hold on the db, so it should never be evicted after calling
|
||||
* dbuf_undirty().
|
||||
*/
|
||||
VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE);
|
||||
ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
|
||||
|
||||
if (db->db_buf != NULL) {
|
||||
/*
|
||||
* If there is an associated ARC buffer with this dbuf we can
|
||||
|
@ -2728,6 +2845,11 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|||
if (dr == NULL || dr->dt.dl.dr_data != db->db_buf)
|
||||
arc_buf_destroy(db->db_buf, db);
|
||||
|
||||
/*
|
||||
* Setting the dbuf's data pointers to NULL will force all
|
||||
* future reads down to the devices to get the most up to date
|
||||
* version of the data after a Direct I/O write has completed.
|
||||
*/
|
||||
db->db_buf = NULL;
|
||||
dbuf_clear_data(db);
|
||||
}
|
||||
|
@ -2736,7 +2858,8 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
|
|||
ASSERT3P(db->db.db_data, ==, NULL);
|
||||
|
||||
db->db_state = DB_NOFILL;
|
||||
DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
|
||||
DTRACE_SET_STATE(db,
|
||||
"allocating NOFILL buffer for clone or direct I/O write");
|
||||
|
||||
DBUF_VERIFY(db);
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
@ -2773,22 +2896,29 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
|
|||
dmu_tx_private_ok(tx));
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
if (db->db_state == DB_NOFILL) {
|
||||
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
|
||||
if (db->db_state == DB_NOFILL ||
|
||||
(db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) {
|
||||
/*
|
||||
* Block cloning: We will be completely overwriting a block
|
||||
* cloned in this transaction group, so let's undirty the
|
||||
* pending clone and mark the block as uncached. This will be
|
||||
* as if the clone was never done. But if the fill can fail
|
||||
* we should have a way to return back to the cloned data.
|
||||
* If the fill can fail we should have a way to return back to
|
||||
* the cloned or Direct I/O write data.
|
||||
*/
|
||||
if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
|
||||
if (canfail && dr) {
|
||||
mutex_exit(&db->db_mtx);
|
||||
dmu_buf_will_dirty(db_fake, tx);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* Block cloning: We will be completely overwriting a block
|
||||
* cloned in this transaction group, so let's undirty the
|
||||
* pending clone and mark the block as uncached. This will be
|
||||
* as if the clone was never done.
|
||||
*/
|
||||
if (dr && dr->dt.dl.dr_brtwrite) {
|
||||
VERIFY(!dbuf_undirty(db, tx));
|
||||
db->db_state = DB_UNCACHED;
|
||||
}
|
||||
}
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
dbuf_noread(db);
|
||||
|
@ -4080,7 +4210,6 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
|
|||
} else {
|
||||
mutex_exit(&db->db_mtx);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#pragma weak dmu_buf_refcount = dbuf_refcount
|
||||
|
@ -4540,24 +4669,32 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
|||
|
||||
mutex_enter(&db->db_mtx);
|
||||
/*
|
||||
* To be synced, we must be dirtied. But we
|
||||
* might have been freed after the dirty.
|
||||
* To be synced, we must be dirtied. But we might have been freed
|
||||
* after the dirty.
|
||||
*/
|
||||
if (db->db_state == DB_UNCACHED) {
|
||||
/* This buffer has been freed since it was dirtied */
|
||||
ASSERT(db->db.db_data == NULL);
|
||||
ASSERT3P(db->db.db_data, ==, NULL);
|
||||
} else if (db->db_state == DB_FILL) {
|
||||
/* This buffer was freed and is now being re-filled */
|
||||
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
|
||||
} else if (db->db_state == DB_READ) {
|
||||
/*
|
||||
* This buffer has a clone we need to write, and an in-flight
|
||||
* read on the BP we're about to clone. Its safe to issue the
|
||||
* write here because the read has already been issued and the
|
||||
* contents won't change.
|
||||
* This buffer was either cloned or had a Direct I/O write
|
||||
* occur and has an in-flgiht read on the BP. It is safe to
|
||||
* issue the write here, because the read has already been
|
||||
* issued and the contents won't change.
|
||||
*
|
||||
* We can verify the case of both the clone and Direct I/O
|
||||
* write by making sure the first dirty record for the dbuf
|
||||
* has no ARC buffer associated with it.
|
||||
*/
|
||||
ASSERT(dr->dt.dl.dr_brtwrite &&
|
||||
dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
|
||||
dbuf_dirty_record_t *dr_head =
|
||||
list_head(&db->db_dirty_records);
|
||||
ASSERT3P(db->db_buf, ==, NULL);
|
||||
ASSERT3P(db->db.db_data, ==, NULL);
|
||||
ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL);
|
||||
ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN);
|
||||
} else {
|
||||
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
|
||||
}
|
||||
|
@ -4608,8 +4745,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
|
|||
dbuf_check_blkptr(dn, db);
|
||||
|
||||
/*
|
||||
* If this buffer is in the middle of an immediate write,
|
||||
* wait for the synchronous IO to complete.
|
||||
* If this buffer is in the middle of an immediate write, wait for the
|
||||
* synchronous IO to complete.
|
||||
*
|
||||
* This is also valid even with Direct I/O writes setting a dirty
|
||||
* records override state into DR_IN_DMU_SYNC, because all
|
||||
* Direct I/O writes happen in open-context.
|
||||
*/
|
||||
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
|
||||
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
|
||||
|
@ -4913,8 +5054,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
|
|||
if (db->db_level == 0) {
|
||||
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
|
||||
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
|
||||
|
||||
/* no dr_data if this is a NO_FILL or Direct I/O */
|
||||
if (dr->dt.dl.dr_data != NULL &&
|
||||
dr->dt.dl.dr_data != db->db_buf) {
|
||||
ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE);
|
||||
ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE);
|
||||
arc_buf_destroy(dr->dt.dl.dr_data, db);
|
||||
}
|
||||
} else {
|
||||
|
@ -5180,7 +5325,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
|
|||
dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
|
||||
/*
|
||||
* The BP for this block has been provided by open context
|
||||
* (by dmu_sync() or dmu_buf_write_embedded()).
|
||||
* (by dmu_sync(), dmu_write_direct(),
|
||||
* or dmu_buf_write_embedded()).
|
||||
*/
|
||||
abd_t *contents = (data != NULL) ?
|
||||
abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
|
||||
|
@ -5219,7 +5365,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
|
|||
|
||||
dr->dr_zio = arc_write(pio, os->os_spa, txg,
|
||||
&dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
|
||||
dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
|
||||
dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready,
|
||||
children_ready_cb, dbuf_write_done, db,
|
||||
ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
|
||||
}
|
||||
|
@ -5239,7 +5385,7 @@ EXPORT_SYMBOL(dbuf_dirty);
|
|||
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
|
||||
EXPORT_SYMBOL(dmu_buf_will_dirty);
|
||||
EXPORT_SYMBOL(dmu_buf_is_dirty);
|
||||
EXPORT_SYMBOL(dmu_buf_will_clone);
|
||||
EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);
|
||||
EXPORT_SYMBOL(dmu_buf_will_not_fill);
|
||||
EXPORT_SYMBOL(dmu_buf_will_fill);
|
||||
EXPORT_SYMBOL(dmu_buf_fill_done);
|
||||
|
|
149
module/zfs/dmu.c
149
module/zfs/dmu.c
|
@ -609,8 +609,16 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
|||
dbp[i] = &db->db;
|
||||
}
|
||||
|
||||
if (!read)
|
||||
zfs_racct_write(length, nblks);
|
||||
/*
|
||||
* If we are doing O_DIRECT we still hold the dbufs, even for reads,
|
||||
* but we do not issue any reads here. We do not want to account for
|
||||
* writes in this case.
|
||||
*
|
||||
* O_DIRECT write/read accounting takes place in
|
||||
* dmu_{write/read}_abd().
|
||||
*/
|
||||
if (!read && ((flags & DMU_DIRECTIO) == 0))
|
||||
zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags);
|
||||
|
||||
if (zs)
|
||||
dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
|
||||
|
@ -897,7 +905,7 @@ dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
|
|||
|
||||
/*
|
||||
* Get the next "chunk" of file data to free. We traverse the file from
|
||||
* the end so that the file gets shorter over time (if we crashes in the
|
||||
* the end so that the file gets shorter over time (if we crash in the
|
||||
* middle, this will leave us in a better state). We find allocated file
|
||||
* data by simply searching the allocated level 1 indirects.
|
||||
*
|
||||
|
@ -1178,6 +1186,18 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
|
|||
size = newsz;
|
||||
}
|
||||
|
||||
if (size == 0)
|
||||
return (0);
|
||||
|
||||
/* Allow Direct I/O when requested and properly aligned */
|
||||
if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) &&
|
||||
zfs_dio_aligned(offset, size, PAGESIZE)) {
|
||||
abd_t *data = abd_get_from_buf(buf, size);
|
||||
err = dmu_read_abd(dn, offset, size, data, flags);
|
||||
abd_free(data);
|
||||
return (err);
|
||||
}
|
||||
|
||||
while (size > 0) {
|
||||
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
|
||||
int i;
|
||||
|
@ -1286,22 +1306,41 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
|
|||
}
|
||||
|
||||
/*
|
||||
* Note: Lustre is an external consumer of this interface.
|
||||
* This interface is not used internally by ZFS but is provided for
|
||||
* use by Lustre which is built on the DMU interfaces.
|
||||
*/
|
||||
void
|
||||
dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
const void *buf, dmu_tx_t *tx)
|
||||
int
|
||||
dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
const void *buf, dmu_tx_t *tx, uint32_t flags)
|
||||
{
|
||||
dmu_buf_t **dbp;
|
||||
int numbufs;
|
||||
int error;
|
||||
|
||||
if (size == 0)
|
||||
return;
|
||||
return (0);
|
||||
|
||||
/* Allow Direct I/O when requested and properly aligned */
|
||||
if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) &&
|
||||
zfs_dio_aligned(offset, size, dn->dn_datablksz)) {
|
||||
abd_t *data = abd_get_from_buf((void *)buf, size);
|
||||
error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
|
||||
abd_free(data);
|
||||
return (error);
|
||||
}
|
||||
|
||||
VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
|
||||
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
|
||||
dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
|
||||
dmu_buf_rele_array(dbp, numbufs, FTAG);
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
const void *buf, dmu_tx_t *tx)
|
||||
{
|
||||
return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0));
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -1365,6 +1404,9 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
|
|||
dmu_buf_t **dbp;
|
||||
int numbufs, i, err;
|
||||
|
||||
if (uio->uio_extflg & UIO_DIRECT)
|
||||
return (dmu_read_uio_direct(dn, uio, size));
|
||||
|
||||
/*
|
||||
* NB: we could do this block-at-a-time, but it's nice
|
||||
* to be reading in parallel.
|
||||
|
@ -1453,23 +1495,52 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
|
|||
dmu_buf_t **dbp;
|
||||
int numbufs;
|
||||
int err = 0;
|
||||
int i;
|
||||
uint64_t write_size;
|
||||
|
||||
err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
|
||||
top:
|
||||
write_size = size;
|
||||
|
||||
/*
|
||||
* We only allow Direct I/O writes to happen if we are block
|
||||
* sized aligned. Otherwise, we pass the write off to the ARC.
|
||||
*/
|
||||
if ((uio->uio_extflg & UIO_DIRECT) &&
|
||||
(write_size >= dn->dn_datablksz)) {
|
||||
if (zfs_dio_aligned(zfs_uio_offset(uio), write_size,
|
||||
dn->dn_datablksz)) {
|
||||
return (dmu_write_uio_direct(dn, uio, size, tx));
|
||||
} else if (write_size > dn->dn_datablksz &&
|
||||
zfs_dio_offset_aligned(zfs_uio_offset(uio),
|
||||
dn->dn_datablksz)) {
|
||||
err = dmu_write_uio_direct(dn, uio, dn->dn_datablksz,
|
||||
tx);
|
||||
if (err == 0) {
|
||||
size -= dn->dn_datablksz;
|
||||
goto top;
|
||||
} else {
|
||||
return (err);
|
||||
}
|
||||
} else {
|
||||
write_size =
|
||||
P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz);
|
||||
}
|
||||
}
|
||||
|
||||
err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size,
|
||||
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
|
||||
if (err)
|
||||
return (err);
|
||||
|
||||
for (i = 0; i < numbufs; i++) {
|
||||
for (int i = 0; i < numbufs; i++) {
|
||||
uint64_t tocpy;
|
||||
int64_t bufoff;
|
||||
dmu_buf_t *db = dbp[i];
|
||||
|
||||
ASSERT(size > 0);
|
||||
ASSERT(write_size > 0);
|
||||
|
||||
offset_t off = zfs_uio_offset(uio);
|
||||
bufoff = off - db->db_offset;
|
||||
tocpy = MIN(db->db_size - bufoff, size);
|
||||
tocpy = MIN(db->db_size - bufoff, write_size);
|
||||
|
||||
ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
|
||||
|
||||
|
@ -1489,10 +1560,18 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
|
|||
if (err)
|
||||
break;
|
||||
|
||||
write_size -= tocpy;
|
||||
size -= tocpy;
|
||||
}
|
||||
|
||||
IMPLY(err == 0, write_size == 0);
|
||||
|
||||
dmu_buf_rele_array(dbp, numbufs, FTAG);
|
||||
|
||||
if ((uio->uio_extflg & UIO_DIRECT) && size > 0) {
|
||||
goto top;
|
||||
}
|
||||
|
||||
return (err);
|
||||
}
|
||||
|
||||
|
@ -1731,7 +1810,7 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
|
|||
* same size as the dbuf.
|
||||
*/
|
||||
if (offset == db->db.db_offset && blksz == db->db.db_size) {
|
||||
zfs_racct_write(blksz, 1);
|
||||
zfs_racct_write(os->os_spa, blksz, 1, 0);
|
||||
dbuf_assign_arcbuf(db, buf, tx);
|
||||
dbuf_rele(db, FTAG);
|
||||
} else {
|
||||
|
@ -1761,23 +1840,22 @@ dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
|
|||
return (err);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
dbuf_dirty_record_t *dsa_dr;
|
||||
dmu_sync_cb_t *dsa_done;
|
||||
zgd_t *dsa_zgd;
|
||||
dmu_tx_t *dsa_tx;
|
||||
} dmu_sync_arg_t;
|
||||
|
||||
static void
|
||||
void
|
||||
dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
|
||||
{
|
||||
(void) buf;
|
||||
dmu_sync_arg_t *dsa = varg;
|
||||
dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
|
||||
blkptr_t *bp = zio->io_bp;
|
||||
|
||||
if (zio->io_error == 0) {
|
||||
dbuf_dirty_record_t *dr = dsa->dsa_dr;
|
||||
blkptr_t *bp = zio->io_bp;
|
||||
|
||||
if (BP_IS_HOLE(bp)) {
|
||||
dmu_buf_t *db = NULL;
|
||||
if (dr)
|
||||
db = &(dr->dr_dbuf->db);
|
||||
else
|
||||
db = dsa->dsa_zgd->zgd_db;
|
||||
/*
|
||||
* A block of zeros may compress to a hole, but the
|
||||
* block size still needs to be known for replay.
|
||||
|
@ -1796,7 +1874,7 @@ dmu_sync_late_arrival_ready(zio_t *zio)
|
|||
dmu_sync_ready(zio, NULL, zio->io_private);
|
||||
}
|
||||
|
||||
static void
|
||||
void
|
||||
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
|
||||
{
|
||||
(void) buf;
|
||||
|
@ -1809,7 +1887,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
|
|||
* Record the vdev(s) backing this blkptr so they can be flushed after
|
||||
* the writes for the lwb have completed.
|
||||
*/
|
||||
if (zio->io_error == 0) {
|
||||
if (zgd && zio->io_error == 0) {
|
||||
zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
|
||||
}
|
||||
|
||||
|
@ -1848,9 +1926,11 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
|
|||
} else {
|
||||
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
|
||||
}
|
||||
|
||||
cv_broadcast(&db->db_changed);
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
if (dsa->dsa_done)
|
||||
dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
|
||||
|
||||
kmem_free(dsa, sizeof (*dsa));
|
||||
|
@ -2120,9 +2200,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
|
|||
dsa->dsa_tx = NULL;
|
||||
|
||||
zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
|
||||
dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
|
||||
&zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
|
||||
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
|
||||
dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db),
|
||||
dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL,
|
||||
dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL,
|
||||
&zb));
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
@ -2385,6 +2466,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
|
|||
zp->zp_nopwrite = nopwrite;
|
||||
zp->zp_encrypt = encrypt;
|
||||
zp->zp_byteorder = ZFS_HOST_BYTEORDER;
|
||||
zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE;
|
||||
memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
|
||||
memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
|
||||
memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
|
||||
|
@ -2594,7 +2676,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
|
|||
ASSERT(db->db_blkid != DMU_SPILL_BLKID);
|
||||
ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
|
||||
|
||||
dmu_buf_will_clone(dbuf, tx);
|
||||
dmu_buf_will_clone_or_dio(dbuf, tx);
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
|
||||
|
@ -2817,8 +2899,15 @@ EXPORT_SYMBOL(dmu_free_long_range);
|
|||
EXPORT_SYMBOL(dmu_free_long_object);
|
||||
EXPORT_SYMBOL(dmu_read);
|
||||
EXPORT_SYMBOL(dmu_read_by_dnode);
|
||||
EXPORT_SYMBOL(dmu_read_uio);
|
||||
EXPORT_SYMBOL(dmu_read_uio_dbuf);
|
||||
EXPORT_SYMBOL(dmu_read_uio_dnode);
|
||||
EXPORT_SYMBOL(dmu_write);
|
||||
EXPORT_SYMBOL(dmu_write_by_dnode);
|
||||
EXPORT_SYMBOL(dmu_write_by_dnode_flags);
|
||||
EXPORT_SYMBOL(dmu_write_uio);
|
||||
EXPORT_SYMBOL(dmu_write_uio_dbuf);
|
||||
EXPORT_SYMBOL(dmu_write_uio_dnode);
|
||||
EXPORT_SYMBOL(dmu_prealloc);
|
||||
EXPORT_SYMBOL(dmu_object_info);
|
||||
EXPORT_SYMBOL(dmu_object_info_from_dnode);
|
||||
|
|
|
@ -0,0 +1,395 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
|
||||
|
||||
#include <sys/dmu.h>
|
||||
#include <sys/dmu_impl.h>
|
||||
#include <sys/dbuf.h>
|
||||
#include <sys/dnode.h>
|
||||
#include <sys/zfs_context.h>
|
||||
#include <sys/zfs_racct.h>
|
||||
#include <sys/dsl_dataset.h>
|
||||
#include <sys/dmu_objset.h>
|
||||
|
||||
static abd_t *
|
||||
make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset,
|
||||
uint64_t size)
|
||||
{
|
||||
size_t buf_size = db->db.db_size;
|
||||
abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL;
|
||||
size_t buf_off = 0;
|
||||
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
|
||||
if (offset > db->db.db_offset) {
|
||||
size_t pre_size = offset - db->db.db_offset;
|
||||
pre_buf = abd_alloc_for_io(pre_size, B_TRUE);
|
||||
buf_size -= pre_size;
|
||||
buf_off = 0;
|
||||
} else {
|
||||
buf_off = db->db.db_offset - offset;
|
||||
size -= buf_off;
|
||||
}
|
||||
|
||||
if (size < buf_size) {
|
||||
size_t post_size = buf_size - size;
|
||||
post_buf = abd_alloc_for_io(post_size, B_TRUE);
|
||||
buf_size -= post_size;
|
||||
}
|
||||
|
||||
ASSERT3U(buf_size, >, 0);
|
||||
abd_t *buf = abd_get_offset_size(data, buf_off, buf_size);
|
||||
|
||||
if (pre_buf || post_buf) {
|
||||
mbuf = abd_alloc_gang();
|
||||
if (pre_buf)
|
||||
abd_gang_add(mbuf, pre_buf, B_TRUE);
|
||||
abd_gang_add(mbuf, buf, B_TRUE);
|
||||
if (post_buf)
|
||||
abd_gang_add(mbuf, post_buf, B_TRUE);
|
||||
} else {
|
||||
mbuf = buf;
|
||||
}
|
||||
|
||||
return (mbuf);
|
||||
}
|
||||
|
||||
static void
|
||||
dmu_read_abd_done(zio_t *zio)
|
||||
{
|
||||
abd_free(zio->io_abd);
|
||||
}
|
||||
|
||||
static void
|
||||
dmu_write_direct_ready(zio_t *zio)
|
||||
{
|
||||
dmu_sync_ready(zio, NULL, zio->io_private);
|
||||
}
|
||||
|
||||
static void
|
||||
dmu_write_direct_done(zio_t *zio)
|
||||
{
|
||||
dmu_sync_arg_t *dsa = zio->io_private;
|
||||
dbuf_dirty_record_t *dr = dsa->dsa_dr;
|
||||
dmu_buf_impl_t *db = dr->dr_dbuf;
|
||||
|
||||
abd_free(zio->io_abd);
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
ASSERT3P(db->db_buf, ==, NULL);
|
||||
ASSERT3P(dr->dt.dl.dr_data, ==, NULL);
|
||||
ASSERT3P(db->db.db_data, ==, NULL);
|
||||
db->db_state = DB_UNCACHED;
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
dmu_sync_done(zio, NULL, zio->io_private);
|
||||
|
||||
if (zio->io_error != 0) {
|
||||
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
|
||||
ASSERT3U(zio->io_error, ==, EIO);
|
||||
|
||||
/*
|
||||
* In the event of an I/O error this block has been freed in
|
||||
* zio_done() through zio_dva_unallocate(). Calling
|
||||
* dmu_sync_done() above set dr_override_state to
|
||||
* DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls
|
||||
* dbuf_unoverride(), it will skip doing zio_free() to free
|
||||
* this block as that was already taken care of.
|
||||
*
|
||||
* Since we are undirtying the record in open-context, we must
|
||||
* have a hold on the db, so it should never be evicted after
|
||||
* calling dbuf_undirty().
|
||||
*/
|
||||
mutex_enter(&db->db_mtx);
|
||||
VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE);
|
||||
mutex_exit(&db->db_mtx);
|
||||
}
|
||||
|
||||
kmem_free(zio->io_bp, sizeof (blkptr_t));
|
||||
zio->io_bp = NULL;
|
||||
}
|
||||
|
||||
int
|
||||
dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
|
||||
{
|
||||
objset_t *os = db->db_objset;
|
||||
dsl_dataset_t *ds = dmu_objset_ds(os);
|
||||
zbookmark_phys_t zb;
|
||||
dbuf_dirty_record_t *dr_head;
|
||||
|
||||
SET_BOOKMARK(&zb, ds->ds_object,
|
||||
db->db.db_object, db->db_level, db->db_blkid);
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
zio_prop_t zp;
|
||||
dmu_write_policy(os, DB_DNODE(db), db->db_level,
|
||||
WP_DMU_SYNC | WP_DIRECT_WR, &zp);
|
||||
DB_DNODE_EXIT(db);
|
||||
|
||||
/*
|
||||
* Dirty this dbuf with DB_NOFILL since we will not have any data
|
||||
* associated with the dbuf.
|
||||
*/
|
||||
dmu_buf_will_clone_or_dio(&db->db, tx);
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
|
||||
uint64_t txg = dmu_tx_get_txg(tx);
|
||||
ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa));
|
||||
ASSERT3U(txg, >, spa_syncing_txg(os->os_spa));
|
||||
|
||||
dr_head = list_head(&db->db_dirty_records);
|
||||
ASSERT3U(dr_head->dr_txg, ==, txg);
|
||||
dr_head->dt.dl.dr_diowrite = B_TRUE;
|
||||
dr_head->dr_accounted = db->db.db_size;
|
||||
|
||||
blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
|
||||
if (db->db_blkptr != NULL) {
|
||||
/*
|
||||
* Fill in bp with the current block pointer so that
|
||||
* the nopwrite code can check if we're writing the same
|
||||
* data that's already on disk.
|
||||
*/
|
||||
*bp = *db->db_blkptr;
|
||||
} else {
|
||||
memset(bp, 0, sizeof (blkptr_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Disable nopwrite if the current block pointer could change
|
||||
* before this TXG syncs.
|
||||
*/
|
||||
if (list_next(&db->db_dirty_records, dr_head) != NULL)
|
||||
zp.zp_nopwrite = B_FALSE;
|
||||
|
||||
ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
|
||||
dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
|
||||
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
dmu_objset_willuse_space(os, dr_head->dr_accounted, tx);
|
||||
|
||||
dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
|
||||
dsa->dsa_dr = dr_head;
|
||||
dsa->dsa_tx = tx;
|
||||
|
||||
zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data,
|
||||
db->db.db_size, db->db.db_size, &zp,
|
||||
dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa,
|
||||
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb);
|
||||
|
||||
if (pio == NULL)
|
||||
return (zio_wait(zio));
|
||||
|
||||
zio_nowait(zio);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
int
|
||||
dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
abd_t *data, uint32_t flags, dmu_tx_t *tx)
|
||||
{
|
||||
dmu_buf_t **dbp;
|
||||
spa_t *spa = dn->dn_objset->os_spa;
|
||||
int numbufs, err;
|
||||
|
||||
ASSERT(flags & DMU_DIRECTIO);
|
||||
|
||||
err = dmu_buf_hold_array_by_dnode(dn, offset,
|
||||
size, B_FALSE, FTAG, &numbufs, &dbp, flags);
|
||||
if (err)
|
||||
return (err);
|
||||
|
||||
zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||
|
||||
for (int i = 0; i < numbufs && err == 0; i++) {
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
|
||||
|
||||
abd_t *abd = abd_get_offset_size(data,
|
||||
db->db.db_offset - offset, dn->dn_datablksz);
|
||||
|
||||
zfs_racct_write(spa, db->db.db_size, 1, flags);
|
||||
err = dmu_write_direct(pio, db, abd, tx);
|
||||
ASSERT0(err);
|
||||
}
|
||||
|
||||
err = zio_wait(pio);
|
||||
|
||||
/*
|
||||
* The dbuf must be held until the Direct I/O write has completed in
|
||||
* the event there was any errors and dbuf_undirty() was called.
|
||||
*/
|
||||
dmu_buf_rele_array(dbp, numbufs, FTAG);
|
||||
|
||||
return (err);
|
||||
}
|
||||
|
||||
int
|
||||
dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size,
|
||||
abd_t *data, uint32_t flags)
|
||||
{
|
||||
objset_t *os = dn->dn_objset;
|
||||
spa_t *spa = os->os_spa;
|
||||
dmu_buf_t **dbp;
|
||||
int numbufs, err;
|
||||
|
||||
ASSERT(flags & DMU_DIRECTIO);
|
||||
|
||||
err = dmu_buf_hold_array_by_dnode(dn, offset,
|
||||
size, B_FALSE, FTAG, &numbufs, &dbp, flags);
|
||||
if (err)
|
||||
return (err);
|
||||
|
||||
zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||
|
||||
for (int i = 0; i < numbufs; i++) {
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
|
||||
abd_t *mbuf;
|
||||
zbookmark_phys_t zb;
|
||||
blkptr_t *bp;
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
|
||||
SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object,
|
||||
db->db.db_object, db->db_level, db->db_blkid);
|
||||
|
||||
/*
|
||||
* If there is another read for this dbuf, we will wait for
|
||||
* that to complete first before checking the db_state below.
|
||||
*/
|
||||
while (db->db_state == DB_READ)
|
||||
cv_wait(&db->db_changed, &db->db_mtx);
|
||||
|
||||
err = dmu_buf_get_bp_from_dbuf(db, &bp);
|
||||
if (err) {
|
||||
mutex_exit(&db->db_mtx);
|
||||
goto error;
|
||||
}
|
||||
|
||||
/*
|
||||
* There is no need to read if this is a hole or the data is
|
||||
* cached. This will not be considered a direct read for IO
|
||||
* accounting in the same way that an ARC hit is not counted.
|
||||
*/
|
||||
if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) {
|
||||
size_t aoff = offset < db->db.db_offset ?
|
||||
db->db.db_offset - offset : 0;
|
||||
size_t boff = offset > db->db.db_offset ?
|
||||
offset - db->db.db_offset : 0;
|
||||
size_t len = MIN(size - aoff, db->db.db_size - boff);
|
||||
|
||||
if (db->db_state == DB_CACHED) {
|
||||
/*
|
||||
* We need to untransformed the ARC buf data
|
||||
* before we copy it over.
|
||||
*/
|
||||
err = dmu_buf_untransform_direct(db, spa);
|
||||
ASSERT0(err);
|
||||
abd_copy_from_buf_off(data,
|
||||
(char *)db->db.db_data + boff, aoff, len);
|
||||
} else {
|
||||
abd_zero_off(data, aoff, len);
|
||||
}
|
||||
|
||||
mutex_exit(&db->db_mtx);
|
||||
continue;
|
||||
}
|
||||
|
||||
mbuf = make_abd_for_dbuf(db, data, offset, size);
|
||||
ASSERT3P(mbuf, !=, NULL);
|
||||
|
||||
/*
|
||||
* The dbuf mutex (db_mtx) must be held when creating the ZIO
|
||||
* for the read. The BP returned from
|
||||
* dmu_buf_get_bp_from_dbuf() could be from a pending block
|
||||
* clone or a yet to be synced Direct I/O write that is in the
|
||||
* dbuf's dirty record. When zio_read() is called, zio_create()
|
||||
* will make a copy of the BP. However, if zio_read() is called
|
||||
* without the mutex being held then the dirty record from the
|
||||
* dbuf could be freed in dbuf_write_done() resulting in garbage
|
||||
* being set for the zio BP.
|
||||
*/
|
||||
zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size,
|
||||
dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ,
|
||||
ZIO_FLAG_CANFAIL, &zb);
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
zfs_racct_read(spa, db->db.db_size, 1, flags);
|
||||
zio_nowait(cio);
|
||||
}
|
||||
|
||||
dmu_buf_rele_array(dbp, numbufs, FTAG);
|
||||
|
||||
return (zio_wait(rio));
|
||||
|
||||
error:
|
||||
dmu_buf_rele_array(dbp, numbufs, FTAG);
|
||||
(void) zio_wait(rio);
|
||||
return (err);
|
||||
}
|
||||
|
||||
#ifdef _KERNEL
|
||||
int
|
||||
dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
|
||||
{
|
||||
offset_t offset = zfs_uio_offset(uio);
|
||||
offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
|
||||
int err;
|
||||
|
||||
ASSERT(uio->uio_extflg & UIO_DIRECT);
|
||||
ASSERT3U(page_index, <, uio->uio_dio.npages);
|
||||
|
||||
abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
|
||||
offset & (PAGESIZE - 1), size);
|
||||
err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO);
|
||||
abd_free(data);
|
||||
|
||||
if (err == 0)
|
||||
zfs_uioskip(uio, size);
|
||||
|
||||
return (err);
|
||||
}
|
||||
|
||||
int
|
||||
dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
|
||||
{
|
||||
offset_t offset = zfs_uio_offset(uio);
|
||||
offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT;
|
||||
int err;
|
||||
|
||||
ASSERT(uio->uio_extflg & UIO_DIRECT);
|
||||
ASSERT3U(page_index, <, uio->uio_dio.npages);
|
||||
|
||||
abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index],
|
||||
offset & (PAGESIZE - 1), size);
|
||||
err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
|
||||
abd_free(data);
|
||||
|
||||
if (err == 0)
|
||||
zfs_uioskip(uio, size);
|
||||
|
||||
return (err);
|
||||
}
|
||||
#endif /* _KERNEL */
|
||||
|
||||
EXPORT_SYMBOL(dmu_read_uio_direct);
|
||||
EXPORT_SYMBOL(dmu_write_uio_direct);
|
|
@ -350,6 +350,20 @@ smallblk_changed_cb(void *arg, uint64_t newval)
|
|||
os->os_zpl_special_smallblock = newval;
|
||||
}
|
||||
|
||||
static void
|
||||
direct_changed_cb(void *arg, uint64_t newval)
|
||||
{
|
||||
objset_t *os = arg;
|
||||
|
||||
/*
|
||||
* Inheritance and range checking should have been done by now.
|
||||
*/
|
||||
ASSERT(newval == ZFS_DIRECT_DISABLED || newval == ZFS_DIRECT_STANDARD ||
|
||||
newval == ZFS_DIRECT_ALWAYS);
|
||||
|
||||
os->os_direct = newval;
|
||||
}
|
||||
|
||||
static void
|
||||
logbias_changed_cb(void *arg, uint64_t newval)
|
||||
{
|
||||
|
@ -633,6 +647,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
|
|||
ZFS_PROP_SPECIAL_SMALL_BLOCKS),
|
||||
smallblk_changed_cb, os);
|
||||
}
|
||||
if (err == 0) {
|
||||
err = dsl_prop_register(ds,
|
||||
zfs_prop_to_name(ZFS_PROP_DIRECT),
|
||||
direct_changed_cb, os);
|
||||
}
|
||||
}
|
||||
if (err != 0) {
|
||||
arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
|
||||
|
|
|
@ -895,6 +895,14 @@ static const spa_iostats_t spa_iostats_template = {
|
|||
{ "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 },
|
||||
{ "simple_trim_extents_failed", KSTAT_DATA_UINT64 },
|
||||
{ "simple_trim_bytes_failed", KSTAT_DATA_UINT64 },
|
||||
{ "arc_read_count", KSTAT_DATA_UINT64 },
|
||||
{ "arc_read_bytes", KSTAT_DATA_UINT64 },
|
||||
{ "arc_write_count", KSTAT_DATA_UINT64 },
|
||||
{ "arc_write_bytes", KSTAT_DATA_UINT64 },
|
||||
{ "direct_read_count", KSTAT_DATA_UINT64 },
|
||||
{ "direct_read_bytes", KSTAT_DATA_UINT64 },
|
||||
{ "direct_write_count", KSTAT_DATA_UINT64 },
|
||||
{ "direct_write_bytes", KSTAT_DATA_UINT64 },
|
||||
};
|
||||
|
||||
#define SPA_IOSTATS_ADD(stat, val) \
|
||||
|
@ -938,6 +946,44 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type,
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
{
|
||||
spa_history_kstat_t *shk = &spa->spa_stats.iostats;
|
||||
kstat_t *ksp = shk->kstat;
|
||||
|
||||
if (ksp == NULL)
|
||||
return;
|
||||
|
||||
spa_iostats_t *iostats = ksp->ks_data;
|
||||
if (flags & DMU_DIRECTIO) {
|
||||
SPA_IOSTATS_ADD(direct_read_count, iops);
|
||||
SPA_IOSTATS_ADD(direct_read_bytes, size);
|
||||
} else {
|
||||
SPA_IOSTATS_ADD(arc_read_count, iops);
|
||||
SPA_IOSTATS_ADD(arc_read_bytes, size);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags)
|
||||
{
|
||||
spa_history_kstat_t *shk = &spa->spa_stats.iostats;
|
||||
kstat_t *ksp = shk->kstat;
|
||||
|
||||
if (ksp == NULL)
|
||||
return;
|
||||
|
||||
spa_iostats_t *iostats = ksp->ks_data;
|
||||
if (flags & DMU_DIRECTIO) {
|
||||
SPA_IOSTATS_ADD(direct_write_count, iops);
|
||||
SPA_IOSTATS_ADD(direct_write_bytes, size);
|
||||
} else {
|
||||
SPA_IOSTATS_ADD(arc_write_count, iops);
|
||||
SPA_IOSTATS_ADD(arc_write_bytes, size);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
spa_iostats_update(kstat_t *ksp, int rw)
|
||||
{
|
||||
|
|
|
@ -117,6 +117,11 @@ static unsigned int zfs_slow_io_events_per_second = 20;
|
|||
*/
|
||||
static unsigned int zfs_deadman_events_per_second = 1;
|
||||
|
||||
/*
|
||||
* Rate limit direct write IO verify failures to this many per scond.
|
||||
*/
|
||||
static unsigned int zfs_dio_write_verify_events_per_second = 20;
|
||||
|
||||
/*
|
||||
* Rate limit checksum events after this many checksum errors per second.
|
||||
*/
|
||||
|
@ -153,6 +158,17 @@ int zfs_nocacheflush = 0;
|
|||
uint_t zfs_vdev_max_auto_ashift = 14;
|
||||
uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
|
||||
|
||||
/*
|
||||
* VDEV checksum verification for Direct I/O writes. This is neccessary for
|
||||
* Linux, because anonymous pages can not be placed under write protection
|
||||
* during Direct I/O writes.
|
||||
*/
|
||||
#if !defined(__FreeBSD__)
|
||||
uint_t zfs_vdev_direct_write_verify = 1;
|
||||
#else
|
||||
uint_t zfs_vdev_direct_write_verify = 0;
|
||||
#endif
|
||||
|
||||
void
|
||||
vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
|
||||
{
|
||||
|
@ -673,6 +689,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
|
|||
1);
|
||||
zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
|
||||
1);
|
||||
zfs_ratelimit_init(&vd->vdev_dio_verify_rl,
|
||||
&zfs_dio_write_verify_events_per_second, 1);
|
||||
zfs_ratelimit_init(&vd->vdev_checksum_rl,
|
||||
&zfs_checksum_events_per_second, 1);
|
||||
|
||||
|
@ -1182,6 +1200,7 @@ vdev_free(vdev_t *vd)
|
|||
|
||||
zfs_ratelimit_fini(&vd->vdev_delay_rl);
|
||||
zfs_ratelimit_fini(&vd->vdev_deadman_rl);
|
||||
zfs_ratelimit_fini(&vd->vdev_dio_verify_rl);
|
||||
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
|
||||
|
||||
if (vd == spa->spa_root_vdev)
|
||||
|
@ -4475,6 +4494,7 @@ vdev_clear(spa_t *spa, vdev_t *vd)
|
|||
vd->vdev_stat.vs_read_errors = 0;
|
||||
vd->vdev_stat.vs_write_errors = 0;
|
||||
vd->vdev_stat.vs_checksum_errors = 0;
|
||||
vd->vdev_stat.vs_dio_verify_errors = 0;
|
||||
vd->vdev_stat.vs_slow_ios = 0;
|
||||
|
||||
for (int c = 0; c < vd->vdev_children; c++)
|
||||
|
@ -6503,7 +6523,14 @@ ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
|
|||
ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
|
||||
"Rate limit hung IO (deadman) events to this many per second");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW,
|
||||
"Rate Direct I/O write verify events to this many per second");
|
||||
|
||||
/* BEGIN CSTYLED */
|
||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW,
|
||||
"Direct I/O writes will perform for checksum verification before "
|
||||
"commiting write");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
|
||||
"Rate limit checksum events to this many checksum errors per second "
|
||||
"(do not set below ZED threshold).");
|
||||
|
|
|
@ -387,6 +387,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
|
|||
/* IO delays */
|
||||
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
|
||||
|
||||
/* Direct I/O write verify errors */
|
||||
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS,
|
||||
vs->vs_dio_verify_errors);
|
||||
|
||||
/* Add extended stats nvlist to main nvlist */
|
||||
fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
|
||||
|
||||
|
|
|
@ -595,6 +595,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
|
|||
DATA_TYPE_UINT64, vs->vs_checksum_errors,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
|
||||
DATA_TYPE_UINT64, vs->vs_slow_ios,
|
||||
FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS,
|
||||
DATA_TYPE_UINT64, vs->vs_dio_verify_errors,
|
||||
NULL);
|
||||
}
|
||||
|
||||
|
|
|
@ -607,7 +607,7 @@ static int64_t zfs_immediate_write_sz = 32768;
|
|||
void
|
||||
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
||||
znode_t *zp, offset_t off, ssize_t resid, boolean_t commit,
|
||||
zil_callback_t callback, void *callback_data)
|
||||
boolean_t o_direct, zil_callback_t callback, void *callback_data)
|
||||
{
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
|
||||
uint32_t blocksize = zp->z_blksz;
|
||||
|
@ -622,7 +622,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
|
|||
return;
|
||||
}
|
||||
|
||||
if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
|
||||
if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct)
|
||||
write_state = WR_INDIRECT;
|
||||
else if (!spa_has_slogs(zilog->zl_spa) &&
|
||||
resid >= zfs_immediate_write_sz)
|
||||
|
|
|
@ -74,6 +74,14 @@ int zfs_bclone_enabled = 1;
|
|||
*/
|
||||
static int zfs_bclone_wait_dirty = 0;
|
||||
|
||||
/*
|
||||
* Enable Direct I/O. If this setting is 0, then all I/O requests will be
|
||||
* directed through the ARC acting as though the dataset property direct was
|
||||
* set to disabled.
|
||||
*/
|
||||
static int zfs_dio_enabled = 1;
|
||||
|
||||
|
||||
/*
|
||||
* Maximum bytes to read per chunk in zfs_read().
|
||||
*/
|
||||
|
@ -202,6 +210,77 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
|
|||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine if Direct I/O has been requested (either via the O_DIRECT flag or
|
||||
* the "direct" dataset property). When inherited by the property only apply
|
||||
* the O_DIRECT flag to correctly aligned IO requests. The rational for this
|
||||
* is it allows the property to be safely set on a dataset without forcing
|
||||
* all of the applications to be aware of the alignment restrictions. When
|
||||
* O_DIRECT is explicitly requested by an application return EINVAL if the
|
||||
* request is unaligned. In all cases, if the range for this request has
|
||||
* been mmap'ed then we will perform buffered I/O to keep the mapped region
|
||||
* synhronized with the ARC.
|
||||
*
|
||||
* It is possible that a file's pages could be mmap'ed after it is checked
|
||||
* here. If so, that is handled coorarding in zfs_write(). See comments in the
|
||||
* following area for how this is handled:
|
||||
* zfs_write() -> update_pages()
|
||||
*/
|
||||
static int
|
||||
zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw,
|
||||
int *ioflagp)
|
||||
{
|
||||
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
||||
objset_t *os = zfsvfs->z_os;
|
||||
int ioflag = *ioflagp;
|
||||
int error = 0;
|
||||
|
||||
if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED ||
|
||||
zn_has_cached_data(zp, zfs_uio_offset(uio),
|
||||
zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) {
|
||||
/*
|
||||
* Direct I/O is disabled or the region is mmap'ed. In either
|
||||
* case the I/O request will just directed through the ARC.
|
||||
*/
|
||||
ioflag &= ~O_DIRECT;
|
||||
goto out;
|
||||
} else if (os->os_direct == ZFS_DIRECT_ALWAYS &&
|
||||
zfs_uio_page_aligned(uio) &&
|
||||
zfs_uio_aligned(uio, PAGE_SIZE)) {
|
||||
if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) ||
|
||||
(rw == UIO_READ)) {
|
||||
ioflag |= O_DIRECT;
|
||||
}
|
||||
} else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) {
|
||||
/*
|
||||
* Direct I/O was requested through the direct=always, but it
|
||||
* is not properly PAGE_SIZE aligned. The request will be
|
||||
* directed through the ARC.
|
||||
*/
|
||||
ioflag &= ~O_DIRECT;
|
||||
}
|
||||
|
||||
if (ioflag & O_DIRECT) {
|
||||
if (!zfs_uio_page_aligned(uio) ||
|
||||
!zfs_uio_aligned(uio, PAGE_SIZE)) {
|
||||
error = SET_ERROR(EINVAL);
|
||||
goto out;
|
||||
}
|
||||
|
||||
error = zfs_uio_get_dio_pages_alloc(uio, rw);
|
||||
if (error) {
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT);
|
||||
ASSERT0(error);
|
||||
|
||||
out:
|
||||
*ioflagp = ioflag;
|
||||
return (error);
|
||||
}
|
||||
|
||||
/*
|
||||
* Read bytes from specified file into supplied buffer.
|
||||
*
|
||||
|
@ -286,24 +365,58 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
error = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ASSERT(zfs_uio_offset(uio) < zp->z_size);
|
||||
|
||||
/*
|
||||
* Setting up Direct I/O if requested.
|
||||
*/
|
||||
error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag);
|
||||
if (error) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
#if defined(__linux__)
|
||||
ssize_t start_offset = zfs_uio_offset(uio);
|
||||
#endif
|
||||
ssize_t chunk_size = zfs_vnops_read_chunk_size;
|
||||
ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio));
|
||||
ssize_t start_resid = n;
|
||||
ssize_t dio_remaining_resid = 0;
|
||||
|
||||
if (uio->uio_extflg & UIO_DIRECT) {
|
||||
/*
|
||||
* All pages for an O_DIRECT request ahve already been mapped
|
||||
* so there's no compelling reason to handle this uio in
|
||||
* smaller chunks.
|
||||
*/
|
||||
chunk_size = DMU_MAX_ACCESS;
|
||||
|
||||
/*
|
||||
* In the event that the O_DIRECT request is reading the entire
|
||||
* file, it is possible file's length is not page sized
|
||||
* aligned. However, lower layers expect that the Direct I/O
|
||||
* request is page-aligned. In this case, as much of the file
|
||||
* that can be read using Direct I/O happens and the remaining
|
||||
* amount will be read through the ARC.
|
||||
*
|
||||
* This is still consistent with the semantics of Direct I/O in
|
||||
* ZFS as at a minimum the I/O request must be page-aligned.
|
||||
*/
|
||||
dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t);
|
||||
if (dio_remaining_resid != 0)
|
||||
n -= dio_remaining_resid;
|
||||
}
|
||||
|
||||
while (n > 0) {
|
||||
ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size -
|
||||
P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size));
|
||||
ssize_t nbytes = MIN(n, chunk_size -
|
||||
P2PHASE(zfs_uio_offset(uio), chunk_size));
|
||||
#ifdef UIO_NOCOPY
|
||||
if (zfs_uio_segflg(uio) == UIO_NOCOPY)
|
||||
error = mappedread_sf(zp, nbytes, uio);
|
||||
else
|
||||
#endif
|
||||
if (zn_has_cached_data(zp, zfs_uio_offset(uio),
|
||||
zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
|
||||
zfs_uio_offset(uio) + nbytes - 1)) {
|
||||
error = mappedread(zp, nbytes, uio);
|
||||
} else {
|
||||
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
||||
|
@ -332,12 +445,39 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
n -= nbytes;
|
||||
}
|
||||
|
||||
int64_t nread = start_resid - n;
|
||||
int64_t nread = start_resid;
|
||||
if (error == 0 && (uio->uio_extflg & UIO_DIRECT) &&
|
||||
dio_remaining_resid != 0) {
|
||||
/*
|
||||
* Temporarily remove the UIO_DIRECT flag from the UIO so the
|
||||
* remainder of the file can be read using the ARC.
|
||||
*/
|
||||
uio->uio_extflg &= ~UIO_DIRECT;
|
||||
|
||||
if (zn_has_cached_data(zp, zfs_uio_offset(uio),
|
||||
zfs_uio_offset(uio) + dio_remaining_resid - 1)) {
|
||||
error = mappedread(zp, dio_remaining_resid, uio);
|
||||
} else {
|
||||
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio,
|
||||
dio_remaining_resid);
|
||||
}
|
||||
uio->uio_extflg |= UIO_DIRECT;
|
||||
|
||||
if (error != 0)
|
||||
n -= dio_remaining_resid;
|
||||
}
|
||||
nread -= n;
|
||||
|
||||
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
|
||||
task_io_account_read(nread);
|
||||
out:
|
||||
zfs_rangelock_exit(lr);
|
||||
|
||||
/*
|
||||
* Cleanup for Direct I/O if requested.
|
||||
*/
|
||||
if (uio->uio_extflg & UIO_DIRECT)
|
||||
zfs_uio_free_dio_pages(uio, UIO_READ);
|
||||
|
||||
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
return (error);
|
||||
|
@ -422,6 +562,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
int error = 0, error1;
|
||||
ssize_t start_resid = zfs_uio_resid(uio);
|
||||
uint64_t clear_setid_bits_txg = 0;
|
||||
boolean_t o_direct_defer = B_FALSE;
|
||||
|
||||
/*
|
||||
* Fasttrack empty write
|
||||
|
@ -474,6 +615,15 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting up Direct I/O if requested.
|
||||
*/
|
||||
error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag);
|
||||
if (error) {
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
return (SET_ERROR(error));
|
||||
}
|
||||
|
||||
/*
|
||||
* Pre-fault the pages to ensure slow (eg NFS) pages
|
||||
* don't hold up txg.
|
||||
|
@ -484,6 +634,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
return (SET_ERROR(EFAULT));
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* If in append mode, set the io offset pointer to eof.
|
||||
*/
|
||||
|
@ -504,6 +655,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
woff = zp->z_size;
|
||||
}
|
||||
zfs_uio_setoffset(uio, woff);
|
||||
/*
|
||||
* We need to update the starting offset as well because it is
|
||||
* set previously in the ZPL (Linux) and VNOPS (FreeBSD)
|
||||
* layers.
|
||||
*/
|
||||
zfs_uio_setsoffset(uio, woff);
|
||||
} else {
|
||||
/*
|
||||
* Note that if the file block size will change as a result of
|
||||
|
@ -513,6 +670,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
|
||||
}
|
||||
|
||||
|
||||
if (zn_rlimit_fsize_uio(zp, uio)) {
|
||||
zfs_rangelock_exit(lr);
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
|
@ -539,6 +697,33 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
|
||||
const uint64_t projid = zp->z_projid;
|
||||
|
||||
/*
|
||||
* In the event we are increasing the file block size
|
||||
* (lr_length == UINT64_MAX), we will direct the write to the ARC.
|
||||
* Because zfs_grow_blocksize() will read from the ARC in order to
|
||||
* grow the dbuf, we avoid doing Direct I/O here as that would cause
|
||||
* data written to disk to be overwritten by data in the ARC during
|
||||
* the sync phase. Besides writing data twice to disk, we also
|
||||
* want to avoid consistency concerns between data in the the ARC and
|
||||
* on disk while growing the file's blocksize.
|
||||
*
|
||||
* We will only temporarily remove Direct I/O and put it back after
|
||||
* we have grown the blocksize. We do this in the event a request
|
||||
* is larger than max_blksz, so further requests to
|
||||
* dmu_write_uio_dbuf() will still issue the requests using Direct
|
||||
* IO.
|
||||
*
|
||||
* As an example:
|
||||
* The first block to file is being written as a 4k request with
|
||||
* a recorsize of 1K. The first 1K issued in the loop below will go
|
||||
* through the ARC; however, the following 3 1K requests will
|
||||
* use Direct I/O.
|
||||
*/
|
||||
if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) {
|
||||
uio->uio_extflg &= ~UIO_DIRECT;
|
||||
o_direct_defer = B_TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Write the file in reasonable size chunks. Each chunk is written
|
||||
* in a separate transaction; this keeps the intent log records small
|
||||
|
@ -580,6 +765,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
ssize_t nbytes = n;
|
||||
if (n >= blksz && woff >= zp->z_size &&
|
||||
P2PHASE(woff, blksz) == 0 &&
|
||||
!(uio->uio_extflg & UIO_DIRECT) &&
|
||||
(blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
|
||||
/*
|
||||
* This write covers a full block. "Borrow" a buffer
|
||||
|
@ -705,9 +891,30 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
zfs_uioskip(uio, nbytes);
|
||||
tx_bytes = nbytes;
|
||||
}
|
||||
/*
|
||||
* There is a window where a file's pages can be mmap'ed after
|
||||
* zfs_setup_direct() is called. This is due to the fact that
|
||||
* the rangelock in this function is acquired after calling
|
||||
* zfs_setup_direct(). This is done so that
|
||||
* zfs_uio_prefaultpages() does not attempt to fault in pages
|
||||
* on Linux for Direct I/O requests. This is not necessary as
|
||||
* the pages are pinned in memory and can not be faulted out.
|
||||
* Ideally, the rangelock would be held before calling
|
||||
* zfs_setup_direct() and zfs_uio_prefaultpages(); however,
|
||||
* this can lead to a deadlock as zfs_getpage() also acquires
|
||||
* the rangelock as a RL_WRITER and prefaulting the pages can
|
||||
* lead to zfs_getpage() being called.
|
||||
*
|
||||
* In the case of the pages being mapped after
|
||||
* zfs_setup_direct() is called, the call to update_pages()
|
||||
* will still be made to make sure there is consistency between
|
||||
* the ARC and the Linux page cache. This is an ufortunate
|
||||
* situation as the data will be read back into the ARC after
|
||||
* the Direct I/O write has completed, but this is the penality
|
||||
* for writing to a mmap'ed region of a file using Direct I/O.
|
||||
*/
|
||||
if (tx_bytes &&
|
||||
zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
|
||||
!(ioflag & O_DIRECT)) {
|
||||
zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) {
|
||||
update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
|
||||
}
|
||||
|
||||
|
@ -756,10 +963,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
* the TX_WRITE records logged here.
|
||||
*/
|
||||
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit,
|
||||
NULL, NULL);
|
||||
uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL,
|
||||
NULL);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
/*
|
||||
* Direct I/O was deferred in order to grow the first block.
|
||||
* At this point it can be re-enabled for subsequent writes.
|
||||
*/
|
||||
if (o_direct_defer) {
|
||||
ASSERT(ioflag & O_DIRECT);
|
||||
uio->uio_extflg |= UIO_DIRECT;
|
||||
o_direct_defer = B_FALSE;
|
||||
}
|
||||
|
||||
if (error != 0)
|
||||
break;
|
||||
ASSERT3S(tx_bytes, ==, nbytes);
|
||||
|
@ -767,9 +985,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
pfbytes -= nbytes;
|
||||
}
|
||||
|
||||
if (o_direct_defer) {
|
||||
ASSERT(ioflag & O_DIRECT);
|
||||
uio->uio_extflg |= UIO_DIRECT;
|
||||
o_direct_defer = B_FALSE;
|
||||
}
|
||||
|
||||
zfs_znode_update_vfs(zp);
|
||||
zfs_rangelock_exit(lr);
|
||||
|
||||
/*
|
||||
* Cleanup for Direct I/O if requested.
|
||||
*/
|
||||
if (uio->uio_extflg & UIO_DIRECT)
|
||||
zfs_uio_free_dio_pages(uio, UIO_WRITE);
|
||||
|
||||
/*
|
||||
* If we're in replay mode, or we made no progress, or the
|
||||
* uio data is inaccessible return an error. Otherwise, it's
|
||||
|
@ -784,9 +1014,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
if (commit)
|
||||
zil_commit(zilog, zp->z_id);
|
||||
|
||||
const int64_t nwritten = start_resid - zfs_uio_resid(uio);
|
||||
int64_t nwritten = start_resid - zfs_uio_resid(uio);
|
||||
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
|
||||
task_io_account_write(nwritten);
|
||||
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
return (0);
|
||||
|
@ -846,7 +1075,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
|
|||
uint64_t object = lr->lr_foid;
|
||||
uint64_t offset = lr->lr_offset;
|
||||
uint64_t size = lr->lr_length;
|
||||
dmu_buf_t *db;
|
||||
zgd_t *zgd;
|
||||
int error = 0;
|
||||
uint64_t zp_gen;
|
||||
|
@ -890,8 +1118,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
|
|||
* we don't have to write the data twice.
|
||||
*/
|
||||
if (buf != NULL) { /* immediate write */
|
||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
||||
offset, size, RL_READER);
|
||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
|
||||
size, RL_READER);
|
||||
/* test for truncation needs to be done while range locked */
|
||||
if (offset >= zp->z_size) {
|
||||
error = SET_ERROR(ENOENT);
|
||||
|
@ -911,10 +1139,11 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
|
|||
for (;;) {
|
||||
uint64_t blkoff;
|
||||
size = zp->z_blksz;
|
||||
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
|
||||
blkoff = ISP2(size) ? P2PHASE(offset, size) :
|
||||
offset;
|
||||
offset -= blkoff;
|
||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
||||
offset, size, RL_READER);
|
||||
zgd->zgd_lr = zfs_rangelock_enter(
|
||||
&zp->z_rangelock, offset, size, RL_READER);
|
||||
if (zp->z_blksz == size)
|
||||
break;
|
||||
offset += blkoff;
|
||||
|
@ -929,18 +1158,44 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
|
|||
zil_fault_io = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
dmu_buf_t *dbp;
|
||||
if (error == 0)
|
||||
error = dmu_buf_hold_noread(os, object, offset, zgd,
|
||||
&db);
|
||||
&dbp);
|
||||
|
||||
if (error == 0) {
|
||||
blkptr_t *bp = &lr->lr_blkptr;
|
||||
zgd->zgd_db = dbp;
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp;
|
||||
boolean_t direct_write = B_FALSE;
|
||||
mutex_enter(&db->db_mtx);
|
||||
dbuf_dirty_record_t *dr =
|
||||
dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg);
|
||||
if (dr != NULL && dr->dt.dl.dr_diowrite)
|
||||
direct_write = B_TRUE;
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
||||
zgd->zgd_db = db;
|
||||
/*
|
||||
* All Direct I/O writes will have already completed and
|
||||
* the block pointer can be immediately stored in the
|
||||
* log record.
|
||||
*/
|
||||
if (direct_write) {
|
||||
/*
|
||||
* A Direct I/O write always covers an entire
|
||||
* block.
|
||||
*/
|
||||
ASSERT3U(dbp->db_size, ==, zp->z_blksz);
|
||||
lr->lr_blkptr = dr->dt.dl.dr_overridden_by;
|
||||
zfs_get_done(zgd, 0);
|
||||
return (0);
|
||||
}
|
||||
|
||||
blkptr_t *bp = &lr->lr_blkptr;
|
||||
zgd->zgd_bp = bp;
|
||||
|
||||
ASSERT(db->db_offset == offset);
|
||||
ASSERT(db->db_size == size);
|
||||
ASSERT3U(dbp->db_offset, ==, offset);
|
||||
ASSERT3U(dbp->db_size, ==, size);
|
||||
|
||||
error = dmu_sync(zio, lr->lr_common.lrc_txg,
|
||||
zfs_get_done, zgd);
|
||||
|
@ -975,7 +1230,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
|
|||
return (error);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
zfs_get_done(zgd_t *zgd, int error)
|
||||
{
|
||||
|
@ -1559,3 +1813,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
|
|||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
|
||||
"Wait for dirty blocks when cloning");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW,
|
||||
"Enable Direct I/O");
|
||||
|
|
106
module/zfs/zio.c
106
module/zfs/zio.c
|
@ -803,6 +803,12 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
|
|||
pio->io_reexecute |= zio->io_reexecute;
|
||||
ASSERT3U(*countp, >, 0);
|
||||
|
||||
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
|
||||
ASSERT3U(*errorp, ==, EIO);
|
||||
ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
|
||||
pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
|
||||
}
|
||||
|
||||
(*countp)--;
|
||||
|
||||
if (*countp == 0 && pio->io_stall == countp) {
|
||||
|
@ -1282,20 +1288,14 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
|
|||
zio_flag_t flags, const zbookmark_phys_t *zb)
|
||||
{
|
||||
zio_t *zio;
|
||||
enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ?
|
||||
ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ?
|
||||
ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE;
|
||||
|
||||
ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
|
||||
zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
|
||||
zp->zp_compress >= ZIO_COMPRESS_OFF &&
|
||||
zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
|
||||
DMU_OT_IS_VALID(zp->zp_type) &&
|
||||
zp->zp_level < 32 &&
|
||||
zp->zp_copies > 0 &&
|
||||
zp->zp_copies <= spa_max_replication(spa));
|
||||
|
||||
zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
|
||||
ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
|
||||
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
|
||||
ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
|
||||
ZIO_STAGE_OPEN, pipeline);
|
||||
|
||||
zio->io_ready = ready;
|
||||
zio->io_children_ready = children_ready;
|
||||
|
@ -1572,6 +1572,19 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
|
|||
*/
|
||||
pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
|
||||
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
|
||||
} else if (type == ZIO_TYPE_WRITE &&
|
||||
pio->io_prop.zp_direct_write == B_TRUE) {
|
||||
/*
|
||||
* By default we only will verify checksums for Direct I/O
|
||||
* writes for Linux. FreeBSD is able to place user pages under
|
||||
* write protection before issuing them to the ZIO pipeline.
|
||||
*
|
||||
* Checksum validation errors will only be reported through
|
||||
* the top-level VDEV, which is set by this child ZIO.
|
||||
*/
|
||||
ASSERT3P(bp, !=, NULL);
|
||||
ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
|
||||
pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY;
|
||||
}
|
||||
|
||||
if (vd->vdev_ops->vdev_op_leaf) {
|
||||
|
@ -3104,6 +3117,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
|
|||
zp.zp_nopwrite = B_FALSE;
|
||||
zp.zp_encrypt = gio->io_prop.zp_encrypt;
|
||||
zp.zp_byteorder = gio->io_prop.zp_byteorder;
|
||||
zp.zp_direct_write = B_FALSE;
|
||||
memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
|
||||
memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
|
||||
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
|
||||
|
@ -4509,6 +4523,19 @@ zio_vdev_io_assess(zio_t *zio)
|
|||
zio->io_vsd = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* If a Direct I/O write checksum verify error has occurred then this
|
||||
* I/O should not attempt to be issued again. Instead the EIO will
|
||||
* be returned.
|
||||
*/
|
||||
if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
|
||||
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
|
||||
ASSERT3U(zio->io_error, ==, EIO);
|
||||
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
||||
return (zio);
|
||||
}
|
||||
|
||||
|
||||
if (zio_injection_enabled && zio->io_error == 0)
|
||||
zio->io_error = zio_handle_fault_injection(zio, EIO);
|
||||
|
||||
|
@ -4822,6 +4849,49 @@ zio_checksum_verify(zio_t *zio)
|
|||
return (zio);
|
||||
}
|
||||
|
||||
static zio_t *
|
||||
zio_dio_checksum_verify(zio_t *zio)
|
||||
{
|
||||
zio_t *pio = zio_unique_parent(zio);
|
||||
int error;
|
||||
|
||||
ASSERT3P(zio->io_vd, !=, NULL);
|
||||
ASSERT3P(zio->io_bp, !=, NULL);
|
||||
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
|
||||
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
|
||||
ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE);
|
||||
ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
|
||||
|
||||
if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0)
|
||||
goto out;
|
||||
|
||||
if ((error = zio_checksum_error(zio, NULL)) != 0) {
|
||||
zio->io_error = error;
|
||||
if (error == ECKSUM) {
|
||||
mutex_enter(&zio->io_vd->vdev_stat_lock);
|
||||
zio->io_vd->vdev_stat.vs_dio_verify_errors++;
|
||||
mutex_exit(&zio->io_vd->vdev_stat_lock);
|
||||
zio->io_error = SET_ERROR(EIO);
|
||||
zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
|
||||
|
||||
/*
|
||||
* The EIO error must be propagated up to the logical
|
||||
* parent ZIO in zio_notify_parent() so it can be
|
||||
* returned to dmu_write_abd().
|
||||
*/
|
||||
zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE;
|
||||
|
||||
(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY,
|
||||
zio->io_spa, zio->io_vd, &zio->io_bookmark,
|
||||
zio, 0);
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
return (zio);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Called by RAID-Z to ensure we don't compute the checksum twice.
|
||||
*/
|
||||
|
@ -5152,7 +5222,8 @@ zio_done(zio_t *zio)
|
|||
* device is currently unavailable.
|
||||
*/
|
||||
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
|
||||
!vdev_is_dead(zio->io_vd)) {
|
||||
!vdev_is_dead(zio->io_vd) &&
|
||||
!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
|
||||
int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
|
||||
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
|
||||
if (ret != EALREADY) {
|
||||
|
@ -5167,6 +5238,7 @@ zio_done(zio_t *zio)
|
|||
|
||||
if ((zio->io_error == EIO || !(zio->io_flags &
|
||||
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
|
||||
!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) &&
|
||||
zio == zio->io_logical) {
|
||||
/*
|
||||
* For logical I/O requests, tell the SPA to log the
|
||||
|
@ -5188,7 +5260,8 @@ zio_done(zio_t *zio)
|
|||
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
|
||||
|
||||
if (IO_IS_ALLOCATING(zio) &&
|
||||
!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
|
||||
!(zio->io_flags & ZIO_FLAG_CANFAIL) &&
|
||||
!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
|
||||
if (zio->io_error != ENOSPC)
|
||||
zio->io_reexecute |= ZIO_REEXECUTE_NOW;
|
||||
else
|
||||
|
@ -5238,6 +5311,14 @@ zio_done(zio_t *zio)
|
|||
zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
|
||||
|
||||
if (zio->io_reexecute) {
|
||||
/*
|
||||
* A Direct I/O write that has a checksum verify error should
|
||||
* not attempt to reexecute. Instead, EAGAIN should just be
|
||||
* propagated back up so the write can be attempt to be issued
|
||||
* through the ARC.
|
||||
*/
|
||||
ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));
|
||||
|
||||
/*
|
||||
* This is a logical I/O that wants to reexecute.
|
||||
*
|
||||
|
@ -5398,6 +5479,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
|
|||
zio_vdev_io_done,
|
||||
zio_vdev_io_assess,
|
||||
zio_checksum_verify,
|
||||
zio_dio_checksum_verify,
|
||||
zio_done
|
||||
};
|
||||
|
||||
|
|
|
@ -693,6 +693,14 @@ tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos',
|
|||
'zfs_unallow_007_neg', 'zfs_unallow_008_neg']
|
||||
tags = ['functional', 'delegate']
|
||||
|
||||
[tests/functional/direct]
|
||||
tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines',
|
||||
'dio_compression', 'dio_dedup', 'dio_encryption', 'dio_grow_block',
|
||||
'dio_max_recordsize', 'dio_mixed', 'dio_mmap', 'dio_overwrites',
|
||||
'dio_property', 'dio_random', 'dio_recordsize', 'dio_unaligned_block',
|
||||
'dio_unaligned_filesize']
|
||||
tags = ['functional', 'direct']
|
||||
|
||||
[tests/functional/exec]
|
||||
tests = ['exec_001_pos', 'exec_002_neg']
|
||||
tags = ['functional', 'exec']
|
||||
|
@ -735,7 +743,7 @@ pre =
|
|||
tags = ['functional', 'inheritance']
|
||||
|
||||
[tests/functional/io]
|
||||
tests = ['sync', 'psync', 'posixaio', 'mmap']
|
||||
tests = ['mmap', 'posixaio', 'psync', 'sync']
|
||||
tags = ['functional', 'io']
|
||||
|
||||
[tests/functional/inuse]
|
||||
|
|
|
@ -30,3 +30,7 @@ tags = ['functional', 'cli_root', 'zfs_jail']
|
|||
tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive',
|
||||
'pam_short_password']
|
||||
tags = ['functional', 'pam']
|
||||
|
||||
[tests/functional/direct:FreeBSD]
|
||||
tests = ['dio_write_stable_pages']
|
||||
tags = ['functional', 'direct']
|
||||
|
|
|
@ -102,6 +102,10 @@ tags = ['functional', 'compression']
|
|||
tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos']
|
||||
tags = ['functional', 'devices']
|
||||
|
||||
[tests/functional/direct:Linux]
|
||||
tests = ['dio_write_verify']
|
||||
tags = ['functional', 'direct']
|
||||
|
||||
[tests/functional/events:Linux]
|
||||
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
|
||||
'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
/getversion
|
||||
/largest_file
|
||||
/libzfs_input_check
|
||||
/manipulate_user_buffer
|
||||
/mkbusy
|
||||
/mkfile
|
||||
/mkfiles
|
||||
|
|
|
@ -60,6 +60,8 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/libzfs_input_check
|
|||
libzfs_core.la \
|
||||
libnvpair.la
|
||||
|
||||
scripts_zfs_tests_bin_PROGRAMS += %D%/manipulate_user_buffer
|
||||
%C%_manipulate_user_buffer_LDADD = -lpthread
|
||||
|
||||
scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree
|
||||
%C%_mkfile_LDADD = $(LTLIBINTL)
|
||||
|
|
|
@ -0,0 +1,272 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2022 by Triad National Security, LLC.
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <pthread.h>
|
||||
#include <assert.h>
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(a, b) ((a) < (b)) ? (a) : (b)
|
||||
#endif
|
||||
|
||||
static char *outputfile = NULL;
|
||||
static int blocksize = 131072; /* 128K */
|
||||
static int wr_err_expected = 0;
|
||||
static int numblocks = 100;
|
||||
static char *execname = NULL;
|
||||
static int print_usage = 0;
|
||||
static int randompattern = 0;
|
||||
static int ofd;
|
||||
char *buf = NULL;
|
||||
|
||||
typedef struct {
|
||||
int entire_file_written;
|
||||
} pthread_args_t;
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
{
|
||||
(void) fprintf(stderr,
|
||||
"usage %s -o outputfile [-b blocksize] [-e wr_error_expected]\n"
|
||||
" [-n numblocks] [-p randpattern] [-h help]\n"
|
||||
"\n"
|
||||
"Testing whether checksum verify works correctly for O_DIRECT.\n"
|
||||
"when manipulating the contents of a userspace buffer.\n"
|
||||
"\n"
|
||||
" outputfile: File to write to.\n"
|
||||
" blocksize: Size of each block to write (must be at \n"
|
||||
" least >= 512).\n"
|
||||
" wr_err_expected: Whether pwrite() is expected to return EIO\n"
|
||||
" while manipulating the contents of the\n"
|
||||
" buffer.\n"
|
||||
" numblocks: Total number of blocksized blocks to\n"
|
||||
" write.\n"
|
||||
" randpattern: Fill data buffer with random data. Default\n"
|
||||
" behavior is to fill the buffer with the \n"
|
||||
" known data pattern (0xdeadbeef).\n"
|
||||
" help: Print usage information and exit.\n"
|
||||
"\n"
|
||||
" Required parameters:\n"
|
||||
" outputfile\n"
|
||||
"\n"
|
||||
" Default Values:\n"
|
||||
" blocksize -> 131072\n"
|
||||
" wr_err_expexted -> false\n"
|
||||
" numblocks -> 100\n"
|
||||
" randpattern -> false\n",
|
||||
execname);
|
||||
(void) exit(1);
|
||||
}
|
||||
|
||||
static void
|
||||
parse_options(int argc, char *argv[])
|
||||
{
|
||||
int c;
|
||||
int errflag = 0;
|
||||
extern char *optarg;
|
||||
extern int optind, optopt;
|
||||
execname = argv[0];
|
||||
|
||||
while ((c = getopt(argc, argv, "b:ehn:o:p")) != -1) {
|
||||
switch (c) {
|
||||
case 'b':
|
||||
blocksize = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'e':
|
||||
wr_err_expected = 1;
|
||||
break;
|
||||
|
||||
case 'h':
|
||||
print_usage = 1;
|
||||
break;
|
||||
|
||||
case 'n':
|
||||
numblocks = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'o':
|
||||
outputfile = optarg;
|
||||
break;
|
||||
|
||||
case 'p':
|
||||
randompattern = 1;
|
||||
break;
|
||||
|
||||
case ':':
|
||||
(void) fprintf(stderr,
|
||||
"Option -%c requires an opertand\n",
|
||||
optopt);
|
||||
errflag++;
|
||||
break;
|
||||
case '?':
|
||||
default:
|
||||
(void) fprintf(stderr,
|
||||
"Unrecognized option: -%c\n", optopt);
|
||||
errflag++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (errflag || print_usage == 1)
|
||||
(void) usage();
|
||||
|
||||
if (blocksize < 512 || outputfile == NULL || numblocks <= 0) {
|
||||
(void) fprintf(stderr,
|
||||
"Required paramater(s) missing or invalid.\n");
|
||||
(void) usage();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Write blocksize * numblocks to the file using O_DIRECT.
|
||||
*/
|
||||
static void *
|
||||
write_thread(void *arg)
|
||||
{
|
||||
size_t offset = 0;
|
||||
int total_data = blocksize * numblocks;
|
||||
int left = total_data;
|
||||
ssize_t wrote = 0;
|
||||
pthread_args_t *args = (pthread_args_t *)arg;
|
||||
|
||||
while (!args->entire_file_written) {
|
||||
wrote = pwrite(ofd, buf, blocksize, offset);
|
||||
if (wrote != blocksize) {
|
||||
if (wr_err_expected)
|
||||
assert(errno == EIO);
|
||||
else
|
||||
exit(2);
|
||||
}
|
||||
|
||||
offset = ((offset + blocksize) % total_data);
|
||||
left -= blocksize;
|
||||
|
||||
if (left == 0)
|
||||
args->entire_file_written = 1;
|
||||
}
|
||||
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the buffers contents with random data.
|
||||
*/
|
||||
static void *
|
||||
manipulate_buf_thread(void *arg)
|
||||
{
|
||||
size_t rand_offset;
|
||||
char rand_char;
|
||||
pthread_args_t *args = (pthread_args_t *)arg;
|
||||
|
||||
while (!args->entire_file_written) {
|
||||
rand_offset = (rand() % blocksize);
|
||||
rand_char = (rand() % (126 - 33) + 33);
|
||||
buf[rand_offset] = rand_char;
|
||||
}
|
||||
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
const char *datapattern = "0xdeadbeef";
|
||||
int ofd_flags = O_WRONLY | O_CREAT | O_DIRECT;
|
||||
mode_t mode = S_IRUSR | S_IWUSR;
|
||||
pthread_t write_thr;
|
||||
pthread_t manipul_thr;
|
||||
int left = blocksize;
|
||||
int offset = 0;
|
||||
int rc;
|
||||
pthread_args_t args = { 0 };
|
||||
|
||||
parse_options(argc, argv);
|
||||
|
||||
ofd = open(outputfile, ofd_flags, mode);
|
||||
if (ofd == -1) {
|
||||
(void) fprintf(stderr, "%s, %s\n", execname, outputfile);
|
||||
perror("open");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
int err = posix_memalign((void **)&buf, sysconf(_SC_PAGE_SIZE),
|
||||
blocksize);
|
||||
if (err != 0) {
|
||||
(void) fprintf(stderr,
|
||||
"%s: %s\n", execname, strerror(err));
|
||||
exit(2);
|
||||
}
|
||||
|
||||
if (!randompattern) {
|
||||
/* Putting known data pattern in buffer */
|
||||
while (left) {
|
||||
size_t amt = MIN(strlen(datapattern), left);
|
||||
memcpy(&buf[offset], datapattern, amt);
|
||||
offset += amt;
|
||||
left -= amt;
|
||||
}
|
||||
} else {
|
||||
/* Putting random data in buffer */
|
||||
for (int i = 0; i < blocksize; i++)
|
||||
buf[i] = rand();
|
||||
}
|
||||
|
||||
/*
|
||||
* Writing using O_DIRECT while manipulating the buffer conntents until
|
||||
* the entire file is written.
|
||||
*/
|
||||
if ((rc = pthread_create(&manipul_thr, NULL, manipulate_buf_thread,
|
||||
&args))) {
|
||||
fprintf(stderr, "error: pthreads_create, manipul_thr, "
|
||||
"rc: %d\n", rc);
|
||||
exit(2);
|
||||
}
|
||||
|
||||
if ((rc = pthread_create(&write_thr, NULL, write_thread, &args))) {
|
||||
fprintf(stderr, "error: pthreads_create, write_thr, "
|
||||
"rc: %d\n", rc);
|
||||
exit(2);
|
||||
}
|
||||
|
||||
pthread_join(write_thr, NULL);
|
||||
pthread_join(manipul_thr, NULL);
|
||||
|
||||
assert(args.entire_file_written == 1);
|
||||
|
||||
(void) close(ofd);
|
||||
|
||||
free(buf);
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -21,12 +21,19 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static int alignment = 0;
|
||||
static int bsize = 0;
|
||||
static int count = 0;
|
||||
static char *ifile = NULL;
|
||||
static char *ofile = NULL;
|
||||
static off_t stride = 0;
|
||||
static off_t stride = 1;
|
||||
static off_t seek = 0;
|
||||
static int seekbytes = 0;
|
||||
static int if_o_direct = 0;
|
||||
static int of_o_direct = 0;
|
||||
static int skip = 0;
|
||||
static int skipbytes = 0;
|
||||
static int entire_file = 0;
|
||||
static const char *execname = "stride_dd";
|
||||
|
||||
static void usage(void);
|
||||
|
@ -36,8 +43,10 @@ static void
|
|||
usage(void)
|
||||
{
|
||||
(void) fprintf(stderr,
|
||||
"usage: %s -i inputfile -o outputfile -b blocksize -c count \n"
|
||||
" -s stride [ -k seekblocks]\n"
|
||||
"usage: %s -i inputfile -o outputfile -b blocksize [-c count]\n"
|
||||
" [-s stride] [-k seekblocks] [-K seekbytes]\n"
|
||||
" [-a alignment] [-d if_o_direct] [-D of_o_direct]\n"
|
||||
" [-p skipblocks] [-P skipbytes] [-e entire_file]\n"
|
||||
"\n"
|
||||
"Simplified version of dd that supports the stride option.\n"
|
||||
"A stride of n means that for each block written, n - 1 blocks\n"
|
||||
|
@ -48,13 +57,44 @@ usage(void)
|
|||
" inputfile: File to read from\n"
|
||||
" outputfile: File to write to\n"
|
||||
" blocksize: Size of each block to read/write\n"
|
||||
" count: Number of blocks to read/write\n"
|
||||
" stride: Read/write a block then skip (stride - 1) blocks\n"
|
||||
" seekblocks: Number of blocks to skip at start of output\n",
|
||||
" count: Number of blocks to read/write (Required"
|
||||
" unless -e is used)\n"
|
||||
" stride: Read/write a block then skip (stride - 1) blocks"
|
||||
"\n"
|
||||
" seekblocks: Number of blocks to skip at start of output\n"
|
||||
" seekbytes: Treat seekblocks as byte count\n"
|
||||
" alignment: Alignment passed to posix_memalign() (default"
|
||||
" PAGE_SIZE)\n"
|
||||
" if_o_direct: Use O_DIRECT with inputfile (default no O_DIRECT)"
|
||||
"\n"
|
||||
" of_o_direct: Use O_DIRECT with outputfile (default no "
|
||||
" O_DIRECT)\n"
|
||||
" skipblocks: Number of blocks to skip at start of input "
|
||||
" (default 0)\n"
|
||||
" skipbytes: Treat skipblocks as byte count\n"
|
||||
" entire_file: When used the entire inputfile will be read and"
|
||||
" count will be ignored\n",
|
||||
execname);
|
||||
(void) exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* posix_memalign() only allows for alignments which are postive, powers of two
|
||||
* and a multiple of sizeof (void *).
|
||||
*/
|
||||
static int
|
||||
invalid_alignment(int alignment)
|
||||
{
|
||||
if ((alignment < 0) || (alignment & (alignment - 1)) ||
|
||||
((alignment % sizeof (void *)))) {
|
||||
(void) fprintf(stderr,
|
||||
"Alignment must be a postive, power of two, and multiple "
|
||||
"of sizeof (void *).\n");
|
||||
return (1);
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
parse_options(int argc, char *argv[])
|
||||
{
|
||||
|
@ -62,12 +102,17 @@ parse_options(int argc, char *argv[])
|
|||
int errflag = 0;
|
||||
|
||||
execname = argv[0];
|
||||
alignment = sysconf(_SC_PAGE_SIZE);
|
||||
|
||||
extern char *optarg;
|
||||
extern int optind, optopt;
|
||||
|
||||
while ((c = getopt(argc, argv, ":b:c:i:o:s:k:")) != -1) {
|
||||
while ((c = getopt(argc, argv, "a:b:c:deDi:o:s:k:Kp:P")) != -1) {
|
||||
switch (c) {
|
||||
case 'a':
|
||||
alignment = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'b':
|
||||
bsize = atoi(optarg);
|
||||
break;
|
||||
|
@ -76,6 +121,18 @@ parse_options(int argc, char *argv[])
|
|||
count = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'd':
|
||||
if_o_direct = 1;
|
||||
break;
|
||||
|
||||
case 'e':
|
||||
entire_file = 1;
|
||||
break;
|
||||
|
||||
case 'D':
|
||||
of_o_direct = 1;
|
||||
break;
|
||||
|
||||
case 'i':
|
||||
ifile = optarg;
|
||||
break;
|
||||
|
@ -92,6 +149,18 @@ parse_options(int argc, char *argv[])
|
|||
seek = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'K':
|
||||
seekbytes = 1;
|
||||
break;
|
||||
|
||||
case 'p':
|
||||
skip = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 'P':
|
||||
skipbytes = 1;
|
||||
break;
|
||||
|
||||
case ':':
|
||||
(void) fprintf(stderr,
|
||||
"Option -%c requires an operand\n", optopt);
|
||||
|
@ -111,64 +180,59 @@ parse_options(int argc, char *argv[])
|
|||
}
|
||||
}
|
||||
|
||||
if (bsize <= 0 || count <= 0 || stride <= 0 || ifile == NULL ||
|
||||
ofile == NULL || seek < 0) {
|
||||
if (bsize <= 0 || stride <= 0 || ifile == NULL || ofile == NULL ||
|
||||
seek < 0 || invalid_alignment(alignment) || skip < 0) {
|
||||
(void) fprintf(stderr,
|
||||
"Required parameter(s) missing or invalid.\n");
|
||||
(void) usage();
|
||||
}
|
||||
|
||||
if (count <= 0 && entire_file == 0) {
|
||||
(void) fprintf(stderr,
|
||||
"Required parameter(s) missing or invalid.\n");
|
||||
(void) usage();
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
static void
|
||||
read_entire_file(int ifd, int ofd, void *buf)
|
||||
{
|
||||
int i;
|
||||
int ifd;
|
||||
int ofd;
|
||||
void *buf;
|
||||
int c;
|
||||
|
||||
parse_options(argc, argv);
|
||||
|
||||
ifd = open(ifile, O_RDONLY);
|
||||
if (ifd == -1) {
|
||||
(void) fprintf(stderr, "%s: %s: ", execname, ifile);
|
||||
perror("open");
|
||||
do {
|
||||
c = read(ifd, buf, bsize);
|
||||
if (c < 0) {
|
||||
perror("read");
|
||||
exit(2);
|
||||
} else if (c != 0) {
|
||||
c = write(ofd, buf, bsize);
|
||||
if (c < 0) {
|
||||
perror("write");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
ofd = open(ofile, O_WRONLY | O_CREAT, 0666);
|
||||
if (ofd == -1) {
|
||||
(void) fprintf(stderr, "%s: %s: ", execname, ofile);
|
||||
perror("open");
|
||||
}
|
||||
if (stride > 1) {
|
||||
if (lseek(ifd, (stride - 1) * bsize, SEEK_CUR) == -1) {
|
||||
perror("input lseek");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
/*
|
||||
* We use valloc because some character block devices expect a
|
||||
* page-aligned buffer.
|
||||
*/
|
||||
int err = posix_memalign(&buf, 4096, bsize);
|
||||
if (err != 0) {
|
||||
(void) fprintf(stderr,
|
||||
"%s: %s\n", execname, strerror(err));
|
||||
exit(2);
|
||||
}
|
||||
|
||||
if (seek > 0) {
|
||||
if (lseek(ofd, seek * bsize, SEEK_CUR) == -1) {
|
||||
if (lseek(ofd, (stride - 1) * bsize, SEEK_CUR) == -1) {
|
||||
perror("output lseek");
|
||||
exit(2);
|
||||
}
|
||||
}
|
||||
} while (c != 0);
|
||||
}
|
||||
|
||||
static void
|
||||
read_on_count(int ifd, int ofd, void *buf)
|
||||
{
|
||||
int i;
|
||||
int c;
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
c = read(ifd, buf, bsize);
|
||||
if (c != bsize) {
|
||||
|
||||
perror("read");
|
||||
exit(2);
|
||||
}
|
||||
if (c != bsize) {
|
||||
if (c < 0) {
|
||||
perror("read");
|
||||
|
@ -205,6 +269,71 @@ main(int argc, char *argv[])
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
int ifd;
|
||||
int ofd;
|
||||
int ifd_flags = O_RDONLY;
|
||||
int ofd_flags = O_WRONLY | O_CREAT;
|
||||
void *buf;
|
||||
|
||||
parse_options(argc, argv);
|
||||
|
||||
if (if_o_direct)
|
||||
ifd_flags |= O_DIRECT;
|
||||
|
||||
if (of_o_direct)
|
||||
ofd_flags |= O_DIRECT;
|
||||
|
||||
ifd = open(ifile, ifd_flags);
|
||||
if (ifd == -1) {
|
||||
(void) fprintf(stderr, "%s: %s: ", execname, ifile);
|
||||
perror("open");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
ofd = open(ofile, ofd_flags, 0666);
|
||||
if (ofd == -1) {
|
||||
(void) fprintf(stderr, "%s: %s: ", execname, ofile);
|
||||
perror("open");
|
||||
exit(2);
|
||||
}
|
||||
|
||||
/*
|
||||
* We use valloc because some character block devices expect a
|
||||
* page-aligned buffer.
|
||||
*/
|
||||
int err = posix_memalign(&buf, alignment, bsize);
|
||||
if (err != 0) {
|
||||
(void) fprintf(stderr,
|
||||
"%s: %s\n", execname, strerror(err));
|
||||
exit(2);
|
||||
}
|
||||
|
||||
if (skip > 0) {
|
||||
int skipamt = skipbytes == 1 ? skip : skip * bsize;
|
||||
if (lseek(ifd, skipamt, SEEK_CUR) == -1) {
|
||||
perror("input lseek");
|
||||
exit(2);
|
||||
}
|
||||
}
|
||||
|
||||
if (seek > 0) {
|
||||
int seekamt = seekbytes == 1 ? seek : seek * bsize;
|
||||
if (lseek(ofd, seekamt, SEEK_CUR) == -1) {
|
||||
perror("output lseek");
|
||||
exit(2);
|
||||
}
|
||||
}
|
||||
|
||||
if (entire_file == 1)
|
||||
read_entire_file(ifd, ofd, buf);
|
||||
else
|
||||
read_on_count(ifd, ofd, buf);
|
||||
|
||||
free(buf);
|
||||
|
||||
(void) close(ofd);
|
||||
|
|
|
@ -200,6 +200,7 @@ export ZFSTEST_FILES='badsend
|
|||
getversion
|
||||
largest_file
|
||||
libzfs_input_check
|
||||
manipulate_user_buffer
|
||||
mkbusy
|
||||
mkfile
|
||||
mkfiles
|
||||
|
|
|
@ -3474,6 +3474,18 @@ function md5digest
|
|||
esac
|
||||
}
|
||||
|
||||
#
|
||||
# Compare the MD5 digest of two files.
|
||||
#
|
||||
function cmp_md5s {
|
||||
typeset file1=$1
|
||||
typeset file2=$2
|
||||
|
||||
typeset sum1=$(md5digest $file1)
|
||||
typeset sum2=$(md5digest $file2)
|
||||
test "$sum1" = "$sum2"
|
||||
}
|
||||
|
||||
#
|
||||
# Compute SHA256 digest for given file or stdin if no file given.
|
||||
# Note: file path must not contain spaces
|
||||
|
|
|
@ -93,6 +93,7 @@ VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift
|
|||
VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift
|
||||
VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift
|
||||
VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count
|
||||
VDEV_DIRECT_WR_VERIFY vdev.direct_write_verify zfs_vdev_direct_write_verify
|
||||
VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip
|
||||
VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev
|
||||
VOL_MODE vol.mode zvol_volmode
|
||||
|
@ -100,6 +101,7 @@ VOL_RECURSIVE vol.recursive UNSUPPORTED
|
|||
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
|
||||
BCLONE_ENABLED bclone_enabled zfs_bclone_enabled
|
||||
BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty
|
||||
DIO_ENABLED dio_enabled zfs_dio_enabled
|
||||
XATTR_COMPAT xattr_compat zfs_xattr_compat
|
||||
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
|
||||
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max
|
||||
|
|
|
@ -265,6 +265,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \
|
|||
functional/delegate/delegate_common.kshlib \
|
||||
functional/devices/devices.cfg \
|
||||
functional/devices/devices_common.kshlib \
|
||||
functional/direct/dio.cfg \
|
||||
functional/direct/dio.kshlib \
|
||||
functional/events/events.cfg \
|
||||
functional/events/events_common.kshlib \
|
||||
functional/fault/fault.cfg \
|
||||
|
@ -1458,6 +1460,26 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||
functional/devices/devices_002_neg.ksh \
|
||||
functional/devices/devices_003_pos.ksh \
|
||||
functional/devices/setup.ksh \
|
||||
functional/direct/dio_aligned_block.ksh \
|
||||
functional/direct/dio_async_always.ksh \
|
||||
functional/direct/dio_async_fio_ioengines.ksh \
|
||||
functional/direct/dio_compression.ksh \
|
||||
functional/direct/dio_dedup.ksh \
|
||||
functional/direct/dio_encryption.ksh \
|
||||
functional/direct/dio_grow_block.ksh \
|
||||
functional/direct/dio_max_recordsize.ksh \
|
||||
functional/direct/dio_mixed.ksh \
|
||||
functional/direct/dio_mmap.ksh \
|
||||
functional/direct/dio_overwrites.ksh \
|
||||
functional/direct/dio_property.ksh \
|
||||
functional/direct/dio_random.ksh \
|
||||
functional/direct/dio_recordsize.ksh \
|
||||
functional/direct/dio_unaligned_block.ksh \
|
||||
functional/direct/dio_unaligned_filesize.ksh \
|
||||
functional/direct/dio_write_verify.ksh \
|
||||
functional/direct/dio_write_stable_pages.ksh \
|
||||
functional/direct/setup.ksh \
|
||||
functional/direct/cleanup.ksh \
|
||||
functional/dos_attributes/cleanup.ksh \
|
||||
functional/dos_attributes/read_dos_attrs_001.ksh \
|
||||
functional/dos_attributes/setup.ksh \
|
||||
|
|
|
@ -75,7 +75,7 @@ export PERF_COMPPERCENT=66
|
|||
export PERF_COMPCHUNK=0
|
||||
export BLOCKSIZE=128K
|
||||
export SYNC_TYPE=0
|
||||
export DIRECT=1
|
||||
export DIRECT=0
|
||||
export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))
|
||||
|
||||
log_must set_tunable32 L2ARC_WRITE_MAX $(( $VCACHE_SZ * 2 ))
|
||||
|
|
|
@ -36,7 +36,7 @@ export PERF_COMPPERCENT=66
|
|||
export PERF_COMPCHUNK=0
|
||||
export BLOCKSIZE=128K
|
||||
export SYNC_TYPE=0
|
||||
export DIRECT=1
|
||||
export DIRECT=0
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
|
|
|
@ -37,7 +37,7 @@ export PERF_COMPPERCENT=66
|
|||
export PERF_COMPCHUNK=0
|
||||
export BLOCKSIZE=128K
|
||||
export SYNC_TYPE=0
|
||||
export DIRECT=1
|
||||
export DIRECT=0
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
|
|
|
@ -37,7 +37,7 @@ export PERF_COMPPERCENT=66
|
|||
export PERF_COMPCHUNK=0
|
||||
export BLOCKSIZE=128K
|
||||
export SYNC_TYPE=0
|
||||
export DIRECT=1
|
||||
export DIRECT=0
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
default_cleanup_noexit
|
||||
|
||||
if tunable_exists DIO_ENABLED ; then
|
||||
log_must restore_tunable DIO_ENABLED
|
||||
fi
|
||||
|
||||
log_pass
|
|
@ -0,0 +1,26 @@
|
|||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
DIO_VDEV1=$TEST_BASE_DIR/file1
|
||||
DIO_VDEV2=$TEST_BASE_DIR/file2
|
||||
DIO_VDEV3=$TEST_BASE_DIR/file3
|
||||
DIO_VDEVS="$DIO_VDEV1 $DIO_VDEV2 $DIO_VDEV3"
|
||||
|
||||
DIO_FILESIZE=4M
|
||||
DIO_BS=128K
|
|
@ -0,0 +1,331 @@
|
|||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
|
||||
function dio_cleanup
|
||||
{
|
||||
if poolexists $TESTPOOL1; then
|
||||
destroy_pool $TESTPOOL1
|
||||
fi
|
||||
|
||||
rm -f $DIO_VDEVS
|
||||
}
|
||||
|
||||
#
|
||||
# Generate an IO workload using fio and then verify the resulting data.
|
||||
#
|
||||
function dio_and_verify # mode file-size block-size directory ioengine extra-args
|
||||
{
|
||||
typeset mode=$1
|
||||
typeset size=$2
|
||||
typeset bs=$3
|
||||
typeset mntpnt=$4
|
||||
typeset ioengine=$5
|
||||
typeset extra_args=$6
|
||||
|
||||
# Invoke an fio workload via Direct I/O and verify with Direct I/O.
|
||||
log_must fio --directory=$mntpnt --name=direct-$mode \
|
||||
--rw=$mode --size=$size --bs=$bs --direct=1 --numjobs=1 \
|
||||
--verify=sha1 --ioengine=$ioengine --fallocate=none \
|
||||
--group_reporting --minimal --do_verify=1 $extra_args
|
||||
|
||||
# Now just read back the file without Direct I/O into the ARC as an
|
||||
# additional verfication step.
|
||||
log_must fio --directory=$mntpnt --name=direct-$mode \
|
||||
--rw=read --size=$size --bs=$bs --direct=0 --numjobs=1 \
|
||||
--ioengine=$ioengine --group_reporting --minimal
|
||||
|
||||
log_must rm -f "$mntpnt/direct-*"
|
||||
}
|
||||
|
||||
#
|
||||
# Get zpool status -d checksum verify failures
|
||||
#
|
||||
function get_zpool_status_chksum_verify_failures # pool_name vdev_type
|
||||
{
|
||||
typeset pool=$1
|
||||
typeset vdev_type=$2
|
||||
|
||||
if [[ "$vdev_type" == "stripe" ]]; then
|
||||
val=$(zpool status -dp $pool | \
|
||||
awk '{s+=$6} END {print s}' )
|
||||
elif [[ "$vdev_type" == "mirror" || "$vdev_type" == "raidz" ||
|
||||
"$vdev_type" == "draid" ]]; then
|
||||
val=$(zpool status -dp $pool | \
|
||||
awk -v d="$vdev_type" '$0 ~ d {print $6}' )
|
||||
else
|
||||
log_fail "Unsupported VDEV type in \
|
||||
get_zpool_status_chksum_verify_failures(): $vdev_type"
|
||||
fi
|
||||
echo "$val"
|
||||
}
|
||||
|
||||
#
|
||||
# Get ZED dio_verify events
|
||||
#
|
||||
function get_zed_dio_verify_events # pool
|
||||
{
|
||||
typeset pool=$1
|
||||
|
||||
val=$(zpool events $pool | grep -c dio_verify)
|
||||
|
||||
echo "$val"
|
||||
}
|
||||
|
||||
#
|
||||
# Checking for checksum verify write failures with:
|
||||
# zpool status -d
|
||||
# zpool events
|
||||
# After getting that counts will clear the out the ZPool errors and events
|
||||
#
|
||||
function check_dio_write_chksum_verify_failures # pool vdev_type expect_errors
|
||||
{
|
||||
typeset pool=$1
|
||||
typeset vdev_type=$2
|
||||
typeset expect_errors=$3
|
||||
typeset note_str="expecting none"
|
||||
|
||||
if [[ $expect_errors -ne 0 ]]; then
|
||||
note_str="expecting some"
|
||||
fi
|
||||
|
||||
log_note "Checking for Direct I/O write checksum verify errors \
|
||||
$note_str on ZPool: $pool"
|
||||
|
||||
status_failures=$(get_zpool_status_chksum_verify_failures $pool $vdev_type)
|
||||
zed_dio_verify_events=$(get_zed_dio_verify_events $pool)
|
||||
|
||||
if [[ $expect_errors -ne 0 ]]; then
|
||||
if [[ $status_failures -eq 0 ||
|
||||
$zed_dio_verify_events -eq 0 ]]; then
|
||||
zpool status -dp $pool
|
||||
zpool events $pool
|
||||
log_fail "Checksum verifies in zpool status -d \
|
||||
$status_failures. ZED dio_verify events \
|
||||
$zed_dio_verify_events. Neither should be 0."
|
||||
fi
|
||||
else
|
||||
if [[ $status_failures -ne 0 ||
|
||||
$zed_dio_verify_events -ne 0 ]]; then
|
||||
zpool status -dp $pool
|
||||
zpool events $pool
|
||||
log_fail "Checksum verifies in zpool status -d \
|
||||
$status_failures. ZED dio_verify events \
|
||||
$zed_dio_verify_events. Both should be zero."
|
||||
fi
|
||||
fi
|
||||
|
||||
log_must zpool clear $pool
|
||||
log_must zpool events -c
|
||||
|
||||
}
|
||||
|
||||
#
|
||||
# Get the value of a counter from
|
||||
# Linux: /proc/spl/kstat/zfs/$pool/iostats file.
|
||||
# FreeBSD: kstat.zfs.$pool.msic.iostats.$stat
|
||||
#
|
||||
function get_iostats_stat # pool stat
|
||||
{
|
||||
typeset pool=$1
|
||||
typeset stat=$2
|
||||
|
||||
if is_linux; then
|
||||
iostats_file=/proc/spl/kstat/zfs/$pool/iostats
|
||||
val=$(grep -m1 "$stat" $iostats_file | awk '{ print $3 }')
|
||||
else
|
||||
val=$(sysctl -n kstat.zfs.$pool.misc.iostats.$stat)
|
||||
fi
|
||||
if [[ -z "$val" ]]; then
|
||||
log_fail "Unable to read $stat counter"
|
||||
fi
|
||||
|
||||
echo "$val"
|
||||
}
|
||||
|
||||
#
|
||||
# Evict any buffered blocks by overwritting them using an O_DIRECT request.
|
||||
#
|
||||
function evict_blocks
|
||||
{
|
||||
typeset pool=$1
|
||||
typeset file=$2
|
||||
typeset size=$3
|
||||
|
||||
log_must stride_dd -i /dev/urandom -o $file -b $size -c 1 -D
|
||||
}
|
||||
|
||||
#
|
||||
# Perform FIO Direct I/O writes to a file with the given arguments.
|
||||
# Then verify thae minimum expected number of blocks were written as
|
||||
# Direct I/O.
|
||||
#
|
||||
function verify_dio_write_count #pool bs size mnpnt
|
||||
{
|
||||
typeset pool=$1
|
||||
typeset bs=$2
|
||||
typeset size=$3
|
||||
typeset mntpnt=$4
|
||||
typeset dio_wr_expected=$(((size / bs) -1))
|
||||
|
||||
log_note "Checking for $dio_wr_expected Direct I/O writes"
|
||||
|
||||
prev_dio_wr=$(get_iostats_stat $pool direct_write_count)
|
||||
dio_and_verify write $size $bs $mntpnt "sync"
|
||||
curr_dio_wr=$(get_iostats_stat $pool direct_write_count)
|
||||
dio_wr_actual=$((curr_dio_wr - prev_dio_wr))
|
||||
|
||||
if [[ $dio_wr_actual -lt $dio_wr_expected ]]; then
|
||||
if is_linux; then
|
||||
cat /proc/spl/kstat/zfs/$pool/iostats
|
||||
else
|
||||
sysctl kstat.zfs.$pool.misc.iostats
|
||||
fi
|
||||
log_fail "Direct writes $dio_wr_actual of $dio_wr_expected"
|
||||
fi
|
||||
}
|
||||
|
||||
#
|
||||
# Perform a stride_dd write command to the file with the given arguments.
|
||||
# Then verify the minimum expected number of blocks were written as either
|
||||
# buffered IO (by the ARC), or Direct I/O to the application (dd).
|
||||
#
|
||||
function check_write # pool file bs count seek flags buf_wr dio_wr
|
||||
{
|
||||
typeset pool=$1
|
||||
typeset file=$2
|
||||
typeset bs=$3
|
||||
typeset count=$4
|
||||
typeset seek=$5
|
||||
typeset flags=$6
|
||||
typeset buf_wr_expect=$7
|
||||
typeset dio_wr_expect=$8
|
||||
|
||||
log_note "Checking $count * $bs write(s) at offset $seek, $flags"
|
||||
|
||||
prev_buf_wr=$(get_iostats_stat $pool arc_write_count)
|
||||
prev_dio_wr=$(get_iostats_stat $pool direct_write_count)
|
||||
|
||||
log_must stride_dd -i /dev/urandom -o $file -b $bs -c $count \
|
||||
-k $seek $flags
|
||||
|
||||
curr_buf_wr=$(get_iostats_stat $pool arc_write_count)
|
||||
buf_wr_actual=$((curr_buf_wr - prev_buf_wr))
|
||||
|
||||
curr_dio_wr=$(get_iostats_stat $pool direct_write_count)
|
||||
dio_wr_actual=$((curr_dio_wr - prev_dio_wr))
|
||||
|
||||
if [[ $buf_wr_actual -lt $buf_wr_expect ]]; then
|
||||
if is_linux; then
|
||||
cat /proc/spl/kstat/zfs/$pool/iostats
|
||||
else
|
||||
sysctl kstat.zfs.$pool.misc.iostats
|
||||
fi
|
||||
log_fail "Buffered writes $buf_wr_actual of $buf_wr_expect"
|
||||
fi
|
||||
|
||||
if [[ $dio_wr_actual -lt $dio_wr_expect ]]; then
|
||||
if is_linux; then
|
||||
cat /proc/spl/kstat/zfs/$pool/iostats
|
||||
else
|
||||
sysctl kstat.zfs.$pool.misc.iostats
|
||||
fi
|
||||
log_fail "Direct writes $dio_wr_actual of $dio_wr_expect"
|
||||
fi
|
||||
}
|
||||
|
||||
#
|
||||
# Perform a stride_dd read command to the file with the given arguments.
|
||||
# Then verify the minimum expected number of blocks were read as either
|
||||
# buffered IO (by the ARC), or Direct I/O to the application (dd).
|
||||
#
|
||||
function check_read # pool file bs count skip flags buf_rd dio_rd
|
||||
{
|
||||
typeset pool=$1
|
||||
typeset file=$2
|
||||
typeset bs=$3
|
||||
typeset count=$4
|
||||
typeset skip=$5
|
||||
typeset flags=$6
|
||||
typeset buf_rd_expect=$7
|
||||
typeset dio_rd_expect=$8
|
||||
|
||||
log_note "Checking $count * $bs read(s) at offset $skip, $flags"
|
||||
|
||||
prev_buf_rd=$(get_iostats_stat $pool arc_read_count)
|
||||
prev_dio_rd=$(get_iostats_stat $pool direct_read_count)
|
||||
|
||||
log_must stride_dd -i $file -o /dev/null -b $bs -c $count \
|
||||
-p $skip $flags
|
||||
|
||||
curr_buf_rd=$(get_iostats_stat $pool arc_read_count)
|
||||
buf_rd_actual=$((curr_buf_rd - prev_buf_rd))
|
||||
|
||||
curr_dio_rd=$(get_iostats_stat $pool direct_read_count)
|
||||
dio_rd_actual=$((curr_dio_rd - prev_dio_rd))
|
||||
|
||||
if [[ $buf_rd_actual -lt $buf_rd_expect ]]; then
|
||||
if is_linux; then
|
||||
cat /proc/spl/kstat/zfs/$pool/iostats
|
||||
else
|
||||
sysctl kstat.zfs.$pool.misc.iostats
|
||||
fi
|
||||
log_fail "Buffered reads $buf_rd_actual of $buf_rd_expect"
|
||||
fi
|
||||
|
||||
if [[ $dio_rd_actual -lt $dio_rd_expect ]]; then
|
||||
if is_linux; then
|
||||
cat /proc/spl/kstat/zfs/$pool/iostats
|
||||
else
|
||||
sysctl kstat.zfs.$pool.misc.iostats
|
||||
fi
|
||||
log_fail "Direct reads $dio_rd_actual of $dio_rd_expect"
|
||||
fi
|
||||
}
|
||||
|
||||
function get_file_size
|
||||
{
|
||||
typeset filename="$1"
|
||||
|
||||
if is_linux; then
|
||||
filesize=$(stat -c %s $filename)
|
||||
else
|
||||
filesize=$(stat -s $filename | awk '{print $8}' | grep -o '[0-9]\+')
|
||||
fi
|
||||
|
||||
echo $filesize
|
||||
}
|
||||
|
||||
function do_truncate_reduce
|
||||
{
|
||||
typeset filename=$1
|
||||
typeset size=$2
|
||||
|
||||
filesize=$(get_file_size $filename)
|
||||
eval "echo original filesize: $filesize"
|
||||
if is_linux; then
|
||||
truncate $filename -s $((filesize - size))
|
||||
else
|
||||
truncate -s -$size $filename
|
||||
fi
|
||||
filesize=$(get_file_size $filename)
|
||||
eval "echo new filesize after truncate: $filesize"
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify the number direct/buffered requests for (un)aligned access
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create a multi-block file
|
||||
# 2. Perform various (un)aligned accesses and verify the result.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
zfs set recordsize=$rs $TESTPOOL/$TESTFS
|
||||
log_must rm -f $tmp_file
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
log_assert "Verify the number direct/buffered requests for unaligned access"
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
|
||||
rs=$(get_prop recordsize $TESTPOOL/$TESTFS)
|
||||
log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
|
||||
|
||||
tmp_file=$mntpnt/tmp_file
|
||||
file_size=$((rs * 8))
|
||||
|
||||
log_must stride_dd -i /dev/urandom -o $tmp_file -b $file_size -c 1
|
||||
|
||||
# N recordsize aligned writes which do not span blocks
|
||||
check_write $TESTPOOL $tmp_file $rs 1 0 "-D" 0 1
|
||||
check_write $TESTPOOL $tmp_file $rs 2 0 "-D" 0 2
|
||||
check_write $TESTPOOL $tmp_file $rs 4 0 "-D" 0 4
|
||||
check_write $TESTPOOL $tmp_file $rs 8 0 "-D" 0 8
|
||||
|
||||
# 1 recordsize aligned write which spans multiple blocks at various offsets
|
||||
check_write $TESTPOOL $tmp_file $((rs * 2)) 1 0 "-D" 0 2
|
||||
check_write $TESTPOOL $tmp_file $((rs * 2)) 1 1 "-D" 0 2
|
||||
check_write $TESTPOOL $tmp_file $((rs * 2)) 1 2 "-D" 0 2
|
||||
check_write $TESTPOOL $tmp_file $((rs * 2)) 1 3 "-D" 0 2
|
||||
check_write $TESTPOOL $tmp_file $((rs * 4)) 1 0 "-D" 0 4
|
||||
check_write $TESTPOOL $tmp_file $((rs * 4)) 1 1 "-D" 0 4
|
||||
check_write $TESTPOOL $tmp_file $((rs * 8)) 1 0 "-D" 0 8
|
||||
|
||||
# sub-blocksize unaligned writes which do not span blocks.
|
||||
check_write $TESTPOOL $tmp_file $((rs / 2)) 1 0 "-D" 1 0
|
||||
check_write $TESTPOOL $tmp_file $((rs / 2)) 1 1 "-D" 1 0
|
||||
check_write $TESTPOOL $tmp_file $((rs / 2)) 1 2 "-D" 1 0
|
||||
check_write $TESTPOOL $tmp_file $((rs / 2)) 1 3 "-D" 1 0
|
||||
|
||||
# large unaligned writes which span multiple blocks
|
||||
check_write $TESTPOOL $tmp_file $((rs * 2)) 1 $((rs / 2)) "-D -K" 2 1
|
||||
check_write $TESTPOOL $tmp_file $((rs * 4)) 2 $((rs / 4)) "-D -K" 4 6
|
||||
|
||||
# evict any cached blocks by overwriting with O_DIRECT
|
||||
evict_blocks $TESTPOOL $tmp_file $file_size
|
||||
|
||||
# recordsize aligned reads which do not span blocks
|
||||
check_read $TESTPOOL $tmp_file $rs 1 0 "-d" 0 1
|
||||
check_read $TESTPOOL $tmp_file $rs 2 0 "-d" 0 2
|
||||
check_read $TESTPOOL $tmp_file $rs 4 0 "-d" 0 4
|
||||
check_read $TESTPOOL $tmp_file $rs 8 0 "-d" 0 8
|
||||
|
||||
# 1 recordsize aligned read which spans multiple blocks at various offsets
|
||||
check_read $TESTPOOL $tmp_file $((rs * 2)) 1 0 "-d" 0 2
|
||||
check_read $TESTPOOL $tmp_file $((rs * 2)) 1 1 "-d" 0 2
|
||||
check_read $TESTPOOL $tmp_file $((rs * 2)) 1 2 "-d" 0 2
|
||||
check_read $TESTPOOL $tmp_file $((rs * 2)) 1 3 "-d" 0 2
|
||||
check_read $TESTPOOL $tmp_file $((rs * 4)) 1 0 "-d" 0 4
|
||||
check_read $TESTPOOL $tmp_file $((rs * 4)) 1 1 "-d" 0 4
|
||||
check_read $TESTPOOL $tmp_file $((rs * 8)) 1 0 "-d" 0 8
|
||||
|
||||
# sub-blocksize unaligned reads which do not span blocks.
|
||||
check_read $TESTPOOL $tmp_file $((rs / 2)) 1 0 "-d" 0 1
|
||||
check_read $TESTPOOL $tmp_file $((rs / 2)) 1 1 "-d" 0 1
|
||||
check_read $TESTPOOL $tmp_file $((rs / 2)) 1 2 "-d" 0 1
|
||||
check_read $TESTPOOL $tmp_file $((rs / 2)) 1 3 "-d" 0 1
|
||||
|
||||
# large unaligned reads which span multiple blocks
|
||||
check_read $TESTPOOL $tmp_file $((rs * 2)) 1 $((rs / 2)) "-d -P" 0 3
|
||||
check_read $TESTPOOL $tmp_file $((rs * 4)) 1 $((rs / 4)) "-d -P" 0 5
|
||||
|
||||
log_pass "Verify the number direct/buffered requests for (un)aligned access"
|
|
@ -0,0 +1,68 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# DDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify small async Direct I/O requests
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Use fio to issue small read/write requests. Writes are
|
||||
# smaller than the block size and thus will be buffered,
|
||||
# reads satisfy the minimum alignment and will be direct.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
zfs set direct=standard $TESTPOOL/$TESTFS
|
||||
rm $tmp_file
|
||||
}
|
||||
|
||||
log_assert "Verify direct=always mixed small async requests"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
log_must zfs set direct=always $TESTPOOL/$TESTFS
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
tmp_file=$mntpnt/tmp_file
|
||||
page_size=$(getconf PAGESIZE)
|
||||
file_size=1G
|
||||
runtime=10
|
||||
|
||||
log_must truncate -s $file_size $tmp_file
|
||||
|
||||
log_must fio --filename=$tmp_file --name=always-randrw \
|
||||
--rw=randwrite --bs=$page_size --size=$file_size --numjobs=1 \
|
||||
--ioengine=posixaio --fallocate=none --iodepth=4 --verify=sha1 \
|
||||
--group_reporting --minimal --runtime=$runtime --time_based
|
||||
|
||||
log_pass "Verify direct=always mixed small async requests"
|
|
@ -0,0 +1,106 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2022 by Triad National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/include/properties.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify FIO async engines work using Direct I/O.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Select a FIO async ioengine
|
||||
# 2. Start sequntial Direct I/O and verify with buffered I/O
|
||||
# 3. Start mixed Direct I/O and verify with buffered I/O
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must rm -f "$mntpnt/direct-*"
|
||||
}
|
||||
|
||||
function check_fio_ioengine
|
||||
{
|
||||
fio --ioengine=io_uring --parse-only > /dev/null 2>&1
|
||||
return $?
|
||||
}
|
||||
|
||||
log_assert "Verify FIO async ioengines work using Direct I/O."
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
typeset -a async_ioengine_args=("--iodepth=4" "--iodepth=4 --thread")
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
fio_async_ioengines="posixaio"
|
||||
|
||||
if is_linux; then
|
||||
fio_async_ioengines+=" libaio"
|
||||
if $(grep -q "CONFIG_IO_URING=y" /boot/config-$(uname -r)); then
|
||||
if [ -e /etc/os-release ] ; then
|
||||
source /etc/os-release
|
||||
if [ $PLATFORM_ID = "platform:el9" ] ; then
|
||||
log_note "io_uring disabled on RHEL 9 " \
|
||||
"variants: fails with " \
|
||||
"'Operation not permitted'"
|
||||
elif $(check_fio_ioengine -eq 0); then
|
||||
fio_async_ioengines+=" io_uring"
|
||||
else
|
||||
log_note "io_uring not supported by fio and " \
|
||||
"will not be tested"
|
||||
fi
|
||||
else
|
||||
if $(check_fio_ioengine); then
|
||||
fio_async_ioengines+=" io_uring"
|
||||
|
||||
else
|
||||
log_note "io_uring not supported by fio and " \
|
||||
"will not be tested"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
log_note "io_uring not supported by kernel will not " \
|
||||
"be tested"
|
||||
|
||||
fi
|
||||
fi
|
||||
|
||||
for ioengine in $fio_async_ioengines; do
|
||||
for ioengine_args in "${async_ioengine_args[@]}"; do
|
||||
for op in "rw" "randrw" "write"; do
|
||||
log_note "Checking Direct I/O with FIO async ioengine" \
|
||||
" $ioengine with args $ioengine_args --rw=$op"
|
||||
dio_and_verify $op $DIO_FILESIZE $DIO_BS $mntpnt "$ioengine" \
|
||||
"$ioengine_args"
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
log_pass "Verfied FIO async ioengines work using Direct I/O"
|
|
@ -0,0 +1,65 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/include/properties.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify compression works using Direct I/O.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Select a random compression algoritm
|
||||
# 2. Start sequential Direct I/O and verify with buffered I/O
|
||||
# 3. Start mixed Direct I/O and verify with buffered I/O
|
||||
# 4. Repeat from 2 for all compression algoritms
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must rm -f "$mntpnt/direct-*"
|
||||
log_must zfs set compression=off $TESTPOOL/$TESTFS
|
||||
}
|
||||
|
||||
log_assert "Verify compression works using Direct I/O."
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
compress_args="--buffer_compress_percentage=50"
|
||||
|
||||
for comp in "${compress_prop_vals[@]:1}"; do
|
||||
log_must zfs set compression=$comp $TESTPOOL/$TESTFS
|
||||
for op in "rw" "randrw" "write"; do
|
||||
dio_and_verify $op $DIO_FILESIZE $DIO_BS $mntpnt "sync" $compress_args
|
||||
done
|
||||
done
|
||||
|
||||
log_pass "Verfied compression works using Direct I/O"
|
|
@ -0,0 +1,62 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# DDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/include/properties.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify deduplication works using Direct I/O.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Enable dedup
|
||||
# 2. Start sequential Direct I/O and verify with buffered I/O
|
||||
# 3. Start mixed Direct IO and verify with buffered I/O
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must rm -f "$mntpnt/direct-*"
|
||||
log_must zfs set dedup=off $TESTPOOL/$TESTFS
|
||||
}
|
||||
|
||||
log_assert "Verify deduplication works using Direct I/O."
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
dedup_args="--dedupe_percentage=50"
|
||||
|
||||
log_must zfs set dedup=on $TESTPOOL/$TESTFS
|
||||
for op in "rw" "randrw" "write"; do
|
||||
dio_and_verify $op $DIO_FILESIZE $DIO_BS $mntpnt "sync" $dedup_args
|
||||
done
|
||||
|
||||
log_pass "Verfied deduplication works using Direct I/O"
|
|
@ -0,0 +1,62 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# DDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify encryption works using Direct I/O.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create multidisk pool.
|
||||
# 2. Start some mixed readwrite Direct I/O.
|
||||
# 3. Verify the results are as expected using buffered I/O.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
log_assert "Verify encryption works using Direct I/O."
|
||||
|
||||
log_onexit dio_cleanup
|
||||
|
||||
log_must truncate -s $MINVDEVSIZE $DIO_VDEVS
|
||||
|
||||
create_pool $TESTPOOL1 $DIO_VDEVS
|
||||
log_must eval "echo 'password' | zfs create -o encryption=on \
|
||||
-o keyformat=passphrase -o keylocation=prompt -o compression=off \
|
||||
$TESTPOOL1/$TESTFS1"
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1)
|
||||
|
||||
for bs in "4k" "128k" "1m"; do
|
||||
for op in "rw" "randrw" "write"; do
|
||||
dio_and_verify $op $DIO_FILESIZE $bs $mntpnt "sync"
|
||||
done
|
||||
done
|
||||
|
||||
log_pass "Verified encryption works using Direct I/O"
|
|
@ -0,0 +1,86 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify the number direct/buffered requests when growing a file
|
||||
#
|
||||
# STRATEGY:
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
zfs set recordsize=$rs $TESTPOOL/$TESTFS
|
||||
log_must rm -f $tmp_file
|
||||
}
|
||||
|
||||
log_assert "Verify the number direct/buffered requests when growing a file"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
|
||||
tmp_file=$mntpnt/tmp_file
|
||||
|
||||
rs=$(get_prop recordsize $TESTPOOL/$TESTFS)
|
||||
log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
|
||||
|
||||
#
|
||||
# Verify the expected number of buffered and Direct I/O's when growing
|
||||
# the first block of a file up to the maximum recordsize.
|
||||
#
|
||||
for bs in "8192" "16384" "32768" "65536" "131072"; do
|
||||
|
||||
# When O_DIRECT is set the first write to a new file, or when the
|
||||
# block size needs to be grown, it will be done as a buffered write.
|
||||
check_write $TESTPOOL $tmp_file $bs 1 0 "-D" 1 0
|
||||
|
||||
# Overwriting the first block of an existing file with O_DIRECT will
|
||||
# be a buffered write if less than the block size.
|
||||
check_write $TESTPOOL $tmp_file 4096 1 0 "-D" 1 0
|
||||
check_write $TESTPOOL $tmp_file 4096 1 1 "-D" 1 0
|
||||
|
||||
# Overwriting the first block of an existing file with O_DIRECT will
|
||||
# be a direct write as long as the block size matches.
|
||||
check_write $TESTPOOL $tmp_file $bs 1 0 "-D" 0 1
|
||||
|
||||
# Evict any blocks which may be buffered before the read tests.
|
||||
evict_blocks $TESTPOOL $tmp_file $bs
|
||||
|
||||
# Reading the first block of an existing file with O_DIRECT will
|
||||
# be a direct read for part or all of the block size.
|
||||
check_read $TESTPOOL $tmp_file $bs 1 0 "-d" 0 1
|
||||
check_read $TESTPOOL $tmp_file 4096 1 0 "-d" 0 1
|
||||
check_read $TESTPOOL $tmp_file 4096 1 1 "-d" 0 1
|
||||
done
|
||||
|
||||
log_pass "Verify the number direct/buffered requests when growing a file"
|
|
@ -0,0 +1,64 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2022 by Triad National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify max recordsizes are supported for Direct I/O.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create a pool from each vdev type with varying recordsizes.
|
||||
# 2. Start sequential Direct I/O and verify with buffered I/O.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
log_assert "Verify max recordsizes are supported for Direct I/O."
|
||||
|
||||
log_onexit dio_cleanup
|
||||
|
||||
log_must truncate -s $MINVDEVSIZE $DIO_VDEVS
|
||||
|
||||
for type in "" "mirror" "raidz" "draid"; do;
|
||||
for recsize in "2097152" "8388608" "16777216"; do
|
||||
create_pool $TESTPOOL1 $type $DIO_VDEVS
|
||||
log_must eval "zfs create \
|
||||
-o recordsize=$recsize -o compression=off \
|
||||
$TESTPOOL1/$TESTFS1"
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1)
|
||||
|
||||
verify_dio_write_count $TESTPOOL1 $recsize $((4 * recsize)) \
|
||||
$mntpnt
|
||||
|
||||
destroy_pool $TESTPOOL1
|
||||
done
|
||||
done
|
||||
|
||||
log_pass "Verified max recordsizes are supported for Direct I/O."
|
|
@ -0,0 +1,107 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/include/properties.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify mixed buffered and Direct I/O are coherent.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Verify interleaved buffered and Direct I/O
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must rm -f $src_file $new_file $tmp_file
|
||||
}
|
||||
|
||||
log_assert "Verify mixed buffered and Direct I/O are coherent."
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
|
||||
src_file=$mntpnt/src_file
|
||||
new_file=$mntpnt/new_file
|
||||
tmp_file=$mntpnt/tmp_file
|
||||
page_size=$(getconf PAGESIZE)
|
||||
file_size=1048576
|
||||
|
||||
log_must stride_dd -i /dev/urandom -o $src_file -b $file_size -c 1
|
||||
|
||||
#
|
||||
# Using mixed input and output block sizes verify that buffered and
|
||||
# Direct I/O can be interleaved and the result with always be coherent.
|
||||
#
|
||||
for ibs in "512" "$page_size" "131072"; do
|
||||
for obs in "512" "$page_size" "131072"; do
|
||||
iblocks=$(($file_size / $ibs))
|
||||
oblocks=$(($file_size / $obs))
|
||||
iflags=""
|
||||
oflags=""
|
||||
|
||||
# Only allow Direct I/O when it is at least page sized.
|
||||
if [[ $ibs -ge $page_size ]]; then
|
||||
iflags="-d"
|
||||
fi
|
||||
|
||||
if [[ $obs -ge $page_size ]]; then
|
||||
oflags="-D"
|
||||
fi
|
||||
|
||||
# Verify buffered write followed by a direct read.
|
||||
log_must stride_dd -i $src_file -o $new_file -b $obs \
|
||||
-c $oblocks
|
||||
log_must stride_dd -i $new_file -o $tmp_file -b $ibs \
|
||||
-c $iblocks $iflags
|
||||
log_must cmp_md5s $new_file $tmp_file
|
||||
log_must rm -f $new_file $tmp_file
|
||||
|
||||
# Verify direct write followed by a buffered read.
|
||||
log_must stride_dd -i $src_file -o $new_file -b $obs \
|
||||
-c $oblocks $oflags
|
||||
log_must stride_dd -i $new_file -o $tmp_file -b $ibs \
|
||||
-c $iblocks
|
||||
log_must cmp_md5s $new_file $tmp_file
|
||||
log_must rm -f $new_file $tmp_file
|
||||
|
||||
# Verify direct write followed by a direct read.
|
||||
log_must stride_dd -i $src_file -o $new_file -b $obs \
|
||||
-c $oblocks $oflags
|
||||
log_must stride_dd -i $new_file -o $tmp_file -b $ibs \
|
||||
-c $iblocks $iflags
|
||||
log_must cmp_md5s $new_file $tmp_file
|
||||
log_must rm -f $new_file $tmp_file
|
||||
done
|
||||
done
|
||||
|
||||
log_pass "Verify mixed buffered and Direct I/O are coherent."
|
|
@ -0,0 +1,92 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify mixed Direct I/O and mmap I/O.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create an empty file.
|
||||
# 2. Start a background Direct I/O random read/write fio to the
|
||||
# file.
|
||||
# 3. Start a background mmap random read/write fio to the file.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
zfs set recordsize=$rs $TESTPOOL/$TESTFS
|
||||
log_must rm -f "$tmp_file"
|
||||
}
|
||||
|
||||
log_assert "Verify mixed Direct I/O and mmap I/O"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
tmp_file=$mntpnt/file
|
||||
bs=$((128 * 1024))
|
||||
blocks=64
|
||||
size=$((bs * blocks))
|
||||
runtime=60
|
||||
|
||||
rs=$(get_prop recordsize $TESTPOOL/$TESTFS)
|
||||
log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
|
||||
|
||||
log_must stride_dd -i /dev/zero -o $tmp_file -b $bs -c $blocks
|
||||
|
||||
# Direct I/O writes
|
||||
log_must eval "fio --filename=$tmp_file --name=direct-write \
|
||||
--rw=randwrite --size=$size --bs=$bs --direct=1 --numjobs=1 \
|
||||
--ioengine=sync --fallocate=none --group_reporting --minimal \
|
||||
--runtime=$runtime --time_based --norandommap &"
|
||||
|
||||
# Direct I/O reads
|
||||
log_must eval "fio --filename=$tmp_file --name=direct-read \
|
||||
--rw=randread --size=$size --bs=$bs --direct=1 --numjobs=1 \
|
||||
--ioengine=sync --fallocate=none --group_reporting --minimal \
|
||||
--runtime=$runtime --time_based --norandommap &"
|
||||
|
||||
# mmap I/O writes
|
||||
log_must eval "fio --filename=$tmp_file --name=mmap-write \
|
||||
--rw=randwrite --size=$size --bs=$bs --numjobs=1 \
|
||||
--ioengine=mmap --fallocate=none --group_reporting --minimal \
|
||||
--runtime=$runtime --time_based --norandommap &"
|
||||
|
||||
# mmap I/O reads
|
||||
log_must eval "fio --filename=$tmp_file --name=mmap-read \
|
||||
--rw=randread --size=$size --bs=$bs --numjobs=1 \
|
||||
--ioengine=mmap --fallocate=none --group_reporting --minimal \
|
||||
--runtime=$runtime --time_based --norandommap &"
|
||||
|
||||
wait
|
||||
|
||||
log_pass "Verfied mixed Direct I/O and mmap I/O"
|
|
@ -0,0 +1,70 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2023 by Triad National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify Direct I/O overwrite.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create an empty file.
|
||||
# 2. Start a Direct I/O random write fio to the file.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
zfs set recordsize=$rs $TESTPOOL/$TESTFS
|
||||
log_must rm -f "$tmp_file"
|
||||
}
|
||||
|
||||
log_assert "Verify Direct I/O overwrites"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
tmp_file=$mntpnt/file
|
||||
bs=$((128 * 1024))
|
||||
blocks=64
|
||||
size=$((bs * blocks))
|
||||
runtime=60
|
||||
|
||||
rs=$(get_prop recordsize $TESTPOOL/$TESTFS)
|
||||
log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
|
||||
|
||||
log_must stride_dd -i /dev/zero -o $tmp_file -b $bs -c $blocks
|
||||
|
||||
# Direct I/O overwrites
|
||||
log_must eval "fio --filename=$tmp_file --name=direct-write \
|
||||
--rw=randwrite --size=$size --bs=$bs --direct=1 --numjobs=1 \
|
||||
--ioengine=sync --fallocate=none --group_reporting --minimal \
|
||||
--runtime=$runtime --time_based --norandommap"
|
||||
|
||||
log_pass "Verfied Direct I/O overwrites"
|
|
@ -0,0 +1,127 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify the direct=always|disabled|standard property
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Verify direct=always behavior
|
||||
# 2. Verify direct=disabled behavior
|
||||
# 3. Verify direct=standard behavior
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
zfs set direct=standard $TESTPOOL/$TESTFS
|
||||
log_must rm -f $tmp_file
|
||||
}
|
||||
|
||||
log_assert "Verify the direct=always|disabled|standard property"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
rs=$(get_prop recordsize $TESTPOOL/$TESTFS)
|
||||
|
||||
tmp_file=$mntpnt/tmp_file
|
||||
page_size=$(getconf PAGESIZE)
|
||||
file_size=1048576
|
||||
count=8
|
||||
|
||||
#
|
||||
# Check when "direct=always" any aligned IO is done as direct.
|
||||
# Note that the "-D" and "-d" flags are not set in the following calls to
|
||||
# stride_dd.
|
||||
#
|
||||
log_must zfs set direct=always $TESTPOOL/$TESTFS
|
||||
|
||||
log_note "Aligned writes (buffered, then all direct)"
|
||||
check_write $TESTPOOL $tmp_file $rs $count 0 "" 1 $((count - 1))
|
||||
|
||||
log_note "Aligned overwrites"
|
||||
check_write $TESTPOOL $tmp_file $rs $count 0 "" 0 $count
|
||||
|
||||
log_note "Sub-recordsize unaligned overwrites"
|
||||
check_write $TESTPOOL $tmp_file $((rs / 2)) $((2 * count)) 0 "" $((2 * count)) 0
|
||||
|
||||
log_note "Sub-page size aligned overwrites"
|
||||
check_write $TESTPOOL $tmp_file 512 $count 0 "" $count 0
|
||||
evict_blocks $TESTPOOL $tmp_file $file_size
|
||||
|
||||
log_note "Aligned reads"
|
||||
check_read $TESTPOOL $tmp_file $rs $count 0 "" 0 $count
|
||||
|
||||
log_note "Sub-recordsize unaligned reads"
|
||||
check_read $TESTPOOL $tmp_file $((rs / 2)) $((count * 2)) 0 "" 0 $((2 * count))
|
||||
|
||||
log_note "Sub-page size aligned reads (one read then ARC hits)"
|
||||
check_read $TESTPOOL $tmp_file 512 $count 0 "" 1 0
|
||||
|
||||
log_must rm -f $tmp_file
|
||||
|
||||
|
||||
#
|
||||
# Check when "direct=disabled" there are never any direct requests.
|
||||
# Note that the "-D" and "-d" flags are always set in the following calls to
|
||||
# stride_dd.
|
||||
#
|
||||
log_must zfs set direct=disabled $TESTPOOL/$TESTFS
|
||||
|
||||
log_note "Aligned writes (all buffered with an extra for create)"
|
||||
check_write $TESTPOOL $tmp_file $rs $count 0 "-D" $count 0
|
||||
|
||||
log_note "Aligned overwrites"
|
||||
check_write $TESTPOOL $tmp_file $rs $count 0 "-D" $count 0
|
||||
|
||||
log_note "Aligned reads (all ARC hits)"
|
||||
check_read $TESTPOOL $tmp_file $rs $count 0 "-d" 0 0
|
||||
|
||||
log_must rm -f $tmp_file
|
||||
|
||||
|
||||
#
|
||||
# Check when "direct=standard" only requested Direct I/O occur.
|
||||
#
|
||||
log_must zfs set direct=standard $TESTPOOL/$TESTFS
|
||||
|
||||
log_note "Aligned writes/overwrites (buffered / direct)"
|
||||
check_write $TESTPOOL $tmp_file $rs $count 0 "" $count 0
|
||||
check_write $TESTPOOL $tmp_file $rs $count 0 "-D" 0 $count
|
||||
|
||||
log_note "Aligned reads (buffered / direct)"
|
||||
evict_blocks $TESTPOOL $tmp_file $file_size
|
||||
check_read $TESTPOOL $tmp_file $rs $count 0 "" $count 0
|
||||
evict_blocks $TESTPOOL $tmp_file $file_size
|
||||
check_read $TESTPOOL $tmp_file $rs $count 0 "-d" 0 $count
|
||||
|
||||
log_pass "Verify the direct=always|disabled|standard property"
|
|
@ -0,0 +1,82 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify mixed Direct I/O and buffered I/O. A workload of random
|
||||
# but correctly aligned direct read/writes is mixed with a
|
||||
# concurrent workload of entirely unaligned buffered read/writes.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create an empty file.
|
||||
# 2. Start a background fio randomly issuing direct read/writes.
|
||||
# 3. Start a background fio randomly issuing buffered read/writes.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must rm -f "$tmp_file"
|
||||
}
|
||||
|
||||
log_assert "Verify randomly sized mixed Direct I/O and buffered I/O"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
tmp_file=$mntpnt/file
|
||||
bs=$((1024 * 1024))
|
||||
blocks=32
|
||||
size=$((bs * blocks))
|
||||
runtime=10
|
||||
page_size=$(getconf PAGESIZE)
|
||||
|
||||
log_must stride_dd -i /dev/zero -o $tmp_file -b $bs -c $blocks
|
||||
|
||||
# Direct random read/write page-aligned IO of varying sizes with
|
||||
# occasional calls to fsync(2), mixed with...
|
||||
log_must eval "fio --filename=$tmp_file --name=direct-rwrand \
|
||||
--rw=randrw --size=$size --offset_align=$(getconf PAGESIZE) \
|
||||
--bsrange=$page_size-1m --direct=1 --fsync=32 --numjobs=2 \
|
||||
--ioengine=sync --fallocate=none --verify=sha1 \
|
||||
--group_reporting --minimal --runtime=$runtime --time_based &"
|
||||
|
||||
# Buffered random read/write entirely unaligned IO of varying sizes
|
||||
# occasional calls to fsync(2).
|
||||
log_must eval "fio --filename=$tmp_file --name=buffered-write \
|
||||
--rw=randrw --size=$size --offset_align=512 --bs_unaligned=1 \
|
||||
--bsrange=$page_size-1m --direct=0 --fsync=32 --numjobs=2 \
|
||||
--ioengine=sync --fallocate=none --verify=sha1 \
|
||||
--group_reporting --minimal --runtime=$runtime --time_based &"
|
||||
|
||||
wait
|
||||
|
||||
log_pass "Verfied randomly sized mixed Direct I/O and buffered I/O"
|
|
@ -0,0 +1,68 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify different recordsizes are supported for Direct I/O.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create a pool from each vdev type with varying recordsizes.
|
||||
# 2. Start sequential Direct I/O and verify with buffered I/O.
|
||||
# 3. Start mixed Direct I/O and verify with buffered I/O.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
log_assert "Verify different recordsizes are supported for Direct I/O."
|
||||
|
||||
log_onexit dio_cleanup
|
||||
|
||||
log_must truncate -s $MINVDEVSIZE $DIO_VDEVS
|
||||
|
||||
for type in "" "mirror" "raidz" "draid"; do
|
||||
for recsize in "1024" "4096" "128k"; do
|
||||
create_pool $TESTPOOL1 $type $DIO_VDEVS
|
||||
log_must eval "zfs create \
|
||||
-o recordsize=$recsize -o compression=off \
|
||||
$TESTPOOL1/$TESTFS1"
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1)
|
||||
|
||||
for bs in "4k" "128k"; do
|
||||
for op in "rw" "randrw" "write"; do
|
||||
dio_and_verify $op $DIO_FILESIZE $bs $mntpnt "sync"
|
||||
done
|
||||
done
|
||||
|
||||
destroy_pool $TESTPOOL1
|
||||
done
|
||||
done
|
||||
|
||||
log_pass "Verified different recordsizes are supported for Direct I/O."
|
|
@ -0,0 +1,78 @@
|
|||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2021 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/direct/dio.cfg
|
||||
. $STF_SUITE/tests/functional/direct/dio.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify failure for (un)aligned O_DIRECT
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create a multi-block file
|
||||
# 2. Perform (un)aligned write/read verify the result.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
zfs set recordsize=$rs $TESTPOOL/$TESTFS
|
||||
zfs set direct=standard $TESTPOOL/$TESTFS
|
||||
log_must rm -f $tmp_file
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
log_assert "Verify direct requests for (un)aligned access"
|
||||
|
||||
mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
|
||||
|
||||
rs=$(get_prop recordsize $TESTPOOL/$TESTFS)
|
||||
log_must zfs set recordsize=128k $TESTPOOL/$TESTFS
|
||||
|
||||
tmp_file=$mntpnt/tmp_file
|
||||
file_size=$((rs * 8))
|
||||
|
||||
log_must stride_dd -i /dev/urandom -o $tmp_file -b $file_size -c 1
|
||||
|
||||
log_must zfs set direct=standard $TESTPOOL/$TESTFS
|
||||
# sub-pagesize direct writes/read will always fail if direct=standard.
|
||||
log_mustnot stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D
|
||||
log_mustnot stride_dd -i $tmp_file -o /dev/null -b 512 -c 8 -d
|
||||
|
||||
log_must zfs set direct=always $TESTPOOL/$TESTFS
|
||||
# sub-pagesize direct writes/read will always pass if direct=always.
|
||||
log_must stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8
|
||||
log_must stride_dd -i $tmp_file -o /dev/null -b 512 -c 8
|
||||
|
||||
log_must zfs set direct=disabled $TESTPOOL/$TESTFS
|
||||
# sub-pagesize direct writes/read will always pass if direct=disabled.
|
||||
log_must stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D
|
||||
log_must stride_dd -i $tmp_file -o /dev/null -b 512 -c 8 -d
|
||||
|
||||
log_pass "Verify direct requests for (un)aligned access"
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue