Merge 'openzfs/zfs-2.2.4-staging' into truenas/zfs-2.2-release
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
This commit is contained in:
commit
a33c8d9ce2
|
@ -8041,6 +8041,17 @@ dump_mos_leaks(spa_t *spa)
|
|||
}
|
||||
}
|
||||
|
||||
if (spa->spa_brt != NULL) {
|
||||
brt_t *brt = spa->spa_brt;
|
||||
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
|
||||
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
|
||||
if (brtvd != NULL && brtvd->bv_initiated) {
|
||||
mos_obj_refd(brtvd->bv_mos_brtvdev);
|
||||
mos_obj_refd(brtvd->bv_mos_entries);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Visit all allocated objects and make sure they are referenced.
|
||||
*/
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
#
|
||||
# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos
|
||||
# as they flip between FAULTED and ONLINE. If
|
||||
# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
|
||||
# ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
|
||||
# FAULTED, then power down the slot via sysfs:
|
||||
#
|
||||
# /sys/class/enclosure/<enclosure>/<slot>/power_status
|
||||
|
@ -19,7 +19,7 @@
|
|||
# Exit codes:
|
||||
# 0: slot successfully powered off
|
||||
# 1: enclosure not available
|
||||
# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled
|
||||
# 2: ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT disabled
|
||||
# 3: vdev was not FAULTED
|
||||
# 4: The enclosure sysfs path passed from ZFS does not exist
|
||||
# 5: Enclosure slot didn't actually turn off after we told it to
|
||||
|
@ -32,7 +32,7 @@ if [ ! -d /sys/class/enclosure ] ; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then
|
||||
if [ "${ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT}" != "1" ] ; then
|
||||
exit 2
|
||||
fi
|
||||
|
||||
|
|
|
@ -205,6 +205,10 @@ zed_notify()
|
|||
[ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
|
||||
[ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
|
||||
|
||||
zed_notify_ntfy "${subject}" "${pathname}"; rv=$?
|
||||
[ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
|
||||
[ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
|
||||
|
||||
[ "${num_success}" -gt 0 ] && return 0
|
||||
[ "${num_failure}" -gt 0 ] && return 1
|
||||
return 2
|
||||
|
@ -527,6 +531,100 @@ zed_notify_pushover()
|
|||
}
|
||||
|
||||
|
||||
# zed_notify_ntfy (subject, pathname)
|
||||
#
|
||||
# Send a notification via Ntfy.sh <https://ntfy.sh/>.
|
||||
# The ntfy topic (ZED_NTFY_TOPIC) identifies the topic that the notification
|
||||
# will be sent to Ntfy.sh server. The ntfy url (ZED_NTFY_URL) defines the
|
||||
# self-hosted or provided hosted ntfy service location. The ntfy access token
|
||||
# <https://docs.ntfy.sh/publish/#access-tokens> (ZED_NTFY_ACCESS_TOKEN) reprsents an
|
||||
# access token that could be used if a topic is read/write protected. If a
|
||||
# topic can be written to publicaly, a ZED_NTFY_ACCESS_TOKEN is not required.
|
||||
#
|
||||
# Requires curl and sed executables to be installed in the standard PATH.
|
||||
#
|
||||
# References
|
||||
# https://docs.ntfy.sh
|
||||
#
|
||||
# Arguments
|
||||
# subject: notification subject
|
||||
# pathname: pathname containing the notification message (OPTIONAL)
|
||||
#
|
||||
# Globals
|
||||
# ZED_NTFY_TOPIC
|
||||
# ZED_NTFY_ACCESS_TOKEN (OPTIONAL)
|
||||
# ZED_NTFY_URL
|
||||
#
|
||||
# Return
|
||||
# 0: notification sent
|
||||
# 1: notification failed
|
||||
# 2: not configured
|
||||
#
|
||||
zed_notify_ntfy()
|
||||
{
|
||||
local subject="$1"
|
||||
local pathname="${2:-"/dev/null"}"
|
||||
local msg_body
|
||||
local msg_out
|
||||
local msg_err
|
||||
|
||||
[ -n "${ZED_NTFY_TOPIC}" ] || return 2
|
||||
local url="${ZED_NTFY_URL:-"https://ntfy.sh"}/${ZED_NTFY_TOPIC}"
|
||||
|
||||
if [ ! -r "${pathname}" ]; then
|
||||
zed_log_err "ntfy cannot read \"${pathname}\""
|
||||
return 1
|
||||
fi
|
||||
|
||||
zed_check_cmd "curl" "sed" || return 1
|
||||
|
||||
# Read the message body in.
|
||||
#
|
||||
msg_body="$(cat "${pathname}")"
|
||||
|
||||
if [ -z "${msg_body}" ]
|
||||
then
|
||||
msg_body=$subject
|
||||
subject=""
|
||||
fi
|
||||
|
||||
# Send the POST request and check for errors.
|
||||
#
|
||||
if [ -n "${ZED_NTFY_ACCESS_TOKEN}" ]; then
|
||||
msg_out="$( \
|
||||
curl \
|
||||
-u ":${ZED_NTFY_ACCESS_TOKEN}" \
|
||||
-H "Title: ${subject}" \
|
||||
-d "${msg_body}" \
|
||||
-H "Priority: high" \
|
||||
"${url}" \
|
||||
2>/dev/null \
|
||||
)"; rv=$?
|
||||
else
|
||||
msg_out="$( \
|
||||
curl \
|
||||
-H "Title: ${subject}" \
|
||||
-d "${msg_body}" \
|
||||
-H "Priority: high" \
|
||||
"${url}" \
|
||||
2>/dev/null \
|
||||
)"; rv=$?
|
||||
fi
|
||||
if [ "${rv}" -ne 0 ]; then
|
||||
zed_log_err "curl exit=${rv}"
|
||||
return 1
|
||||
fi
|
||||
msg_err="$(echo "${msg_out}" \
|
||||
| sed -n -e 's/.*"errors" *:.*\[\(.*\)\].*/\1/p')"
|
||||
if [ -n "${msg_err}" ]; then
|
||||
zed_log_err "ntfy \"${msg_err}"\"
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
|
||||
# zed_rate_limit (tag, [interval])
|
||||
#
|
||||
# Check whether an event of a given type [tag] has already occurred within the
|
||||
|
|
|
@ -146,4 +146,26 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event"
|
|||
# Power off the drive's slot in the enclosure if it becomes FAULTED. This can
|
||||
# help silence misbehaving drives. This assumes your drive enclosure fully
|
||||
# supports slot power control via sysfs.
|
||||
#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1
|
||||
#ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1
|
||||
|
||||
##
|
||||
# Ntfy topic
|
||||
# This defines which topic will receive the ntfy notification.
|
||||
# <https://docs.ntfy.sh/publish/>
|
||||
# Disabled by default; uncomment to enable.
|
||||
#ZED_NTFY_TOPIC=""
|
||||
|
||||
##
|
||||
# Ntfy access token (optional for public topics)
|
||||
# This defines an access token which can be used
|
||||
# to allow you to authenticate when sending to topics
|
||||
# <https://docs.ntfy.sh/publish/#access-tokens>
|
||||
# Disabled by default; uncomment to enable.
|
||||
#ZED_NTFY_ACCESS_TOKEN=""
|
||||
|
||||
##
|
||||
# Ntfy Service URL
|
||||
# This defines which service the ntfy call will be directed toward
|
||||
# <https://docs.ntfy.sh/install/>
|
||||
# https://ntfy.sh by default; uncomment to enable an alternative service url.
|
||||
#ZED_NTFY_URL="https://ntfy.sh"
|
||||
|
|
|
@ -3672,15 +3672,25 @@ zfs_do_list(int argc, char **argv)
|
|||
|
||||
for (char *tok; (tok = strsep(&optarg, ",")); ) {
|
||||
static const char *const type_subopts[] = {
|
||||
"filesystem", "volume",
|
||||
"snapshot", "snap",
|
||||
"filesystem",
|
||||
"fs",
|
||||
"volume",
|
||||
"vol",
|
||||
"snapshot",
|
||||
"snap",
|
||||
"bookmark",
|
||||
"all" };
|
||||
"all"
|
||||
};
|
||||
static const int type_types[] = {
|
||||
ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME,
|
||||
ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT,
|
||||
ZFS_TYPE_FILESYSTEM,
|
||||
ZFS_TYPE_FILESYSTEM,
|
||||
ZFS_TYPE_VOLUME,
|
||||
ZFS_TYPE_VOLUME,
|
||||
ZFS_TYPE_SNAPSHOT,
|
||||
ZFS_TYPE_SNAPSHOT,
|
||||
ZFS_TYPE_BOOKMARK,
|
||||
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK };
|
||||
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK
|
||||
};
|
||||
|
||||
for (c = 0; c < ARRAY_SIZE(type_subopts); ++c)
|
||||
if (strcmp(tok, type_subopts[c]) == 0) {
|
||||
|
|
|
@ -2161,6 +2161,7 @@ typedef struct status_cbdata {
|
|||
boolean_t cb_explain;
|
||||
boolean_t cb_first;
|
||||
boolean_t cb_dedup_stats;
|
||||
boolean_t cb_print_unhealthy;
|
||||
boolean_t cb_print_status;
|
||||
boolean_t cb_print_slow_ios;
|
||||
boolean_t cb_print_vdev_init;
|
||||
|
@ -2357,6 +2358,35 @@ health_str_to_color(const char *health)
|
|||
return (NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called for each leaf vdev. Returns 0 if the vdev is healthy.
|
||||
* A vdev is unhealthy if any of the following are true:
|
||||
* 1) there are read, write, or checksum errors,
|
||||
* 2) its state is not ONLINE, or
|
||||
* 3) slow IO reporting was requested (-s) and there are slow IOs.
|
||||
*/
|
||||
static int
|
||||
vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data)
|
||||
{
|
||||
status_cbdata_t *cb = data;
|
||||
vdev_stat_t *vs;
|
||||
uint_t vsc;
|
||||
(void) hdl_data;
|
||||
|
||||
if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
|
||||
(uint64_t **)&vs, &vsc) != 0)
|
||||
return (1);
|
||||
|
||||
if (vs->vs_checksum_errors || vs->vs_read_errors ||
|
||||
vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY)
|
||||
return (1);
|
||||
|
||||
if (cb->cb_print_slow_ios && vs->vs_slow_ios)
|
||||
return (1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Print out configuration state as requested by status_callback.
|
||||
*/
|
||||
|
@ -2375,7 +2405,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
|||
const char *state;
|
||||
const char *type;
|
||||
const char *path = NULL;
|
||||
const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL;
|
||||
const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL,
|
||||
*scolor = NULL;
|
||||
|
||||
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
|
||||
&child, &children) != 0)
|
||||
|
@ -2402,6 +2433,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
|||
state = gettext("AVAIL");
|
||||
}
|
||||
|
||||
/*
|
||||
* If '-e' is specified then top-level vdevs and their children
|
||||
* can be pruned if all of their leaves are healthy.
|
||||
*/
|
||||
if (cb->cb_print_unhealthy && depth > 0 &&
|
||||
for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
printf_color(health_str_to_color(state),
|
||||
"\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth,
|
||||
name, state);
|
||||
|
@ -2416,6 +2456,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
|||
if (vs->vs_checksum_errors)
|
||||
ccolor = ANSI_RED;
|
||||
|
||||
if (vs->vs_slow_ios)
|
||||
scolor = ANSI_BLUE;
|
||||
|
||||
if (cb->cb_literal) {
|
||||
fputc(' ', stdout);
|
||||
printf_color(rcolor, "%5llu",
|
||||
|
@ -2448,9 +2491,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
|||
}
|
||||
|
||||
if (cb->cb_literal)
|
||||
printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
|
||||
printf_color(scolor, " %5llu",
|
||||
(u_longlong_t)vs->vs_slow_ios);
|
||||
else
|
||||
printf(" %5s", rbuf);
|
||||
printf_color(scolor, " %5s", rbuf);
|
||||
}
|
||||
if (cb->cb_print_power) {
|
||||
if (children == 0) {
|
||||
|
@ -8999,9 +9043,11 @@ status_callback(zpool_handle_t *zhp, void *data)
|
|||
(void) printf(gettext(
|
||||
"errors: No known data errors\n"));
|
||||
} else if (!cbp->cb_verbose) {
|
||||
color_start(ANSI_RED);
|
||||
(void) printf(gettext("errors: %llu data "
|
||||
"errors, use '-v' for a list\n"),
|
||||
(u_longlong_t)nerr);
|
||||
color_end();
|
||||
} else {
|
||||
print_error_log(zhp);
|
||||
}
|
||||
|
@ -9022,6 +9068,7 @@ status_callback(zpool_handle_t *zhp, void *data)
|
|||
* [pool] [interval [count]]
|
||||
*
|
||||
* -c CMD For each vdev, run command CMD
|
||||
* -e Display only unhealthy vdevs
|
||||
* -i Display vdev initialization status.
|
||||
* -g Display guid for individual vdev name.
|
||||
* -L Follow links when resolving vdev path name.
|
||||
|
@ -9053,7 +9100,7 @@ zpool_do_status(int argc, char **argv)
|
|||
};
|
||||
|
||||
/* check options */
|
||||
while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options,
|
||||
while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options,
|
||||
NULL)) != -1) {
|
||||
switch (c) {
|
||||
case 'c':
|
||||
|
@ -9080,6 +9127,9 @@ zpool_do_status(int argc, char **argv)
|
|||
}
|
||||
cmd = optarg;
|
||||
break;
|
||||
case 'e':
|
||||
cb.cb_print_unhealthy = B_TRUE;
|
||||
break;
|
||||
case 'i':
|
||||
cb.cb_print_vdev_init = B_TRUE;
|
||||
break;
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
|
||||
ZFS_LINUX_TEST_SRC([page_size], [
|
||||
#include <linux/mm.h>
|
||||
],[
|
||||
unsigned long s;
|
||||
s = page_size(NULL);
|
||||
])
|
||||
])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
|
||||
AC_MSG_CHECKING([whether page_size() is available])
|
||||
ZFS_LINUX_TEST_RESULT([page_size], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
|
@ -16,6 +16,9 @@ dnl #
|
|||
dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
|
||||
dnl # generic_copy_file_range() added to support it
|
||||
dnl #
|
||||
dnl # 6.8: generic_copy_file_range() removed, replaced by
|
||||
dnl # splice_copy_file_range()
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
|
||||
ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
|
||||
#include <linux/fs.h>
|
||||
|
@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
|
|||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
|
||||
ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
|
||||
#include <linux/splice.h>
|
||||
], [
|
||||
struct file *src_file __attribute__ ((unused)) = NULL;
|
||||
loff_t src_off __attribute__ ((unused)) = 0;
|
||||
struct file *dst_file __attribute__ ((unused)) = NULL;
|
||||
loff_t dst_off __attribute__ ((unused)) = 0;
|
||||
size_t len __attribute__ ((unused)) = 0;
|
||||
splice_copy_file_range(src_file, src_off, dst_file, dst_off,
|
||||
len);
|
||||
])
|
||||
])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
|
||||
AC_MSG_CHECKING([whether splice_copy_file_range() is available])
|
||||
ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
|
||||
[splice_copy_file_range() is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
|
||||
ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
|
||||
#include <linux/fs.h>
|
||||
|
|
|
@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
|||
ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
|
||||
ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
|
||||
|
@ -166,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
|||
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
|
||||
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
|
||||
ZFS_AC_KERNEL_SRC_SYNC_BDEV
|
||||
ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
|
||||
|
@ -266,6 +268,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
|||
ZFS_AC_KERNEL_VFS_IOV_ITER
|
||||
ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
|
||||
|
@ -314,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
|||
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
|
||||
ZFS_AC_KERNEL_COPY_SPLICE_READ
|
||||
ZFS_AC_KERNEL_SYNC_BDEV
|
||||
ZFS_AC_KERNEL_MM_PAGE_SIZE
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_CPU_HAS_FEATURE
|
||||
|
|
|
@ -286,7 +286,6 @@ typedef struct zfid_long {
|
|||
|
||||
extern uint_t zfs_fsyncer_key;
|
||||
extern int zfs_super_owner;
|
||||
extern int zfs_bclone_enabled;
|
||||
|
||||
extern void zfs_init(void);
|
||||
extern void zfs_fini(void);
|
||||
|
|
|
@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
|
|||
%D%/kernel/linux/compiler_compat.h \
|
||||
%D%/kernel/linux/dcache_compat.h \
|
||||
%D%/kernel/linux/kmap_compat.h \
|
||||
%D%/kernel/linux/mm_compat.h \
|
||||
%D%/kernel/linux/mod_compat.h \
|
||||
%D%/kernel/linux/page_compat.h \
|
||||
%D%/kernel/linux/percpu_compat.h \
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#ifndef _ZFS_MM_COMPAT_H
|
||||
#define _ZFS_MM_COMPAT_H
|
||||
|
||||
#include <linux/mm.h>
|
||||
|
||||
/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
|
||||
#ifndef HAVE_MM_PAGE_SIZE
|
||||
#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
|
||||
#endif
|
||||
|
||||
#endif /* _ZFS_MM_COMPAT_H */
|
|
@ -68,6 +68,7 @@ enum scope_prefix_types {
|
|||
zfs_trim,
|
||||
zfs_txg,
|
||||
zfs_vdev,
|
||||
zfs_vdev_disk,
|
||||
zfs_vdev_file,
|
||||
zfs_vdev_mirror,
|
||||
zfs_vnops,
|
||||
|
|
|
@ -45,8 +45,6 @@ extern "C" {
|
|||
typedef struct zfsvfs zfsvfs_t;
|
||||
struct znode;
|
||||
|
||||
extern int zfs_bclone_enabled;
|
||||
|
||||
/*
|
||||
* This structure emulates the vfs_t from other platforms. It's purpose
|
||||
* is to facilitate the handling of mount options and minimize structural
|
||||
|
|
|
@ -79,6 +79,9 @@ typedef struct abd {
|
|||
|
||||
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
|
||||
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
|
||||
#endif
|
||||
|
||||
extern int zfs_abd_scatter_enabled;
|
||||
|
||||
|
@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
|
|||
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
|
||||
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
|
||||
abd_iter_func2_t *, void *);
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
|
||||
void *);
|
||||
#endif
|
||||
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
|
||||
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
|
||||
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
|
||||
|
@ -213,6 +220,8 @@ void abd_fini(void);
|
|||
|
||||
/*
|
||||
* Linux ABD bio functions
|
||||
* Note: these are only needed to support vdev_classic. See comment in
|
||||
* vdev_disk.c.
|
||||
*/
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#ifndef _ABD_IMPL_H
|
||||
|
@ -38,12 +39,30 @@ typedef enum abd_stats_op {
|
|||
ABDSTAT_DECR /* Decrease abdstat values */
|
||||
} abd_stats_op_t;
|
||||
|
||||
struct scatterlist; /* forward declaration */
|
||||
/* forward declarations */
|
||||
struct scatterlist;
|
||||
struct page;
|
||||
|
||||
struct abd_iter {
|
||||
/* public interface */
|
||||
void *iter_mapaddr; /* addr corresponding to iter_pos */
|
||||
size_t iter_mapsize; /* length of data valid at mapaddr */
|
||||
union {
|
||||
/* for abd_iter_map()/abd_iter_unmap() */
|
||||
struct {
|
||||
/* addr corresponding to iter_pos */
|
||||
void *iter_mapaddr;
|
||||
/* length of data valid at mapaddr */
|
||||
size_t iter_mapsize;
|
||||
};
|
||||
/* for abd_iter_page() */
|
||||
struct {
|
||||
/* current page */
|
||||
struct page *iter_page;
|
||||
/* offset of data in page */
|
||||
size_t iter_page_doff;
|
||||
/* size of data in page */
|
||||
size_t iter_page_dsize;
|
||||
};
|
||||
};
|
||||
|
||||
/* private */
|
||||
abd_t *iter_abd; /* ABD being iterated through */
|
||||
|
@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
|
|||
void abd_iter_advance(struct abd_iter *, size_t);
|
||||
void abd_iter_map(struct abd_iter *);
|
||||
void abd_iter_unmap(struct abd_iter *);
|
||||
void abd_iter_page(struct abd_iter *);
|
||||
|
||||
/*
|
||||
* Helper macros
|
||||
|
|
|
@ -24,8 +24,11 @@
|
|||
|
||||
#ifndef _SYS_FS_ZFS_VNOPS_H
|
||||
#define _SYS_FS_ZFS_VNOPS_H
|
||||
|
||||
#include <sys/zfs_vnops_os.h>
|
||||
|
||||
extern int zfs_bclone_enabled;
|
||||
|
||||
extern int zfs_fsync(znode_t *, int, cred_t *);
|
||||
extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
|
||||
extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
.\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
|
||||
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
|
||||
.\" Copyright (c) 2019 Datto Inc.
|
||||
.\" Copyright (c) 2023, 2024 Klara, Inc.
|
||||
.\" The contents of this file are subject to the terms of the Common Development
|
||||
.\" and Distribution License (the "License"). You may not use this file except
|
||||
.\" in compliance with the License. You can obtain a copy of the license at
|
||||
|
@ -15,7 +16,7 @@
|
|||
.\" own identifying information:
|
||||
.\" Portions Copyright [yyyy] [name of copyright owner]
|
||||
.\"
|
||||
.Dd July 21, 2023
|
||||
.Dd January 9, 2024
|
||||
.Dt ZFS 4
|
||||
.Os
|
||||
.
|
||||
|
@ -1142,6 +1143,15 @@ Enable the experimental block cloning feature.
|
|||
If this setting is 0, then even if feature@block_cloning is enabled,
|
||||
attempts to clone blocks will act as though the feature is disabled.
|
||||
.
|
||||
.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
||||
When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
|
||||
written to disk.
|
||||
This allows the clone operation to reliably succeed when a file is
|
||||
modified and then immediately cloned.
|
||||
For small files this may be slower than making a copy of the file.
|
||||
Therefore, this setting defaults to 0 which causes a clone operation to
|
||||
immediately fail when encountering a dirty block.
|
||||
.
|
||||
.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
|
||||
Select a BLAKE3 implementation.
|
||||
.Pp
|
||||
|
@ -1336,6 +1346,42 @@ _
|
|||
4 Driver No driver retries on driver errors.
|
||||
.TE
|
||||
.
|
||||
.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
|
||||
Maximum number of segments to add to a BIO (min 4).
|
||||
If this is higher than the maximum allowed by the device queue or the kernel
|
||||
itself, it will be clamped.
|
||||
Setting it to zero will cause the kernel's ideal size to be used.
|
||||
This parameter only applies on Linux.
|
||||
This parameter is ignored if
|
||||
.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
|
||||
.
|
||||
.It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint
|
||||
Controls the method used to submit IO to the Linux block layer
|
||||
(default
|
||||
.Sy 1 "classic" Ns
|
||||
)
|
||||
.Pp
|
||||
If set to 1, the "classic" method is used.
|
||||
This is the method that has been in use since the earliest versions of
|
||||
ZFS-on-Linux.
|
||||
It has known issues with highly fragmented IO requests and is less efficient on
|
||||
many workloads, but it well known and well understood.
|
||||
.Pp
|
||||
If set to 0, the "new" method is used.
|
||||
This method is available since 2.2.4 and should resolve all known issues and be
|
||||
far more efficient, but has not had as much testing.
|
||||
In the 2.2.x series, this parameter defaults to 1, to use the "classic" method.
|
||||
.Pp
|
||||
It is not recommended that you change it except on advice from the OpenZFS
|
||||
developers.
|
||||
If you do change it, please also open a bug report describing why you did so,
|
||||
including the workload involved and any error messages.
|
||||
.Pp
|
||||
This parameter and the "classic" submission method will be removed in a future
|
||||
release of OpenZFS once we have total confidence in the new method.
|
||||
.Pp
|
||||
This parameter only applies on Linux, and can only be set at module load time.
|
||||
.
|
||||
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
|
||||
Time before expiring
|
||||
.Pa .zfs/snapshot .
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
.\" Copyright 2018 Nexenta Systems, Inc.
|
||||
.\" Copyright 2019 Joyent, Inc.
|
||||
.\"
|
||||
.Dd March 16, 2022
|
||||
.Dd February 8, 2024
|
||||
.Dt ZFS-LIST 8
|
||||
.Os
|
||||
.
|
||||
|
@ -155,6 +155,15 @@ or
|
|||
For example, specifying
|
||||
.Fl t Sy snapshot
|
||||
displays only snapshots.
|
||||
.Sy fs ,
|
||||
.Sy snap ,
|
||||
or
|
||||
.Sy vol
|
||||
can be used as aliases for
|
||||
.Sy filesystem ,
|
||||
.Sy snapshot ,
|
||||
or
|
||||
.Sy volume .
|
||||
.El
|
||||
.
|
||||
.Sh EXAMPLES
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
.Sh SYNOPSIS
|
||||
.Nm zpool
|
||||
.Cm status
|
||||
.Op Fl DigLpPstvx
|
||||
.Op Fl DeigLpPstvx
|
||||
.Op Fl T Sy u Ns | Ns Sy d
|
||||
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns …
|
||||
.Oo Ar pool Oc Ns …
|
||||
|
@ -69,6 +69,8 @@ See the
|
|||
option of
|
||||
.Nm zpool Cm iostat
|
||||
for complete details.
|
||||
.It Fl e
|
||||
Only show unhealthy vdevs (not-ONLINE or with errors).
|
||||
.It Fl i
|
||||
Display vdev initialization status.
|
||||
.It Fl g
|
||||
|
|
|
@ -111,10 +111,11 @@ static const char *upvalname (Proto *p, int uv) {
|
|||
|
||||
static const char *findvararg (CallInfo *ci, int n, StkId *pos) {
|
||||
int nparams = clLvalue(ci->func)->p->numparams;
|
||||
if (n >= ci->u.l.base - ci->func - nparams)
|
||||
int nvararg = cast_int(ci->u.l.base - ci->func) - nparams;
|
||||
if (n <= -nvararg)
|
||||
return NULL; /* no such vararg */
|
||||
else {
|
||||
*pos = ci->func + nparams + n;
|
||||
*pos = ci->func + nparams - n;
|
||||
return "(*vararg)"; /* generic name for any vararg */
|
||||
}
|
||||
}
|
||||
|
@ -126,7 +127,7 @@ static const char *findlocal (lua_State *L, CallInfo *ci, int n,
|
|||
StkId base;
|
||||
if (isLua(ci)) {
|
||||
if (n < 0) /* access to vararg values? */
|
||||
return findvararg(ci, -n, pos);
|
||||
return findvararg(ci, n, pos);
|
||||
else {
|
||||
base = ci->u.l.base;
|
||||
name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));
|
||||
|
|
|
@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
|||
{
|
||||
ASSERT(!abd_is_gang(abd));
|
||||
abd_verify(abd);
|
||||
memset(aiter, 0, sizeof (struct abd_iter));
|
||||
aiter->iter_abd = abd;
|
||||
aiter->iter_pos = 0;
|
||||
aiter->iter_mapaddr = NULL;
|
||||
aiter->iter_mapsize = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -89,10 +89,6 @@ int zfs_debug_level;
|
|||
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
|
||||
"Debug level");
|
||||
|
||||
int zfs_bclone_enabled = 0;
|
||||
SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
|
||||
&zfs_bclone_enabled, 0, "Enable block cloning");
|
||||
|
||||
struct zfs_jailparam {
|
||||
int mount_snapshot;
|
||||
};
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
@ -59,7 +60,9 @@
|
|||
#include <sys/zfs_znode.h>
|
||||
#ifdef _KERNEL
|
||||
#include <linux/kmap_compat.h>
|
||||
#include <linux/mm_compat.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/version.h>
|
||||
#endif
|
||||
|
||||
#ifdef _KERNEL
|
||||
|
@ -895,14 +898,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
|||
{
|
||||
ASSERT(!abd_is_gang(abd));
|
||||
abd_verify(abd);
|
||||
memset(aiter, 0, sizeof (struct abd_iter));
|
||||
aiter->iter_abd = abd;
|
||||
aiter->iter_mapaddr = NULL;
|
||||
aiter->iter_mapsize = 0;
|
||||
aiter->iter_pos = 0;
|
||||
if (abd_is_linear(abd)) {
|
||||
aiter->iter_offset = 0;
|
||||
aiter->iter_sg = NULL;
|
||||
} else {
|
||||
if (!abd_is_linear(abd)) {
|
||||
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
|
||||
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
|
||||
}
|
||||
|
@ -915,6 +913,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
|||
boolean_t
|
||||
abd_iter_at_end(struct abd_iter *aiter)
|
||||
{
|
||||
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
|
||||
return (aiter->iter_pos == aiter->iter_abd->abd_size);
|
||||
}
|
||||
|
||||
|
@ -926,8 +925,15 @@ abd_iter_at_end(struct abd_iter *aiter)
|
|||
void
|
||||
abd_iter_advance(struct abd_iter *aiter, size_t amount)
|
||||
{
|
||||
/*
|
||||
* Ensure that last chunk is not in use. abd_iterate_*() must clear
|
||||
* this state (directly or abd_iter_unmap()) before advancing.
|
||||
*/
|
||||
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
|
||||
ASSERT0(aiter->iter_mapsize);
|
||||
ASSERT3P(aiter->iter_page, ==, NULL);
|
||||
ASSERT0(aiter->iter_page_doff);
|
||||
ASSERT0(aiter->iter_page_dsize);
|
||||
|
||||
/* There's nothing left to advance to, so do nothing */
|
||||
if (abd_iter_at_end(aiter))
|
||||
|
@ -1009,6 +1015,106 @@ abd_cache_reap_now(void)
|
|||
}
|
||||
|
||||
#if defined(_KERNEL)
|
||||
/*
|
||||
* Yield the next page struct and data offset and size within it, without
|
||||
* mapping it into the address space.
|
||||
*/
|
||||
void
|
||||
abd_iter_page(struct abd_iter *aiter)
|
||||
{
|
||||
if (abd_iter_at_end(aiter)) {
|
||||
aiter->iter_page = NULL;
|
||||
aiter->iter_page_doff = 0;
|
||||
aiter->iter_page_dsize = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
struct page *page;
|
||||
size_t doff, dsize;
|
||||
|
||||
if (abd_is_linear(aiter->iter_abd)) {
|
||||
ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
|
||||
|
||||
/* memory address at iter_pos */
|
||||
void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
|
||||
|
||||
/* struct page for address */
|
||||
page = is_vmalloc_addr(paddr) ?
|
||||
vmalloc_to_page(paddr) : virt_to_page(paddr);
|
||||
|
||||
/* offset of address within the page */
|
||||
doff = offset_in_page(paddr);
|
||||
|
||||
/* total data remaining in abd from this position */
|
||||
dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
|
||||
} else {
|
||||
ASSERT(!abd_is_gang(aiter->iter_abd));
|
||||
|
||||
/* current scatter page */
|
||||
page = sg_page(aiter->iter_sg);
|
||||
|
||||
/* position within page */
|
||||
doff = aiter->iter_offset;
|
||||
|
||||
/* remaining data in scatterlist */
|
||||
dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
|
||||
aiter->iter_abd->abd_size - aiter->iter_pos);
|
||||
}
|
||||
ASSERT(page);
|
||||
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
|
||||
if (PageTail(page)) {
|
||||
/*
|
||||
* This page is part of a "compound page", which is a group of
|
||||
* pages that can be referenced from a single struct page *.
|
||||
* Its organised as a "head" page, followed by a series of
|
||||
* "tail" pages.
|
||||
*
|
||||
* In OpenZFS, compound pages are allocated using the
|
||||
* __GFP_COMP flag, which we get from scatter ABDs and SPL
|
||||
* vmalloc slabs (ie >16K allocations). So a great many of the
|
||||
* IO buffers we get are going to be of this type.
|
||||
*
|
||||
* The tail pages are just regular PAGE_SIZE pages, and can be
|
||||
* safely used as-is. However, the head page has length
|
||||
* covering itself and all the tail pages. If this ABD chunk
|
||||
* spans multiple pages, then we can use the head page and a
|
||||
* >PAGE_SIZE length, which is far more efficient.
|
||||
*
|
||||
* To do this, we need to adjust the offset to be counted from
|
||||
* the head page. struct page for compound pages are stored
|
||||
* contiguously, so we can just adjust by a simple offset.
|
||||
*
|
||||
* Before kernel 4.5, compound page heads were refcounted
|
||||
* separately, such that moving back to the head page would
|
||||
* require us to take a reference to it and releasing it once
|
||||
* we're completely finished with it. In practice, that means
|
||||
* when our caller is done with the ABD, which we have no
|
||||
* insight into from here. Rather than contort this API to
|
||||
* track head page references on such ancient kernels, we just
|
||||
* compile this block out and use the tail pages directly. This
|
||||
* is slightly less efficient, but makes everything far
|
||||
* simpler.
|
||||
*/
|
||||
struct page *head = compound_head(page);
|
||||
doff += ((page - head) * PAGESIZE);
|
||||
page = head;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* final page and position within it */
|
||||
aiter->iter_page = page;
|
||||
aiter->iter_page_doff = doff;
|
||||
|
||||
/* amount of data in the chunk, up to the end of the page */
|
||||
aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: ABD BIO functions only needed to support vdev_classic. See comments in
|
||||
* vdev_disk.c.
|
||||
*/
|
||||
|
||||
/*
|
||||
* bio_nr_pages for ABD.
|
||||
* @off is the offset in @abd
|
||||
|
@ -1163,4 +1269,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
|
|||
module_param(zfs_abd_scatter_max_order, uint, 0644);
|
||||
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
|
||||
"Maximum order allocation used for a scatter ABD.");
|
||||
#endif
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
|
||||
* LLNL-CODE-403049.
|
||||
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
|
@ -66,6 +67,13 @@ typedef struct vdev_disk {
|
|||
krwlock_t vd_lock;
|
||||
} vdev_disk_t;
|
||||
|
||||
/*
|
||||
* Maximum number of segments to add to a bio (min 4). If this is higher than
|
||||
* the maximum allowed by the device queue or the kernel itself, it will be
|
||||
* clamped. Setting it to zero will cause the kernel's ideal size to be used.
|
||||
*/
|
||||
uint_t zfs_vdev_disk_max_segs = 0;
|
||||
|
||||
/*
|
||||
* Unique identifier for the exclusive vdev holder.
|
||||
*/
|
||||
|
@ -83,17 +91,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
|
|||
*/
|
||||
#define EFI_MIN_RESV_SIZE (16 * 1024)
|
||||
|
||||
/*
|
||||
* Virtual device vector for disks.
|
||||
*/
|
||||
typedef struct dio_request {
|
||||
zio_t *dr_zio; /* Parent ZIO */
|
||||
atomic_t dr_ref; /* References */
|
||||
int dr_error; /* Bio error */
|
||||
int dr_bio_count; /* Count of bio's */
|
||||
struct bio *dr_bio[]; /* Attached bio's */
|
||||
} dio_request_t;
|
||||
|
||||
/*
|
||||
* BIO request failfast mask.
|
||||
*/
|
||||
|
@ -457,95 +454,15 @@ vdev_disk_close(vdev_t *v)
|
|||
if (v->vdev_reopening || vd == NULL)
|
||||
return;
|
||||
|
||||
if (vd->vd_bdh != NULL) {
|
||||
if (vd->vd_bdh != NULL)
|
||||
vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
|
||||
zfs_vdev_holder);
|
||||
}
|
||||
|
||||
rw_destroy(&vd->vd_lock);
|
||||
kmem_free(vd, sizeof (vdev_disk_t));
|
||||
v->vdev_tsd = NULL;
|
||||
}
|
||||
|
||||
static dio_request_t *
|
||||
vdev_disk_dio_alloc(int bio_count)
|
||||
{
|
||||
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
||||
sizeof (struct bio *) * bio_count, KM_SLEEP);
|
||||
atomic_set(&dr->dr_ref, 0);
|
||||
dr->dr_bio_count = bio_count;
|
||||
dr->dr_error = 0;
|
||||
|
||||
for (int i = 0; i < dr->dr_bio_count; i++)
|
||||
dr->dr_bio[i] = NULL;
|
||||
|
||||
return (dr);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_disk_dio_free(dio_request_t *dr)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < dr->dr_bio_count; i++)
|
||||
if (dr->dr_bio[i])
|
||||
bio_put(dr->dr_bio[i]);
|
||||
|
||||
kmem_free(dr, sizeof (dio_request_t) +
|
||||
sizeof (struct bio *) * dr->dr_bio_count);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_disk_dio_get(dio_request_t *dr)
|
||||
{
|
||||
atomic_inc(&dr->dr_ref);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_disk_dio_put(dio_request_t *dr)
|
||||
{
|
||||
int rc = atomic_dec_return(&dr->dr_ref);
|
||||
|
||||
/*
|
||||
* Free the dio_request when the last reference is dropped and
|
||||
* ensure zio_interpret is called only once with the correct zio
|
||||
*/
|
||||
if (rc == 0) {
|
||||
zio_t *zio = dr->dr_zio;
|
||||
int error = dr->dr_error;
|
||||
|
||||
vdev_disk_dio_free(dr);
|
||||
|
||||
if (zio) {
|
||||
zio->io_error = error;
|
||||
ASSERT3S(zio->io_error, >=, 0);
|
||||
if (zio->io_error)
|
||||
vdev_disk_error(zio);
|
||||
|
||||
zio_delay_interrupt(zio);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
|
||||
{
|
||||
dio_request_t *dr = bio->bi_private;
|
||||
|
||||
if (dr->dr_error == 0) {
|
||||
#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
dr->dr_error = BIO_END_IO_ERROR(bio);
|
||||
#else
|
||||
if (error)
|
||||
dr->dr_error = -(error);
|
||||
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
dr->dr_error = EIO;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Drop reference acquired by __vdev_disk_physio */
|
||||
vdev_disk_dio_put(dr);
|
||||
}
|
||||
|
||||
static inline void
|
||||
vdev_submit_bio_impl(struct bio *bio)
|
||||
{
|
||||
|
@ -697,8 +614,462 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
|
|||
return (bio);
|
||||
}
|
||||
|
||||
static inline uint_t
|
||||
vdev_bio_max_segs(struct block_device *bdev)
|
||||
{
|
||||
/*
|
||||
* Smallest of the device max segs and the tuneable max segs. Minimum
|
||||
* 4, so there's room to finish split pages if they come up.
|
||||
*/
|
||||
const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
|
||||
const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
|
||||
MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
|
||||
const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
|
||||
|
||||
#ifdef HAVE_BIO_MAX_SEGS
|
||||
return (bio_max_segs(max_segs));
|
||||
#else
|
||||
return (MIN(max_segs, BIO_MAX_PAGES));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint_t
|
||||
vdev_bio_max_bytes(struct block_device *bdev)
|
||||
{
|
||||
return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Virtual block IO object (VBIO)
|
||||
*
|
||||
* Linux block IO (BIO) objects have a limit on how many data segments (pages)
|
||||
* they can hold. Depending on how they're allocated and structured, a large
|
||||
* ZIO can require more than one BIO to be submitted to the kernel, which then
|
||||
* all have to complete before we can return the completed ZIO back to ZFS.
|
||||
*
|
||||
* A VBIO is a wrapper around multiple BIOs, carrying everything needed to
|
||||
* translate a ZIO down into the kernel block layer and back again.
|
||||
*
|
||||
* Note that these are only used for data ZIOs (read/write). Meta-operations
|
||||
* (flush/trim) don't need multiple BIOs and so can just make the call
|
||||
* directly.
|
||||
*/
|
||||
typedef struct {
|
||||
zio_t *vbio_zio; /* parent zio */
|
||||
|
||||
struct block_device *vbio_bdev; /* blockdev to submit bios to */
|
||||
|
||||
abd_t *vbio_abd; /* abd carrying borrowed linear buf */
|
||||
|
||||
uint_t vbio_max_segs; /* max segs per bio */
|
||||
|
||||
uint_t vbio_max_bytes; /* max bytes per bio */
|
||||
uint_t vbio_lbs_mask; /* logical block size mask */
|
||||
|
||||
uint64_t vbio_offset; /* start offset of next bio */
|
||||
|
||||
struct bio *vbio_bio; /* pointer to the current bio */
|
||||
int vbio_flags; /* bio flags */
|
||||
} vbio_t;
|
||||
|
||||
static vbio_t *
|
||||
vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
|
||||
{
|
||||
vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
|
||||
|
||||
vbio->vbio_zio = zio;
|
||||
vbio->vbio_bdev = bdev;
|
||||
vbio->vbio_abd = NULL;
|
||||
vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
|
||||
vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
|
||||
vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
|
||||
vbio->vbio_offset = zio->io_offset;
|
||||
vbio->vbio_bio = NULL;
|
||||
vbio->vbio_flags = flags;
|
||||
|
||||
return (vbio);
|
||||
}
|
||||
|
||||
BIO_END_IO_PROTO(vbio_completion, bio, error);
|
||||
|
||||
static int
|
||||
vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
|
||||
{
|
||||
struct bio *bio = vbio->vbio_bio;
|
||||
uint_t ssize;
|
||||
|
||||
while (size > 0) {
|
||||
if (bio == NULL) {
|
||||
/* New BIO, allocate and set up */
|
||||
bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
|
||||
vbio->vbio_max_segs);
|
||||
VERIFY(bio);
|
||||
|
||||
BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
|
||||
bio_set_op_attrs(bio,
|
||||
vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
|
||||
WRITE : READ, vbio->vbio_flags);
|
||||
|
||||
if (vbio->vbio_bio) {
|
||||
bio_chain(vbio->vbio_bio, bio);
|
||||
vdev_submit_bio(vbio->vbio_bio);
|
||||
}
|
||||
vbio->vbio_bio = bio;
|
||||
}
|
||||
|
||||
/*
|
||||
* Only load as much of the current page data as will fit in
|
||||
* the space left in the BIO, respecting lbs alignment. Older
|
||||
* kernels will error if we try to overfill the BIO, while
|
||||
* newer ones will accept it and split the BIO. This ensures
|
||||
* everything works on older kernels, and avoids an additional
|
||||
* overhead on the new.
|
||||
*/
|
||||
ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
|
||||
vbio->vbio_lbs_mask);
|
||||
if (ssize > 0 &&
|
||||
bio_add_page(bio, page, ssize, offset) == ssize) {
|
||||
/* Accepted, adjust and load any remaining. */
|
||||
size -= ssize;
|
||||
offset += ssize;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* No room, set up for a new BIO and loop */
|
||||
vbio->vbio_offset += BIO_BI_SIZE(bio);
|
||||
|
||||
/* Signal new BIO allocation wanted */
|
||||
bio = NULL;
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/* Iterator callback to submit ABD pages to the vbio. */
|
||||
static int
|
||||
vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
|
||||
{
|
||||
vbio_t *vbio = priv;
|
||||
return (vbio_add_page(vbio, page, len, off));
|
||||
}
|
||||
|
||||
/* Create some BIOs, fill them with data and submit them */
|
||||
static void
|
||||
vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
|
||||
{
|
||||
ASSERT(vbio->vbio_bdev);
|
||||
|
||||
/*
|
||||
* We plug so we can submit the BIOs as we go and only unplug them when
|
||||
* they are fully created and submitted. This is important; if we don't
|
||||
* plug, then the kernel may start executing earlier BIOs while we're
|
||||
* still creating and executing later ones, and if the device goes
|
||||
* away while that's happening, older kernels can get confused and
|
||||
* trample memory.
|
||||
*/
|
||||
struct blk_plug plug;
|
||||
blk_start_plug(&plug);
|
||||
|
||||
(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
|
||||
ASSERT(vbio->vbio_bio);
|
||||
|
||||
vbio->vbio_bio->bi_end_io = vbio_completion;
|
||||
vbio->vbio_bio->bi_private = vbio;
|
||||
|
||||
vdev_submit_bio(vbio->vbio_bio);
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
vbio->vbio_bio = NULL;
|
||||
vbio->vbio_bdev = NULL;
|
||||
}
|
||||
|
||||
/* IO completion callback */
|
||||
BIO_END_IO_PROTO(vbio_completion, bio, error)
|
||||
{
|
||||
vbio_t *vbio = bio->bi_private;
|
||||
zio_t *zio = vbio->vbio_zio;
|
||||
|
||||
ASSERT(zio);
|
||||
|
||||
/* Capture and log any errors */
|
||||
#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
zio->io_error = BIO_END_IO_ERROR(bio);
|
||||
#else
|
||||
zio->io_error = 0;
|
||||
if (error)
|
||||
zio->io_error = -(error);
|
||||
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
zio->io_error = EIO;
|
||||
#endif
|
||||
ASSERT3U(zio->io_error, >=, 0);
|
||||
|
||||
if (zio->io_error)
|
||||
vdev_disk_error(zio);
|
||||
|
||||
/* Return the BIO to the kernel */
|
||||
bio_put(bio);
|
||||
|
||||
/*
|
||||
* If we copied the ABD before issuing it, clean up and return the copy
|
||||
* to the ADB, with changes if appropriate.
|
||||
*/
|
||||
if (vbio->vbio_abd != NULL) {
|
||||
void *buf = abd_to_buf(vbio->vbio_abd);
|
||||
abd_free(vbio->vbio_abd);
|
||||
vbio->vbio_abd = NULL;
|
||||
|
||||
if (zio->io_type == ZIO_TYPE_READ)
|
||||
abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
|
||||
else
|
||||
abd_return_buf(zio->io_abd, buf, zio->io_size);
|
||||
}
|
||||
|
||||
/* Final cleanup */
|
||||
kmem_free(vbio, sizeof (vbio_t));
|
||||
|
||||
/* All done, submit for processing */
|
||||
zio_delay_interrupt(zio);
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterator callback to count ABD pages and check their size & alignment.
|
||||
*
|
||||
* On Linux, each BIO segment can take a page pointer, and an offset+length of
|
||||
* the data within that page. A page can be arbitrarily large ("compound"
|
||||
* pages) but we still have to ensure the data portion is correctly sized and
|
||||
* aligned to the logical block size, to ensure that if the kernel wants to
|
||||
* split the BIO, the two halves will still be properly aligned.
|
||||
*/
|
||||
typedef struct {
|
||||
uint_t bmask;
|
||||
uint_t npages;
|
||||
uint_t end;
|
||||
} vdev_disk_check_pages_t;
|
||||
|
||||
static int
|
||||
vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
|
||||
{
|
||||
vdev_disk_check_pages_t *s = priv;
|
||||
|
||||
/*
|
||||
* If we didn't finish on a block size boundary last time, then there
|
||||
* would be a gap if we tried to use this ABD as-is, so abort.
|
||||
*/
|
||||
if (s->end != 0)
|
||||
return (1);
|
||||
|
||||
/*
|
||||
* Note if we're taking less than a full block, so we can check it
|
||||
* above on the next call.
|
||||
*/
|
||||
s->end = len & s->bmask;
|
||||
|
||||
/* All blocks after the first must start on a block size boundary. */
|
||||
if (s->npages != 0 && (off & s->bmask) != 0)
|
||||
return (1);
|
||||
|
||||
s->npages++;
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if we can submit the pages in this ABD to the kernel as-is. Returns
|
||||
* the number of pages, or 0 if it can't be submitted like this.
|
||||
*/
|
||||
static boolean_t
|
||||
vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
|
||||
{
|
||||
vdev_disk_check_pages_t s = {
|
||||
.bmask = bdev_logical_block_size(bdev)-1,
|
||||
.npages = 0,
|
||||
.end = 0,
|
||||
};
|
||||
|
||||
if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
|
||||
return (B_FALSE);
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
static int
|
||||
vdev_disk_io_rw(zio_t *zio)
|
||||
{
|
||||
vdev_t *v = zio->io_vd;
|
||||
vdev_disk_t *vd = v->vdev_tsd;
|
||||
struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
|
||||
int flags = 0;
|
||||
|
||||
/*
|
||||
* Accessing outside the block device is never allowed.
|
||||
*/
|
||||
if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
|
||||
vdev_dbgmsg(zio->io_vd,
|
||||
"Illegal access %llu size %llu, device size %llu",
|
||||
(u_longlong_t)zio->io_offset,
|
||||
(u_longlong_t)zio->io_size,
|
||||
(u_longlong_t)i_size_read(bdev->bd_inode));
|
||||
return (SET_ERROR(EIO));
|
||||
}
|
||||
|
||||
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
||||
v->vdev_failfast == B_TRUE) {
|
||||
bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
|
||||
zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check alignment of the incoming ABD. If any part of it would require
|
||||
* submitting a page that is not aligned to the logical block size,
|
||||
* then we take a copy into a linear buffer and submit that instead.
|
||||
* This should be impossible on a 512b LBS, and fairly rare on 4K,
|
||||
* usually requiring abnormally-small data blocks (eg gang blocks)
|
||||
* mixed into the same ABD as larger ones (eg aggregated).
|
||||
*/
|
||||
abd_t *abd = zio->io_abd;
|
||||
if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
|
||||
void *buf;
|
||||
if (zio->io_type == ZIO_TYPE_READ)
|
||||
buf = abd_borrow_buf(zio->io_abd, zio->io_size);
|
||||
else
|
||||
buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
|
||||
|
||||
/*
|
||||
* Wrap the copy in an abd_t, so we can use the same iterators
|
||||
* to count and fill the vbio later.
|
||||
*/
|
||||
abd = abd_get_from_buf(buf, zio->io_size);
|
||||
|
||||
/*
|
||||
* False here would mean the borrowed copy has an invalid
|
||||
* alignment too, which would mean we've somehow been passed a
|
||||
* linear ABD with an interior page that has a non-zero offset
|
||||
* or a size not a multiple of PAGE_SIZE. This is not possible.
|
||||
* It would mean either zio_buf_alloc() or its underlying
|
||||
* allocators have done something extremely strange, or our
|
||||
* math in vdev_disk_check_pages() is wrong. In either case,
|
||||
* something in seriously wrong and its not safe to continue.
|
||||
*/
|
||||
VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
|
||||
}
|
||||
|
||||
/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
|
||||
vbio_t *vbio = vbio_alloc(zio, bdev, flags);
|
||||
if (abd != zio->io_abd)
|
||||
vbio->vbio_abd = abd;
|
||||
|
||||
/* Fill it with data pages and submit it to the kernel */
|
||||
vbio_submit(vbio, abd, zio->io_size);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/* ========== */
|
||||
|
||||
/*
|
||||
* This is the classic, battle-tested BIO submission code. Until we're totally
|
||||
* sure that the new code is safe and correct in all cases, this will remain
|
||||
* available.
|
||||
*
|
||||
* It is enabled by setting zfs_vdev_disk_classic=1 at module load time. It is
|
||||
* enabled (=1) by default since 2.2.4, and disabled by default (=0) on master.
|
||||
*
|
||||
* These functions have been renamed to vdev_classic_* to make it clear what
|
||||
* they belong to, but their implementations are unchanged.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Virtual device vector for disks.
|
||||
*/
|
||||
typedef struct dio_request {
|
||||
zio_t *dr_zio; /* Parent ZIO */
|
||||
atomic_t dr_ref; /* References */
|
||||
int dr_error; /* Bio error */
|
||||
int dr_bio_count; /* Count of bio's */
|
||||
struct bio *dr_bio[]; /* Attached bio's */
|
||||
} dio_request_t;
|
||||
|
||||
static dio_request_t *
|
||||
vdev_classic_dio_alloc(int bio_count)
|
||||
{
|
||||
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
||||
sizeof (struct bio *) * bio_count, KM_SLEEP);
|
||||
atomic_set(&dr->dr_ref, 0);
|
||||
dr->dr_bio_count = bio_count;
|
||||
dr->dr_error = 0;
|
||||
|
||||
for (int i = 0; i < dr->dr_bio_count; i++)
|
||||
dr->dr_bio[i] = NULL;
|
||||
|
||||
return (dr);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_classic_dio_free(dio_request_t *dr)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < dr->dr_bio_count; i++)
|
||||
if (dr->dr_bio[i])
|
||||
bio_put(dr->dr_bio[i]);
|
||||
|
||||
kmem_free(dr, sizeof (dio_request_t) +
|
||||
sizeof (struct bio *) * dr->dr_bio_count);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_classic_dio_get(dio_request_t *dr)
|
||||
{
|
||||
atomic_inc(&dr->dr_ref);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_classic_dio_put(dio_request_t *dr)
|
||||
{
|
||||
int rc = atomic_dec_return(&dr->dr_ref);
|
||||
|
||||
/*
|
||||
* Free the dio_request when the last reference is dropped and
|
||||
* ensure zio_interpret is called only once with the correct zio
|
||||
*/
|
||||
if (rc == 0) {
|
||||
zio_t *zio = dr->dr_zio;
|
||||
int error = dr->dr_error;
|
||||
|
||||
vdev_classic_dio_free(dr);
|
||||
|
||||
if (zio) {
|
||||
zio->io_error = error;
|
||||
ASSERT3S(zio->io_error, >=, 0);
|
||||
if (zio->io_error)
|
||||
vdev_disk_error(zio);
|
||||
|
||||
zio_delay_interrupt(zio);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
|
||||
{
|
||||
dio_request_t *dr = bio->bi_private;
|
||||
|
||||
if (dr->dr_error == 0) {
|
||||
#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
dr->dr_error = BIO_END_IO_ERROR(bio);
|
||||
#else
|
||||
if (error)
|
||||
dr->dr_error = -(error);
|
||||
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
dr->dr_error = EIO;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Drop reference acquired by vdev_classic_physio */
|
||||
vdev_classic_dio_put(dr);
|
||||
}
|
||||
|
||||
static inline unsigned int
|
||||
vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
||||
vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
||||
{
|
||||
unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
|
||||
bio_size, abd_offset);
|
||||
|
@ -711,9 +1082,16 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
|||
}
|
||||
|
||||
static int
|
||||
__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
||||
size_t io_size, uint64_t io_offset, int rw, int flags)
|
||||
vdev_classic_physio(zio_t *zio)
|
||||
{
|
||||
vdev_t *v = zio->io_vd;
|
||||
vdev_disk_t *vd = v->vdev_tsd;
|
||||
struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
|
||||
size_t io_size = zio->io_size;
|
||||
uint64_t io_offset = zio->io_offset;
|
||||
int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
|
||||
int flags = 0;
|
||||
|
||||
dio_request_t *dr;
|
||||
uint64_t abd_offset;
|
||||
uint64_t bio_offset;
|
||||
|
@ -736,7 +1114,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
|||
}
|
||||
|
||||
retry:
|
||||
dr = vdev_disk_dio_alloc(bio_count);
|
||||
dr = vdev_classic_dio_alloc(bio_count);
|
||||
|
||||
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
||||
zio->io_vd->vdev_failfast == B_TRUE) {
|
||||
|
@ -771,23 +1149,23 @@ retry:
|
|||
* this should be rare - see the comment above.
|
||||
*/
|
||||
if (dr->dr_bio_count == i) {
|
||||
vdev_disk_dio_free(dr);
|
||||
vdev_classic_dio_free(dr);
|
||||
bio_count *= 2;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
|
||||
nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
|
||||
dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
|
||||
if (unlikely(dr->dr_bio[i] == NULL)) {
|
||||
vdev_disk_dio_free(dr);
|
||||
vdev_classic_dio_free(dr);
|
||||
return (SET_ERROR(ENOMEM));
|
||||
}
|
||||
|
||||
/* Matching put called by vdev_disk_physio_completion */
|
||||
vdev_disk_dio_get(dr);
|
||||
/* Matching put called by vdev_classic_physio_completion */
|
||||
vdev_classic_dio_get(dr);
|
||||
|
||||
BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
|
||||
dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
|
||||
dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
|
||||
dr->dr_bio[i]->bi_private = dr;
|
||||
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
|
||||
|
||||
|
@ -801,7 +1179,7 @@ retry:
|
|||
}
|
||||
|
||||
/* Extra reference to protect dio_request during vdev_submit_bio */
|
||||
vdev_disk_dio_get(dr);
|
||||
vdev_classic_dio_get(dr);
|
||||
|
||||
if (dr->dr_bio_count > 1)
|
||||
blk_start_plug(&plug);
|
||||
|
@ -815,11 +1193,13 @@ retry:
|
|||
if (dr->dr_bio_count > 1)
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
vdev_disk_dio_put(dr);
|
||||
vdev_classic_dio_put(dr);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
/* ========== */
|
||||
|
||||
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
|
||||
{
|
||||
zio_t *zio = bio->bi_private;
|
||||
|
@ -928,12 +1308,14 @@ vdev_disk_io_trim(zio_t *zio)
|
|||
#endif
|
||||
}
|
||||
|
||||
int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
|
||||
|
||||
static void
|
||||
vdev_disk_io_start(zio_t *zio)
|
||||
{
|
||||
vdev_t *v = zio->io_vd;
|
||||
vdev_disk_t *vd = v->vdev_tsd;
|
||||
int rw, error;
|
||||
int error;
|
||||
|
||||
/*
|
||||
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
|
||||
|
@ -996,13 +1378,6 @@ vdev_disk_io_start(zio_t *zio)
|
|||
rw_exit(&vd->vd_lock);
|
||||
zio_execute(zio);
|
||||
return;
|
||||
case ZIO_TYPE_WRITE:
|
||||
rw = WRITE;
|
||||
break;
|
||||
|
||||
case ZIO_TYPE_READ:
|
||||
rw = READ;
|
||||
break;
|
||||
|
||||
case ZIO_TYPE_TRIM:
|
||||
zio->io_error = vdev_disk_io_trim(zio);
|
||||
|
@ -1015,23 +1390,34 @@ vdev_disk_io_start(zio_t *zio)
|
|||
#endif
|
||||
return;
|
||||
|
||||
case ZIO_TYPE_READ:
|
||||
case ZIO_TYPE_WRITE:
|
||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||
error = vdev_disk_io_rw_fn(zio);
|
||||
rw_exit(&vd->vd_lock);
|
||||
if (error) {
|
||||
zio->io_error = error;
|
||||
zio_interrupt(zio);
|
||||
}
|
||||
return;
|
||||
|
||||
default:
|
||||
/*
|
||||
* Getting here means our parent vdev has made a very strange
|
||||
* request of us, and shouldn't happen. Assert here to force a
|
||||
* crash in dev builds, but in production return the IO
|
||||
* unhandled. The pool will likely suspend anyway but that's
|
||||
* nicer than crashing the kernel.
|
||||
*/
|
||||
ASSERT3S(zio->io_type, ==, -1);
|
||||
|
||||
rw_exit(&vd->vd_lock);
|
||||
zio->io_error = SET_ERROR(ENOTSUP);
|
||||
zio_interrupt(zio);
|
||||
return;
|
||||
}
|
||||
|
||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||
error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
|
||||
zio->io_size, zio->io_offset, rw, 0);
|
||||
rw_exit(&vd->vd_lock);
|
||||
|
||||
if (error) {
|
||||
zio->io_error = error;
|
||||
zio_interrupt(zio);
|
||||
return;
|
||||
}
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -1080,8 +1466,49 @@ vdev_disk_rele(vdev_t *vd)
|
|||
/* XXX: Implement me as a vnode rele for the device */
|
||||
}
|
||||
|
||||
/*
|
||||
* BIO submission method. See comment above about vdev_classic.
|
||||
* Set zfs_vdev_disk_classic=0 for new, =1 for classic
|
||||
*/
|
||||
static uint_t zfs_vdev_disk_classic = 1; /* default classic */
|
||||
|
||||
/* Set submission function from module parameter */
|
||||
static int
|
||||
vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
|
||||
{
|
||||
int err = param_set_uint(buf, kp);
|
||||
if (err < 0)
|
||||
return (SET_ERROR(err));
|
||||
|
||||
vdev_disk_io_rw_fn =
|
||||
zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
|
||||
|
||||
printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
|
||||
zfs_vdev_disk_classic ? "classic" : "new");
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* At first use vdev use, set the submission function from the default value if
|
||||
* it hasn't been set already.
|
||||
*/
|
||||
static int
|
||||
vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
|
||||
{
|
||||
(void) spa;
|
||||
(void) nv;
|
||||
(void) tsd;
|
||||
|
||||
if (vdev_disk_io_rw_fn == NULL)
|
||||
vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
|
||||
vdev_classic_physio : vdev_disk_io_rw;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
vdev_ops_t vdev_disk_ops = {
|
||||
.vdev_op_init = NULL,
|
||||
.vdev_op_init = vdev_disk_init,
|
||||
.vdev_op_fini = NULL,
|
||||
.vdev_op_open = vdev_disk_open,
|
||||
.vdev_op_close = vdev_disk_close,
|
||||
|
@ -1174,3 +1601,10 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
|
|||
|
||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
|
||||
"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
|
||||
"Maximum number of data segments to add to an IO request (min 4)");
|
||||
|
||||
ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
|
||||
vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
|
||||
"Use classic BIO submission method");
|
||||
|
|
|
@ -3821,11 +3821,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
|
|||
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
||||
zfs_sa_upgrade_txholds(tx, zp);
|
||||
|
||||
err = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
err = dmu_tx_assign(tx, TXG_WAIT);
|
||||
if (err != 0) {
|
||||
if (err == ERESTART)
|
||||
dmu_tx_wait(tx);
|
||||
|
||||
dmu_tx_abort(tx);
|
||||
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
|
||||
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
|
||||
|
@ -4277,9 +4274,4 @@ EXPORT_SYMBOL(zfs_map);
|
|||
/* CSTYLED */
|
||||
module_param(zfs_delete_blocks, ulong, 0644);
|
||||
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
|
||||
|
||||
/* CSTYLED */
|
||||
module_param(zfs_bclone_enabled, uint, 0644);
|
||||
MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");
|
||||
|
||||
#endif
|
||||
|
|
|
@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
|
|||
{
|
||||
boolean_t *for_sync = data;
|
||||
fstrans_cookie_t cookie;
|
||||
int ret;
|
||||
|
||||
ASSERT(PageLocked(pp));
|
||||
ASSERT(!PageWriteback(pp));
|
||||
|
||||
cookie = spl_fstrans_mark();
|
||||
(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
|
||||
ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
|
||||
spl_fstrans_unmark(cookie);
|
||||
|
||||
return (0);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
#ifdef HAVE_WRITEPAGE_T_FOLIO
|
||||
static int
|
||||
zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
|
||||
{
|
||||
(void) zpl_putpage(&pp->page, wbc, data);
|
||||
return (0);
|
||||
return (zpl_putpage(&pp->page, wbc, data));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -26,13 +26,14 @@
|
|||
#include <linux/compat.h>
|
||||
#endif
|
||||
#include <linux/fs.h>
|
||||
#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
|
||||
#include <linux/splice.h>
|
||||
#endif
|
||||
#include <sys/file.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#include <sys/zfs_vnops.h>
|
||||
#include <sys/zfeature.h>
|
||||
|
||||
int zfs_bclone_enabled = 0;
|
||||
|
||||
/*
|
||||
* Clone part of a file via block cloning.
|
||||
*
|
||||
|
@ -40,7 +41,7 @@ int zfs_bclone_enabled = 0;
|
|||
* care of that depending on how it was called.
|
||||
*/
|
||||
static ssize_t
|
||||
__zpl_clone_file_range(struct file *src_file, loff_t src_off,
|
||||
zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
|
||||
struct file *dst_file, loff_t dst_off, size_t len)
|
||||
{
|
||||
struct inode *src_i = file_inode(src_file);
|
||||
|
@ -96,14 +97,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
|||
{
|
||||
ssize_t ret;
|
||||
|
||||
/* Flags is reserved for future extensions and must be zero. */
|
||||
if (flags != 0)
|
||||
return (-EINVAL);
|
||||
|
||||
/* Try to do it via zfs_clone_range() */
|
||||
ret = __zpl_clone_file_range(src_file, src_off,
|
||||
/* Try to do it via zfs_clone_range() and allow shortening. */
|
||||
ret = zpl_clone_file_range_impl(src_file, src_off,
|
||||
dst_file, dst_off, len);
|
||||
|
||||
#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
|
||||
#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
|
||||
/*
|
||||
* Since Linux 5.3 the filesystem driver is responsible for executing
|
||||
* an appropriate fallback, and a generic fallback function is provided.
|
||||
|
@ -112,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
|||
ret == -EAGAIN)
|
||||
ret = generic_copy_file_range(src_file, src_off, dst_file,
|
||||
dst_off, len, flags);
|
||||
#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
|
||||
/*
|
||||
* Since 6.8 the fallback function is called splice_copy_file_range
|
||||
* and has a slightly different signature.
|
||||
*/
|
||||
if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
|
||||
ret == -EAGAIN)
|
||||
ret = splice_copy_file_range(src_file, src_off, dst_file,
|
||||
dst_off, len);
|
||||
#else
|
||||
/*
|
||||
* Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
|
||||
|
@ -119,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
|||
*/
|
||||
if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
|
||||
ret = -EOPNOTSUPP;
|
||||
#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
|
||||
#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
@ -137,6 +148,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
|||
* FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
|
||||
* range in both files and if they're the same, arrange for them to be backed
|
||||
* by the same storage.
|
||||
*
|
||||
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
|
||||
* if we want. It's designed for filesystems that may need to shorten the
|
||||
* length for alignment, EOF, or any other requirement. ZFS may shorten the
|
||||
* request when there is outstanding dirty data which hasn't been written.
|
||||
*/
|
||||
loff_t
|
||||
zpl_remap_file_range(struct file *src_file, loff_t src_off,
|
||||
|
@ -145,24 +161,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off,
|
|||
if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
|
||||
return (-EINVAL);
|
||||
|
||||
/*
|
||||
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
|
||||
* range if we want. Its designed for filesystems that make data past
|
||||
* EOF available, and don't want it to be visible in both files. ZFS
|
||||
* doesn't do that, so we just turn the flag off.
|
||||
*/
|
||||
flags &= ~REMAP_FILE_CAN_SHORTEN;
|
||||
|
||||
/* No support for dedup yet */
|
||||
if (flags & REMAP_FILE_DEDUP)
|
||||
/* No support for dedup yet */
|
||||
return (-EOPNOTSUPP);
|
||||
|
||||
/* Zero length means to clone everything to the end of the file */
|
||||
if (len == 0)
|
||||
len = i_size_read(file_inode(src_file)) - src_off;
|
||||
|
||||
return (__zpl_clone_file_range(src_file, src_off,
|
||||
dst_file, dst_off, len));
|
||||
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
|
||||
dst_file, dst_off, len);
|
||||
|
||||
if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len)
|
||||
ret = -EINVAL;
|
||||
|
||||
return (ret);
|
||||
}
|
||||
#endif /* HAVE_VFS_REMAP_FILE_RANGE */
|
||||
|
||||
|
@ -179,8 +192,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off,
|
|||
if (len == 0)
|
||||
len = i_size_read(file_inode(src_file)) - src_off;
|
||||
|
||||
return (__zpl_clone_file_range(src_file, src_off,
|
||||
dst_file, dst_off, len));
|
||||
/* The entire length must be cloned or this is an error. */
|
||||
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
|
||||
dst_file, dst_off, len);
|
||||
|
||||
if (ret >= 0 && ret != len)
|
||||
ret = -EINVAL;
|
||||
|
||||
return (ret);
|
||||
}
|
||||
#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
|
||||
|
||||
|
@ -214,8 +233,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg)
|
|||
|
||||
size_t len = i_size_read(file_inode(src_file));
|
||||
|
||||
ssize_t ret =
|
||||
__zpl_clone_file_range(src_file, 0, dst_file, 0, len);
|
||||
ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len);
|
||||
|
||||
fput(src_file);
|
||||
|
||||
|
@ -253,7 +271,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
|
|||
if (len == 0)
|
||||
len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
|
||||
|
||||
ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
|
||||
ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
|
||||
dst_file, fcr.fcr_dest_offset, len);
|
||||
|
||||
fput(src_file);
|
||||
|
|
|
@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
|
|||
return (ret);
|
||||
}
|
||||
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
int
|
||||
abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
|
||||
abd_iter_page_func_t *func, void *private)
|
||||
{
|
||||
struct abd_iter aiter;
|
||||
int ret = 0;
|
||||
|
||||
if (size == 0)
|
||||
return (0);
|
||||
|
||||
abd_verify(abd);
|
||||
ASSERT3U(off + size, <=, abd->abd_size);
|
||||
|
||||
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
|
||||
|
||||
while (size > 0) {
|
||||
IMPLY(abd_is_gang(abd), c_abd != NULL);
|
||||
|
||||
abd_iter_page(&aiter);
|
||||
|
||||
size_t len = MIN(aiter.iter_page_dsize, size);
|
||||
ASSERT3U(len, >, 0);
|
||||
|
||||
ret = func(aiter.iter_page, aiter.iter_page_doff,
|
||||
len, private);
|
||||
|
||||
aiter.iter_page = NULL;
|
||||
aiter.iter_page_doff = 0;
|
||||
aiter.iter_page_dsize = 0;
|
||||
|
||||
if (ret != 0)
|
||||
break;
|
||||
|
||||
size -= len;
|
||||
c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
|
||||
}
|
||||
|
||||
return (ret);
|
||||
}
|
||||
#endif
|
||||
|
||||
struct buf_arg {
|
||||
void *arg_buf;
|
||||
};
|
||||
|
|
|
@ -1822,7 +1822,8 @@ spa_get_slop_space(spa_t *spa)
|
|||
* deduplicated data, so since it's not useful to reserve more
|
||||
* space with more deduplicated data, we subtract that out here.
|
||||
*/
|
||||
space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
|
||||
space =
|
||||
spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa);
|
||||
slop = MIN(space >> spa_slop_shift, spa_max_slop);
|
||||
|
||||
/*
|
||||
|
|
|
@ -58,6 +58,26 @@
|
|||
#include <sys/zfs_vfsops.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
|
||||
/*
|
||||
* Enable the experimental block cloning feature. If this setting is 0, then
|
||||
* even if feature@block_cloning is enabled, attempts to clone blocks will act
|
||||
* as though the feature is disabled.
|
||||
*/
|
||||
int zfs_bclone_enabled = 0;
|
||||
|
||||
/*
|
||||
* When set zfs_clone_range() waits for dirty data to be written to disk.
|
||||
* This allows the clone operation to reliably succeed when a file is modified
|
||||
* and then immediately cloned. For small files this may be slower than making
|
||||
* a copy of the file and is therefore not the default. However, in certain
|
||||
* scenarios this behavior may be desirable so a tunable is provided.
|
||||
*/
|
||||
static int zfs_bclone_wait_dirty = 0;
|
||||
|
||||
/*
|
||||
* Maximum bytes to read per chunk in zfs_read().
|
||||
*/
|
||||
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
|
||||
|
||||
static ulong_t zfs_fsync_sync_cnt = 4;
|
||||
|
||||
|
@ -110,7 +130,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
|
|||
|
||||
/* Flush any mmap()'d data to disk */
|
||||
if (zn_has_cached_data(zp, 0, file_sz - 1))
|
||||
zn_flush_cached_data(zp, B_FALSE);
|
||||
zn_flush_cached_data(zp, B_TRUE);
|
||||
|
||||
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
|
||||
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
|
||||
|
@ -189,8 +209,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
|
|||
return (error);
|
||||
}
|
||||
|
||||
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
|
||||
|
||||
/*
|
||||
* Read bytes from specified file into supplied buffer.
|
||||
*
|
||||
|
@ -1055,6 +1073,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
|||
size_t maxblocks, nbps;
|
||||
uint_t inblksz;
|
||||
uint64_t clear_setid_bits_txg = 0;
|
||||
uint64_t last_synced_txg = 0;
|
||||
|
||||
inoff = *inoffp;
|
||||
outoff = *outoffp;
|
||||
|
@ -1174,6 +1193,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
|||
}
|
||||
}
|
||||
|
||||
/* Flush any mmap()'d data to disk */
|
||||
if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
|
||||
zn_flush_cached_data(inzp, B_TRUE);
|
||||
|
||||
/*
|
||||
* Maintain predictable lock order.
|
||||
*/
|
||||
|
@ -1293,15 +1316,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
|||
}
|
||||
|
||||
nbps = maxblocks;
|
||||
last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
|
||||
error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
|
||||
&nbps);
|
||||
if (error != 0) {
|
||||
/*
|
||||
* If we are trying to clone a block that was created
|
||||
* in the current transaction group, error will be
|
||||
* EAGAIN here, which we can just return to the caller
|
||||
* so it can fallback if it likes.
|
||||
* in the current transaction group, the error will be
|
||||
* EAGAIN here. Based on zfs_bclone_wait_dirty either
|
||||
* return a shortened range to the caller so it can
|
||||
* fallback, or wait for the next TXG and check again.
|
||||
*/
|
||||
if (error == EAGAIN && zfs_bclone_wait_dirty) {
|
||||
txg_wait_synced(dmu_objset_pool(inos),
|
||||
last_synced_txg + 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1523,3 +1554,9 @@ EXPORT_SYMBOL(zfs_clone_range_replay);
|
|||
|
||||
ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
|
||||
"Bytes to read per chunk");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
|
||||
"Enable block cloning");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
|
||||
"Wait for dirty blocks when cloning");
|
||||
|
|
|
@ -535,7 +535,8 @@ tags = ['functional', 'cli_root', 'zpool_split']
|
|||
tests = ['zpool_status_001_pos', 'zpool_status_002_pos',
|
||||
'zpool_status_003_pos', 'zpool_status_004_pos',
|
||||
'zpool_status_005_pos', 'zpool_status_006_pos',
|
||||
'zpool_status_007_pos', 'zpool_status_features_001_pos']
|
||||
'zpool_status_007_pos', 'zpool_status_008_pos',
|
||||
'zpool_status_features_001_pos']
|
||||
tags = ['functional', 'cli_root', 'zpool_status']
|
||||
|
||||
[tests/functional/cli_root/zpool_sync]
|
||||
|
@ -630,7 +631,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
|
|||
tags = ['functional', 'compression']
|
||||
|
||||
[tests/functional/cp_files]
|
||||
tests = ['cp_files_001_pos', 'cp_stress']
|
||||
tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
|
||||
tags = ['functional', 'cp_files']
|
||||
|
||||
[tests/functional/crtime]
|
||||
|
|
|
@ -138,7 +138,11 @@ idmap_reason = 'Idmapped mount needs kernel 5.12+'
|
|||
# copy_file_range() is not supported by all kernels
|
||||
#
|
||||
cfr_reason = 'Kernel copy_file_range support required'
|
||||
cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+'
|
||||
|
||||
if sys.platform.startswith('freebsd'):
|
||||
cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs FreeBSD 14+'
|
||||
else:
|
||||
cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+'
|
||||
|
||||
#
|
||||
# These tests are known to fail, thus we use this list to prevent these
|
||||
|
@ -176,6 +180,7 @@ if sys.platform.startswith('freebsd'):
|
|||
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
|
||||
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
|
||||
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
|
||||
'cp_files/cp_files_002_pos': ['SKIP', na_reason],
|
||||
'link_count/link_count_001': ['SKIP', na_reason],
|
||||
'casenorm/mixed_create_failure': ['FAIL', 13215],
|
||||
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
|
||||
|
@ -267,6 +272,22 @@ if sys.platform.startswith('freebsd'):
|
|||
'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623],
|
||||
'resilver/resilver_restart_001': ['FAIL', known_reason],
|
||||
'snapshot/snapshot_002_pos': ['FAIL', '14831'],
|
||||
'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_crossfs_corner_cases_limited':
|
||||
['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_crossfs_data': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_crossfs_embedded': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_crossfs_hole': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_diffprops_all': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_diffprops_checksum': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_diffprops_compress': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_diffprops_copies': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_diffprops_recordsize': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_prop_sync': ['SKIP', cfr_cross_reason],
|
||||
'block_cloning/block_cloning_cross_enc_dataset':
|
||||
['SKIP', cfr_cross_reason],
|
||||
'block_cloning/block_cloning_copyfilerange_cross_dataset':
|
||||
['SKIP', cfr_cross_reason]
|
||||
})
|
||||
elif sys.platform.startswith('linux'):
|
||||
maybe.update({
|
||||
|
@ -312,6 +333,7 @@ elif sys.platform.startswith('linux'):
|
|||
['SKIP', cfr_reason],
|
||||
'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
|
||||
'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
|
||||
'cp_files/cp_files_002_pos': ['SKIP', cfr_reason],
|
||||
'fault/auto_online_002_pos': ['FAIL', 11889],
|
||||
'fault/auto_replace_001_pos': ['FAIL', 14851],
|
||||
'fault/auto_spare_002_pos': ['FAIL', 11889],
|
||||
|
|
|
@ -61,13 +61,8 @@ function compare_version_gte
|
|||
[ "$(printf "$1\n$2" | sort -V | tail -n1)" = "$1" ]
|
||||
}
|
||||
|
||||
# Linux kernel version comparison function
|
||||
#
|
||||
# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version
|
||||
#
|
||||
# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ]
|
||||
#
|
||||
function linux_version
|
||||
# Helper function used by linux_version() and freebsd_version()
|
||||
function kernel_version
|
||||
{
|
||||
typeset ver="$1"
|
||||
|
||||
|
@ -83,6 +78,24 @@ function linux_version
|
|||
echo $((version * 100000 + major * 1000 + minor))
|
||||
}
|
||||
|
||||
# Linux kernel version comparison function
|
||||
#
|
||||
# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version
|
||||
#
|
||||
# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ]
|
||||
function linux_version {
|
||||
kernel_version "$1"
|
||||
}
|
||||
|
||||
# FreeBSD version comparison function
|
||||
#
|
||||
# $1 FreeBSD version ("13.2", "14.0") or blank for installed FreeBSD version
|
||||
#
|
||||
# Used for comparison: if [ $(freebsd_version) -ge $(freebsd_version "13.2") ]
|
||||
function freebsd_version {
|
||||
kernel_version "$1"
|
||||
}
|
||||
|
||||
# Determine if this is a Linux test system
|
||||
#
|
||||
# Return 0 if platform Linux, 1 if otherwise
|
||||
|
|
|
@ -90,7 +90,8 @@ VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev
|
|||
VOL_MODE vol.mode zvol_volmode
|
||||
VOL_RECURSIVE vol.recursive UNSUPPORTED
|
||||
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
|
||||
BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled
|
||||
BCLONE_ENABLED bclone_enabled zfs_bclone_enabled
|
||||
BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty
|
||||
XATTR_COMPAT xattr_compat zfs_xattr_compat
|
||||
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
|
||||
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max
|
||||
|
|
|
@ -1238,6 +1238,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||
functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
|
||||
functional/cli_root/zpool_status/zpool_status_006_pos.ksh \
|
||||
functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
|
||||
functional/cli_root/zpool_status/zpool_status_008_pos.ksh \
|
||||
functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
|
||||
functional/cli_root/zpool_sync/cleanup.ksh \
|
||||
functional/cli_root/zpool_sync/setup.ksh \
|
||||
|
@ -1393,6 +1394,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||
functional/compression/setup.ksh \
|
||||
functional/cp_files/cleanup.ksh \
|
||||
functional/cp_files/cp_files_001_pos.ksh \
|
||||
functional/cp_files/cp_files_002_pos.ksh \
|
||||
functional/cp_files/cp_stress.ksh \
|
||||
functional/cp_files/setup.ksh \
|
||||
functional/crtime/cleanup.ksh \
|
||||
|
|
|
@ -42,6 +42,12 @@ function verify_crossfs_block_cloning
|
|||
if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
|
||||
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
|
||||
fi
|
||||
|
||||
# Cross dataset block cloning only supported on FreeBSD 14+
|
||||
# https://github.com/freebsd/freebsd-src/commit/969071be938c
|
||||
if is_freebsd && [ $(freebsd_version) -lt $(freebsd_version 14.0) ] ; then
|
||||
log_unsupported "Cloning across datasets not supported in $(uname -r)"
|
||||
fi
|
||||
}
|
||||
|
||||
# Unused.
|
||||
|
|
|
@ -26,12 +26,11 @@
|
|||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
|
||||
. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
|
||||
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
|
||||
fi
|
||||
verify_crossfs_block_cloning
|
||||
|
||||
claim="The copy_file_range syscall can clone across datasets."
|
||||
|
||||
|
|
|
@ -26,12 +26,11 @@
|
|||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
|
||||
. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
|
||||
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
|
||||
fi
|
||||
verify_crossfs_block_cloning
|
||||
|
||||
claim="Block cloning across encrypted datasets."
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ else
|
|||
fi
|
||||
|
||||
set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \
|
||||
"-vx $testpool"
|
||||
"-vx $testpool" "-e $testpool" "-es $testpool"
|
||||
|
||||
log_assert "Executing 'zpool status' with correct options succeeds"
|
||||
|
||||
|
@ -64,4 +64,6 @@ while [[ $i -lt ${#args[*]} ]]; do
|
|||
(( i = i + 1 ))
|
||||
done
|
||||
|
||||
cleanup
|
||||
|
||||
log_pass "'zpool status' with correct options succeeded"
|
||||
|
|
|
@ -37,6 +37,7 @@
|
|||
# 3. Read the file
|
||||
# 4. Take a snapshot and make a clone
|
||||
# 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v'
|
||||
# and 'zpool status -ev'
|
||||
|
||||
function cleanup
|
||||
{
|
||||
|
@ -68,6 +69,7 @@ log_must zpool status -v $TESTPOOL2
|
|||
log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'"
|
||||
log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'"
|
||||
log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'"
|
||||
log_must eval "zpool status -ev | grep '$TESTPOOL2/10m_file'"
|
||||
log_mustnot eval "zpool status -v | grep '$TESTFS1'"
|
||||
|
||||
log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone"
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
#!/bin/ksh -p
|
||||
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify 'zpool status -e' only shows unhealthy devices.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create zpool
|
||||
# 2. Force DEGRADE, FAULT, or inject slow IOs for vdevs
|
||||
# 3. Verify vdevs are reported correctly with -e and -s
|
||||
# 4. Verify parents are reported as DEGRADED
|
||||
# 5. Verify healthy children are not reported
|
||||
#
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
|
||||
zinject -c all
|
||||
poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
|
||||
log_must rm -f $all_vdevs
|
||||
}
|
||||
|
||||
log_assert "Verify 'zpool status -e'"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
all_vdevs=$(echo $TESTDIR/vdev{1..6})
|
||||
log_must mkdir -p $TESTDIR
|
||||
log_must truncate -s $MINVDEVSIZE $all_vdevs
|
||||
|
||||
OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
|
||||
|
||||
for raid_type in "draid2:3d:6c:1s" "raidz2"; do
|
||||
|
||||
log_must zpool create -f $TESTPOOL2 $raid_type $all_vdevs
|
||||
|
||||
# Check DEGRADED vdevs are shown.
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "ONLINE"
|
||||
log_must zinject -d $TESTDIR/vdev4 -A degrade $TESTPOOL2
|
||||
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev4 | grep DEGRADED"
|
||||
|
||||
# Check FAULTED vdevs are shown.
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev5 "ONLINE"
|
||||
log_must zinject -d $TESTDIR/vdev5 -A fault $TESTPOOL2
|
||||
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev5 | grep FAULTED"
|
||||
|
||||
# Check no ONLINE vdevs are shown
|
||||
log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE"
|
||||
|
||||
# Check no ONLINE slow vdevs are show. Then mark IOs greater than
|
||||
# 10ms slow, delay IOs 20ms to vdev6, check slow IOs.
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE"
|
||||
log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE"
|
||||
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS 10
|
||||
log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2
|
||||
log_must mkfile 1048576 /$TESTPOOL2/testfile
|
||||
sync_pool $TESTPOOL2
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
|
||||
|
||||
# Check vdev6 slow IOs are only shown when requested with -s.
|
||||
log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE"
|
||||
log_must eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE"
|
||||
|
||||
# Pool level and top-vdev level status must be DEGRADED.
|
||||
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTPOOL2 | grep DEGRADED"
|
||||
log_must eval "zpool status -e $TESTPOOL2 | grep $raid_type | grep DEGRADED"
|
||||
|
||||
# Check that healthy vdevs[1-3] aren't shown with -e.
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE"
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE"
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE"
|
||||
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE"
|
||||
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE"
|
||||
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE"
|
||||
|
||||
log_must zinject -c all
|
||||
log_must zpool status -es $TESTPOOL2
|
||||
|
||||
zpool destroy $TESTPOOL2
|
||||
done
|
||||
|
||||
log_pass "Verify zpool status -e shows only unhealthy vdevs"
|
|
@ -32,3 +32,7 @@
|
|||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
default_cleanup
|
||||
|
||||
if tunable_exists BCLONE_ENABLED ; then
|
||||
log_must restore_tunable BCLONE_ENABLED
|
||||
fi
|
||||
|
|
|
@ -0,0 +1,161 @@
|
|||
#! /bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify all cp --reflink modes work with modified file.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Verify "cp --reflink=never|auto|always" behaves as expected.
|
||||
# Two different modes of operation are tested.
|
||||
#
|
||||
# a. zfs_bclone_wait_dirty=0: FICLONE and FICLONERANGE fail with EINVAL
|
||||
# when there are dirty blocks which cannot be immediately cloned.
|
||||
# This is the default behavior.
|
||||
#
|
||||
# b. zfs_bclone_wait_dirty=1: FICLONE and FICLONERANGE wait for
|
||||
# dirty blocks to be written to disk allowing the clone to succeed.
|
||||
# The downside to this is it may be slow which depending on the
|
||||
# situtation may defeat the point of making a clone.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
verify_block_cloning
|
||||
|
||||
if ! is_linux; then
|
||||
log_unsupported "cp --reflink is a GNU coreutils option"
|
||||
fi
|
||||
|
||||
function cleanup
|
||||
{
|
||||
datasetexists $TESTPOOL/cp-reflink && \
|
||||
destroy_dataset $$TESTPOOL/cp-reflink -f
|
||||
log_must set_tunable32 BCLONE_WAIT_DIRTY 0
|
||||
}
|
||||
|
||||
function verify_copy
|
||||
{
|
||||
src_cksum=$(sha256digest $1)
|
||||
dst_cksum=$(sha256digest $2)
|
||||
|
||||
if [[ "$src_cksum" != "$dst_cksum" ]]; then
|
||||
log_must ls -l $CP_TESTDIR
|
||||
log_fail "checksum mismatch ($src_cksum != $dst_cksum)"
|
||||
fi
|
||||
}
|
||||
|
||||
log_assert "Verify all cp --reflink modes work with modified file"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
SRC_FILE=src.data
|
||||
DST_FILE=dst.data
|
||||
SRC_SIZE=$(($RANDOM % 2048))
|
||||
|
||||
# A smaller recordsize is used merely to speed up the test.
|
||||
RECORDSIZE=4096
|
||||
|
||||
log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink
|
||||
CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink)
|
||||
|
||||
log_must cd $CP_TESTDIR
|
||||
|
||||
# Never wait on dirty blocks (zfs_bclone_wait_dirty=0)
|
||||
log_must set_tunable32 BCLONE_WAIT_DIRTY 0
|
||||
|
||||
for mode in "never" "auto" "always"; do
|
||||
log_note "Checking 'cp --reflink=$mode'"
|
||||
|
||||
# Create a new file and immediately copy it.
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
|
||||
|
||||
if [[ "$mode" == "always" ]]; then
|
||||
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
log_must ls -l $CP_TESTDIR
|
||||
else
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
fi
|
||||
log_must rm -f $DST_FILE
|
||||
|
||||
# Append to an existing file and immediately copy it.
|
||||
sync_pool $TESTPOOL
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
|
||||
count=1 conv=notrunc
|
||||
if [[ "$mode" == "always" ]]; then
|
||||
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
log_must ls -l $CP_TESTDIR
|
||||
else
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
fi
|
||||
log_must rm -f $DST_FILE
|
||||
|
||||
# Overwrite a random range of an existing file and immediately copy it.
|
||||
sync_pool $TESTPOOL
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
|
||||
seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
|
||||
if [[ "$mode" == "always" ]]; then
|
||||
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
log_must ls -l $CP_TESTDIR
|
||||
else
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
fi
|
||||
log_must rm -f $SRC_FILE $DST_FILE
|
||||
done
|
||||
|
||||
# Wait on dirty blocks (zfs_bclone_wait_dirty=1)
|
||||
log_must set_tunable32 BCLONE_WAIT_DIRTY 1
|
||||
|
||||
for mode in "never" "auto" "always"; do
|
||||
log_note "Checking 'cp --reflink=$mode'"
|
||||
|
||||
# Create a new file and immediately copy it.
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
log_must rm -f $DST_FILE
|
||||
|
||||
# Append to an existing file and immediately copy it.
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
|
||||
count=1 conv=notrunc
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
log_must rm -f $DST_FILE
|
||||
|
||||
# Overwrite a random range of an existing file and immediately copy it.
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
|
||||
seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
log_must rm -f $SRC_FILE $DST_FILE
|
||||
done
|
||||
|
||||
log_pass
|
|
@ -32,4 +32,10 @@
|
|||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
DISK=${DISKS%% *}
|
||||
|
||||
if tunable_exists BCLONE_ENABLED ; then
|
||||
log_must save_tunable BCLONE_ENABLED
|
||||
log_must set_tunable32 BCLONE_ENABLED 1
|
||||
fi
|
||||
|
||||
default_setup $DISK
|
||||
|
|
Loading…
Reference in New Issue