Merge pull request #215 from truenas/truenas/zfs-2.2.4-staging
Sync with upstream zfs-2.2-4-staging
This commit is contained in:
commit
e9b06fd7ae
|
@ -793,18 +793,27 @@ def section_dmu(kstats_dict):
|
|||
|
||||
zfetch_stats = isolate_section('zfetchstats', kstats_dict)
|
||||
|
||||
zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses'])
|
||||
zfetch_access_total = int(zfetch_stats['hits']) +\
|
||||
int(zfetch_stats['future']) + int(zfetch_stats['stride']) +\
|
||||
int(zfetch_stats['past']) + int(zfetch_stats['misses'])
|
||||
|
||||
prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total))
|
||||
prt_i2('Stream hits:',
|
||||
f_perc(zfetch_stats['hits'], zfetch_access_total),
|
||||
f_hits(zfetch_stats['hits']))
|
||||
future = int(zfetch_stats['future']) + int(zfetch_stats['stride'])
|
||||
prt_i2('Hits ahead of stream:', f_perc(future, zfetch_access_total),
|
||||
f_hits(future))
|
||||
prt_i2('Hits behind stream:',
|
||||
f_perc(zfetch_stats['past'], zfetch_access_total),
|
||||
f_hits(zfetch_stats['past']))
|
||||
prt_i2('Stream misses:',
|
||||
f_perc(zfetch_stats['misses'], zfetch_access_total),
|
||||
f_hits(zfetch_stats['misses']))
|
||||
prt_i2('Streams limit reached:',
|
||||
f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']),
|
||||
f_hits(zfetch_stats['max_streams']))
|
||||
prt_i1('Stream strides:', f_hits(zfetch_stats['stride']))
|
||||
prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued']))
|
||||
print()
|
||||
|
||||
|
|
|
@ -8041,6 +8041,17 @@ dump_mos_leaks(spa_t *spa)
|
|||
}
|
||||
}
|
||||
|
||||
if (spa->spa_brt != NULL) {
|
||||
brt_t *brt = spa->spa_brt;
|
||||
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
|
||||
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
|
||||
if (brtvd != NULL && brtvd->bv_initiated) {
|
||||
mos_obj_refd(brtvd->bv_mos_brtvdev);
|
||||
mos_obj_refd(brtvd->bv_mos_entries);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Visit all allocated objects and make sure they are referenced.
|
||||
*/
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
#
|
||||
# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos
|
||||
# as they flip between FAULTED and ONLINE. If
|
||||
# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
|
||||
# ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
|
||||
# FAULTED, then power down the slot via sysfs:
|
||||
#
|
||||
# /sys/class/enclosure/<enclosure>/<slot>/power_status
|
||||
|
@ -19,7 +19,7 @@
|
|||
# Exit codes:
|
||||
# 0: slot successfully powered off
|
||||
# 1: enclosure not available
|
||||
# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled
|
||||
# 2: ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT disabled
|
||||
# 3: vdev was not FAULTED
|
||||
# 4: The enclosure sysfs path passed from ZFS does not exist
|
||||
# 5: Enclosure slot didn't actually turn off after we told it to
|
||||
|
@ -32,7 +32,7 @@ if [ ! -d /sys/class/enclosure ] ; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then
|
||||
if [ "${ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT}" != "1" ] ; then
|
||||
exit 2
|
||||
fi
|
||||
|
||||
|
|
|
@ -205,6 +205,10 @@ zed_notify()
|
|||
[ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
|
||||
[ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
|
||||
|
||||
zed_notify_ntfy "${subject}" "${pathname}"; rv=$?
|
||||
[ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
|
||||
[ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
|
||||
|
||||
[ "${num_success}" -gt 0 ] && return 0
|
||||
[ "${num_failure}" -gt 0 ] && return 1
|
||||
return 2
|
||||
|
@ -527,6 +531,100 @@ zed_notify_pushover()
|
|||
}
|
||||
|
||||
|
||||
# zed_notify_ntfy (subject, pathname)
|
||||
#
|
||||
# Send a notification via Ntfy.sh <https://ntfy.sh/>.
|
||||
# The ntfy topic (ZED_NTFY_TOPIC) identifies the topic that the notification
|
||||
# will be sent to Ntfy.sh server. The ntfy url (ZED_NTFY_URL) defines the
|
||||
# self-hosted or provided hosted ntfy service location. The ntfy access token
|
||||
# <https://docs.ntfy.sh/publish/#access-tokens> (ZED_NTFY_ACCESS_TOKEN) reprsents an
|
||||
# access token that could be used if a topic is read/write protected. If a
|
||||
# topic can be written to publicaly, a ZED_NTFY_ACCESS_TOKEN is not required.
|
||||
#
|
||||
# Requires curl and sed executables to be installed in the standard PATH.
|
||||
#
|
||||
# References
|
||||
# https://docs.ntfy.sh
|
||||
#
|
||||
# Arguments
|
||||
# subject: notification subject
|
||||
# pathname: pathname containing the notification message (OPTIONAL)
|
||||
#
|
||||
# Globals
|
||||
# ZED_NTFY_TOPIC
|
||||
# ZED_NTFY_ACCESS_TOKEN (OPTIONAL)
|
||||
# ZED_NTFY_URL
|
||||
#
|
||||
# Return
|
||||
# 0: notification sent
|
||||
# 1: notification failed
|
||||
# 2: not configured
|
||||
#
|
||||
zed_notify_ntfy()
|
||||
{
|
||||
local subject="$1"
|
||||
local pathname="${2:-"/dev/null"}"
|
||||
local msg_body
|
||||
local msg_out
|
||||
local msg_err
|
||||
|
||||
[ -n "${ZED_NTFY_TOPIC}" ] || return 2
|
||||
local url="${ZED_NTFY_URL:-"https://ntfy.sh"}/${ZED_NTFY_TOPIC}"
|
||||
|
||||
if [ ! -r "${pathname}" ]; then
|
||||
zed_log_err "ntfy cannot read \"${pathname}\""
|
||||
return 1
|
||||
fi
|
||||
|
||||
zed_check_cmd "curl" "sed" || return 1
|
||||
|
||||
# Read the message body in.
|
||||
#
|
||||
msg_body="$(cat "${pathname}")"
|
||||
|
||||
if [ -z "${msg_body}" ]
|
||||
then
|
||||
msg_body=$subject
|
||||
subject=""
|
||||
fi
|
||||
|
||||
# Send the POST request and check for errors.
|
||||
#
|
||||
if [ -n "${ZED_NTFY_ACCESS_TOKEN}" ]; then
|
||||
msg_out="$( \
|
||||
curl \
|
||||
-u ":${ZED_NTFY_ACCESS_TOKEN}" \
|
||||
-H "Title: ${subject}" \
|
||||
-d "${msg_body}" \
|
||||
-H "Priority: high" \
|
||||
"${url}" \
|
||||
2>/dev/null \
|
||||
)"; rv=$?
|
||||
else
|
||||
msg_out="$( \
|
||||
curl \
|
||||
-H "Title: ${subject}" \
|
||||
-d "${msg_body}" \
|
||||
-H "Priority: high" \
|
||||
"${url}" \
|
||||
2>/dev/null \
|
||||
)"; rv=$?
|
||||
fi
|
||||
if [ "${rv}" -ne 0 ]; then
|
||||
zed_log_err "curl exit=${rv}"
|
||||
return 1
|
||||
fi
|
||||
msg_err="$(echo "${msg_out}" \
|
||||
| sed -n -e 's/.*"errors" *:.*\[\(.*\)\].*/\1/p')"
|
||||
if [ -n "${msg_err}" ]; then
|
||||
zed_log_err "ntfy \"${msg_err}"\"
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
|
||||
# zed_rate_limit (tag, [interval])
|
||||
#
|
||||
# Check whether an event of a given type [tag] has already occurred within the
|
||||
|
|
|
@ -146,4 +146,26 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event"
|
|||
# Power off the drive's slot in the enclosure if it becomes FAULTED. This can
|
||||
# help silence misbehaving drives. This assumes your drive enclosure fully
|
||||
# supports slot power control via sysfs.
|
||||
#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1
|
||||
#ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1
|
||||
|
||||
##
|
||||
# Ntfy topic
|
||||
# This defines which topic will receive the ntfy notification.
|
||||
# <https://docs.ntfy.sh/publish/>
|
||||
# Disabled by default; uncomment to enable.
|
||||
#ZED_NTFY_TOPIC=""
|
||||
|
||||
##
|
||||
# Ntfy access token (optional for public topics)
|
||||
# This defines an access token which can be used
|
||||
# to allow you to authenticate when sending to topics
|
||||
# <https://docs.ntfy.sh/publish/#access-tokens>
|
||||
# Disabled by default; uncomment to enable.
|
||||
#ZED_NTFY_ACCESS_TOKEN=""
|
||||
|
||||
##
|
||||
# Ntfy Service URL
|
||||
# This defines which service the ntfy call will be directed toward
|
||||
# <https://docs.ntfy.sh/install/>
|
||||
# https://ntfy.sh by default; uncomment to enable an alternative service url.
|
||||
#ZED_NTFY_URL="https://ntfy.sh"
|
||||
|
|
|
@ -3672,15 +3672,25 @@ zfs_do_list(int argc, char **argv)
|
|||
|
||||
for (char *tok; (tok = strsep(&optarg, ",")); ) {
|
||||
static const char *const type_subopts[] = {
|
||||
"filesystem", "volume",
|
||||
"snapshot", "snap",
|
||||
"filesystem",
|
||||
"fs",
|
||||
"volume",
|
||||
"vol",
|
||||
"snapshot",
|
||||
"snap",
|
||||
"bookmark",
|
||||
"all" };
|
||||
"all"
|
||||
};
|
||||
static const int type_types[] = {
|
||||
ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME,
|
||||
ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT,
|
||||
ZFS_TYPE_FILESYSTEM,
|
||||
ZFS_TYPE_FILESYSTEM,
|
||||
ZFS_TYPE_VOLUME,
|
||||
ZFS_TYPE_VOLUME,
|
||||
ZFS_TYPE_SNAPSHOT,
|
||||
ZFS_TYPE_SNAPSHOT,
|
||||
ZFS_TYPE_BOOKMARK,
|
||||
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK };
|
||||
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK
|
||||
};
|
||||
|
||||
for (c = 0; c < ARRAY_SIZE(type_subopts); ++c)
|
||||
if (strcmp(tok, type_subopts[c]) == 0) {
|
||||
|
|
|
@ -2161,6 +2161,7 @@ typedef struct status_cbdata {
|
|||
boolean_t cb_explain;
|
||||
boolean_t cb_first;
|
||||
boolean_t cb_dedup_stats;
|
||||
boolean_t cb_print_unhealthy;
|
||||
boolean_t cb_print_status;
|
||||
boolean_t cb_print_slow_ios;
|
||||
boolean_t cb_print_vdev_init;
|
||||
|
@ -2357,6 +2358,35 @@ health_str_to_color(const char *health)
|
|||
return (NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called for each leaf vdev. Returns 0 if the vdev is healthy.
|
||||
* A vdev is unhealthy if any of the following are true:
|
||||
* 1) there are read, write, or checksum errors,
|
||||
* 2) its state is not ONLINE, or
|
||||
* 3) slow IO reporting was requested (-s) and there are slow IOs.
|
||||
*/
|
||||
static int
|
||||
vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data)
|
||||
{
|
||||
status_cbdata_t *cb = data;
|
||||
vdev_stat_t *vs;
|
||||
uint_t vsc;
|
||||
(void) hdl_data;
|
||||
|
||||
if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
|
||||
(uint64_t **)&vs, &vsc) != 0)
|
||||
return (1);
|
||||
|
||||
if (vs->vs_checksum_errors || vs->vs_read_errors ||
|
||||
vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY)
|
||||
return (1);
|
||||
|
||||
if (cb->cb_print_slow_ios && vs->vs_slow_ios)
|
||||
return (1);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Print out configuration state as requested by status_callback.
|
||||
*/
|
||||
|
@ -2375,7 +2405,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
|||
const char *state;
|
||||
const char *type;
|
||||
const char *path = NULL;
|
||||
const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL;
|
||||
const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL,
|
||||
*scolor = NULL;
|
||||
|
||||
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
|
||||
&child, &children) != 0)
|
||||
|
@ -2402,6 +2433,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
|||
state = gettext("AVAIL");
|
||||
}
|
||||
|
||||
/*
|
||||
* If '-e' is specified then top-level vdevs and their children
|
||||
* can be pruned if all of their leaves are healthy.
|
||||
*/
|
||||
if (cb->cb_print_unhealthy && depth > 0 &&
|
||||
for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
printf_color(health_str_to_color(state),
|
||||
"\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth,
|
||||
name, state);
|
||||
|
@ -2416,6 +2456,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
|||
if (vs->vs_checksum_errors)
|
||||
ccolor = ANSI_RED;
|
||||
|
||||
if (vs->vs_slow_ios)
|
||||
scolor = ANSI_BLUE;
|
||||
|
||||
if (cb->cb_literal) {
|
||||
fputc(' ', stdout);
|
||||
printf_color(rcolor, "%5llu",
|
||||
|
@ -2448,9 +2491,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
|||
}
|
||||
|
||||
if (cb->cb_literal)
|
||||
printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
|
||||
printf_color(scolor, " %5llu",
|
||||
(u_longlong_t)vs->vs_slow_ios);
|
||||
else
|
||||
printf(" %5s", rbuf);
|
||||
printf_color(scolor, " %5s", rbuf);
|
||||
}
|
||||
if (cb->cb_print_power) {
|
||||
if (children == 0) {
|
||||
|
@ -8999,9 +9043,11 @@ status_callback(zpool_handle_t *zhp, void *data)
|
|||
(void) printf(gettext(
|
||||
"errors: No known data errors\n"));
|
||||
} else if (!cbp->cb_verbose) {
|
||||
color_start(ANSI_RED);
|
||||
(void) printf(gettext("errors: %llu data "
|
||||
"errors, use '-v' for a list\n"),
|
||||
(u_longlong_t)nerr);
|
||||
color_end();
|
||||
} else {
|
||||
print_error_log(zhp);
|
||||
}
|
||||
|
@ -9022,6 +9068,7 @@ status_callback(zpool_handle_t *zhp, void *data)
|
|||
* [pool] [interval [count]]
|
||||
*
|
||||
* -c CMD For each vdev, run command CMD
|
||||
* -e Display only unhealthy vdevs
|
||||
* -i Display vdev initialization status.
|
||||
* -g Display guid for individual vdev name.
|
||||
* -L Follow links when resolving vdev path name.
|
||||
|
@ -9053,7 +9100,7 @@ zpool_do_status(int argc, char **argv)
|
|||
};
|
||||
|
||||
/* check options */
|
||||
while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options,
|
||||
while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options,
|
||||
NULL)) != -1) {
|
||||
switch (c) {
|
||||
case 'c':
|
||||
|
@ -9080,6 +9127,9 @@ zpool_do_status(int argc, char **argv)
|
|||
}
|
||||
cmd = optarg;
|
||||
break;
|
||||
case 'e':
|
||||
cb.cb_print_unhealthy = B_TRUE;
|
||||
break;
|
||||
case 'i':
|
||||
cb.cb_print_vdev_init = B_TRUE;
|
||||
break;
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
|
||||
ZFS_LINUX_TEST_SRC([page_size], [
|
||||
#include <linux/mm.h>
|
||||
],[
|
||||
unsigned long s;
|
||||
s = page_size(NULL);
|
||||
])
|
||||
])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
|
||||
AC_MSG_CHECKING([whether page_size() is available])
|
||||
ZFS_LINUX_TEST_RESULT([page_size], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
|
@ -16,6 +16,9 @@ dnl #
|
|||
dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
|
||||
dnl # generic_copy_file_range() added to support it
|
||||
dnl #
|
||||
dnl # 6.8: generic_copy_file_range() removed, replaced by
|
||||
dnl # splice_copy_file_range()
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
|
||||
ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
|
||||
#include <linux/fs.h>
|
||||
|
@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
|
|||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
|
||||
ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
|
||||
#include <linux/splice.h>
|
||||
], [
|
||||
struct file *src_file __attribute__ ((unused)) = NULL;
|
||||
loff_t src_off __attribute__ ((unused)) = 0;
|
||||
struct file *dst_file __attribute__ ((unused)) = NULL;
|
||||
loff_t dst_off __attribute__ ((unused)) = 0;
|
||||
size_t len __attribute__ ((unused)) = 0;
|
||||
splice_copy_file_range(src_file, src_off, dst_file, dst_off,
|
||||
len);
|
||||
])
|
||||
])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
|
||||
AC_MSG_CHECKING([whether splice_copy_file_range() is available])
|
||||
ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
|
||||
[splice_copy_file_range() is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
|
||||
ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
|
||||
#include <linux/fs.h>
|
||||
|
|
|
@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
|||
ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
|
||||
ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
|
||||
ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
|
||||
|
@ -166,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
|||
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
|
||||
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
|
||||
ZFS_AC_KERNEL_SRC_SYNC_BDEV
|
||||
ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
|
||||
|
@ -266,6 +268,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
|||
ZFS_AC_KERNEL_VFS_IOV_ITER
|
||||
ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
|
||||
ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
|
||||
|
@ -314,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
|||
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
|
||||
ZFS_AC_KERNEL_COPY_SPLICE_READ
|
||||
ZFS_AC_KERNEL_SYNC_BDEV
|
||||
ZFS_AC_KERNEL_MM_PAGE_SIZE
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_CPU_HAS_FEATURE
|
||||
|
|
|
@ -286,7 +286,6 @@ typedef struct zfid_long {
|
|||
|
||||
extern uint_t zfs_fsyncer_key;
|
||||
extern int zfs_super_owner;
|
||||
extern int zfs_bclone_enabled;
|
||||
|
||||
extern void zfs_init(void);
|
||||
extern void zfs_fini(void);
|
||||
|
|
|
@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
|
|||
%D%/kernel/linux/compiler_compat.h \
|
||||
%D%/kernel/linux/dcache_compat.h \
|
||||
%D%/kernel/linux/kmap_compat.h \
|
||||
%D%/kernel/linux/mm_compat.h \
|
||||
%D%/kernel/linux/mod_compat.h \
|
||||
%D%/kernel/linux/page_compat.h \
|
||||
%D%/kernel/linux/percpu_compat.h \
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
* CDDL HEADER START
|
||||
*
|
||||
* The contents of this file are subject to the terms of the
|
||||
* Common Development and Distribution License (the "License").
|
||||
* You may not use this file except in compliance with the License.
|
||||
*
|
||||
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
* or https://opensource.org/licenses/CDDL-1.0.
|
||||
* See the License for the specific language governing permissions
|
||||
* and limitations under the License.
|
||||
*
|
||||
* When distributing Covered Code, include this CDDL HEADER in each
|
||||
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
* If applicable, add the following below this CDDL HEADER, with the
|
||||
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
*
|
||||
* CDDL HEADER END
|
||||
*/
|
||||
|
||||
/*
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#ifndef _ZFS_MM_COMPAT_H
|
||||
#define _ZFS_MM_COMPAT_H
|
||||
|
||||
#include <linux/mm.h>
|
||||
|
||||
/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
|
||||
#ifndef HAVE_MM_PAGE_SIZE
|
||||
#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
|
||||
#endif
|
||||
|
||||
#endif /* _ZFS_MM_COMPAT_H */
|
|
@ -68,6 +68,7 @@ enum scope_prefix_types {
|
|||
zfs_trim,
|
||||
zfs_txg,
|
||||
zfs_vdev,
|
||||
zfs_vdev_disk,
|
||||
zfs_vdev_file,
|
||||
zfs_vdev_mirror,
|
||||
zfs_vnops,
|
||||
|
|
|
@ -45,8 +45,6 @@ extern "C" {
|
|||
typedef struct zfsvfs zfsvfs_t;
|
||||
struct znode;
|
||||
|
||||
extern int zfs_bclone_enabled;
|
||||
|
||||
/*
|
||||
* This structure emulates the vfs_t from other platforms. It's purpose
|
||||
* is to facilitate the handling of mount options and minimize structural
|
||||
|
|
|
@ -79,6 +79,9 @@ typedef struct abd {
|
|||
|
||||
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
|
||||
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
|
||||
#endif
|
||||
|
||||
extern int zfs_abd_scatter_enabled;
|
||||
|
||||
|
@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
|
|||
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
|
||||
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
|
||||
abd_iter_func2_t *, void *);
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
|
||||
void *);
|
||||
#endif
|
||||
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
|
||||
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
|
||||
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
|
||||
|
@ -213,6 +220,8 @@ void abd_fini(void);
|
|||
|
||||
/*
|
||||
* Linux ABD bio functions
|
||||
* Note: these are only needed to support vdev_classic. See comment in
|
||||
* vdev_disk.c.
|
||||
*/
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#ifndef _ABD_IMPL_H
|
||||
|
@ -38,12 +39,30 @@ typedef enum abd_stats_op {
|
|||
ABDSTAT_DECR /* Decrease abdstat values */
|
||||
} abd_stats_op_t;
|
||||
|
||||
struct scatterlist; /* forward declaration */
|
||||
/* forward declarations */
|
||||
struct scatterlist;
|
||||
struct page;
|
||||
|
||||
struct abd_iter {
|
||||
/* public interface */
|
||||
void *iter_mapaddr; /* addr corresponding to iter_pos */
|
||||
size_t iter_mapsize; /* length of data valid at mapaddr */
|
||||
union {
|
||||
/* for abd_iter_map()/abd_iter_unmap() */
|
||||
struct {
|
||||
/* addr corresponding to iter_pos */
|
||||
void *iter_mapaddr;
|
||||
/* length of data valid at mapaddr */
|
||||
size_t iter_mapsize;
|
||||
};
|
||||
/* for abd_iter_page() */
|
||||
struct {
|
||||
/* current page */
|
||||
struct page *iter_page;
|
||||
/* offset of data in page */
|
||||
size_t iter_page_doff;
|
||||
/* size of data in page */
|
||||
size_t iter_page_dsize;
|
||||
};
|
||||
};
|
||||
|
||||
/* private */
|
||||
abd_t *iter_abd; /* ABD being iterated through */
|
||||
|
@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
|
|||
void abd_iter_advance(struct abd_iter *, size_t);
|
||||
void abd_iter_map(struct abd_iter *);
|
||||
void abd_iter_unmap(struct abd_iter *);
|
||||
void abd_iter_page(struct abd_iter *);
|
||||
|
||||
/*
|
||||
* Helper macros
|
||||
|
|
|
@ -45,18 +45,24 @@ typedef struct zfetch {
|
|||
int zf_numstreams; /* number of zstream_t's */
|
||||
} zfetch_t;
|
||||
|
||||
typedef struct zsrange {
|
||||
uint16_t start;
|
||||
uint16_t end;
|
||||
} zsrange_t;
|
||||
|
||||
#define ZFETCH_RANGES 9 /* Fits zstream_t into 128 bytes */
|
||||
|
||||
typedef struct zstream {
|
||||
list_node_t zs_node; /* link for zf_stream */
|
||||
uint64_t zs_blkid; /* expect next access at this blkid */
|
||||
uint_t zs_atime; /* time last prefetch issued */
|
||||
zsrange_t zs_ranges[ZFETCH_RANGES]; /* ranges from future */
|
||||
unsigned int zs_pf_dist; /* data prefetch distance in bytes */
|
||||
unsigned int zs_ipf_dist; /* L1 prefetch distance in bytes */
|
||||
uint64_t zs_pf_start; /* first data block to prefetch */
|
||||
uint64_t zs_pf_end; /* data block to prefetch up to */
|
||||
uint64_t zs_ipf_start; /* first data block to prefetch L1 */
|
||||
uint64_t zs_ipf_end; /* data block to prefetch L1 up to */
|
||||
|
||||
list_node_t zs_node; /* link for zf_stream */
|
||||
hrtime_t zs_atime; /* time last prefetch issued */
|
||||
zfetch_t *zs_fetch; /* parent fetch */
|
||||
boolean_t zs_missed; /* stream saw cache misses */
|
||||
boolean_t zs_more; /* need more distant prefetch */
|
||||
zfs_refcount_t zs_callers; /* number of pending callers */
|
||||
|
@ -74,7 +80,7 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *);
|
|||
void dmu_zfetch_fini(zfetch_t *);
|
||||
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
|
||||
boolean_t);
|
||||
void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
|
||||
void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t);
|
||||
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
|
||||
boolean_t);
|
||||
|
||||
|
|
|
@ -82,12 +82,15 @@ int multilist_is_empty(multilist_t *);
|
|||
unsigned int multilist_get_num_sublists(multilist_t *);
|
||||
unsigned int multilist_get_random_index(multilist_t *);
|
||||
|
||||
multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
|
||||
void multilist_sublist_lock(multilist_sublist_t *);
|
||||
multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
|
||||
multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
|
||||
void multilist_sublist_unlock(multilist_sublist_t *);
|
||||
|
||||
void multilist_sublist_insert_head(multilist_sublist_t *, void *);
|
||||
void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
|
||||
void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
|
||||
void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
|
||||
void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
|
||||
void multilist_sublist_remove(multilist_sublist_t *, void *);
|
||||
int multilist_sublist_is_empty(multilist_sublist_t *);
|
||||
|
|
|
@ -24,8 +24,11 @@
|
|||
|
||||
#ifndef _SYS_FS_ZFS_VNOPS_H
|
||||
#define _SYS_FS_ZFS_VNOPS_H
|
||||
|
||||
#include <sys/zfs_vnops_os.h>
|
||||
|
||||
extern int zfs_bclone_enabled;
|
||||
|
||||
extern int zfs_fsync(znode_t *, int, cred_t *);
|
||||
extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
|
||||
extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
.\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
|
||||
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
|
||||
.\" Copyright (c) 2019 Datto Inc.
|
||||
.\" Copyright (c) 2023, 2024 Klara, Inc.
|
||||
.\" The contents of this file are subject to the terms of the Common Development
|
||||
.\" and Distribution License (the "License"). You may not use this file except
|
||||
.\" in compliance with the License. You can obtain a copy of the license at
|
||||
|
@ -15,7 +16,7 @@
|
|||
.\" own identifying information:
|
||||
.\" Portions Copyright [yyyy] [name of copyright owner]
|
||||
.\"
|
||||
.Dd July 21, 2023
|
||||
.Dd January 9, 2024
|
||||
.Dt ZFS 4
|
||||
.Os
|
||||
.
|
||||
|
@ -530,6 +531,10 @@ However, this is limited by
|
|||
Maximum micro ZAP size.
|
||||
A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
|
||||
.
|
||||
.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
|
||||
Log2 fraction of holes in speculative prefetch stream allowed for it to
|
||||
proceed.
|
||||
.
|
||||
.It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
|
||||
Min bytes to prefetch per stream.
|
||||
Prefetch distance starts from the demand access size and quickly grows to
|
||||
|
@ -544,6 +549,13 @@ Max bytes to prefetch per stream.
|
|||
.It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
|
||||
Max bytes to prefetch indirects for per stream.
|
||||
.
|
||||
.It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
|
||||
Requests within this byte distance from the current prefetch stream position
|
||||
are considered parts of the stream, reordered due to parallel processing.
|
||||
Such requests do not advance the stream position immediately unless
|
||||
.Sy zfetch_hole_shift
|
||||
fill threshold is reached, but saved to fill holes in the stream later.
|
||||
.
|
||||
.It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint
|
||||
Max number of streams per zfetch (prefetch streams per file).
|
||||
.
|
||||
|
@ -1142,6 +1154,15 @@ Enable the experimental block cloning feature.
|
|||
If this setting is 0, then even if feature@block_cloning is enabled,
|
||||
attempts to clone blocks will act as though the feature is disabled.
|
||||
.
|
||||
.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
||||
When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
|
||||
written to disk.
|
||||
This allows the clone operation to reliably succeed when a file is
|
||||
modified and then immediately cloned.
|
||||
For small files this may be slower than making a copy of the file.
|
||||
Therefore, this setting defaults to 0 which causes a clone operation to
|
||||
immediately fail when encountering a dirty block.
|
||||
.
|
||||
.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
|
||||
Select a BLAKE3 implementation.
|
||||
.Pp
|
||||
|
@ -1336,6 +1357,42 @@ _
|
|||
4 Driver No driver retries on driver errors.
|
||||
.TE
|
||||
.
|
||||
.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
|
||||
Maximum number of segments to add to a BIO (min 4).
|
||||
If this is higher than the maximum allowed by the device queue or the kernel
|
||||
itself, it will be clamped.
|
||||
Setting it to zero will cause the kernel's ideal size to be used.
|
||||
This parameter only applies on Linux.
|
||||
This parameter is ignored if
|
||||
.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
|
||||
.
|
||||
.It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint
|
||||
Controls the method used to submit IO to the Linux block layer
|
||||
(default
|
||||
.Sy 1 "classic" Ns
|
||||
)
|
||||
.Pp
|
||||
If set to 1, the "classic" method is used.
|
||||
This is the method that has been in use since the earliest versions of
|
||||
ZFS-on-Linux.
|
||||
It has known issues with highly fragmented IO requests and is less efficient on
|
||||
many workloads, but it well known and well understood.
|
||||
.Pp
|
||||
If set to 0, the "new" method is used.
|
||||
This method is available since 2.2.4 and should resolve all known issues and be
|
||||
far more efficient, but has not had as much testing.
|
||||
In the 2.2.x series, this parameter defaults to 1, to use the "classic" method.
|
||||
.Pp
|
||||
It is not recommended that you change it except on advice from the OpenZFS
|
||||
developers.
|
||||
If you do change it, please also open a bug report describing why you did so,
|
||||
including the workload involved and any error messages.
|
||||
.Pp
|
||||
This parameter and the "classic" submission method will be removed in a future
|
||||
release of OpenZFS once we have total confidence in the new method.
|
||||
.Pp
|
||||
This parameter only applies on Linux, and can only be set at module load time.
|
||||
.
|
||||
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
|
||||
Time before expiring
|
||||
.Pa .zfs/snapshot .
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
.\" Copyright 2018 Nexenta Systems, Inc.
|
||||
.\" Copyright 2019 Joyent, Inc.
|
||||
.\"
|
||||
.Dd March 16, 2022
|
||||
.Dd February 8, 2024
|
||||
.Dt ZFS-LIST 8
|
||||
.Os
|
||||
.
|
||||
|
@ -155,6 +155,15 @@ or
|
|||
For example, specifying
|
||||
.Fl t Sy snapshot
|
||||
displays only snapshots.
|
||||
.Sy fs ,
|
||||
.Sy snap ,
|
||||
or
|
||||
.Sy vol
|
||||
can be used as aliases for
|
||||
.Sy filesystem ,
|
||||
.Sy snapshot ,
|
||||
or
|
||||
.Sy volume .
|
||||
.El
|
||||
.
|
||||
.Sh EXAMPLES
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
.Sh SYNOPSIS
|
||||
.Nm zpool
|
||||
.Cm status
|
||||
.Op Fl DigLpPstvx
|
||||
.Op Fl DeigLpPstvx
|
||||
.Op Fl T Sy u Ns | Ns Sy d
|
||||
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns …
|
||||
.Oo Ar pool Oc Ns …
|
||||
|
@ -69,6 +69,8 @@ See the
|
|||
option of
|
||||
.Nm zpool Cm iostat
|
||||
for complete details.
|
||||
.It Fl e
|
||||
Only show unhealthy vdevs (not-ONLINE or with errors).
|
||||
.It Fl i
|
||||
Display vdev initialization status.
|
||||
.It Fl g
|
||||
|
|
|
@ -111,10 +111,11 @@ static const char *upvalname (Proto *p, int uv) {
|
|||
|
||||
static const char *findvararg (CallInfo *ci, int n, StkId *pos) {
|
||||
int nparams = clLvalue(ci->func)->p->numparams;
|
||||
if (n >= ci->u.l.base - ci->func - nparams)
|
||||
int nvararg = cast_int(ci->u.l.base - ci->func) - nparams;
|
||||
if (n <= -nvararg)
|
||||
return NULL; /* no such vararg */
|
||||
else {
|
||||
*pos = ci->func + nparams + n;
|
||||
*pos = ci->func + nparams - n;
|
||||
return "(*vararg)"; /* generic name for any vararg */
|
||||
}
|
||||
}
|
||||
|
@ -126,7 +127,7 @@ static const char *findlocal (lua_State *L, CallInfo *ci, int n,
|
|||
StkId base;
|
||||
if (isLua(ci)) {
|
||||
if (n < 0) /* access to vararg values? */
|
||||
return findvararg(ci, -n, pos);
|
||||
return findvararg(ci, n, pos);
|
||||
else {
|
||||
base = ci->u.l.base;
|
||||
name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));
|
||||
|
|
|
@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
|||
{
|
||||
ASSERT(!abd_is_gang(abd));
|
||||
abd_verify(abd);
|
||||
memset(aiter, 0, sizeof (struct abd_iter));
|
||||
aiter->iter_abd = abd;
|
||||
aiter->iter_pos = 0;
|
||||
aiter->iter_mapaddr = NULL;
|
||||
aiter->iter_mapsize = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -89,10 +89,6 @@ int zfs_debug_level;
|
|||
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
|
||||
"Debug level");
|
||||
|
||||
int zfs_bclone_enabled = 0;
|
||||
SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
|
||||
&zfs_bclone_enabled, 0, "Enable block cloning");
|
||||
|
||||
struct zfs_jailparam {
|
||||
int mount_snapshot;
|
||||
};
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
/*
|
||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||
* Copyright (c) 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
@ -59,7 +60,9 @@
|
|||
#include <sys/zfs_znode.h>
|
||||
#ifdef _KERNEL
|
||||
#include <linux/kmap_compat.h>
|
||||
#include <linux/mm_compat.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/version.h>
|
||||
#endif
|
||||
|
||||
#ifdef _KERNEL
|
||||
|
@ -895,14 +898,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
|||
{
|
||||
ASSERT(!abd_is_gang(abd));
|
||||
abd_verify(abd);
|
||||
memset(aiter, 0, sizeof (struct abd_iter));
|
||||
aiter->iter_abd = abd;
|
||||
aiter->iter_mapaddr = NULL;
|
||||
aiter->iter_mapsize = 0;
|
||||
aiter->iter_pos = 0;
|
||||
if (abd_is_linear(abd)) {
|
||||
aiter->iter_offset = 0;
|
||||
aiter->iter_sg = NULL;
|
||||
} else {
|
||||
if (!abd_is_linear(abd)) {
|
||||
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
|
||||
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
|
||||
}
|
||||
|
@ -915,6 +913,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
|||
boolean_t
|
||||
abd_iter_at_end(struct abd_iter *aiter)
|
||||
{
|
||||
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
|
||||
return (aiter->iter_pos == aiter->iter_abd->abd_size);
|
||||
}
|
||||
|
||||
|
@ -926,8 +925,15 @@ abd_iter_at_end(struct abd_iter *aiter)
|
|||
void
|
||||
abd_iter_advance(struct abd_iter *aiter, size_t amount)
|
||||
{
|
||||
/*
|
||||
* Ensure that last chunk is not in use. abd_iterate_*() must clear
|
||||
* this state (directly or abd_iter_unmap()) before advancing.
|
||||
*/
|
||||
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
|
||||
ASSERT0(aiter->iter_mapsize);
|
||||
ASSERT3P(aiter->iter_page, ==, NULL);
|
||||
ASSERT0(aiter->iter_page_doff);
|
||||
ASSERT0(aiter->iter_page_dsize);
|
||||
|
||||
/* There's nothing left to advance to, so do nothing */
|
||||
if (abd_iter_at_end(aiter))
|
||||
|
@ -1009,6 +1015,106 @@ abd_cache_reap_now(void)
|
|||
}
|
||||
|
||||
#if defined(_KERNEL)
|
||||
/*
|
||||
* Yield the next page struct and data offset and size within it, without
|
||||
* mapping it into the address space.
|
||||
*/
|
||||
void
|
||||
abd_iter_page(struct abd_iter *aiter)
|
||||
{
|
||||
if (abd_iter_at_end(aiter)) {
|
||||
aiter->iter_page = NULL;
|
||||
aiter->iter_page_doff = 0;
|
||||
aiter->iter_page_dsize = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
struct page *page;
|
||||
size_t doff, dsize;
|
||||
|
||||
if (abd_is_linear(aiter->iter_abd)) {
|
||||
ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
|
||||
|
||||
/* memory address at iter_pos */
|
||||
void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
|
||||
|
||||
/* struct page for address */
|
||||
page = is_vmalloc_addr(paddr) ?
|
||||
vmalloc_to_page(paddr) : virt_to_page(paddr);
|
||||
|
||||
/* offset of address within the page */
|
||||
doff = offset_in_page(paddr);
|
||||
|
||||
/* total data remaining in abd from this position */
|
||||
dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
|
||||
} else {
|
||||
ASSERT(!abd_is_gang(aiter->iter_abd));
|
||||
|
||||
/* current scatter page */
|
||||
page = sg_page(aiter->iter_sg);
|
||||
|
||||
/* position within page */
|
||||
doff = aiter->iter_offset;
|
||||
|
||||
/* remaining data in scatterlist */
|
||||
dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
|
||||
aiter->iter_abd->abd_size - aiter->iter_pos);
|
||||
}
|
||||
ASSERT(page);
|
||||
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
|
||||
if (PageTail(page)) {
|
||||
/*
|
||||
* This page is part of a "compound page", which is a group of
|
||||
* pages that can be referenced from a single struct page *.
|
||||
* Its organised as a "head" page, followed by a series of
|
||||
* "tail" pages.
|
||||
*
|
||||
* In OpenZFS, compound pages are allocated using the
|
||||
* __GFP_COMP flag, which we get from scatter ABDs and SPL
|
||||
* vmalloc slabs (ie >16K allocations). So a great many of the
|
||||
* IO buffers we get are going to be of this type.
|
||||
*
|
||||
* The tail pages are just regular PAGE_SIZE pages, and can be
|
||||
* safely used as-is. However, the head page has length
|
||||
* covering itself and all the tail pages. If this ABD chunk
|
||||
* spans multiple pages, then we can use the head page and a
|
||||
* >PAGE_SIZE length, which is far more efficient.
|
||||
*
|
||||
* To do this, we need to adjust the offset to be counted from
|
||||
* the head page. struct page for compound pages are stored
|
||||
* contiguously, so we can just adjust by a simple offset.
|
||||
*
|
||||
* Before kernel 4.5, compound page heads were refcounted
|
||||
* separately, such that moving back to the head page would
|
||||
* require us to take a reference to it and releasing it once
|
||||
* we're completely finished with it. In practice, that means
|
||||
* when our caller is done with the ABD, which we have no
|
||||
* insight into from here. Rather than contort this API to
|
||||
* track head page references on such ancient kernels, we just
|
||||
* compile this block out and use the tail pages directly. This
|
||||
* is slightly less efficient, but makes everything far
|
||||
* simpler.
|
||||
*/
|
||||
struct page *head = compound_head(page);
|
||||
doff += ((page - head) * PAGESIZE);
|
||||
page = head;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* final page and position within it */
|
||||
aiter->iter_page = page;
|
||||
aiter->iter_page_doff = doff;
|
||||
|
||||
/* amount of data in the chunk, up to the end of the page */
|
||||
aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: ABD BIO functions only needed to support vdev_classic. See comments in
|
||||
* vdev_disk.c.
|
||||
*/
|
||||
|
||||
/*
|
||||
* bio_nr_pages for ABD.
|
||||
* @off is the offset in @abd
|
||||
|
@ -1163,4 +1269,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
|
|||
module_param(zfs_abd_scatter_max_order, uint, 0644);
|
||||
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
|
||||
"Maximum order allocation used for a scatter ABD.");
|
||||
#endif
|
||||
|
||||
#endif /* _KERNEL */
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
|
||||
* LLNL-CODE-403049.
|
||||
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
*/
|
||||
|
||||
#include <sys/zfs_context.h>
|
||||
|
@ -66,6 +67,13 @@ typedef struct vdev_disk {
|
|||
krwlock_t vd_lock;
|
||||
} vdev_disk_t;
|
||||
|
||||
/*
|
||||
* Maximum number of segments to add to a bio (min 4). If this is higher than
|
||||
* the maximum allowed by the device queue or the kernel itself, it will be
|
||||
* clamped. Setting it to zero will cause the kernel's ideal size to be used.
|
||||
*/
|
||||
uint_t zfs_vdev_disk_max_segs = 0;
|
||||
|
||||
/*
|
||||
* Unique identifier for the exclusive vdev holder.
|
||||
*/
|
||||
|
@ -83,17 +91,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
|
|||
*/
|
||||
#define EFI_MIN_RESV_SIZE (16 * 1024)
|
||||
|
||||
/*
|
||||
* Virtual device vector for disks.
|
||||
*/
|
||||
typedef struct dio_request {
|
||||
zio_t *dr_zio; /* Parent ZIO */
|
||||
atomic_t dr_ref; /* References */
|
||||
int dr_error; /* Bio error */
|
||||
int dr_bio_count; /* Count of bio's */
|
||||
struct bio *dr_bio[]; /* Attached bio's */
|
||||
} dio_request_t;
|
||||
|
||||
/*
|
||||
* BIO request failfast mask.
|
||||
*/
|
||||
|
@ -457,95 +454,15 @@ vdev_disk_close(vdev_t *v)
|
|||
if (v->vdev_reopening || vd == NULL)
|
||||
return;
|
||||
|
||||
if (vd->vd_bdh != NULL) {
|
||||
if (vd->vd_bdh != NULL)
|
||||
vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
|
||||
zfs_vdev_holder);
|
||||
}
|
||||
|
||||
rw_destroy(&vd->vd_lock);
|
||||
kmem_free(vd, sizeof (vdev_disk_t));
|
||||
v->vdev_tsd = NULL;
|
||||
}
|
||||
|
||||
static dio_request_t *
|
||||
vdev_disk_dio_alloc(int bio_count)
|
||||
{
|
||||
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
||||
sizeof (struct bio *) * bio_count, KM_SLEEP);
|
||||
atomic_set(&dr->dr_ref, 0);
|
||||
dr->dr_bio_count = bio_count;
|
||||
dr->dr_error = 0;
|
||||
|
||||
for (int i = 0; i < dr->dr_bio_count; i++)
|
||||
dr->dr_bio[i] = NULL;
|
||||
|
||||
return (dr);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_disk_dio_free(dio_request_t *dr)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < dr->dr_bio_count; i++)
|
||||
if (dr->dr_bio[i])
|
||||
bio_put(dr->dr_bio[i]);
|
||||
|
||||
kmem_free(dr, sizeof (dio_request_t) +
|
||||
sizeof (struct bio *) * dr->dr_bio_count);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_disk_dio_get(dio_request_t *dr)
|
||||
{
|
||||
atomic_inc(&dr->dr_ref);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_disk_dio_put(dio_request_t *dr)
|
||||
{
|
||||
int rc = atomic_dec_return(&dr->dr_ref);
|
||||
|
||||
/*
|
||||
* Free the dio_request when the last reference is dropped and
|
||||
* ensure zio_interpret is called only once with the correct zio
|
||||
*/
|
||||
if (rc == 0) {
|
||||
zio_t *zio = dr->dr_zio;
|
||||
int error = dr->dr_error;
|
||||
|
||||
vdev_disk_dio_free(dr);
|
||||
|
||||
if (zio) {
|
||||
zio->io_error = error;
|
||||
ASSERT3S(zio->io_error, >=, 0);
|
||||
if (zio->io_error)
|
||||
vdev_disk_error(zio);
|
||||
|
||||
zio_delay_interrupt(zio);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
|
||||
{
|
||||
dio_request_t *dr = bio->bi_private;
|
||||
|
||||
if (dr->dr_error == 0) {
|
||||
#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
dr->dr_error = BIO_END_IO_ERROR(bio);
|
||||
#else
|
||||
if (error)
|
||||
dr->dr_error = -(error);
|
||||
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
dr->dr_error = EIO;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Drop reference acquired by __vdev_disk_physio */
|
||||
vdev_disk_dio_put(dr);
|
||||
}
|
||||
|
||||
static inline void
|
||||
vdev_submit_bio_impl(struct bio *bio)
|
||||
{
|
||||
|
@ -697,8 +614,457 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
|
|||
return (bio);
|
||||
}
|
||||
|
||||
static inline uint_t
|
||||
vdev_bio_max_segs(struct block_device *bdev)
|
||||
{
|
||||
/*
|
||||
* Smallest of the device max segs and the tuneable max segs. Minimum
|
||||
* 4, so there's room to finish split pages if they come up.
|
||||
*/
|
||||
const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
|
||||
const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
|
||||
MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
|
||||
const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
|
||||
|
||||
#ifdef HAVE_BIO_MAX_SEGS
|
||||
return (bio_max_segs(max_segs));
|
||||
#else
|
||||
return (MIN(max_segs, BIO_MAX_PAGES));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint_t
|
||||
vdev_bio_max_bytes(struct block_device *bdev)
|
||||
{
|
||||
return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Virtual block IO object (VBIO)
|
||||
*
|
||||
* Linux block IO (BIO) objects have a limit on how many data segments (pages)
|
||||
* they can hold. Depending on how they're allocated and structured, a large
|
||||
* ZIO can require more than one BIO to be submitted to the kernel, which then
|
||||
* all have to complete before we can return the completed ZIO back to ZFS.
|
||||
*
|
||||
* A VBIO is a wrapper around multiple BIOs, carrying everything needed to
|
||||
* translate a ZIO down into the kernel block layer and back again.
|
||||
*
|
||||
* Note that these are only used for data ZIOs (read/write). Meta-operations
|
||||
* (flush/trim) don't need multiple BIOs and so can just make the call
|
||||
* directly.
|
||||
*/
|
||||
typedef struct {
|
||||
zio_t *vbio_zio; /* parent zio */
|
||||
|
||||
struct block_device *vbio_bdev; /* blockdev to submit bios to */
|
||||
|
||||
abd_t *vbio_abd; /* abd carrying borrowed linear buf */
|
||||
|
||||
uint_t vbio_max_segs; /* max segs per bio */
|
||||
|
||||
uint_t vbio_max_bytes; /* max bytes per bio */
|
||||
uint_t vbio_lbs_mask; /* logical block size mask */
|
||||
|
||||
uint64_t vbio_offset; /* start offset of next bio */
|
||||
|
||||
struct bio *vbio_bio; /* pointer to the current bio */
|
||||
int vbio_flags; /* bio flags */
|
||||
} vbio_t;
|
||||
|
||||
static vbio_t *
|
||||
vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
|
||||
{
|
||||
vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
|
||||
|
||||
vbio->vbio_zio = zio;
|
||||
vbio->vbio_bdev = bdev;
|
||||
vbio->vbio_abd = NULL;
|
||||
vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
|
||||
vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
|
||||
vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
|
||||
vbio->vbio_offset = zio->io_offset;
|
||||
vbio->vbio_bio = NULL;
|
||||
vbio->vbio_flags = flags;
|
||||
|
||||
return (vbio);
|
||||
}
|
||||
|
||||
BIO_END_IO_PROTO(vbio_completion, bio, error);
|
||||
|
||||
static int
|
||||
vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
|
||||
{
|
||||
struct bio *bio = vbio->vbio_bio;
|
||||
uint_t ssize;
|
||||
|
||||
while (size > 0) {
|
||||
if (bio == NULL) {
|
||||
/* New BIO, allocate and set up */
|
||||
bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
|
||||
vbio->vbio_max_segs);
|
||||
VERIFY(bio);
|
||||
|
||||
BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
|
||||
bio_set_op_attrs(bio,
|
||||
vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
|
||||
WRITE : READ, vbio->vbio_flags);
|
||||
|
||||
if (vbio->vbio_bio) {
|
||||
bio_chain(vbio->vbio_bio, bio);
|
||||
vdev_submit_bio(vbio->vbio_bio);
|
||||
}
|
||||
vbio->vbio_bio = bio;
|
||||
}
|
||||
|
||||
/*
|
||||
* Only load as much of the current page data as will fit in
|
||||
* the space left in the BIO, respecting lbs alignment. Older
|
||||
* kernels will error if we try to overfill the BIO, while
|
||||
* newer ones will accept it and split the BIO. This ensures
|
||||
* everything works on older kernels, and avoids an additional
|
||||
* overhead on the new.
|
||||
*/
|
||||
ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
|
||||
vbio->vbio_lbs_mask);
|
||||
if (ssize > 0 &&
|
||||
bio_add_page(bio, page, ssize, offset) == ssize) {
|
||||
/* Accepted, adjust and load any remaining. */
|
||||
size -= ssize;
|
||||
offset += ssize;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* No room, set up for a new BIO and loop */
|
||||
vbio->vbio_offset += BIO_BI_SIZE(bio);
|
||||
|
||||
/* Signal new BIO allocation wanted */
|
||||
bio = NULL;
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/* Iterator callback to submit ABD pages to the vbio. */
|
||||
static int
|
||||
vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
|
||||
{
|
||||
vbio_t *vbio = priv;
|
||||
return (vbio_add_page(vbio, page, len, off));
|
||||
}
|
||||
|
||||
/* Create some BIOs, fill them with data and submit them */
|
||||
static void
|
||||
vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
|
||||
{
|
||||
/*
|
||||
* We plug so we can submit the BIOs as we go and only unplug them when
|
||||
* they are fully created and submitted. This is important; if we don't
|
||||
* plug, then the kernel may start executing earlier BIOs while we're
|
||||
* still creating and executing later ones, and if the device goes
|
||||
* away while that's happening, older kernels can get confused and
|
||||
* trample memory.
|
||||
*/
|
||||
struct blk_plug plug;
|
||||
blk_start_plug(&plug);
|
||||
|
||||
(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
|
||||
ASSERT(vbio->vbio_bio);
|
||||
|
||||
vbio->vbio_bio->bi_end_io = vbio_completion;
|
||||
vbio->vbio_bio->bi_private = vbio;
|
||||
|
||||
vdev_submit_bio(vbio->vbio_bio);
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
}
|
||||
|
||||
/* IO completion callback */
|
||||
BIO_END_IO_PROTO(vbio_completion, bio, error)
|
||||
{
|
||||
vbio_t *vbio = bio->bi_private;
|
||||
zio_t *zio = vbio->vbio_zio;
|
||||
|
||||
ASSERT(zio);
|
||||
|
||||
/* Capture and log any errors */
|
||||
#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
zio->io_error = BIO_END_IO_ERROR(bio);
|
||||
#else
|
||||
zio->io_error = 0;
|
||||
if (error)
|
||||
zio->io_error = -(error);
|
||||
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
zio->io_error = EIO;
|
||||
#endif
|
||||
ASSERT3U(zio->io_error, >=, 0);
|
||||
|
||||
if (zio->io_error)
|
||||
vdev_disk_error(zio);
|
||||
|
||||
/* Return the BIO to the kernel */
|
||||
bio_put(bio);
|
||||
|
||||
/*
|
||||
* If we copied the ABD before issuing it, clean up and return the copy
|
||||
* to the ADB, with changes if appropriate.
|
||||
*/
|
||||
if (vbio->vbio_abd != NULL) {
|
||||
void *buf = abd_to_buf(vbio->vbio_abd);
|
||||
abd_free(vbio->vbio_abd);
|
||||
vbio->vbio_abd = NULL;
|
||||
|
||||
if (zio->io_type == ZIO_TYPE_READ)
|
||||
abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
|
||||
else
|
||||
abd_return_buf(zio->io_abd, buf, zio->io_size);
|
||||
}
|
||||
|
||||
/* Final cleanup */
|
||||
kmem_free(vbio, sizeof (vbio_t));
|
||||
|
||||
/* All done, submit for processing */
|
||||
zio_delay_interrupt(zio);
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterator callback to count ABD pages and check their size & alignment.
|
||||
*
|
||||
* On Linux, each BIO segment can take a page pointer, and an offset+length of
|
||||
* the data within that page. A page can be arbitrarily large ("compound"
|
||||
* pages) but we still have to ensure the data portion is correctly sized and
|
||||
* aligned to the logical block size, to ensure that if the kernel wants to
|
||||
* split the BIO, the two halves will still be properly aligned.
|
||||
*/
|
||||
typedef struct {
|
||||
uint_t bmask;
|
||||
uint_t npages;
|
||||
uint_t end;
|
||||
} vdev_disk_check_pages_t;
|
||||
|
||||
static int
|
||||
vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
|
||||
{
|
||||
vdev_disk_check_pages_t *s = priv;
|
||||
|
||||
/*
|
||||
* If we didn't finish on a block size boundary last time, then there
|
||||
* would be a gap if we tried to use this ABD as-is, so abort.
|
||||
*/
|
||||
if (s->end != 0)
|
||||
return (1);
|
||||
|
||||
/*
|
||||
* Note if we're taking less than a full block, so we can check it
|
||||
* above on the next call.
|
||||
*/
|
||||
s->end = len & s->bmask;
|
||||
|
||||
/* All blocks after the first must start on a block size boundary. */
|
||||
if (s->npages != 0 && (off & s->bmask) != 0)
|
||||
return (1);
|
||||
|
||||
s->npages++;
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if we can submit the pages in this ABD to the kernel as-is. Returns
|
||||
* the number of pages, or 0 if it can't be submitted like this.
|
||||
*/
|
||||
static boolean_t
|
||||
vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
|
||||
{
|
||||
vdev_disk_check_pages_t s = {
|
||||
.bmask = bdev_logical_block_size(bdev)-1,
|
||||
.npages = 0,
|
||||
.end = 0,
|
||||
};
|
||||
|
||||
if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
|
||||
return (B_FALSE);
|
||||
|
||||
return (B_TRUE);
|
||||
}
|
||||
|
||||
static int
|
||||
vdev_disk_io_rw(zio_t *zio)
|
||||
{
|
||||
vdev_t *v = zio->io_vd;
|
||||
vdev_disk_t *vd = v->vdev_tsd;
|
||||
struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
|
||||
int flags = 0;
|
||||
|
||||
/*
|
||||
* Accessing outside the block device is never allowed.
|
||||
*/
|
||||
if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
|
||||
vdev_dbgmsg(zio->io_vd,
|
||||
"Illegal access %llu size %llu, device size %llu",
|
||||
(u_longlong_t)zio->io_offset,
|
||||
(u_longlong_t)zio->io_size,
|
||||
(u_longlong_t)i_size_read(bdev->bd_inode));
|
||||
return (SET_ERROR(EIO));
|
||||
}
|
||||
|
||||
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
||||
v->vdev_failfast == B_TRUE) {
|
||||
bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
|
||||
zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check alignment of the incoming ABD. If any part of it would require
|
||||
* submitting a page that is not aligned to the logical block size,
|
||||
* then we take a copy into a linear buffer and submit that instead.
|
||||
* This should be impossible on a 512b LBS, and fairly rare on 4K,
|
||||
* usually requiring abnormally-small data blocks (eg gang blocks)
|
||||
* mixed into the same ABD as larger ones (eg aggregated).
|
||||
*/
|
||||
abd_t *abd = zio->io_abd;
|
||||
if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
|
||||
void *buf;
|
||||
if (zio->io_type == ZIO_TYPE_READ)
|
||||
buf = abd_borrow_buf(zio->io_abd, zio->io_size);
|
||||
else
|
||||
buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
|
||||
|
||||
/*
|
||||
* Wrap the copy in an abd_t, so we can use the same iterators
|
||||
* to count and fill the vbio later.
|
||||
*/
|
||||
abd = abd_get_from_buf(buf, zio->io_size);
|
||||
|
||||
/*
|
||||
* False here would mean the borrowed copy has an invalid
|
||||
* alignment too, which would mean we've somehow been passed a
|
||||
* linear ABD with an interior page that has a non-zero offset
|
||||
* or a size not a multiple of PAGE_SIZE. This is not possible.
|
||||
* It would mean either zio_buf_alloc() or its underlying
|
||||
* allocators have done something extremely strange, or our
|
||||
* math in vdev_disk_check_pages() is wrong. In either case,
|
||||
* something in seriously wrong and its not safe to continue.
|
||||
*/
|
||||
VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
|
||||
}
|
||||
|
||||
/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
|
||||
vbio_t *vbio = vbio_alloc(zio, bdev, flags);
|
||||
if (abd != zio->io_abd)
|
||||
vbio->vbio_abd = abd;
|
||||
|
||||
/* Fill it with data pages and submit it to the kernel */
|
||||
vbio_submit(vbio, abd, zio->io_size);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/* ========== */
|
||||
|
||||
/*
|
||||
* This is the classic, battle-tested BIO submission code. Until we're totally
|
||||
* sure that the new code is safe and correct in all cases, this will remain
|
||||
* available.
|
||||
*
|
||||
* It is enabled by setting zfs_vdev_disk_classic=1 at module load time. It is
|
||||
* enabled (=1) by default since 2.2.4, and disabled by default (=0) on master.
|
||||
*
|
||||
* These functions have been renamed to vdev_classic_* to make it clear what
|
||||
* they belong to, but their implementations are unchanged.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Virtual device vector for disks.
|
||||
*/
|
||||
typedef struct dio_request {
|
||||
zio_t *dr_zio; /* Parent ZIO */
|
||||
atomic_t dr_ref; /* References */
|
||||
int dr_error; /* Bio error */
|
||||
int dr_bio_count; /* Count of bio's */
|
||||
struct bio *dr_bio[]; /* Attached bio's */
|
||||
} dio_request_t;
|
||||
|
||||
static dio_request_t *
|
||||
vdev_classic_dio_alloc(int bio_count)
|
||||
{
|
||||
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
||||
sizeof (struct bio *) * bio_count, KM_SLEEP);
|
||||
atomic_set(&dr->dr_ref, 0);
|
||||
dr->dr_bio_count = bio_count;
|
||||
dr->dr_error = 0;
|
||||
|
||||
for (int i = 0; i < dr->dr_bio_count; i++)
|
||||
dr->dr_bio[i] = NULL;
|
||||
|
||||
return (dr);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_classic_dio_free(dio_request_t *dr)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < dr->dr_bio_count; i++)
|
||||
if (dr->dr_bio[i])
|
||||
bio_put(dr->dr_bio[i]);
|
||||
|
||||
kmem_free(dr, sizeof (dio_request_t) +
|
||||
sizeof (struct bio *) * dr->dr_bio_count);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_classic_dio_get(dio_request_t *dr)
|
||||
{
|
||||
atomic_inc(&dr->dr_ref);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_classic_dio_put(dio_request_t *dr)
|
||||
{
|
||||
int rc = atomic_dec_return(&dr->dr_ref);
|
||||
|
||||
/*
|
||||
* Free the dio_request when the last reference is dropped and
|
||||
* ensure zio_interpret is called only once with the correct zio
|
||||
*/
|
||||
if (rc == 0) {
|
||||
zio_t *zio = dr->dr_zio;
|
||||
int error = dr->dr_error;
|
||||
|
||||
vdev_classic_dio_free(dr);
|
||||
|
||||
if (zio) {
|
||||
zio->io_error = error;
|
||||
ASSERT3S(zio->io_error, >=, 0);
|
||||
if (zio->io_error)
|
||||
vdev_disk_error(zio);
|
||||
|
||||
zio_delay_interrupt(zio);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
|
||||
{
|
||||
dio_request_t *dr = bio->bi_private;
|
||||
|
||||
if (dr->dr_error == 0) {
|
||||
#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||
dr->dr_error = BIO_END_IO_ERROR(bio);
|
||||
#else
|
||||
if (error)
|
||||
dr->dr_error = -(error);
|
||||
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
dr->dr_error = EIO;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Drop reference acquired by vdev_classic_physio */
|
||||
vdev_classic_dio_put(dr);
|
||||
}
|
||||
|
||||
static inline unsigned int
|
||||
vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
||||
vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
||||
{
|
||||
unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
|
||||
bio_size, abd_offset);
|
||||
|
@ -711,9 +1077,16 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
|||
}
|
||||
|
||||
static int
|
||||
__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
||||
size_t io_size, uint64_t io_offset, int rw, int flags)
|
||||
vdev_classic_physio(zio_t *zio)
|
||||
{
|
||||
vdev_t *v = zio->io_vd;
|
||||
vdev_disk_t *vd = v->vdev_tsd;
|
||||
struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
|
||||
size_t io_size = zio->io_size;
|
||||
uint64_t io_offset = zio->io_offset;
|
||||
int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
|
||||
int flags = 0;
|
||||
|
||||
dio_request_t *dr;
|
||||
uint64_t abd_offset;
|
||||
uint64_t bio_offset;
|
||||
|
@ -736,7 +1109,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
|||
}
|
||||
|
||||
retry:
|
||||
dr = vdev_disk_dio_alloc(bio_count);
|
||||
dr = vdev_classic_dio_alloc(bio_count);
|
||||
|
||||
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
||||
zio->io_vd->vdev_failfast == B_TRUE) {
|
||||
|
@ -771,23 +1144,23 @@ retry:
|
|||
* this should be rare - see the comment above.
|
||||
*/
|
||||
if (dr->dr_bio_count == i) {
|
||||
vdev_disk_dio_free(dr);
|
||||
vdev_classic_dio_free(dr);
|
||||
bio_count *= 2;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
|
||||
nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
|
||||
dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
|
||||
if (unlikely(dr->dr_bio[i] == NULL)) {
|
||||
vdev_disk_dio_free(dr);
|
||||
vdev_classic_dio_free(dr);
|
||||
return (SET_ERROR(ENOMEM));
|
||||
}
|
||||
|
||||
/* Matching put called by vdev_disk_physio_completion */
|
||||
vdev_disk_dio_get(dr);
|
||||
/* Matching put called by vdev_classic_physio_completion */
|
||||
vdev_classic_dio_get(dr);
|
||||
|
||||
BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
|
||||
dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
|
||||
dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
|
||||
dr->dr_bio[i]->bi_private = dr;
|
||||
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
|
||||
|
||||
|
@ -801,7 +1174,7 @@ retry:
|
|||
}
|
||||
|
||||
/* Extra reference to protect dio_request during vdev_submit_bio */
|
||||
vdev_disk_dio_get(dr);
|
||||
vdev_classic_dio_get(dr);
|
||||
|
||||
if (dr->dr_bio_count > 1)
|
||||
blk_start_plug(&plug);
|
||||
|
@ -815,11 +1188,13 @@ retry:
|
|||
if (dr->dr_bio_count > 1)
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
vdev_disk_dio_put(dr);
|
||||
vdev_classic_dio_put(dr);
|
||||
|
||||
return (error);
|
||||
}
|
||||
|
||||
/* ========== */
|
||||
|
||||
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
|
||||
{
|
||||
zio_t *zio = bio->bi_private;
|
||||
|
@ -928,12 +1303,14 @@ vdev_disk_io_trim(zio_t *zio)
|
|||
#endif
|
||||
}
|
||||
|
||||
int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
|
||||
|
||||
static void
|
||||
vdev_disk_io_start(zio_t *zio)
|
||||
{
|
||||
vdev_t *v = zio->io_vd;
|
||||
vdev_disk_t *vd = v->vdev_tsd;
|
||||
int rw, error;
|
||||
int error;
|
||||
|
||||
/*
|
||||
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
|
||||
|
@ -996,13 +1373,6 @@ vdev_disk_io_start(zio_t *zio)
|
|||
rw_exit(&vd->vd_lock);
|
||||
zio_execute(zio);
|
||||
return;
|
||||
case ZIO_TYPE_WRITE:
|
||||
rw = WRITE;
|
||||
break;
|
||||
|
||||
case ZIO_TYPE_READ:
|
||||
rw = READ;
|
||||
break;
|
||||
|
||||
case ZIO_TYPE_TRIM:
|
||||
zio->io_error = vdev_disk_io_trim(zio);
|
||||
|
@ -1015,23 +1385,34 @@ vdev_disk_io_start(zio_t *zio)
|
|||
#endif
|
||||
return;
|
||||
|
||||
case ZIO_TYPE_READ:
|
||||
case ZIO_TYPE_WRITE:
|
||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||
error = vdev_disk_io_rw_fn(zio);
|
||||
rw_exit(&vd->vd_lock);
|
||||
if (error) {
|
||||
zio->io_error = error;
|
||||
zio_interrupt(zio);
|
||||
}
|
||||
return;
|
||||
|
||||
default:
|
||||
/*
|
||||
* Getting here means our parent vdev has made a very strange
|
||||
* request of us, and shouldn't happen. Assert here to force a
|
||||
* crash in dev builds, but in production return the IO
|
||||
* unhandled. The pool will likely suspend anyway but that's
|
||||
* nicer than crashing the kernel.
|
||||
*/
|
||||
ASSERT3S(zio->io_type, ==, -1);
|
||||
|
||||
rw_exit(&vd->vd_lock);
|
||||
zio->io_error = SET_ERROR(ENOTSUP);
|
||||
zio_interrupt(zio);
|
||||
return;
|
||||
}
|
||||
|
||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||
error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
|
||||
zio->io_size, zio->io_offset, rw, 0);
|
||||
rw_exit(&vd->vd_lock);
|
||||
|
||||
if (error) {
|
||||
zio->io_error = error;
|
||||
zio_interrupt(zio);
|
||||
return;
|
||||
}
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -1080,8 +1461,49 @@ vdev_disk_rele(vdev_t *vd)
|
|||
/* XXX: Implement me as a vnode rele for the device */
|
||||
}
|
||||
|
||||
/*
|
||||
* BIO submission method. See comment above about vdev_classic.
|
||||
* Set zfs_vdev_disk_classic=0 for new, =1 for classic
|
||||
*/
|
||||
static uint_t zfs_vdev_disk_classic = 1; /* default classic */
|
||||
|
||||
/* Set submission function from module parameter */
|
||||
static int
|
||||
vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
|
||||
{
|
||||
int err = param_set_uint(buf, kp);
|
||||
if (err < 0)
|
||||
return (SET_ERROR(err));
|
||||
|
||||
vdev_disk_io_rw_fn =
|
||||
zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
|
||||
|
||||
printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
|
||||
zfs_vdev_disk_classic ? "classic" : "new");
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* At first use vdev use, set the submission function from the default value if
|
||||
* it hasn't been set already.
|
||||
*/
|
||||
static int
|
||||
vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
|
||||
{
|
||||
(void) spa;
|
||||
(void) nv;
|
||||
(void) tsd;
|
||||
|
||||
if (vdev_disk_io_rw_fn == NULL)
|
||||
vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
|
||||
vdev_classic_physio : vdev_disk_io_rw;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
vdev_ops_t vdev_disk_ops = {
|
||||
.vdev_op_init = NULL,
|
||||
.vdev_op_init = vdev_disk_init,
|
||||
.vdev_op_fini = NULL,
|
||||
.vdev_op_open = vdev_disk_open,
|
||||
.vdev_op_close = vdev_disk_close,
|
||||
|
@ -1174,3 +1596,10 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
|
|||
|
||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
|
||||
"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
|
||||
"Maximum number of data segments to add to an IO request (min 4)");
|
||||
|
||||
ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
|
||||
vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
|
||||
"Use classic BIO submission method");
|
||||
|
|
|
@ -3821,11 +3821,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
|
|||
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
||||
zfs_sa_upgrade_txholds(tx, zp);
|
||||
|
||||
err = dmu_tx_assign(tx, TXG_NOWAIT);
|
||||
err = dmu_tx_assign(tx, TXG_WAIT);
|
||||
if (err != 0) {
|
||||
if (err == ERESTART)
|
||||
dmu_tx_wait(tx);
|
||||
|
||||
dmu_tx_abort(tx);
|
||||
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
|
||||
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
|
||||
|
@ -4277,9 +4274,4 @@ EXPORT_SYMBOL(zfs_map);
|
|||
/* CSTYLED */
|
||||
module_param(zfs_delete_blocks, ulong, 0644);
|
||||
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
|
||||
|
||||
/* CSTYLED */
|
||||
module_param(zfs_bclone_enabled, uint, 0644);
|
||||
MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");
|
||||
|
||||
#endif
|
||||
|
|
|
@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
|
|||
{
|
||||
boolean_t *for_sync = data;
|
||||
fstrans_cookie_t cookie;
|
||||
int ret;
|
||||
|
||||
ASSERT(PageLocked(pp));
|
||||
ASSERT(!PageWriteback(pp));
|
||||
|
||||
cookie = spl_fstrans_mark();
|
||||
(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
|
||||
ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
|
||||
spl_fstrans_unmark(cookie);
|
||||
|
||||
return (0);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
#ifdef HAVE_WRITEPAGE_T_FOLIO
|
||||
static int
|
||||
zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
|
||||
{
|
||||
(void) zpl_putpage(&pp->page, wbc, data);
|
||||
return (0);
|
||||
return (zpl_putpage(&pp->page, wbc, data));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -26,13 +26,14 @@
|
|||
#include <linux/compat.h>
|
||||
#endif
|
||||
#include <linux/fs.h>
|
||||
#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
|
||||
#include <linux/splice.h>
|
||||
#endif
|
||||
#include <sys/file.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#include <sys/zfs_vnops.h>
|
||||
#include <sys/zfeature.h>
|
||||
|
||||
int zfs_bclone_enabled = 0;
|
||||
|
||||
/*
|
||||
* Clone part of a file via block cloning.
|
||||
*
|
||||
|
@ -40,7 +41,7 @@ int zfs_bclone_enabled = 0;
|
|||
* care of that depending on how it was called.
|
||||
*/
|
||||
static ssize_t
|
||||
__zpl_clone_file_range(struct file *src_file, loff_t src_off,
|
||||
zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
|
||||
struct file *dst_file, loff_t dst_off, size_t len)
|
||||
{
|
||||
struct inode *src_i = file_inode(src_file);
|
||||
|
@ -96,14 +97,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
|||
{
|
||||
ssize_t ret;
|
||||
|
||||
/* Flags is reserved for future extensions and must be zero. */
|
||||
if (flags != 0)
|
||||
return (-EINVAL);
|
||||
|
||||
/* Try to do it via zfs_clone_range() */
|
||||
ret = __zpl_clone_file_range(src_file, src_off,
|
||||
/* Try to do it via zfs_clone_range() and allow shortening. */
|
||||
ret = zpl_clone_file_range_impl(src_file, src_off,
|
||||
dst_file, dst_off, len);
|
||||
|
||||
#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
|
||||
#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
|
||||
/*
|
||||
* Since Linux 5.3 the filesystem driver is responsible for executing
|
||||
* an appropriate fallback, and a generic fallback function is provided.
|
||||
|
@ -112,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
|||
ret == -EAGAIN)
|
||||
ret = generic_copy_file_range(src_file, src_off, dst_file,
|
||||
dst_off, len, flags);
|
||||
#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
|
||||
/*
|
||||
* Since 6.8 the fallback function is called splice_copy_file_range
|
||||
* and has a slightly different signature.
|
||||
*/
|
||||
if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
|
||||
ret == -EAGAIN)
|
||||
ret = splice_copy_file_range(src_file, src_off, dst_file,
|
||||
dst_off, len);
|
||||
#else
|
||||
/*
|
||||
* Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
|
||||
|
@ -119,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
|||
*/
|
||||
if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
|
||||
ret = -EOPNOTSUPP;
|
||||
#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
|
||||
#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
@ -137,6 +148,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
|||
* FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
|
||||
* range in both files and if they're the same, arrange for them to be backed
|
||||
* by the same storage.
|
||||
*
|
||||
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
|
||||
* if we want. It's designed for filesystems that may need to shorten the
|
||||
* length for alignment, EOF, or any other requirement. ZFS may shorten the
|
||||
* request when there is outstanding dirty data which hasn't been written.
|
||||
*/
|
||||
loff_t
|
||||
zpl_remap_file_range(struct file *src_file, loff_t src_off,
|
||||
|
@ -145,24 +161,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off,
|
|||
if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
|
||||
return (-EINVAL);
|
||||
|
||||
/*
|
||||
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
|
||||
* range if we want. Its designed for filesystems that make data past
|
||||
* EOF available, and don't want it to be visible in both files. ZFS
|
||||
* doesn't do that, so we just turn the flag off.
|
||||
*/
|
||||
flags &= ~REMAP_FILE_CAN_SHORTEN;
|
||||
|
||||
/* No support for dedup yet */
|
||||
if (flags & REMAP_FILE_DEDUP)
|
||||
/* No support for dedup yet */
|
||||
return (-EOPNOTSUPP);
|
||||
|
||||
/* Zero length means to clone everything to the end of the file */
|
||||
if (len == 0)
|
||||
len = i_size_read(file_inode(src_file)) - src_off;
|
||||
|
||||
return (__zpl_clone_file_range(src_file, src_off,
|
||||
dst_file, dst_off, len));
|
||||
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
|
||||
dst_file, dst_off, len);
|
||||
|
||||
if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len)
|
||||
ret = -EINVAL;
|
||||
|
||||
return (ret);
|
||||
}
|
||||
#endif /* HAVE_VFS_REMAP_FILE_RANGE */
|
||||
|
||||
|
@ -179,8 +192,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off,
|
|||
if (len == 0)
|
||||
len = i_size_read(file_inode(src_file)) - src_off;
|
||||
|
||||
return (__zpl_clone_file_range(src_file, src_off,
|
||||
dst_file, dst_off, len));
|
||||
/* The entire length must be cloned or this is an error. */
|
||||
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
|
||||
dst_file, dst_off, len);
|
||||
|
||||
if (ret >= 0 && ret != len)
|
||||
ret = -EINVAL;
|
||||
|
||||
return (ret);
|
||||
}
|
||||
#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
|
||||
|
||||
|
@ -214,8 +233,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg)
|
|||
|
||||
size_t len = i_size_read(file_inode(src_file));
|
||||
|
||||
ssize_t ret =
|
||||
__zpl_clone_file_range(src_file, 0, dst_file, 0, len);
|
||||
ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len);
|
||||
|
||||
fput(src_file);
|
||||
|
||||
|
@ -253,7 +271,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
|
|||
if (len == 0)
|
||||
len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
|
||||
|
||||
ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
|
||||
ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
|
||||
dst_file, fcr.fcr_dest_offset, len);
|
||||
|
||||
fput(src_file);
|
||||
|
|
|
@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
|
|||
return (ret);
|
||||
}
|
||||
|
||||
#if defined(__linux__) && defined(_KERNEL)
|
||||
int
|
||||
abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
|
||||
abd_iter_page_func_t *func, void *private)
|
||||
{
|
||||
struct abd_iter aiter;
|
||||
int ret = 0;
|
||||
|
||||
if (size == 0)
|
||||
return (0);
|
||||
|
||||
abd_verify(abd);
|
||||
ASSERT3U(off + size, <=, abd->abd_size);
|
||||
|
||||
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
|
||||
|
||||
while (size > 0) {
|
||||
IMPLY(abd_is_gang(abd), c_abd != NULL);
|
||||
|
||||
abd_iter_page(&aiter);
|
||||
|
||||
size_t len = MIN(aiter.iter_page_dsize, size);
|
||||
ASSERT3U(len, >, 0);
|
||||
|
||||
ret = func(aiter.iter_page, aiter.iter_page_doff,
|
||||
len, private);
|
||||
|
||||
aiter.iter_page = NULL;
|
||||
aiter.iter_page_doff = 0;
|
||||
aiter.iter_page_dsize = 0;
|
||||
|
||||
if (ret != 0)
|
||||
break;
|
||||
|
||||
size -= len;
|
||||
c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
|
||||
}
|
||||
|
||||
return (ret);
|
||||
}
|
||||
#endif
|
||||
|
||||
struct buf_arg {
|
||||
void *arg_buf;
|
||||
};
|
||||
|
|
189
module/zfs/arc.c
189
module/zfs/arc.c
|
@ -3883,7 +3883,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
|
|||
|
||||
ASSERT3P(marker, !=, NULL);
|
||||
|
||||
mls = multilist_sublist_lock(ml, idx);
|
||||
mls = multilist_sublist_lock_idx(ml, idx);
|
||||
|
||||
for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
|
||||
hdr = multilist_sublist_prev(mls, marker)) {
|
||||
|
@ -3995,6 +3995,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
|
|||
return (bytes_evicted);
|
||||
}
|
||||
|
||||
static arc_buf_hdr_t *
|
||||
arc_state_alloc_marker(void)
|
||||
{
|
||||
arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
|
||||
|
||||
/*
|
||||
* A b_spa of 0 is used to indicate that this header is
|
||||
* a marker. This fact is used in arc_evict_state_impl().
|
||||
*/
|
||||
marker->b_spa = 0;
|
||||
|
||||
return (marker);
|
||||
}
|
||||
|
||||
static void
|
||||
arc_state_free_marker(arc_buf_hdr_t *marker)
|
||||
{
|
||||
kmem_cache_free(hdr_full_cache, marker);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate an array of buffer headers used as placeholders during arc state
|
||||
* eviction.
|
||||
|
@ -4005,16 +4025,8 @@ arc_state_alloc_markers(int count)
|
|||
arc_buf_hdr_t **markers;
|
||||
|
||||
markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
|
||||
for (int i = 0; i < count; i++) {
|
||||
markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
|
||||
|
||||
/*
|
||||
* A b_spa of 0 is used to indicate that this header is
|
||||
* a marker. This fact is used in arc_evict_state_impl().
|
||||
*/
|
||||
markers[i]->b_spa = 0;
|
||||
|
||||
}
|
||||
for (int i = 0; i < count; i++)
|
||||
markers[i] = arc_state_alloc_marker();
|
||||
return (markers);
|
||||
}
|
||||
|
||||
|
@ -4022,7 +4034,7 @@ static void
|
|||
arc_state_free_markers(arc_buf_hdr_t **markers, int count)
|
||||
{
|
||||
for (int i = 0; i < count; i++)
|
||||
kmem_cache_free(hdr_full_cache, markers[i]);
|
||||
arc_state_free_marker(markers[i]);
|
||||
kmem_free(markers, sizeof (*markers) * count);
|
||||
}
|
||||
|
||||
|
@ -4066,7 +4078,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
|
|||
for (int i = 0; i < num_sublists; i++) {
|
||||
multilist_sublist_t *mls;
|
||||
|
||||
mls = multilist_sublist_lock(ml, i);
|
||||
mls = multilist_sublist_lock_idx(ml, i);
|
||||
multilist_sublist_insert_tail(mls, markers[i]);
|
||||
multilist_sublist_unlock(mls);
|
||||
}
|
||||
|
@ -4131,7 +4143,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
|
|||
}
|
||||
|
||||
for (int i = 0; i < num_sublists; i++) {
|
||||
multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
|
||||
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
|
||||
multilist_sublist_remove(mls, markers[i]);
|
||||
multilist_sublist_unlock(mls);
|
||||
}
|
||||
|
@ -8639,7 +8651,7 @@ l2arc_sublist_lock(int list_num)
|
|||
* sublists being selected.
|
||||
*/
|
||||
idx = multilist_get_random_index(ml);
|
||||
return (multilist_sublist_lock(ml, idx));
|
||||
return (multilist_sublist_lock_idx(ml, idx));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -9051,9 +9063,9 @@ l2arc_blk_fetch_done(zio_t *zio)
|
|||
static uint64_t
|
||||
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||
{
|
||||
arc_buf_hdr_t *hdr, *hdr_prev, *head;
|
||||
uint64_t write_asize, write_psize, write_lsize, headroom;
|
||||
boolean_t full;
|
||||
arc_buf_hdr_t *hdr, *head, *marker;
|
||||
uint64_t write_asize, write_psize, headroom;
|
||||
boolean_t full, from_head = !arc_warm;
|
||||
l2arc_write_callback_t *cb = NULL;
|
||||
zio_t *pio, *wzio;
|
||||
uint64_t guid = spa_load_guid(spa);
|
||||
|
@ -9062,10 +9074,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
|||
ASSERT3P(dev->l2ad_vdev, !=, NULL);
|
||||
|
||||
pio = NULL;
|
||||
write_lsize = write_asize = write_psize = 0;
|
||||
write_asize = write_psize = 0;
|
||||
full = B_FALSE;
|
||||
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
|
||||
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
|
||||
marker = arc_state_alloc_marker();
|
||||
|
||||
/*
|
||||
* Copy buffers for L2ARC writing.
|
||||
|
@ -9080,40 +9093,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
|||
continue;
|
||||
}
|
||||
|
||||
multilist_sublist_t *mls = l2arc_sublist_lock(pass);
|
||||
uint64_t passed_sz = 0;
|
||||
|
||||
VERIFY3P(mls, !=, NULL);
|
||||
|
||||
/*
|
||||
* L2ARC fast warmup.
|
||||
*
|
||||
* Until the ARC is warm and starts to evict, read from the
|
||||
* head of the ARC lists rather than the tail.
|
||||
*/
|
||||
if (arc_warm == B_FALSE)
|
||||
hdr = multilist_sublist_head(mls);
|
||||
else
|
||||
hdr = multilist_sublist_tail(mls);
|
||||
|
||||
headroom = target_sz * l2arc_headroom;
|
||||
if (zfs_compressed_arc_enabled)
|
||||
headroom = (headroom * l2arc_headroom_boost) / 100;
|
||||
|
||||
for (; hdr; hdr = hdr_prev) {
|
||||
/*
|
||||
* Until the ARC is warm and starts to evict, read from the
|
||||
* head of the ARC lists rather than the tail.
|
||||
*/
|
||||
multilist_sublist_t *mls = l2arc_sublist_lock(pass);
|
||||
ASSERT3P(mls, !=, NULL);
|
||||
if (from_head)
|
||||
hdr = multilist_sublist_head(mls);
|
||||
else
|
||||
hdr = multilist_sublist_tail(mls);
|
||||
|
||||
while (hdr != NULL) {
|
||||
kmutex_t *hash_lock;
|
||||
abd_t *to_write = NULL;
|
||||
|
||||
if (arc_warm == B_FALSE)
|
||||
hdr_prev = multilist_sublist_next(mls, hdr);
|
||||
else
|
||||
hdr_prev = multilist_sublist_prev(mls, hdr);
|
||||
|
||||
hash_lock = HDR_LOCK(hdr);
|
||||
if (!mutex_tryenter(hash_lock)) {
|
||||
/*
|
||||
* Skip this buffer rather than waiting.
|
||||
*/
|
||||
skip:
|
||||
/* Skip this buffer rather than waiting. */
|
||||
if (from_head)
|
||||
hdr = multilist_sublist_next(mls, hdr);
|
||||
else
|
||||
hdr = multilist_sublist_prev(mls, hdr);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -9128,11 +9135,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
|||
|
||||
if (!l2arc_write_eligible(guid, hdr)) {
|
||||
mutex_exit(hash_lock);
|
||||
continue;
|
||||
goto skip;
|
||||
}
|
||||
|
||||
ASSERT(HDR_HAS_L1HDR(hdr));
|
||||
|
||||
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
|
||||
ASSERT3U(arc_hdr_size(hdr), >, 0);
|
||||
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
|
||||
|
@ -9154,12 +9160,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
|||
}
|
||||
|
||||
/*
|
||||
* We rely on the L1 portion of the header below, so
|
||||
* it's invalid for this header to have been evicted out
|
||||
* of the ghost cache, prior to being written out. The
|
||||
* ARC_FLAG_L2_WRITING bit ensures this won't happen.
|
||||
* We should not sleep with sublist lock held or it
|
||||
* may block ARC eviction. Insert a marker to save
|
||||
* the position and drop the lock.
|
||||
*/
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
|
||||
if (from_head) {
|
||||
multilist_sublist_insert_after(mls, hdr,
|
||||
marker);
|
||||
} else {
|
||||
multilist_sublist_insert_before(mls, hdr,
|
||||
marker);
|
||||
}
|
||||
multilist_sublist_unlock(mls);
|
||||
|
||||
/*
|
||||
* If this header has b_rabd, we can use this since it
|
||||
|
@ -9190,32 +9202,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
|||
&to_write);
|
||||
if (ret != 0) {
|
||||
arc_hdr_clear_flags(hdr,
|
||||
ARC_FLAG_L2_WRITING);
|
||||
ARC_FLAG_L2CACHE);
|
||||
mutex_exit(hash_lock);
|
||||
continue;
|
||||
goto next;
|
||||
}
|
||||
|
||||
l2arc_free_abd_on_write(to_write, asize, type);
|
||||
}
|
||||
|
||||
hdr->b_l2hdr.b_dev = dev;
|
||||
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
|
||||
hdr->b_l2hdr.b_hits = 0;
|
||||
hdr->b_l2hdr.b_arcs_state =
|
||||
hdr->b_l1hdr.b_state->arcs_state;
|
||||
mutex_enter(&dev->l2ad_mtx);
|
||||
if (pio == NULL) {
|
||||
/*
|
||||
* Insert a dummy header on the buflist so
|
||||
* l2arc_write_done() can find where the
|
||||
* write buffers begin without searching.
|
||||
*/
|
||||
mutex_enter(&dev->l2ad_mtx);
|
||||
list_insert_head(&dev->l2ad_buflist, head);
|
||||
mutex_exit(&dev->l2ad_mtx);
|
||||
}
|
||||
list_insert_head(&dev->l2ad_buflist, hdr);
|
||||
mutex_exit(&dev->l2ad_mtx);
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
|
||||
ARC_FLAG_L2_WRITING);
|
||||
|
||||
(void) zfs_refcount_add_many(&dev->l2ad_alloc,
|
||||
arc_hdr_size(hdr), hdr);
|
||||
l2arc_hdr_arcstats_increment(hdr);
|
||||
|
||||
boolean_t commit = l2arc_log_blk_insert(dev, hdr);
|
||||
mutex_exit(hash_lock);
|
||||
|
||||
if (pio == NULL) {
|
||||
cb = kmem_alloc(
|
||||
sizeof (l2arc_write_callback_t), KM_SLEEP);
|
||||
cb->l2wcb_dev = dev;
|
||||
cb->l2wcb_head = head;
|
||||
/*
|
||||
* Create a list to save allocated abd buffers
|
||||
* for l2arc_log_blk_commit().
|
||||
*/
|
||||
list_create(&cb->l2wcb_abd_list,
|
||||
sizeof (l2arc_lb_abd_buf_t),
|
||||
offsetof(l2arc_lb_abd_buf_t, node));
|
||||
|
@ -9223,54 +9248,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
|||
ZIO_FLAG_CANFAIL);
|
||||
}
|
||||
|
||||
hdr->b_l2hdr.b_dev = dev;
|
||||
hdr->b_l2hdr.b_hits = 0;
|
||||
|
||||
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
|
||||
hdr->b_l2hdr.b_arcs_state =
|
||||
hdr->b_l1hdr.b_state->arcs_state;
|
||||
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
|
||||
|
||||
mutex_enter(&dev->l2ad_mtx);
|
||||
list_insert_head(&dev->l2ad_buflist, hdr);
|
||||
mutex_exit(&dev->l2ad_mtx);
|
||||
|
||||
(void) zfs_refcount_add_many(&dev->l2ad_alloc,
|
||||
arc_hdr_size(hdr), hdr);
|
||||
|
||||
wzio = zio_write_phys(pio, dev->l2ad_vdev,
|
||||
hdr->b_l2hdr.b_daddr, asize, to_write,
|
||||
dev->l2ad_hand, asize, to_write,
|
||||
ZIO_CHECKSUM_OFF, NULL, hdr,
|
||||
ZIO_PRIORITY_ASYNC_WRITE,
|
||||
ZIO_FLAG_CANFAIL, B_FALSE);
|
||||
|
||||
write_lsize += HDR_GET_LSIZE(hdr);
|
||||
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
|
||||
zio_t *, wzio);
|
||||
zio_nowait(wzio);
|
||||
|
||||
write_psize += psize;
|
||||
write_asize += asize;
|
||||
dev->l2ad_hand += asize;
|
||||
l2arc_hdr_arcstats_increment(hdr);
|
||||
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
|
||||
|
||||
mutex_exit(hash_lock);
|
||||
|
||||
/*
|
||||
* Append buf info to current log and commit if full.
|
||||
* arcstat_l2_{size,asize} kstats are updated
|
||||
* internally.
|
||||
*/
|
||||
if (l2arc_log_blk_insert(dev, hdr)) {
|
||||
/*
|
||||
* l2ad_hand will be adjusted in
|
||||
* l2arc_log_blk_commit().
|
||||
*/
|
||||
if (commit) {
|
||||
/* l2ad_hand will be adjusted inside. */
|
||||
write_asize +=
|
||||
l2arc_log_blk_commit(dev, pio, cb);
|
||||
}
|
||||
|
||||
zio_nowait(wzio);
|
||||
next:
|
||||
multilist_sublist_lock(mls);
|
||||
if (from_head)
|
||||
hdr = multilist_sublist_next(mls, marker);
|
||||
else
|
||||
hdr = multilist_sublist_prev(mls, marker);
|
||||
multilist_sublist_remove(mls, marker);
|
||||
}
|
||||
|
||||
multilist_sublist_unlock(mls);
|
||||
|
@ -9279,9 +9284,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
|||
break;
|
||||
}
|
||||
|
||||
arc_state_free_marker(marker);
|
||||
|
||||
/* No buffers selected for writing? */
|
||||
if (pio == NULL) {
|
||||
ASSERT0(write_lsize);
|
||||
ASSERT0(write_psize);
|
||||
ASSERT(!HDR_HAS_L1HDR(head));
|
||||
kmem_cache_free(hdr_l2only_cache, head);
|
||||
|
||||
|
@ -10609,7 +10616,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
|
|||
L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
|
||||
L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
|
||||
L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
|
||||
L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
|
||||
L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
|
||||
|
||||
dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
|
||||
HDR_GET_PSIZE(hdr));
|
||||
|
|
|
@ -754,7 +754,7 @@ static void
|
|||
dbuf_evict_one(void)
|
||||
{
|
||||
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
|
||||
multilist_sublist_t *mls = multilist_sublist_lock(
|
||||
multilist_sublist_t *mls = multilist_sublist_lock_idx(
|
||||
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
|
||||
|
||||
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
|
||||
|
@ -1542,17 +1542,14 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
|
|||
* returning.
|
||||
*/
|
||||
static int
|
||||
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
|
||||
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||
db_lock_type_t dblt, const void *tag)
|
||||
{
|
||||
dnode_t *dn;
|
||||
zbookmark_phys_t zb;
|
||||
uint32_t aflags = ARC_FLAG_NOWAIT;
|
||||
int err, zio_flags;
|
||||
blkptr_t bp, *bpp;
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
dn = DB_DNODE(db);
|
||||
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
|
||||
|
@ -1627,8 +1624,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
|
|||
if (err != 0)
|
||||
goto early_unlock;
|
||||
|
||||
DB_DNODE_EXIT(db);
|
||||
|
||||
db->db_state = DB_READ;
|
||||
DTRACE_SET_STATE(db, "read issued");
|
||||
mutex_exit(&db->db_mtx);
|
||||
|
@ -1653,12 +1648,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
|
|||
* parent's rwlock, which would be a lock ordering violation.
|
||||
*/
|
||||
dmu_buf_unlock_parent(db, dblt, tag);
|
||||
(void) arc_read(zio, db->db_objset->os_spa, bpp,
|
||||
return (arc_read(zio, db->db_objset->os_spa, bpp,
|
||||
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
|
||||
&aflags, &zb);
|
||||
return (err);
|
||||
&aflags, &zb));
|
||||
|
||||
early_unlock:
|
||||
DB_DNODE_EXIT(db);
|
||||
mutex_exit(&db->db_mtx);
|
||||
dmu_buf_unlock_parent(db, dblt, tag);
|
||||
return (err);
|
||||
|
@ -1743,7 +1737,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
|
|||
}
|
||||
|
||||
int
|
||||
dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
{
|
||||
int err = 0;
|
||||
boolean_t prefetch;
|
||||
|
@ -1759,7 +1753,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
|||
dn = DB_DNODE(db);
|
||||
|
||||
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
||||
(flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
|
||||
(flags & DB_RF_NOPREFETCH) == 0;
|
||||
|
||||
mutex_enter(&db->db_mtx);
|
||||
if (flags & DB_RF_PARTIAL_FIRST)
|
||||
|
@ -1806,13 +1800,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
|||
|
||||
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
|
||||
|
||||
if (zio == NULL && (db->db_state == DB_NOFILL ||
|
||||
if (pio == NULL && (db->db_state == DB_NOFILL ||
|
||||
(db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
|
||||
spa_t *spa = dn->dn_objset->os_spa;
|
||||
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||
pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||
need_wait = B_TRUE;
|
||||
}
|
||||
err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
|
||||
err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
|
||||
/*
|
||||
* dbuf_read_impl has dropped db_mtx and our parent's rwlock
|
||||
* for us
|
||||
|
@ -1833,9 +1827,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
|||
*/
|
||||
if (need_wait) {
|
||||
if (err == 0)
|
||||
err = zio_wait(zio);
|
||||
err = zio_wait(pio);
|
||||
else
|
||||
VERIFY0(zio_wait(zio));
|
||||
(void) zio_wait(pio);
|
||||
pio = NULL;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
|
@ -1862,7 +1857,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
|||
ASSERT(db->db_state == DB_READ ||
|
||||
(flags & DB_RF_HAVESTRUCT) == 0);
|
||||
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
|
||||
db, zio_t *, zio);
|
||||
db, zio_t *, pio);
|
||||
cv_wait(&db->db_changed, &db->db_mtx);
|
||||
}
|
||||
if (db->db_state == DB_UNCACHED)
|
||||
|
@ -1871,6 +1866,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
|||
}
|
||||
}
|
||||
|
||||
if (pio && err != 0) {
|
||||
zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
|
||||
ZIO_FLAG_CANFAIL);
|
||||
zio->io_error = err;
|
||||
zio_nowait(zio);
|
||||
}
|
||||
|
||||
return (err);
|
||||
}
|
||||
|
||||
|
|
|
@ -569,8 +569,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
|||
for (i = 0; i < nblks; i++) {
|
||||
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
|
||||
if (db == NULL) {
|
||||
if (zs)
|
||||
dmu_zfetch_run(zs, missed, B_TRUE);
|
||||
if (zs) {
|
||||
dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
|
||||
B_TRUE);
|
||||
}
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
dmu_buf_rele_array(dbp, nblks, tag);
|
||||
if (read)
|
||||
|
@ -606,7 +608,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
|||
zfs_racct_write(length, nblks);
|
||||
|
||||
if (zs)
|
||||
dmu_zfetch_run(zs, missed, B_TRUE);
|
||||
dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
|
||||
rw_exit(&dn->dn_struct_rwlock);
|
||||
|
||||
if (read) {
|
||||
|
|
|
@ -1633,7 +1633,7 @@ sync_dnodes_task(void *arg)
|
|||
sync_dnodes_arg_t *sda = arg;
|
||||
|
||||
multilist_sublist_t *ms =
|
||||
multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
|
||||
multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
|
||||
|
||||
dmu_objset_sync_dnodes(ms, sda->sda_tx);
|
||||
|
||||
|
@ -1987,8 +1987,8 @@ userquota_updates_task(void *arg)
|
|||
dnode_t *dn;
|
||||
userquota_cache_t cache = { { 0 } };
|
||||
|
||||
multilist_sublist_t *list =
|
||||
multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
|
||||
multilist_sublist_t *list = multilist_sublist_lock_idx(
|
||||
&os->os_synced_dnodes, uua->uua_sublist_idx);
|
||||
|
||||
ASSERT(multilist_sublist_head(list) == NULL ||
|
||||
dmu_objset_userused_enabled(os));
|
||||
|
@ -2070,8 +2070,8 @@ dnode_rele_task(void *arg)
|
|||
userquota_updates_arg_t *uua = arg;
|
||||
objset_t *os = uua->uua_os;
|
||||
|
||||
multilist_sublist_t *list =
|
||||
multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
|
||||
multilist_sublist_t *list = multilist_sublist_lock_idx(
|
||||
&os->os_synced_dnodes, uua->uua_sublist_idx);
|
||||
|
||||
dnode_t *dn;
|
||||
while ((dn = multilist_sublist_head(list)) != NULL) {
|
||||
|
|
|
@ -2110,6 +2110,16 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
|
|||
dmu_buf_rele(db, FTAG);
|
||||
dnode_rele(dn, FTAG);
|
||||
}
|
||||
|
||||
/*
|
||||
* If the receive fails, we want the resume stream to start with the
|
||||
* same record that we last successfully received. There is no way to
|
||||
* request resume from the object record, but we can benefit from the
|
||||
* fact that sender always sends object record before anything else,
|
||||
* after which it will "resend" data at offset 0 and resume normally.
|
||||
*/
|
||||
save_resume_state(rwa, drro->drr_object, 0, tx);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
return (0);
|
||||
|
|
|
@ -65,9 +65,16 @@ unsigned int zfetch_max_distance = 64 * 1024 * 1024;
|
|||
#endif
|
||||
/* max bytes to prefetch indirects for per stream (default 64MB) */
|
||||
unsigned int zfetch_max_idistance = 64 * 1024 * 1024;
|
||||
/* max request reorder distance within a stream (default 16MB) */
|
||||
unsigned int zfetch_max_reorder = 16 * 1024 * 1024;
|
||||
/* Max log2 fraction of holes in a stream */
|
||||
unsigned int zfetch_hole_shift = 2;
|
||||
|
||||
typedef struct zfetch_stats {
|
||||
kstat_named_t zfetchstat_hits;
|
||||
kstat_named_t zfetchstat_future;
|
||||
kstat_named_t zfetchstat_stride;
|
||||
kstat_named_t zfetchstat_past;
|
||||
kstat_named_t zfetchstat_misses;
|
||||
kstat_named_t zfetchstat_max_streams;
|
||||
kstat_named_t zfetchstat_io_issued;
|
||||
|
@ -76,6 +83,9 @@ typedef struct zfetch_stats {
|
|||
|
||||
static zfetch_stats_t zfetch_stats = {
|
||||
{ "hits", KSTAT_DATA_UINT64 },
|
||||
{ "future", KSTAT_DATA_UINT64 },
|
||||
{ "stride", KSTAT_DATA_UINT64 },
|
||||
{ "past", KSTAT_DATA_UINT64 },
|
||||
{ "misses", KSTAT_DATA_UINT64 },
|
||||
{ "max_streams", KSTAT_DATA_UINT64 },
|
||||
{ "io_issued", KSTAT_DATA_UINT64 },
|
||||
|
@ -84,6 +94,9 @@ static zfetch_stats_t zfetch_stats = {
|
|||
|
||||
struct {
|
||||
wmsum_t zfetchstat_hits;
|
||||
wmsum_t zfetchstat_future;
|
||||
wmsum_t zfetchstat_stride;
|
||||
wmsum_t zfetchstat_past;
|
||||
wmsum_t zfetchstat_misses;
|
||||
wmsum_t zfetchstat_max_streams;
|
||||
wmsum_t zfetchstat_io_issued;
|
||||
|
@ -107,6 +120,12 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
|
|||
return (EACCES);
|
||||
zs->zfetchstat_hits.value.ui64 =
|
||||
wmsum_value(&zfetch_sums.zfetchstat_hits);
|
||||
zs->zfetchstat_future.value.ui64 =
|
||||
wmsum_value(&zfetch_sums.zfetchstat_future);
|
||||
zs->zfetchstat_stride.value.ui64 =
|
||||
wmsum_value(&zfetch_sums.zfetchstat_stride);
|
||||
zs->zfetchstat_past.value.ui64 =
|
||||
wmsum_value(&zfetch_sums.zfetchstat_past);
|
||||
zs->zfetchstat_misses.value.ui64 =
|
||||
wmsum_value(&zfetch_sums.zfetchstat_misses);
|
||||
zs->zfetchstat_max_streams.value.ui64 =
|
||||
|
@ -122,6 +141,9 @@ void
|
|||
zfetch_init(void)
|
||||
{
|
||||
wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
|
||||
wmsum_init(&zfetch_sums.zfetchstat_future, 0);
|
||||
wmsum_init(&zfetch_sums.zfetchstat_stride, 0);
|
||||
wmsum_init(&zfetch_sums.zfetchstat_past, 0);
|
||||
wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
|
||||
wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
|
||||
wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
|
||||
|
@ -147,6 +169,9 @@ zfetch_fini(void)
|
|||
}
|
||||
|
||||
wmsum_fini(&zfetch_sums.zfetchstat_hits);
|
||||
wmsum_fini(&zfetch_sums.zfetchstat_future);
|
||||
wmsum_fini(&zfetch_sums.zfetchstat_stride);
|
||||
wmsum_fini(&zfetch_sums.zfetchstat_past);
|
||||
wmsum_fini(&zfetch_sums.zfetchstat_misses);
|
||||
wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
|
||||
wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
|
||||
|
@ -222,22 +247,22 @@ static void
|
|||
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
||||
{
|
||||
zstream_t *zs, *zs_next, *zs_old = NULL;
|
||||
hrtime_t now = gethrtime(), t;
|
||||
uint_t now = gethrestime_sec(), t;
|
||||
|
||||
ASSERT(MUTEX_HELD(&zf->zf_lock));
|
||||
|
||||
/*
|
||||
* Delete too old streams, reusing the first found one.
|
||||
*/
|
||||
t = now - SEC2NSEC(zfetch_max_sec_reap);
|
||||
t = now - zfetch_max_sec_reap;
|
||||
for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
|
||||
zs_next = list_next(&zf->zf_stream, zs);
|
||||
/*
|
||||
* Skip if still active. 1 -- zf_stream reference.
|
||||
*/
|
||||
if (zfs_refcount_count(&zs->zs_refs) != 1)
|
||||
if ((int)(zs->zs_atime - t) >= 0)
|
||||
continue;
|
||||
if (zs->zs_atime > t)
|
||||
if (zfs_refcount_count(&zs->zs_refs) != 1)
|
||||
continue;
|
||||
if (zs_old)
|
||||
dmu_zfetch_stream_remove(zf, zs);
|
||||
|
@ -246,6 +271,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
|||
}
|
||||
if (zs_old) {
|
||||
zs = zs_old;
|
||||
list_remove(&zf->zf_stream, zs);
|
||||
goto reuse;
|
||||
}
|
||||
|
||||
|
@ -255,21 +281,23 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
|||
* for all the streams to be non-overlapping.
|
||||
*/
|
||||
uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
|
||||
zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
|
||||
(zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) /
|
||||
zfetch_max_distance));
|
||||
if (zf->zf_numstreams >= max_streams) {
|
||||
t = now - SEC2NSEC(zfetch_min_sec_reap);
|
||||
t = now - zfetch_min_sec_reap;
|
||||
for (zs = list_head(&zf->zf_stream); zs != NULL;
|
||||
zs = list_next(&zf->zf_stream, zs)) {
|
||||
if ((int)(zs->zs_atime - t) >= 0)
|
||||
continue;
|
||||
if (zfs_refcount_count(&zs->zs_refs) != 1)
|
||||
continue;
|
||||
if (zs->zs_atime > t)
|
||||
continue;
|
||||
if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime)
|
||||
if (zs_old == NULL ||
|
||||
(int)(zs_old->zs_atime - zs->zs_atime) >= 0)
|
||||
zs_old = zs;
|
||||
}
|
||||
if (zs_old) {
|
||||
zs = zs_old;
|
||||
list_remove(&zf->zf_stream, zs);
|
||||
goto reuse;
|
||||
}
|
||||
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
|
||||
|
@ -277,24 +305,24 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
|||
}
|
||||
|
||||
zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
|
||||
zs->zs_fetch = zf;
|
||||
zfs_refcount_create(&zs->zs_callers);
|
||||
zfs_refcount_create(&zs->zs_refs);
|
||||
/* One reference for zf_stream. */
|
||||
zfs_refcount_add(&zs->zs_refs, NULL);
|
||||
zf->zf_numstreams++;
|
||||
list_insert_head(&zf->zf_stream, zs);
|
||||
|
||||
reuse:
|
||||
list_insert_head(&zf->zf_stream, zs);
|
||||
zs->zs_blkid = blkid;
|
||||
/* Allow immediate stream reuse until first hit. */
|
||||
zs->zs_atime = now - zfetch_min_sec_reap;
|
||||
memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges));
|
||||
zs->zs_pf_dist = 0;
|
||||
zs->zs_ipf_dist = 0;
|
||||
zs->zs_pf_start = blkid;
|
||||
zs->zs_pf_end = blkid;
|
||||
zs->zs_ipf_dist = 0;
|
||||
zs->zs_ipf_start = blkid;
|
||||
zs->zs_ipf_end = blkid;
|
||||
/* Allow immediate stream reuse until first hit. */
|
||||
zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap);
|
||||
zs->zs_missed = B_FALSE;
|
||||
zs->zs_more = B_FALSE;
|
||||
}
|
||||
|
@ -311,6 +339,120 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
|
|||
aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Process stream hit access for nblks blocks starting at zs_blkid. Return
|
||||
* number of blocks to proceed for after aggregation with future ranges.
|
||||
*/
|
||||
static uint64_t
|
||||
dmu_zfetch_hit(zstream_t *zs, uint64_t nblks)
|
||||
{
|
||||
uint_t i, j;
|
||||
|
||||
/* Optimize sequential accesses (no future ranges). */
|
||||
if (zs->zs_ranges[0].start == 0)
|
||||
goto done;
|
||||
|
||||
/* Look for intersections with further ranges. */
|
||||
for (i = 0; i < ZFETCH_RANGES; i++) {
|
||||
zsrange_t *r = &zs->zs_ranges[i];
|
||||
if (r->start == 0 || r->start > nblks)
|
||||
break;
|
||||
if (r->end >= nblks) {
|
||||
nblks = r->end;
|
||||
i++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Delete all found intersecting ranges, updates remaining. */
|
||||
for (j = 0; i < ZFETCH_RANGES; i++, j++) {
|
||||
if (zs->zs_ranges[i].start == 0)
|
||||
break;
|
||||
ASSERT3U(zs->zs_ranges[i].start, >, nblks);
|
||||
ASSERT3U(zs->zs_ranges[i].end, >, nblks);
|
||||
zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks;
|
||||
zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks;
|
||||
}
|
||||
if (j < ZFETCH_RANGES) {
|
||||
zs->zs_ranges[j].start = 0;
|
||||
zs->zs_ranges[j].end = 0;
|
||||
}
|
||||
|
||||
done:
|
||||
zs->zs_blkid += nblks;
|
||||
return (nblks);
|
||||
}
|
||||
|
||||
/*
|
||||
* Process future stream access for nblks blocks starting at blkid. Return
|
||||
* number of blocks to proceed for if future ranges reach fill threshold.
|
||||
*/
|
||||
static uint64_t
|
||||
dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
|
||||
{
|
||||
ASSERT3U(blkid, >, zs->zs_blkid);
|
||||
blkid -= zs->zs_blkid;
|
||||
ASSERT3U(blkid + nblks, <=, UINT16_MAX);
|
||||
|
||||
/* Search for first and last intersection or insert point. */
|
||||
uint_t f = ZFETCH_RANGES, l = 0, i;
|
||||
for (i = 0; i < ZFETCH_RANGES; i++) {
|
||||
zsrange_t *r = &zs->zs_ranges[i];
|
||||
if (r->start == 0 || r->start > blkid + nblks)
|
||||
break;
|
||||
if (r->end < blkid)
|
||||
continue;
|
||||
if (f > i)
|
||||
f = i;
|
||||
if (l < i)
|
||||
l = i;
|
||||
}
|
||||
if (f <= l) {
|
||||
/* Got some intersecting range, expand it if needed. */
|
||||
if (zs->zs_ranges[f].start > blkid)
|
||||
zs->zs_ranges[f].start = blkid;
|
||||
zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks);
|
||||
if (f < l) {
|
||||
/* Got more than one intersection, remove others. */
|
||||
for (f++, l++; l < ZFETCH_RANGES; f++, l++) {
|
||||
zs->zs_ranges[f].start = zs->zs_ranges[l].start;
|
||||
zs->zs_ranges[f].end = zs->zs_ranges[l].end;
|
||||
}
|
||||
zs->zs_ranges[ZFETCH_RANGES - 1].start = 0;
|
||||
zs->zs_ranges[ZFETCH_RANGES - 1].end = 0;
|
||||
}
|
||||
} else if (i < ZFETCH_RANGES) {
|
||||
/* Got no intersecting ranges, insert new one. */
|
||||
for (l = ZFETCH_RANGES - 1; l > i; l--) {
|
||||
zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start;
|
||||
zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end;
|
||||
}
|
||||
zs->zs_ranges[i].start = blkid;
|
||||
zs->zs_ranges[i].end = blkid + nblks;
|
||||
} else {
|
||||
/* No space left to insert. Drop the range. */
|
||||
return (0);
|
||||
}
|
||||
|
||||
/* Check if with the new access addition we reached fill threshold. */
|
||||
if (zfetch_hole_shift >= 16)
|
||||
return (0);
|
||||
uint_t hole = 0;
|
||||
for (i = f = l = 0; i < ZFETCH_RANGES; i++) {
|
||||
zsrange_t *r = &zs->zs_ranges[i];
|
||||
if (r->start == 0)
|
||||
break;
|
||||
hole += r->start - f;
|
||||
f = r->end;
|
||||
if (hole <= r->end >> zfetch_hole_shift)
|
||||
l = r->end;
|
||||
}
|
||||
if (l > 0)
|
||||
return (dmu_zfetch_hit(zs, l));
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the predictive prefetch entry point. dmu_zfetch_prepare()
|
||||
* associates dnode access specified with blkid and nblks arguments with
|
||||
|
@ -365,53 +507,92 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
|
|||
mutex_enter(&zf->zf_lock);
|
||||
|
||||
/*
|
||||
* Find matching prefetch stream. Depending on whether the accesses
|
||||
* Find perfect prefetch stream. Depending on whether the accesses
|
||||
* are block-aligned, first block of the new access may either follow
|
||||
* the last block of the previous access, or be equal to it.
|
||||
*/
|
||||
unsigned int dbs = zf->zf_dnode->dn_datablkshift;
|
||||
uint64_t end_blkid = blkid + nblks;
|
||||
for (zs = list_head(&zf->zf_stream); zs != NULL;
|
||||
zs = list_next(&zf->zf_stream, zs)) {
|
||||
if (blkid == zs->zs_blkid) {
|
||||
break;
|
||||
goto hit;
|
||||
} else if (blkid + 1 == zs->zs_blkid) {
|
||||
blkid++;
|
||||
nblks--;
|
||||
break;
|
||||
goto hit;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If the file is ending, remove the matching stream if found.
|
||||
* If not found then it is too late to create a new one now.
|
||||
* Find close enough prefetch stream. Access crossing stream position
|
||||
* is a hit in its new part. Access ahead of stream position considered
|
||||
* a hit for metadata prefetch, since we do not care about fill percent,
|
||||
* or stored for future otherwise. Access behind stream position is
|
||||
* silently ignored, since we already skipped it reaching fill percent.
|
||||
*/
|
||||
uint64_t end_of_access_blkid = blkid + nblks;
|
||||
if (end_of_access_blkid >= maxblkid) {
|
||||
if (zs != NULL)
|
||||
dmu_zfetch_stream_remove(zf, zs);
|
||||
mutex_exit(&zf->zf_lock);
|
||||
if (!have_lock)
|
||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||
return (NULL);
|
||||
uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX);
|
||||
uint_t t = gethrestime_sec() - zfetch_max_sec_reap;
|
||||
for (zs = list_head(&zf->zf_stream); zs != NULL;
|
||||
zs = list_next(&zf->zf_stream, zs)) {
|
||||
if (blkid > zs->zs_blkid) {
|
||||
if (end_blkid <= zs->zs_blkid + max_reorder) {
|
||||
if (!fetch_data) {
|
||||
nblks = dmu_zfetch_hit(zs,
|
||||
end_blkid - zs->zs_blkid);
|
||||
ZFETCHSTAT_BUMP(zfetchstat_stride);
|
||||
goto future;
|
||||
}
|
||||
nblks = dmu_zfetch_future(zs, blkid, nblks);
|
||||
if (nblks > 0)
|
||||
ZFETCHSTAT_BUMP(zfetchstat_stride);
|
||||
else
|
||||
ZFETCHSTAT_BUMP(zfetchstat_future);
|
||||
goto future;
|
||||
}
|
||||
} else if (end_blkid >= zs->zs_blkid) {
|
||||
nblks -= zs->zs_blkid - blkid;
|
||||
blkid += zs->zs_blkid - blkid;
|
||||
goto hit;
|
||||
} else if (end_blkid + max_reorder > zs->zs_blkid &&
|
||||
(int)(zs->zs_atime - t) >= 0) {
|
||||
ZFETCHSTAT_BUMP(zfetchstat_past);
|
||||
zs->zs_atime = gethrestime_sec();
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* Exit if we already prefetched this block before. */
|
||||
if (nblks == 0) {
|
||||
mutex_exit(&zf->zf_lock);
|
||||
if (!have_lock)
|
||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||
return (NULL);
|
||||
}
|
||||
/*
|
||||
* This access is not part of any existing stream. Create a new
|
||||
* stream for it unless we are at the end of file.
|
||||
*/
|
||||
if (end_blkid < maxblkid)
|
||||
dmu_zfetch_stream_create(zf, end_blkid);
|
||||
mutex_exit(&zf->zf_lock);
|
||||
if (!have_lock)
|
||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||
ZFETCHSTAT_BUMP(zfetchstat_misses);
|
||||
return (NULL);
|
||||
|
||||
if (zs == NULL) {
|
||||
/*
|
||||
* This access is not part of any existing stream. Create
|
||||
* a new stream for it.
|
||||
*/
|
||||
dmu_zfetch_stream_create(zf, end_of_access_blkid);
|
||||
hit:
|
||||
nblks = dmu_zfetch_hit(zs, nblks);
|
||||
ZFETCHSTAT_BUMP(zfetchstat_hits);
|
||||
|
||||
future:
|
||||
zs->zs_atime = gethrestime_sec();
|
||||
|
||||
/* Exit if we already prefetched for this position before. */
|
||||
if (nblks == 0)
|
||||
goto out;
|
||||
|
||||
/* If the file is ending, remove the stream. */
|
||||
end_blkid = zs->zs_blkid;
|
||||
if (end_blkid >= maxblkid) {
|
||||
dmu_zfetch_stream_remove(zf, zs);
|
||||
out:
|
||||
mutex_exit(&zf->zf_lock);
|
||||
if (!have_lock)
|
||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||
ZFETCHSTAT_BUMP(zfetchstat_misses);
|
||||
return (NULL);
|
||||
}
|
||||
|
||||
|
@ -427,7 +608,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
|
|||
* than ~6% of ARC held by active prefetches. It should help with
|
||||
* getting out of RAM on some badly mispredicted read patterns.
|
||||
*/
|
||||
unsigned int dbs = zf->zf_dnode->dn_datablkshift;
|
||||
unsigned int nbytes = nblks << dbs;
|
||||
unsigned int pf_nblks;
|
||||
if (fetch_data) {
|
||||
|
@ -447,10 +627,10 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
|
|||
} else {
|
||||
pf_nblks = 0;
|
||||
}
|
||||
if (zs->zs_pf_start < end_of_access_blkid)
|
||||
zs->zs_pf_start = end_of_access_blkid;
|
||||
if (zs->zs_pf_end < end_of_access_blkid + pf_nblks)
|
||||
zs->zs_pf_end = end_of_access_blkid + pf_nblks;
|
||||
if (zs->zs_pf_start < end_blkid)
|
||||
zs->zs_pf_start = end_blkid;
|
||||
if (zs->zs_pf_end < end_blkid + pf_nblks)
|
||||
zs->zs_pf_end = end_blkid + pf_nblks;
|
||||
|
||||
/*
|
||||
* Do the same for indirects, starting where we will stop reading
|
||||
|
@ -468,9 +648,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
|
|||
if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
|
||||
zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
|
||||
|
||||
zs->zs_blkid = end_of_access_blkid;
|
||||
/* Protect the stream from reclamation. */
|
||||
zs->zs_atime = gethrtime();
|
||||
zfs_refcount_add(&zs->zs_refs, NULL);
|
||||
/* Count concurrent callers. */
|
||||
zfs_refcount_add(&zs->zs_callers, NULL);
|
||||
|
@ -478,15 +655,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
|
|||
|
||||
if (!have_lock)
|
||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||
|
||||
ZFETCHSTAT_BUMP(zfetchstat_hits);
|
||||
return (zs);
|
||||
}
|
||||
|
||||
void
|
||||
dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
|
||||
dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
|
||||
boolean_t have_lock)
|
||||
{
|
||||
zfetch_t *zf = zs->zs_fetch;
|
||||
int64_t pf_start, pf_end, ipf_start, ipf_end;
|
||||
int epbs, issued;
|
||||
|
||||
|
@ -562,7 +737,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
|||
|
||||
zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
|
||||
if (zs)
|
||||
dmu_zfetch_run(zs, missed, have_lock);
|
||||
dmu_zfetch_run(zf, zs, missed, have_lock);
|
||||
}
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
|
||||
|
@ -585,3 +760,9 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
|
|||
|
||||
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
|
||||
"Max bytes to prefetch indirects for per stream");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW,
|
||||
"Max request reorder distance within a stream");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW,
|
||||
"Max log2 fraction of holes in a stream");
|
||||
|
|
|
@ -641,7 +641,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
|
|||
{
|
||||
multilist_t *ml = &mc->mc_metaslab_txg_list;
|
||||
for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
|
||||
multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
|
||||
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
|
||||
metaslab_t *msp = multilist_sublist_head(mls);
|
||||
multilist_sublist_unlock(mls);
|
||||
while (msp != NULL) {
|
||||
|
@ -658,7 +658,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
|
|||
i--;
|
||||
break;
|
||||
}
|
||||
mls = multilist_sublist_lock(ml, i);
|
||||
mls = multilist_sublist_lock_idx(ml, i);
|
||||
metaslab_t *next_msp = multilist_sublist_next(mls, msp);
|
||||
multilist_sublist_unlock(mls);
|
||||
if (txg >
|
||||
|
@ -2190,12 +2190,12 @@ metaslab_potentially_evict(metaslab_class_t *mc)
|
|||
unsigned int idx = multilist_get_random_index(
|
||||
&mc->mc_metaslab_txg_list);
|
||||
multilist_sublist_t *mls =
|
||||
multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
|
||||
multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
|
||||
metaslab_t *msp = multilist_sublist_head(mls);
|
||||
multilist_sublist_unlock(mls);
|
||||
while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
|
||||
inuse * size) {
|
||||
VERIFY3P(mls, ==, multilist_sublist_lock(
|
||||
VERIFY3P(mls, ==, multilist_sublist_lock_idx(
|
||||
&mc->mc_metaslab_txg_list, idx));
|
||||
ASSERT3U(idx, ==,
|
||||
metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
|
||||
|
|
|
@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml)
|
|||
return (random_in_range(ml->ml_num_sublists));
|
||||
}
|
||||
|
||||
void
|
||||
multilist_sublist_lock(multilist_sublist_t *mls)
|
||||
{
|
||||
mutex_enter(&mls->mls_lock);
|
||||
}
|
||||
|
||||
/* Lock and return the sublist specified at the given index */
|
||||
multilist_sublist_t *
|
||||
multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
|
||||
multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx)
|
||||
{
|
||||
multilist_sublist_t *mls;
|
||||
|
||||
|
@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
|
|||
multilist_sublist_t *
|
||||
multilist_sublist_lock_obj(multilist_t *ml, void *obj)
|
||||
{
|
||||
return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
|
||||
return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj)));
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
|
|||
list_insert_tail(&mls->mls_list, obj);
|
||||
}
|
||||
|
||||
/* please see comment above multilist_sublist_insert_head */
|
||||
void
|
||||
multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&mls->mls_lock));
|
||||
list_insert_after(&mls->mls_list, prev, obj);
|
||||
}
|
||||
|
||||
/* please see comment above multilist_sublist_insert_head */
|
||||
void
|
||||
multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj)
|
||||
{
|
||||
ASSERT(MUTEX_HELD(&mls->mls_lock));
|
||||
list_insert_before(&mls->mls_list, next, obj);
|
||||
}
|
||||
|
||||
/*
|
||||
* Move the object one element forward in the list.
|
||||
*
|
||||
|
|
|
@ -1822,7 +1822,8 @@ spa_get_slop_space(spa_t *spa)
|
|||
* deduplicated data, so since it's not useful to reserve more
|
||||
* space with more deduplicated data, we subtract that out here.
|
||||
*/
|
||||
space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
|
||||
space =
|
||||
spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa);
|
||||
slop = MIN(space >> spa_slop_shift, spa_max_slop);
|
||||
|
||||
/*
|
||||
|
|
|
@ -58,6 +58,26 @@
|
|||
#include <sys/zfs_vfsops.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
|
||||
/*
|
||||
* Enable the experimental block cloning feature. If this setting is 0, then
|
||||
* even if feature@block_cloning is enabled, attempts to clone blocks will act
|
||||
* as though the feature is disabled.
|
||||
*/
|
||||
int zfs_bclone_enabled = 0;
|
||||
|
||||
/*
|
||||
* When set zfs_clone_range() waits for dirty data to be written to disk.
|
||||
* This allows the clone operation to reliably succeed when a file is modified
|
||||
* and then immediately cloned. For small files this may be slower than making
|
||||
* a copy of the file and is therefore not the default. However, in certain
|
||||
* scenarios this behavior may be desirable so a tunable is provided.
|
||||
*/
|
||||
static int zfs_bclone_wait_dirty = 0;
|
||||
|
||||
/*
|
||||
* Maximum bytes to read per chunk in zfs_read().
|
||||
*/
|
||||
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
|
||||
|
||||
static ulong_t zfs_fsync_sync_cnt = 4;
|
||||
|
||||
|
@ -110,7 +130,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
|
|||
|
||||
/* Flush any mmap()'d data to disk */
|
||||
if (zn_has_cached_data(zp, 0, file_sz - 1))
|
||||
zn_flush_cached_data(zp, B_FALSE);
|
||||
zn_flush_cached_data(zp, B_TRUE);
|
||||
|
||||
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
|
||||
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
|
||||
|
@ -189,8 +209,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
|
|||
return (error);
|
||||
}
|
||||
|
||||
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
|
||||
|
||||
/*
|
||||
* Read bytes from specified file into supplied buffer.
|
||||
*
|
||||
|
@ -1055,6 +1073,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
|||
size_t maxblocks, nbps;
|
||||
uint_t inblksz;
|
||||
uint64_t clear_setid_bits_txg = 0;
|
||||
uint64_t last_synced_txg = 0;
|
||||
|
||||
inoff = *inoffp;
|
||||
outoff = *outoffp;
|
||||
|
@ -1174,6 +1193,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
|||
}
|
||||
}
|
||||
|
||||
/* Flush any mmap()'d data to disk */
|
||||
if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
|
||||
zn_flush_cached_data(inzp, B_TRUE);
|
||||
|
||||
/*
|
||||
* Maintain predictable lock order.
|
||||
*/
|
||||
|
@ -1293,15 +1316,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
|||
}
|
||||
|
||||
nbps = maxblocks;
|
||||
last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
|
||||
error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
|
||||
&nbps);
|
||||
if (error != 0) {
|
||||
/*
|
||||
* If we are trying to clone a block that was created
|
||||
* in the current transaction group, error will be
|
||||
* EAGAIN here, which we can just return to the caller
|
||||
* so it can fallback if it likes.
|
||||
* in the current transaction group, the error will be
|
||||
* EAGAIN here. Based on zfs_bclone_wait_dirty either
|
||||
* return a shortened range to the caller so it can
|
||||
* fallback, or wait for the next TXG and check again.
|
||||
*/
|
||||
if (error == EAGAIN && zfs_bclone_wait_dirty) {
|
||||
txg_wait_synced(dmu_objset_pool(inos),
|
||||
last_synced_txg + 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1523,3 +1554,9 @@ EXPORT_SYMBOL(zfs_clone_range_replay);
|
|||
|
||||
ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
|
||||
"Bytes to read per chunk");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
|
||||
"Enable block cloning");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
|
||||
"Wait for dirty blocks when cloning");
|
||||
|
|
|
@ -535,7 +535,8 @@ tags = ['functional', 'cli_root', 'zpool_split']
|
|||
tests = ['zpool_status_001_pos', 'zpool_status_002_pos',
|
||||
'zpool_status_003_pos', 'zpool_status_004_pos',
|
||||
'zpool_status_005_pos', 'zpool_status_006_pos',
|
||||
'zpool_status_007_pos', 'zpool_status_features_001_pos']
|
||||
'zpool_status_007_pos', 'zpool_status_008_pos',
|
||||
'zpool_status_features_001_pos']
|
||||
tags = ['functional', 'cli_root', 'zpool_status']
|
||||
|
||||
[tests/functional/cli_root/zpool_sync]
|
||||
|
@ -630,7 +631,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
|
|||
tags = ['functional', 'compression']
|
||||
|
||||
[tests/functional/cp_files]
|
||||
tests = ['cp_files_001_pos', 'cp_stress']
|
||||
tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
|
||||
tags = ['functional', 'cp_files']
|
||||
|
||||
[tests/functional/crtime]
|
||||
|
|
|
@ -138,7 +138,11 @@ idmap_reason = 'Idmapped mount needs kernel 5.12+'
|
|||
# copy_file_range() is not supported by all kernels
|
||||
#
|
||||
cfr_reason = 'Kernel copy_file_range support required'
|
||||
cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+'
|
||||
|
||||
if sys.platform.startswith('freebsd'):
|
||||
cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs FreeBSD 14+'
|
||||
else:
|
||||
cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+'
|
||||
|
||||
#
|
||||
# These tests are known to fail, thus we use this list to prevent these
|
||||
|
@ -176,6 +180,7 @@ if sys.platform.startswith('freebsd'):
|
|||
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
|
||||
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
|
||||
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
|
||||
'cp_files/cp_files_002_pos': ['SKIP', na_reason],
|
||||
'link_count/link_count_001': ['SKIP', na_reason],
|
||||
'casenorm/mixed_create_failure': ['FAIL', 13215],
|
||||
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
|
||||
|
@ -267,6 +272,22 @@ if sys.platform.startswith('freebsd'):
|
|||
'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623],
|
||||
'resilver/resilver_restart_001': ['FAIL', known_reason],
|
||||
'snapshot/snapshot_002_pos': ['FAIL', '14831'],
|
||||
'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_crossfs_corner_cases_limited':
|
||||
['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_crossfs_data': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_crossfs_embedded': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_crossfs_hole': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_diffprops_all': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_diffprops_checksum': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_diffprops_compress': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_diffprops_copies': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_diffprops_recordsize': ['SKIP', cfr_cross_reason],
|
||||
'bclone/bclone_prop_sync': ['SKIP', cfr_cross_reason],
|
||||
'block_cloning/block_cloning_cross_enc_dataset':
|
||||
['SKIP', cfr_cross_reason],
|
||||
'block_cloning/block_cloning_copyfilerange_cross_dataset':
|
||||
['SKIP', cfr_cross_reason]
|
||||
})
|
||||
elif sys.platform.startswith('linux'):
|
||||
maybe.update({
|
||||
|
@ -312,6 +333,7 @@ elif sys.platform.startswith('linux'):
|
|||
['SKIP', cfr_reason],
|
||||
'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
|
||||
'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
|
||||
'cp_files/cp_files_002_pos': ['SKIP', cfr_reason],
|
||||
'fault/auto_online_002_pos': ['FAIL', 11889],
|
||||
'fault/auto_replace_001_pos': ['FAIL', 14851],
|
||||
'fault/auto_spare_002_pos': ['FAIL', 11889],
|
||||
|
|
|
@ -61,13 +61,8 @@ function compare_version_gte
|
|||
[ "$(printf "$1\n$2" | sort -V | tail -n1)" = "$1" ]
|
||||
}
|
||||
|
||||
# Linux kernel version comparison function
|
||||
#
|
||||
# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version
|
||||
#
|
||||
# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ]
|
||||
#
|
||||
function linux_version
|
||||
# Helper function used by linux_version() and freebsd_version()
|
||||
function kernel_version
|
||||
{
|
||||
typeset ver="$1"
|
||||
|
||||
|
@ -83,6 +78,24 @@ function linux_version
|
|||
echo $((version * 100000 + major * 1000 + minor))
|
||||
}
|
||||
|
||||
# Linux kernel version comparison function
|
||||
#
|
||||
# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version
|
||||
#
|
||||
# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ]
|
||||
function linux_version {
|
||||
kernel_version "$1"
|
||||
}
|
||||
|
||||
# FreeBSD version comparison function
|
||||
#
|
||||
# $1 FreeBSD version ("13.2", "14.0") or blank for installed FreeBSD version
|
||||
#
|
||||
# Used for comparison: if [ $(freebsd_version) -ge $(freebsd_version "13.2") ]
|
||||
function freebsd_version {
|
||||
kernel_version "$1"
|
||||
}
|
||||
|
||||
# Determine if this is a Linux test system
|
||||
#
|
||||
# Return 0 if platform Linux, 1 if otherwise
|
||||
|
|
|
@ -90,7 +90,8 @@ VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev
|
|||
VOL_MODE vol.mode zvol_volmode
|
||||
VOL_RECURSIVE vol.recursive UNSUPPORTED
|
||||
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
|
||||
BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled
|
||||
BCLONE_ENABLED bclone_enabled zfs_bclone_enabled
|
||||
BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty
|
||||
XATTR_COMPAT xattr_compat zfs_xattr_compat
|
||||
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
|
||||
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max
|
||||
|
|
|
@ -1238,6 +1238,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||
functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
|
||||
functional/cli_root/zpool_status/zpool_status_006_pos.ksh \
|
||||
functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
|
||||
functional/cli_root/zpool_status/zpool_status_008_pos.ksh \
|
||||
functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
|
||||
functional/cli_root/zpool_sync/cleanup.ksh \
|
||||
functional/cli_root/zpool_sync/setup.ksh \
|
||||
|
@ -1393,6 +1394,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
|||
functional/compression/setup.ksh \
|
||||
functional/cp_files/cleanup.ksh \
|
||||
functional/cp_files/cp_files_001_pos.ksh \
|
||||
functional/cp_files/cp_files_002_pos.ksh \
|
||||
functional/cp_files/cp_stress.ksh \
|
||||
functional/cp_files/setup.ksh \
|
||||
functional/crtime/cleanup.ksh \
|
||||
|
|
|
@ -42,6 +42,12 @@ function verify_crossfs_block_cloning
|
|||
if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
|
||||
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
|
||||
fi
|
||||
|
||||
# Cross dataset block cloning only supported on FreeBSD 14+
|
||||
# https://github.com/freebsd/freebsd-src/commit/969071be938c
|
||||
if is_freebsd && [ $(freebsd_version) -lt $(freebsd_version 14.0) ] ; then
|
||||
log_unsupported "Cloning across datasets not supported in $(uname -r)"
|
||||
fi
|
||||
}
|
||||
|
||||
# Unused.
|
||||
|
|
|
@ -26,12 +26,11 @@
|
|||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
|
||||
. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
|
||||
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
|
||||
fi
|
||||
verify_crossfs_block_cloning
|
||||
|
||||
claim="The copy_file_range syscall can clone across datasets."
|
||||
|
||||
|
|
|
@ -26,12 +26,11 @@
|
|||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
|
||||
. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
|
||||
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
|
||||
fi
|
||||
verify_crossfs_block_cloning
|
||||
|
||||
claim="Block cloning across encrypted datasets."
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ else
|
|||
fi
|
||||
|
||||
set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \
|
||||
"-vx $testpool"
|
||||
"-vx $testpool" "-e $testpool" "-es $testpool"
|
||||
|
||||
log_assert "Executing 'zpool status' with correct options succeeds"
|
||||
|
||||
|
@ -64,4 +64,6 @@ while [[ $i -lt ${#args[*]} ]]; do
|
|||
(( i = i + 1 ))
|
||||
done
|
||||
|
||||
cleanup
|
||||
|
||||
log_pass "'zpool status' with correct options succeeded"
|
||||
|
|
|
@ -37,6 +37,7 @@
|
|||
# 3. Read the file
|
||||
# 4. Take a snapshot and make a clone
|
||||
# 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v'
|
||||
# and 'zpool status -ev'
|
||||
|
||||
function cleanup
|
||||
{
|
||||
|
@ -68,6 +69,7 @@ log_must zpool status -v $TESTPOOL2
|
|||
log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'"
|
||||
log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'"
|
||||
log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'"
|
||||
log_must eval "zpool status -ev | grep '$TESTPOOL2/10m_file'"
|
||||
log_mustnot eval "zpool status -v | grep '$TESTFS1'"
|
||||
|
||||
log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone"
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
#!/bin/ksh -p
|
||||
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# This file and its contents are supplied under the terms of the
|
||||
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||
# You may only use this file in accordance with the terms of version
|
||||
# 1.0 of the CDDL.
|
||||
#
|
||||
# A full copy of the text of the CDDL should have accompanied this
|
||||
# source. A copy of the CDDL is also available via the Internet at
|
||||
# http://www.illumos.org/license/CDDL.
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify 'zpool status -e' only shows unhealthy devices.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create zpool
|
||||
# 2. Force DEGRADE, FAULT, or inject slow IOs for vdevs
|
||||
# 3. Verify vdevs are reported correctly with -e and -s
|
||||
# 4. Verify parents are reported as DEGRADED
|
||||
# 5. Verify healthy children are not reported
|
||||
#
|
||||
|
||||
function cleanup
|
||||
{
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
|
||||
zinject -c all
|
||||
poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
|
||||
log_must rm -f $all_vdevs
|
||||
}
|
||||
|
||||
log_assert "Verify 'zpool status -e'"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
all_vdevs=$(echo $TESTDIR/vdev{1..6})
|
||||
log_must mkdir -p $TESTDIR
|
||||
log_must truncate -s $MINVDEVSIZE $all_vdevs
|
||||
|
||||
OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
|
||||
|
||||
for raid_type in "draid2:3d:6c:1s" "raidz2"; do
|
||||
|
||||
log_must zpool create -f $TESTPOOL2 $raid_type $all_vdevs
|
||||
|
||||
# Check DEGRADED vdevs are shown.
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "ONLINE"
|
||||
log_must zinject -d $TESTDIR/vdev4 -A degrade $TESTPOOL2
|
||||
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev4 | grep DEGRADED"
|
||||
|
||||
# Check FAULTED vdevs are shown.
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev5 "ONLINE"
|
||||
log_must zinject -d $TESTDIR/vdev5 -A fault $TESTPOOL2
|
||||
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev5 | grep FAULTED"
|
||||
|
||||
# Check no ONLINE vdevs are shown
|
||||
log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE"
|
||||
|
||||
# Check no ONLINE slow vdevs are show. Then mark IOs greater than
|
||||
# 10ms slow, delay IOs 20ms to vdev6, check slow IOs.
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE"
|
||||
log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE"
|
||||
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS 10
|
||||
log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2
|
||||
log_must mkfile 1048576 /$TESTPOOL2/testfile
|
||||
sync_pool $TESTPOOL2
|
||||
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
|
||||
|
||||
# Check vdev6 slow IOs are only shown when requested with -s.
|
||||
log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE"
|
||||
log_must eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE"
|
||||
|
||||
# Pool level and top-vdev level status must be DEGRADED.
|
||||
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTPOOL2 | grep DEGRADED"
|
||||
log_must eval "zpool status -e $TESTPOOL2 | grep $raid_type | grep DEGRADED"
|
||||
|
||||
# Check that healthy vdevs[1-3] aren't shown with -e.
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE"
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE"
|
||||
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE"
|
||||
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE"
|
||||
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE"
|
||||
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE"
|
||||
|
||||
log_must zinject -c all
|
||||
log_must zpool status -es $TESTPOOL2
|
||||
|
||||
zpool destroy $TESTPOOL2
|
||||
done
|
||||
|
||||
log_pass "Verify zpool status -e shows only unhealthy vdevs"
|
|
@ -32,3 +32,7 @@
|
|||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
default_cleanup
|
||||
|
||||
if tunable_exists BCLONE_ENABLED ; then
|
||||
log_must restore_tunable BCLONE_ENABLED
|
||||
fi
|
||||
|
|
|
@ -0,0 +1,161 @@
|
|||
#! /bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# Verify all cp --reflink modes work with modified file.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Verify "cp --reflink=never|auto|always" behaves as expected.
|
||||
# Two different modes of operation are tested.
|
||||
#
|
||||
# a. zfs_bclone_wait_dirty=0: FICLONE and FICLONERANGE fail with EINVAL
|
||||
# when there are dirty blocks which cannot be immediately cloned.
|
||||
# This is the default behavior.
|
||||
#
|
||||
# b. zfs_bclone_wait_dirty=1: FICLONE and FICLONERANGE wait for
|
||||
# dirty blocks to be written to disk allowing the clone to succeed.
|
||||
# The downside to this is it may be slow which depending on the
|
||||
# situtation may defeat the point of making a clone.
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
verify_block_cloning
|
||||
|
||||
if ! is_linux; then
|
||||
log_unsupported "cp --reflink is a GNU coreutils option"
|
||||
fi
|
||||
|
||||
function cleanup
|
||||
{
|
||||
datasetexists $TESTPOOL/cp-reflink && \
|
||||
destroy_dataset $$TESTPOOL/cp-reflink -f
|
||||
log_must set_tunable32 BCLONE_WAIT_DIRTY 0
|
||||
}
|
||||
|
||||
function verify_copy
|
||||
{
|
||||
src_cksum=$(sha256digest $1)
|
||||
dst_cksum=$(sha256digest $2)
|
||||
|
||||
if [[ "$src_cksum" != "$dst_cksum" ]]; then
|
||||
log_must ls -l $CP_TESTDIR
|
||||
log_fail "checksum mismatch ($src_cksum != $dst_cksum)"
|
||||
fi
|
||||
}
|
||||
|
||||
log_assert "Verify all cp --reflink modes work with modified file"
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
SRC_FILE=src.data
|
||||
DST_FILE=dst.data
|
||||
SRC_SIZE=$(($RANDOM % 2048))
|
||||
|
||||
# A smaller recordsize is used merely to speed up the test.
|
||||
RECORDSIZE=4096
|
||||
|
||||
log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink
|
||||
CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink)
|
||||
|
||||
log_must cd $CP_TESTDIR
|
||||
|
||||
# Never wait on dirty blocks (zfs_bclone_wait_dirty=0)
|
||||
log_must set_tunable32 BCLONE_WAIT_DIRTY 0
|
||||
|
||||
for mode in "never" "auto" "always"; do
|
||||
log_note "Checking 'cp --reflink=$mode'"
|
||||
|
||||
# Create a new file and immediately copy it.
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
|
||||
|
||||
if [[ "$mode" == "always" ]]; then
|
||||
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
log_must ls -l $CP_TESTDIR
|
||||
else
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
fi
|
||||
log_must rm -f $DST_FILE
|
||||
|
||||
# Append to an existing file and immediately copy it.
|
||||
sync_pool $TESTPOOL
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
|
||||
count=1 conv=notrunc
|
||||
if [[ "$mode" == "always" ]]; then
|
||||
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
log_must ls -l $CP_TESTDIR
|
||||
else
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
fi
|
||||
log_must rm -f $DST_FILE
|
||||
|
||||
# Overwrite a random range of an existing file and immediately copy it.
|
||||
sync_pool $TESTPOOL
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
|
||||
seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
|
||||
if [[ "$mode" == "always" ]]; then
|
||||
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
log_must ls -l $CP_TESTDIR
|
||||
else
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
fi
|
||||
log_must rm -f $SRC_FILE $DST_FILE
|
||||
done
|
||||
|
||||
# Wait on dirty blocks (zfs_bclone_wait_dirty=1)
|
||||
log_must set_tunable32 BCLONE_WAIT_DIRTY 1
|
||||
|
||||
for mode in "never" "auto" "always"; do
|
||||
log_note "Checking 'cp --reflink=$mode'"
|
||||
|
||||
# Create a new file and immediately copy it.
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
log_must rm -f $DST_FILE
|
||||
|
||||
# Append to an existing file and immediately copy it.
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
|
||||
count=1 conv=notrunc
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
log_must rm -f $DST_FILE
|
||||
|
||||
# Overwrite a random range of an existing file and immediately copy it.
|
||||
log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
|
||||
seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
|
||||
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||
verify_copy $SRC_FILE $DST_FILE
|
||||
log_must rm -f $SRC_FILE $DST_FILE
|
||||
done
|
||||
|
||||
log_pass
|
|
@ -32,4 +32,10 @@
|
|||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
DISK=${DISKS%% *}
|
||||
|
||||
if tunable_exists BCLONE_ENABLED ; then
|
||||
log_must save_tunable BCLONE_ENABLED
|
||||
log_must set_tunable32 BCLONE_ENABLED 1
|
||||
fi
|
||||
|
||||
default_setup $DISK
|
||||
|
|
Loading…
Reference in New Issue