Merge pull request #215 from truenas/truenas/zfs-2.2.4-staging
Sync with upstream zfs-2.2-4-staging
This commit is contained in:
commit
e9b06fd7ae
|
@ -793,18 +793,27 @@ def section_dmu(kstats_dict):
|
||||||
|
|
||||||
zfetch_stats = isolate_section('zfetchstats', kstats_dict)
|
zfetch_stats = isolate_section('zfetchstats', kstats_dict)
|
||||||
|
|
||||||
zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses'])
|
zfetch_access_total = int(zfetch_stats['hits']) +\
|
||||||
|
int(zfetch_stats['future']) + int(zfetch_stats['stride']) +\
|
||||||
|
int(zfetch_stats['past']) + int(zfetch_stats['misses'])
|
||||||
|
|
||||||
prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total))
|
prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total))
|
||||||
prt_i2('Stream hits:',
|
prt_i2('Stream hits:',
|
||||||
f_perc(zfetch_stats['hits'], zfetch_access_total),
|
f_perc(zfetch_stats['hits'], zfetch_access_total),
|
||||||
f_hits(zfetch_stats['hits']))
|
f_hits(zfetch_stats['hits']))
|
||||||
|
future = int(zfetch_stats['future']) + int(zfetch_stats['stride'])
|
||||||
|
prt_i2('Hits ahead of stream:', f_perc(future, zfetch_access_total),
|
||||||
|
f_hits(future))
|
||||||
|
prt_i2('Hits behind stream:',
|
||||||
|
f_perc(zfetch_stats['past'], zfetch_access_total),
|
||||||
|
f_hits(zfetch_stats['past']))
|
||||||
prt_i2('Stream misses:',
|
prt_i2('Stream misses:',
|
||||||
f_perc(zfetch_stats['misses'], zfetch_access_total),
|
f_perc(zfetch_stats['misses'], zfetch_access_total),
|
||||||
f_hits(zfetch_stats['misses']))
|
f_hits(zfetch_stats['misses']))
|
||||||
prt_i2('Streams limit reached:',
|
prt_i2('Streams limit reached:',
|
||||||
f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']),
|
f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']),
|
||||||
f_hits(zfetch_stats['max_streams']))
|
f_hits(zfetch_stats['max_streams']))
|
||||||
|
prt_i1('Stream strides:', f_hits(zfetch_stats['stride']))
|
||||||
prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued']))
|
prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued']))
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
|
@ -8041,6 +8041,17 @@ dump_mos_leaks(spa_t *spa)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (spa->spa_brt != NULL) {
|
||||||
|
brt_t *brt = spa->spa_brt;
|
||||||
|
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
|
||||||
|
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
|
||||||
|
if (brtvd != NULL && brtvd->bv_initiated) {
|
||||||
|
mos_obj_refd(brtvd->bv_mos_brtvdev);
|
||||||
|
mos_obj_refd(brtvd->bv_mos_entries);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Visit all allocated objects and make sure they are referenced.
|
* Visit all allocated objects and make sure they are referenced.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
#
|
#
|
||||||
# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos
|
# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos
|
||||||
# as they flip between FAULTED and ONLINE. If
|
# as they flip between FAULTED and ONLINE. If
|
||||||
# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
|
# ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
|
||||||
# FAULTED, then power down the slot via sysfs:
|
# FAULTED, then power down the slot via sysfs:
|
||||||
#
|
#
|
||||||
# /sys/class/enclosure/<enclosure>/<slot>/power_status
|
# /sys/class/enclosure/<enclosure>/<slot>/power_status
|
||||||
|
@ -19,7 +19,7 @@
|
||||||
# Exit codes:
|
# Exit codes:
|
||||||
# 0: slot successfully powered off
|
# 0: slot successfully powered off
|
||||||
# 1: enclosure not available
|
# 1: enclosure not available
|
||||||
# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled
|
# 2: ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT disabled
|
||||||
# 3: vdev was not FAULTED
|
# 3: vdev was not FAULTED
|
||||||
# 4: The enclosure sysfs path passed from ZFS does not exist
|
# 4: The enclosure sysfs path passed from ZFS does not exist
|
||||||
# 5: Enclosure slot didn't actually turn off after we told it to
|
# 5: Enclosure slot didn't actually turn off after we told it to
|
||||||
|
@ -32,7 +32,7 @@ if [ ! -d /sys/class/enclosure ] ; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then
|
if [ "${ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT}" != "1" ] ; then
|
||||||
exit 2
|
exit 2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
@ -205,6 +205,10 @@ zed_notify()
|
||||||
[ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
|
[ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
|
||||||
[ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
|
[ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
|
||||||
|
|
||||||
|
zed_notify_ntfy "${subject}" "${pathname}"; rv=$?
|
||||||
|
[ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
|
||||||
|
[ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
|
||||||
|
|
||||||
[ "${num_success}" -gt 0 ] && return 0
|
[ "${num_success}" -gt 0 ] && return 0
|
||||||
[ "${num_failure}" -gt 0 ] && return 1
|
[ "${num_failure}" -gt 0 ] && return 1
|
||||||
return 2
|
return 2
|
||||||
|
@ -527,6 +531,100 @@ zed_notify_pushover()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# zed_notify_ntfy (subject, pathname)
|
||||||
|
#
|
||||||
|
# Send a notification via Ntfy.sh <https://ntfy.sh/>.
|
||||||
|
# The ntfy topic (ZED_NTFY_TOPIC) identifies the topic that the notification
|
||||||
|
# will be sent to Ntfy.sh server. The ntfy url (ZED_NTFY_URL) defines the
|
||||||
|
# self-hosted or provided hosted ntfy service location. The ntfy access token
|
||||||
|
# <https://docs.ntfy.sh/publish/#access-tokens> (ZED_NTFY_ACCESS_TOKEN) reprsents an
|
||||||
|
# access token that could be used if a topic is read/write protected. If a
|
||||||
|
# topic can be written to publicaly, a ZED_NTFY_ACCESS_TOKEN is not required.
|
||||||
|
#
|
||||||
|
# Requires curl and sed executables to be installed in the standard PATH.
|
||||||
|
#
|
||||||
|
# References
|
||||||
|
# https://docs.ntfy.sh
|
||||||
|
#
|
||||||
|
# Arguments
|
||||||
|
# subject: notification subject
|
||||||
|
# pathname: pathname containing the notification message (OPTIONAL)
|
||||||
|
#
|
||||||
|
# Globals
|
||||||
|
# ZED_NTFY_TOPIC
|
||||||
|
# ZED_NTFY_ACCESS_TOKEN (OPTIONAL)
|
||||||
|
# ZED_NTFY_URL
|
||||||
|
#
|
||||||
|
# Return
|
||||||
|
# 0: notification sent
|
||||||
|
# 1: notification failed
|
||||||
|
# 2: not configured
|
||||||
|
#
|
||||||
|
zed_notify_ntfy()
|
||||||
|
{
|
||||||
|
local subject="$1"
|
||||||
|
local pathname="${2:-"/dev/null"}"
|
||||||
|
local msg_body
|
||||||
|
local msg_out
|
||||||
|
local msg_err
|
||||||
|
|
||||||
|
[ -n "${ZED_NTFY_TOPIC}" ] || return 2
|
||||||
|
local url="${ZED_NTFY_URL:-"https://ntfy.sh"}/${ZED_NTFY_TOPIC}"
|
||||||
|
|
||||||
|
if [ ! -r "${pathname}" ]; then
|
||||||
|
zed_log_err "ntfy cannot read \"${pathname}\""
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
zed_check_cmd "curl" "sed" || return 1
|
||||||
|
|
||||||
|
# Read the message body in.
|
||||||
|
#
|
||||||
|
msg_body="$(cat "${pathname}")"
|
||||||
|
|
||||||
|
if [ -z "${msg_body}" ]
|
||||||
|
then
|
||||||
|
msg_body=$subject
|
||||||
|
subject=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Send the POST request and check for errors.
|
||||||
|
#
|
||||||
|
if [ -n "${ZED_NTFY_ACCESS_TOKEN}" ]; then
|
||||||
|
msg_out="$( \
|
||||||
|
curl \
|
||||||
|
-u ":${ZED_NTFY_ACCESS_TOKEN}" \
|
||||||
|
-H "Title: ${subject}" \
|
||||||
|
-d "${msg_body}" \
|
||||||
|
-H "Priority: high" \
|
||||||
|
"${url}" \
|
||||||
|
2>/dev/null \
|
||||||
|
)"; rv=$?
|
||||||
|
else
|
||||||
|
msg_out="$( \
|
||||||
|
curl \
|
||||||
|
-H "Title: ${subject}" \
|
||||||
|
-d "${msg_body}" \
|
||||||
|
-H "Priority: high" \
|
||||||
|
"${url}" \
|
||||||
|
2>/dev/null \
|
||||||
|
)"; rv=$?
|
||||||
|
fi
|
||||||
|
if [ "${rv}" -ne 0 ]; then
|
||||||
|
zed_log_err "curl exit=${rv}"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
msg_err="$(echo "${msg_out}" \
|
||||||
|
| sed -n -e 's/.*"errors" *:.*\[\(.*\)\].*/\1/p')"
|
||||||
|
if [ -n "${msg_err}" ]; then
|
||||||
|
zed_log_err "ntfy \"${msg_err}"\"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# zed_rate_limit (tag, [interval])
|
# zed_rate_limit (tag, [interval])
|
||||||
#
|
#
|
||||||
# Check whether an event of a given type [tag] has already occurred within the
|
# Check whether an event of a given type [tag] has already occurred within the
|
||||||
|
|
|
@ -146,4 +146,26 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event"
|
||||||
# Power off the drive's slot in the enclosure if it becomes FAULTED. This can
|
# Power off the drive's slot in the enclosure if it becomes FAULTED. This can
|
||||||
# help silence misbehaving drives. This assumes your drive enclosure fully
|
# help silence misbehaving drives. This assumes your drive enclosure fully
|
||||||
# supports slot power control via sysfs.
|
# supports slot power control via sysfs.
|
||||||
#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1
|
#ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1
|
||||||
|
|
||||||
|
##
|
||||||
|
# Ntfy topic
|
||||||
|
# This defines which topic will receive the ntfy notification.
|
||||||
|
# <https://docs.ntfy.sh/publish/>
|
||||||
|
# Disabled by default; uncomment to enable.
|
||||||
|
#ZED_NTFY_TOPIC=""
|
||||||
|
|
||||||
|
##
|
||||||
|
# Ntfy access token (optional for public topics)
|
||||||
|
# This defines an access token which can be used
|
||||||
|
# to allow you to authenticate when sending to topics
|
||||||
|
# <https://docs.ntfy.sh/publish/#access-tokens>
|
||||||
|
# Disabled by default; uncomment to enable.
|
||||||
|
#ZED_NTFY_ACCESS_TOKEN=""
|
||||||
|
|
||||||
|
##
|
||||||
|
# Ntfy Service URL
|
||||||
|
# This defines which service the ntfy call will be directed toward
|
||||||
|
# <https://docs.ntfy.sh/install/>
|
||||||
|
# https://ntfy.sh by default; uncomment to enable an alternative service url.
|
||||||
|
#ZED_NTFY_URL="https://ntfy.sh"
|
||||||
|
|
|
@ -3672,15 +3672,25 @@ zfs_do_list(int argc, char **argv)
|
||||||
|
|
||||||
for (char *tok; (tok = strsep(&optarg, ",")); ) {
|
for (char *tok; (tok = strsep(&optarg, ",")); ) {
|
||||||
static const char *const type_subopts[] = {
|
static const char *const type_subopts[] = {
|
||||||
"filesystem", "volume",
|
"filesystem",
|
||||||
"snapshot", "snap",
|
"fs",
|
||||||
|
"volume",
|
||||||
|
"vol",
|
||||||
|
"snapshot",
|
||||||
|
"snap",
|
||||||
"bookmark",
|
"bookmark",
|
||||||
"all" };
|
"all"
|
||||||
|
};
|
||||||
static const int type_types[] = {
|
static const int type_types[] = {
|
||||||
ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME,
|
ZFS_TYPE_FILESYSTEM,
|
||||||
ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT,
|
ZFS_TYPE_FILESYSTEM,
|
||||||
|
ZFS_TYPE_VOLUME,
|
||||||
|
ZFS_TYPE_VOLUME,
|
||||||
|
ZFS_TYPE_SNAPSHOT,
|
||||||
|
ZFS_TYPE_SNAPSHOT,
|
||||||
ZFS_TYPE_BOOKMARK,
|
ZFS_TYPE_BOOKMARK,
|
||||||
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK };
|
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK
|
||||||
|
};
|
||||||
|
|
||||||
for (c = 0; c < ARRAY_SIZE(type_subopts); ++c)
|
for (c = 0; c < ARRAY_SIZE(type_subopts); ++c)
|
||||||
if (strcmp(tok, type_subopts[c]) == 0) {
|
if (strcmp(tok, type_subopts[c]) == 0) {
|
||||||
|
|
|
@ -2161,6 +2161,7 @@ typedef struct status_cbdata {
|
||||||
boolean_t cb_explain;
|
boolean_t cb_explain;
|
||||||
boolean_t cb_first;
|
boolean_t cb_first;
|
||||||
boolean_t cb_dedup_stats;
|
boolean_t cb_dedup_stats;
|
||||||
|
boolean_t cb_print_unhealthy;
|
||||||
boolean_t cb_print_status;
|
boolean_t cb_print_status;
|
||||||
boolean_t cb_print_slow_ios;
|
boolean_t cb_print_slow_ios;
|
||||||
boolean_t cb_print_vdev_init;
|
boolean_t cb_print_vdev_init;
|
||||||
|
@ -2357,6 +2358,35 @@ health_str_to_color(const char *health)
|
||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Called for each leaf vdev. Returns 0 if the vdev is healthy.
|
||||||
|
* A vdev is unhealthy if any of the following are true:
|
||||||
|
* 1) there are read, write, or checksum errors,
|
||||||
|
* 2) its state is not ONLINE, or
|
||||||
|
* 3) slow IO reporting was requested (-s) and there are slow IOs.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data)
|
||||||
|
{
|
||||||
|
status_cbdata_t *cb = data;
|
||||||
|
vdev_stat_t *vs;
|
||||||
|
uint_t vsc;
|
||||||
|
(void) hdl_data;
|
||||||
|
|
||||||
|
if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
|
||||||
|
(uint64_t **)&vs, &vsc) != 0)
|
||||||
|
return (1);
|
||||||
|
|
||||||
|
if (vs->vs_checksum_errors || vs->vs_read_errors ||
|
||||||
|
vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY)
|
||||||
|
return (1);
|
||||||
|
|
||||||
|
if (cb->cb_print_slow_ios && vs->vs_slow_ios)
|
||||||
|
return (1);
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Print out configuration state as requested by status_callback.
|
* Print out configuration state as requested by status_callback.
|
||||||
*/
|
*/
|
||||||
|
@ -2375,7 +2405,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
||||||
const char *state;
|
const char *state;
|
||||||
const char *type;
|
const char *type;
|
||||||
const char *path = NULL;
|
const char *path = NULL;
|
||||||
const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL;
|
const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL,
|
||||||
|
*scolor = NULL;
|
||||||
|
|
||||||
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
|
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
|
||||||
&child, &children) != 0)
|
&child, &children) != 0)
|
||||||
|
@ -2402,6 +2433,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
||||||
state = gettext("AVAIL");
|
state = gettext("AVAIL");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If '-e' is specified then top-level vdevs and their children
|
||||||
|
* can be pruned if all of their leaves are healthy.
|
||||||
|
*/
|
||||||
|
if (cb->cb_print_unhealthy && depth > 0 &&
|
||||||
|
for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
printf_color(health_str_to_color(state),
|
printf_color(health_str_to_color(state),
|
||||||
"\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth,
|
"\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth,
|
||||||
name, state);
|
name, state);
|
||||||
|
@ -2416,6 +2456,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
||||||
if (vs->vs_checksum_errors)
|
if (vs->vs_checksum_errors)
|
||||||
ccolor = ANSI_RED;
|
ccolor = ANSI_RED;
|
||||||
|
|
||||||
|
if (vs->vs_slow_ios)
|
||||||
|
scolor = ANSI_BLUE;
|
||||||
|
|
||||||
if (cb->cb_literal) {
|
if (cb->cb_literal) {
|
||||||
fputc(' ', stdout);
|
fputc(' ', stdout);
|
||||||
printf_color(rcolor, "%5llu",
|
printf_color(rcolor, "%5llu",
|
||||||
|
@ -2448,9 +2491,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cb->cb_literal)
|
if (cb->cb_literal)
|
||||||
printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
|
printf_color(scolor, " %5llu",
|
||||||
|
(u_longlong_t)vs->vs_slow_ios);
|
||||||
else
|
else
|
||||||
printf(" %5s", rbuf);
|
printf_color(scolor, " %5s", rbuf);
|
||||||
}
|
}
|
||||||
if (cb->cb_print_power) {
|
if (cb->cb_print_power) {
|
||||||
if (children == 0) {
|
if (children == 0) {
|
||||||
|
@ -8999,9 +9043,11 @@ status_callback(zpool_handle_t *zhp, void *data)
|
||||||
(void) printf(gettext(
|
(void) printf(gettext(
|
||||||
"errors: No known data errors\n"));
|
"errors: No known data errors\n"));
|
||||||
} else if (!cbp->cb_verbose) {
|
} else if (!cbp->cb_verbose) {
|
||||||
|
color_start(ANSI_RED);
|
||||||
(void) printf(gettext("errors: %llu data "
|
(void) printf(gettext("errors: %llu data "
|
||||||
"errors, use '-v' for a list\n"),
|
"errors, use '-v' for a list\n"),
|
||||||
(u_longlong_t)nerr);
|
(u_longlong_t)nerr);
|
||||||
|
color_end();
|
||||||
} else {
|
} else {
|
||||||
print_error_log(zhp);
|
print_error_log(zhp);
|
||||||
}
|
}
|
||||||
|
@ -9022,6 +9068,7 @@ status_callback(zpool_handle_t *zhp, void *data)
|
||||||
* [pool] [interval [count]]
|
* [pool] [interval [count]]
|
||||||
*
|
*
|
||||||
* -c CMD For each vdev, run command CMD
|
* -c CMD For each vdev, run command CMD
|
||||||
|
* -e Display only unhealthy vdevs
|
||||||
* -i Display vdev initialization status.
|
* -i Display vdev initialization status.
|
||||||
* -g Display guid for individual vdev name.
|
* -g Display guid for individual vdev name.
|
||||||
* -L Follow links when resolving vdev path name.
|
* -L Follow links when resolving vdev path name.
|
||||||
|
@ -9053,7 +9100,7 @@ zpool_do_status(int argc, char **argv)
|
||||||
};
|
};
|
||||||
|
|
||||||
/* check options */
|
/* check options */
|
||||||
while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options,
|
while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options,
|
||||||
NULL)) != -1) {
|
NULL)) != -1) {
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 'c':
|
case 'c':
|
||||||
|
@ -9080,6 +9127,9 @@ zpool_do_status(int argc, char **argv)
|
||||||
}
|
}
|
||||||
cmd = optarg;
|
cmd = optarg;
|
||||||
break;
|
break;
|
||||||
|
case 'e':
|
||||||
|
cb.cb_print_unhealthy = B_TRUE;
|
||||||
|
break;
|
||||||
case 'i':
|
case 'i':
|
||||||
cb.cb_print_vdev_init = B_TRUE;
|
cb.cb_print_vdev_init = B_TRUE;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
|
||||||
|
ZFS_LINUX_TEST_SRC([page_size], [
|
||||||
|
#include <linux/mm.h>
|
||||||
|
],[
|
||||||
|
unsigned long s;
|
||||||
|
s = page_size(NULL);
|
||||||
|
])
|
||||||
|
])
|
||||||
|
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
|
||||||
|
AC_MSG_CHECKING([whether page_size() is available])
|
||||||
|
ZFS_LINUX_TEST_RESULT([page_size], [
|
||||||
|
AC_MSG_RESULT(yes)
|
||||||
|
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
|
||||||
|
],[
|
||||||
|
AC_MSG_RESULT(no)
|
||||||
|
])
|
||||||
|
])
|
|
@ -16,6 +16,9 @@ dnl #
|
||||||
dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
|
dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
|
||||||
dnl # generic_copy_file_range() added to support it
|
dnl # generic_copy_file_range() added to support it
|
||||||
dnl #
|
dnl #
|
||||||
|
dnl # 6.8: generic_copy_file_range() removed, replaced by
|
||||||
|
dnl # splice_copy_file_range()
|
||||||
|
dnl #
|
||||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
|
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
|
||||||
ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
|
ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
|
@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
|
||||||
])
|
])
|
||||||
])
|
])
|
||||||
|
|
||||||
|
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
|
||||||
|
ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
|
||||||
|
#include <linux/splice.h>
|
||||||
|
], [
|
||||||
|
struct file *src_file __attribute__ ((unused)) = NULL;
|
||||||
|
loff_t src_off __attribute__ ((unused)) = 0;
|
||||||
|
struct file *dst_file __attribute__ ((unused)) = NULL;
|
||||||
|
loff_t dst_off __attribute__ ((unused)) = 0;
|
||||||
|
size_t len __attribute__ ((unused)) = 0;
|
||||||
|
splice_copy_file_range(src_file, src_off, dst_file, dst_off,
|
||||||
|
len);
|
||||||
|
])
|
||||||
|
])
|
||||||
|
AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
|
||||||
|
AC_MSG_CHECKING([whether splice_copy_file_range() is available])
|
||||||
|
ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
|
||||||
|
AC_MSG_RESULT(yes)
|
||||||
|
AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
|
||||||
|
[splice_copy_file_range() is available])
|
||||||
|
],[
|
||||||
|
AC_MSG_RESULT(no)
|
||||||
|
])
|
||||||
|
])
|
||||||
|
|
||||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
|
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
|
||||||
ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
|
ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
|
|
|
@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
||||||
ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
|
ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
|
||||||
ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
|
ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
|
||||||
ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
|
ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
|
||||||
|
ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
|
||||||
ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
|
ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
|
||||||
ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
|
ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
|
||||||
ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
|
ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
|
||||||
|
@ -166,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
||||||
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
|
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
|
||||||
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
|
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
|
||||||
ZFS_AC_KERNEL_SRC_SYNC_BDEV
|
ZFS_AC_KERNEL_SRC_SYNC_BDEV
|
||||||
|
ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
|
||||||
case "$host_cpu" in
|
case "$host_cpu" in
|
||||||
powerpc*)
|
powerpc*)
|
||||||
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
|
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
|
||||||
|
@ -266,6 +268,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
||||||
ZFS_AC_KERNEL_VFS_IOV_ITER
|
ZFS_AC_KERNEL_VFS_IOV_ITER
|
||||||
ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
|
ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
|
||||||
ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
|
ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
|
||||||
|
ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
|
||||||
ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
|
ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
|
||||||
ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
|
ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
|
||||||
ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
|
ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
|
||||||
|
@ -314,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
||||||
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
|
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
|
||||||
ZFS_AC_KERNEL_COPY_SPLICE_READ
|
ZFS_AC_KERNEL_COPY_SPLICE_READ
|
||||||
ZFS_AC_KERNEL_SYNC_BDEV
|
ZFS_AC_KERNEL_SYNC_BDEV
|
||||||
|
ZFS_AC_KERNEL_MM_PAGE_SIZE
|
||||||
case "$host_cpu" in
|
case "$host_cpu" in
|
||||||
powerpc*)
|
powerpc*)
|
||||||
ZFS_AC_KERNEL_CPU_HAS_FEATURE
|
ZFS_AC_KERNEL_CPU_HAS_FEATURE
|
||||||
|
|
|
@ -286,7 +286,6 @@ typedef struct zfid_long {
|
||||||
|
|
||||||
extern uint_t zfs_fsyncer_key;
|
extern uint_t zfs_fsyncer_key;
|
||||||
extern int zfs_super_owner;
|
extern int zfs_super_owner;
|
||||||
extern int zfs_bclone_enabled;
|
|
||||||
|
|
||||||
extern void zfs_init(void);
|
extern void zfs_init(void);
|
||||||
extern void zfs_fini(void);
|
extern void zfs_fini(void);
|
||||||
|
|
|
@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
|
||||||
%D%/kernel/linux/compiler_compat.h \
|
%D%/kernel/linux/compiler_compat.h \
|
||||||
%D%/kernel/linux/dcache_compat.h \
|
%D%/kernel/linux/dcache_compat.h \
|
||||||
%D%/kernel/linux/kmap_compat.h \
|
%D%/kernel/linux/kmap_compat.h \
|
||||||
|
%D%/kernel/linux/mm_compat.h \
|
||||||
%D%/kernel/linux/mod_compat.h \
|
%D%/kernel/linux/mod_compat.h \
|
||||||
%D%/kernel/linux/page_compat.h \
|
%D%/kernel/linux/page_compat.h \
|
||||||
%D%/kernel/linux/percpu_compat.h \
|
%D%/kernel/linux/percpu_compat.h \
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
/*
|
||||||
|
* CDDL HEADER START
|
||||||
|
*
|
||||||
|
* The contents of this file are subject to the terms of the
|
||||||
|
* Common Development and Distribution License (the "License").
|
||||||
|
* You may not use this file except in compliance with the License.
|
||||||
|
*
|
||||||
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||||
|
* or https://opensource.org/licenses/CDDL-1.0.
|
||||||
|
* See the License for the specific language governing permissions
|
||||||
|
* and limitations under the License.
|
||||||
|
*
|
||||||
|
* When distributing Covered Code, include this CDDL HEADER in each
|
||||||
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||||
|
* If applicable, add the following below this CDDL HEADER, with the
|
||||||
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
||||||
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
||||||
|
*
|
||||||
|
* CDDL HEADER END
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2023, 2024, Klara Inc.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef _ZFS_MM_COMPAT_H
|
||||||
|
#define _ZFS_MM_COMPAT_H
|
||||||
|
|
||||||
|
#include <linux/mm.h>
|
||||||
|
|
||||||
|
/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
|
||||||
|
#ifndef HAVE_MM_PAGE_SIZE
|
||||||
|
#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* _ZFS_MM_COMPAT_H */
|
|
@ -68,6 +68,7 @@ enum scope_prefix_types {
|
||||||
zfs_trim,
|
zfs_trim,
|
||||||
zfs_txg,
|
zfs_txg,
|
||||||
zfs_vdev,
|
zfs_vdev,
|
||||||
|
zfs_vdev_disk,
|
||||||
zfs_vdev_file,
|
zfs_vdev_file,
|
||||||
zfs_vdev_mirror,
|
zfs_vdev_mirror,
|
||||||
zfs_vnops,
|
zfs_vnops,
|
||||||
|
|
|
@ -45,8 +45,6 @@ extern "C" {
|
||||||
typedef struct zfsvfs zfsvfs_t;
|
typedef struct zfsvfs zfsvfs_t;
|
||||||
struct znode;
|
struct znode;
|
||||||
|
|
||||||
extern int zfs_bclone_enabled;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This structure emulates the vfs_t from other platforms. It's purpose
|
* This structure emulates the vfs_t from other platforms. It's purpose
|
||||||
* is to facilitate the handling of mount options and minimize structural
|
* is to facilitate the handling of mount options and minimize structural
|
||||||
|
|
|
@ -79,6 +79,9 @@ typedef struct abd {
|
||||||
|
|
||||||
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
|
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
|
||||||
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
|
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
|
||||||
|
#if defined(__linux__) && defined(_KERNEL)
|
||||||
|
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
|
||||||
|
#endif
|
||||||
|
|
||||||
extern int zfs_abd_scatter_enabled;
|
extern int zfs_abd_scatter_enabled;
|
||||||
|
|
||||||
|
@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
|
||||||
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
|
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
|
||||||
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
|
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
|
||||||
abd_iter_func2_t *, void *);
|
abd_iter_func2_t *, void *);
|
||||||
|
#if defined(__linux__) && defined(_KERNEL)
|
||||||
|
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
|
||||||
|
void *);
|
||||||
|
#endif
|
||||||
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
|
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
|
||||||
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
|
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
|
||||||
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
|
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
|
||||||
|
@ -213,6 +220,8 @@ void abd_fini(void);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Linux ABD bio functions
|
* Linux ABD bio functions
|
||||||
|
* Note: these are only needed to support vdev_classic. See comment in
|
||||||
|
* vdev_disk.c.
|
||||||
*/
|
*/
|
||||||
#if defined(__linux__) && defined(_KERNEL)
|
#if defined(__linux__) && defined(_KERNEL)
|
||||||
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
|
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||||
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
|
||||||
|
* Copyright (c) 2023, 2024, Klara Inc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _ABD_IMPL_H
|
#ifndef _ABD_IMPL_H
|
||||||
|
@ -38,12 +39,30 @@ typedef enum abd_stats_op {
|
||||||
ABDSTAT_DECR /* Decrease abdstat values */
|
ABDSTAT_DECR /* Decrease abdstat values */
|
||||||
} abd_stats_op_t;
|
} abd_stats_op_t;
|
||||||
|
|
||||||
struct scatterlist; /* forward declaration */
|
/* forward declarations */
|
||||||
|
struct scatterlist;
|
||||||
|
struct page;
|
||||||
|
|
||||||
struct abd_iter {
|
struct abd_iter {
|
||||||
/* public interface */
|
/* public interface */
|
||||||
void *iter_mapaddr; /* addr corresponding to iter_pos */
|
union {
|
||||||
size_t iter_mapsize; /* length of data valid at mapaddr */
|
/* for abd_iter_map()/abd_iter_unmap() */
|
||||||
|
struct {
|
||||||
|
/* addr corresponding to iter_pos */
|
||||||
|
void *iter_mapaddr;
|
||||||
|
/* length of data valid at mapaddr */
|
||||||
|
size_t iter_mapsize;
|
||||||
|
};
|
||||||
|
/* for abd_iter_page() */
|
||||||
|
struct {
|
||||||
|
/* current page */
|
||||||
|
struct page *iter_page;
|
||||||
|
/* offset of data in page */
|
||||||
|
size_t iter_page_doff;
|
||||||
|
/* size of data in page */
|
||||||
|
size_t iter_page_dsize;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
/* private */
|
/* private */
|
||||||
abd_t *iter_abd; /* ABD being iterated through */
|
abd_t *iter_abd; /* ABD being iterated through */
|
||||||
|
@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
|
||||||
void abd_iter_advance(struct abd_iter *, size_t);
|
void abd_iter_advance(struct abd_iter *, size_t);
|
||||||
void abd_iter_map(struct abd_iter *);
|
void abd_iter_map(struct abd_iter *);
|
||||||
void abd_iter_unmap(struct abd_iter *);
|
void abd_iter_unmap(struct abd_iter *);
|
||||||
|
void abd_iter_page(struct abd_iter *);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Helper macros
|
* Helper macros
|
||||||
|
|
|
@ -45,18 +45,24 @@ typedef struct zfetch {
|
||||||
int zf_numstreams; /* number of zstream_t's */
|
int zf_numstreams; /* number of zstream_t's */
|
||||||
} zfetch_t;
|
} zfetch_t;
|
||||||
|
|
||||||
|
typedef struct zsrange {
|
||||||
|
uint16_t start;
|
||||||
|
uint16_t end;
|
||||||
|
} zsrange_t;
|
||||||
|
|
||||||
|
#define ZFETCH_RANGES 9 /* Fits zstream_t into 128 bytes */
|
||||||
|
|
||||||
typedef struct zstream {
|
typedef struct zstream {
|
||||||
|
list_node_t zs_node; /* link for zf_stream */
|
||||||
uint64_t zs_blkid; /* expect next access at this blkid */
|
uint64_t zs_blkid; /* expect next access at this blkid */
|
||||||
|
uint_t zs_atime; /* time last prefetch issued */
|
||||||
|
zsrange_t zs_ranges[ZFETCH_RANGES]; /* ranges from future */
|
||||||
unsigned int zs_pf_dist; /* data prefetch distance in bytes */
|
unsigned int zs_pf_dist; /* data prefetch distance in bytes */
|
||||||
unsigned int zs_ipf_dist; /* L1 prefetch distance in bytes */
|
unsigned int zs_ipf_dist; /* L1 prefetch distance in bytes */
|
||||||
uint64_t zs_pf_start; /* first data block to prefetch */
|
uint64_t zs_pf_start; /* first data block to prefetch */
|
||||||
uint64_t zs_pf_end; /* data block to prefetch up to */
|
uint64_t zs_pf_end; /* data block to prefetch up to */
|
||||||
uint64_t zs_ipf_start; /* first data block to prefetch L1 */
|
uint64_t zs_ipf_start; /* first data block to prefetch L1 */
|
||||||
uint64_t zs_ipf_end; /* data block to prefetch L1 up to */
|
uint64_t zs_ipf_end; /* data block to prefetch L1 up to */
|
||||||
|
|
||||||
list_node_t zs_node; /* link for zf_stream */
|
|
||||||
hrtime_t zs_atime; /* time last prefetch issued */
|
|
||||||
zfetch_t *zs_fetch; /* parent fetch */
|
|
||||||
boolean_t zs_missed; /* stream saw cache misses */
|
boolean_t zs_missed; /* stream saw cache misses */
|
||||||
boolean_t zs_more; /* need more distant prefetch */
|
boolean_t zs_more; /* need more distant prefetch */
|
||||||
zfs_refcount_t zs_callers; /* number of pending callers */
|
zfs_refcount_t zs_callers; /* number of pending callers */
|
||||||
|
@ -74,7 +80,7 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *);
|
||||||
void dmu_zfetch_fini(zfetch_t *);
|
void dmu_zfetch_fini(zfetch_t *);
|
||||||
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
|
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
|
||||||
boolean_t);
|
boolean_t);
|
||||||
void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
|
void dmu_zfetch_run(zfetch_t *, zstream_t *, boolean_t, boolean_t);
|
||||||
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
|
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
|
||||||
boolean_t);
|
boolean_t);
|
||||||
|
|
||||||
|
|
|
@ -82,12 +82,15 @@ int multilist_is_empty(multilist_t *);
|
||||||
unsigned int multilist_get_num_sublists(multilist_t *);
|
unsigned int multilist_get_num_sublists(multilist_t *);
|
||||||
unsigned int multilist_get_random_index(multilist_t *);
|
unsigned int multilist_get_random_index(multilist_t *);
|
||||||
|
|
||||||
multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
|
void multilist_sublist_lock(multilist_sublist_t *);
|
||||||
|
multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
|
||||||
multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
|
multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
|
||||||
void multilist_sublist_unlock(multilist_sublist_t *);
|
void multilist_sublist_unlock(multilist_sublist_t *);
|
||||||
|
|
||||||
void multilist_sublist_insert_head(multilist_sublist_t *, void *);
|
void multilist_sublist_insert_head(multilist_sublist_t *, void *);
|
||||||
void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
|
void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
|
||||||
|
void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
|
||||||
|
void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
|
||||||
void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
|
void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
|
||||||
void multilist_sublist_remove(multilist_sublist_t *, void *);
|
void multilist_sublist_remove(multilist_sublist_t *, void *);
|
||||||
int multilist_sublist_is_empty(multilist_sublist_t *);
|
int multilist_sublist_is_empty(multilist_sublist_t *);
|
||||||
|
|
|
@ -24,8 +24,11 @@
|
||||||
|
|
||||||
#ifndef _SYS_FS_ZFS_VNOPS_H
|
#ifndef _SYS_FS_ZFS_VNOPS_H
|
||||||
#define _SYS_FS_ZFS_VNOPS_H
|
#define _SYS_FS_ZFS_VNOPS_H
|
||||||
|
|
||||||
#include <sys/zfs_vnops_os.h>
|
#include <sys/zfs_vnops_os.h>
|
||||||
|
|
||||||
|
extern int zfs_bclone_enabled;
|
||||||
|
|
||||||
extern int zfs_fsync(znode_t *, int, cred_t *);
|
extern int zfs_fsync(znode_t *, int, cred_t *);
|
||||||
extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
|
extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
|
||||||
extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
|
extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
.\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
|
.\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
|
||||||
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
|
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
|
||||||
.\" Copyright (c) 2019 Datto Inc.
|
.\" Copyright (c) 2019 Datto Inc.
|
||||||
|
.\" Copyright (c) 2023, 2024 Klara, Inc.
|
||||||
.\" The contents of this file are subject to the terms of the Common Development
|
.\" The contents of this file are subject to the terms of the Common Development
|
||||||
.\" and Distribution License (the "License"). You may not use this file except
|
.\" and Distribution License (the "License"). You may not use this file except
|
||||||
.\" in compliance with the License. You can obtain a copy of the license at
|
.\" in compliance with the License. You can obtain a copy of the license at
|
||||||
|
@ -15,7 +16,7 @@
|
||||||
.\" own identifying information:
|
.\" own identifying information:
|
||||||
.\" Portions Copyright [yyyy] [name of copyright owner]
|
.\" Portions Copyright [yyyy] [name of copyright owner]
|
||||||
.\"
|
.\"
|
||||||
.Dd July 21, 2023
|
.Dd January 9, 2024
|
||||||
.Dt ZFS 4
|
.Dt ZFS 4
|
||||||
.Os
|
.Os
|
||||||
.
|
.
|
||||||
|
@ -530,6 +531,10 @@ However, this is limited by
|
||||||
Maximum micro ZAP size.
|
Maximum micro ZAP size.
|
||||||
A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
|
A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
|
||||||
.
|
.
|
||||||
|
.It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
|
||||||
|
Log2 fraction of holes in speculative prefetch stream allowed for it to
|
||||||
|
proceed.
|
||||||
|
.
|
||||||
.It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
|
.It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
|
||||||
Min bytes to prefetch per stream.
|
Min bytes to prefetch per stream.
|
||||||
Prefetch distance starts from the demand access size and quickly grows to
|
Prefetch distance starts from the demand access size and quickly grows to
|
||||||
|
@ -544,6 +549,13 @@ Max bytes to prefetch per stream.
|
||||||
.It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
|
.It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
|
||||||
Max bytes to prefetch indirects for per stream.
|
Max bytes to prefetch indirects for per stream.
|
||||||
.
|
.
|
||||||
|
.It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
|
||||||
|
Requests within this byte distance from the current prefetch stream position
|
||||||
|
are considered parts of the stream, reordered due to parallel processing.
|
||||||
|
Such requests do not advance the stream position immediately unless
|
||||||
|
.Sy zfetch_hole_shift
|
||||||
|
fill threshold is reached, but saved to fill holes in the stream later.
|
||||||
|
.
|
||||||
.It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint
|
.It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint
|
||||||
Max number of streams per zfetch (prefetch streams per file).
|
Max number of streams per zfetch (prefetch streams per file).
|
||||||
.
|
.
|
||||||
|
@ -1142,6 +1154,15 @@ Enable the experimental block cloning feature.
|
||||||
If this setting is 0, then even if feature@block_cloning is enabled,
|
If this setting is 0, then even if feature@block_cloning is enabled,
|
||||||
attempts to clone blocks will act as though the feature is disabled.
|
attempts to clone blocks will act as though the feature is disabled.
|
||||||
.
|
.
|
||||||
|
.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
||||||
|
When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
|
||||||
|
written to disk.
|
||||||
|
This allows the clone operation to reliably succeed when a file is
|
||||||
|
modified and then immediately cloned.
|
||||||
|
For small files this may be slower than making a copy of the file.
|
||||||
|
Therefore, this setting defaults to 0 which causes a clone operation to
|
||||||
|
immediately fail when encountering a dirty block.
|
||||||
|
.
|
||||||
.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
|
.It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
|
||||||
Select a BLAKE3 implementation.
|
Select a BLAKE3 implementation.
|
||||||
.Pp
|
.Pp
|
||||||
|
@ -1336,6 +1357,42 @@ _
|
||||||
4 Driver No driver retries on driver errors.
|
4 Driver No driver retries on driver errors.
|
||||||
.TE
|
.TE
|
||||||
.
|
.
|
||||||
|
.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
|
||||||
|
Maximum number of segments to add to a BIO (min 4).
|
||||||
|
If this is higher than the maximum allowed by the device queue or the kernel
|
||||||
|
itself, it will be clamped.
|
||||||
|
Setting it to zero will cause the kernel's ideal size to be used.
|
||||||
|
This parameter only applies on Linux.
|
||||||
|
This parameter is ignored if
|
||||||
|
.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
|
||||||
|
.
|
||||||
|
.It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint
|
||||||
|
Controls the method used to submit IO to the Linux block layer
|
||||||
|
(default
|
||||||
|
.Sy 1 "classic" Ns
|
||||||
|
)
|
||||||
|
.Pp
|
||||||
|
If set to 1, the "classic" method is used.
|
||||||
|
This is the method that has been in use since the earliest versions of
|
||||||
|
ZFS-on-Linux.
|
||||||
|
It has known issues with highly fragmented IO requests and is less efficient on
|
||||||
|
many workloads, but it well known and well understood.
|
||||||
|
.Pp
|
||||||
|
If set to 0, the "new" method is used.
|
||||||
|
This method is available since 2.2.4 and should resolve all known issues and be
|
||||||
|
far more efficient, but has not had as much testing.
|
||||||
|
In the 2.2.x series, this parameter defaults to 1, to use the "classic" method.
|
||||||
|
.Pp
|
||||||
|
It is not recommended that you change it except on advice from the OpenZFS
|
||||||
|
developers.
|
||||||
|
If you do change it, please also open a bug report describing why you did so,
|
||||||
|
including the workload involved and any error messages.
|
||||||
|
.Pp
|
||||||
|
This parameter and the "classic" submission method will be removed in a future
|
||||||
|
release of OpenZFS once we have total confidence in the new method.
|
||||||
|
.Pp
|
||||||
|
This parameter only applies on Linux, and can only be set at module load time.
|
||||||
|
.
|
||||||
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
|
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
|
||||||
Time before expiring
|
Time before expiring
|
||||||
.Pa .zfs/snapshot .
|
.Pa .zfs/snapshot .
|
||||||
|
|
|
@ -29,7 +29,7 @@
|
||||||
.\" Copyright 2018 Nexenta Systems, Inc.
|
.\" Copyright 2018 Nexenta Systems, Inc.
|
||||||
.\" Copyright 2019 Joyent, Inc.
|
.\" Copyright 2019 Joyent, Inc.
|
||||||
.\"
|
.\"
|
||||||
.Dd March 16, 2022
|
.Dd February 8, 2024
|
||||||
.Dt ZFS-LIST 8
|
.Dt ZFS-LIST 8
|
||||||
.Os
|
.Os
|
||||||
.
|
.
|
||||||
|
@ -155,6 +155,15 @@ or
|
||||||
For example, specifying
|
For example, specifying
|
||||||
.Fl t Sy snapshot
|
.Fl t Sy snapshot
|
||||||
displays only snapshots.
|
displays only snapshots.
|
||||||
|
.Sy fs ,
|
||||||
|
.Sy snap ,
|
||||||
|
or
|
||||||
|
.Sy vol
|
||||||
|
can be used as aliases for
|
||||||
|
.Sy filesystem ,
|
||||||
|
.Sy snapshot ,
|
||||||
|
or
|
||||||
|
.Sy volume .
|
||||||
.El
|
.El
|
||||||
.
|
.
|
||||||
.Sh EXAMPLES
|
.Sh EXAMPLES
|
||||||
|
|
|
@ -36,7 +36,7 @@
|
||||||
.Sh SYNOPSIS
|
.Sh SYNOPSIS
|
||||||
.Nm zpool
|
.Nm zpool
|
||||||
.Cm status
|
.Cm status
|
||||||
.Op Fl DigLpPstvx
|
.Op Fl DeigLpPstvx
|
||||||
.Op Fl T Sy u Ns | Ns Sy d
|
.Op Fl T Sy u Ns | Ns Sy d
|
||||||
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns …
|
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns …
|
||||||
.Oo Ar pool Oc Ns …
|
.Oo Ar pool Oc Ns …
|
||||||
|
@ -69,6 +69,8 @@ See the
|
||||||
option of
|
option of
|
||||||
.Nm zpool Cm iostat
|
.Nm zpool Cm iostat
|
||||||
for complete details.
|
for complete details.
|
||||||
|
.It Fl e
|
||||||
|
Only show unhealthy vdevs (not-ONLINE or with errors).
|
||||||
.It Fl i
|
.It Fl i
|
||||||
Display vdev initialization status.
|
Display vdev initialization status.
|
||||||
.It Fl g
|
.It Fl g
|
||||||
|
|
|
@ -111,10 +111,11 @@ static const char *upvalname (Proto *p, int uv) {
|
||||||
|
|
||||||
static const char *findvararg (CallInfo *ci, int n, StkId *pos) {
|
static const char *findvararg (CallInfo *ci, int n, StkId *pos) {
|
||||||
int nparams = clLvalue(ci->func)->p->numparams;
|
int nparams = clLvalue(ci->func)->p->numparams;
|
||||||
if (n >= ci->u.l.base - ci->func - nparams)
|
int nvararg = cast_int(ci->u.l.base - ci->func) - nparams;
|
||||||
|
if (n <= -nvararg)
|
||||||
return NULL; /* no such vararg */
|
return NULL; /* no such vararg */
|
||||||
else {
|
else {
|
||||||
*pos = ci->func + nparams + n;
|
*pos = ci->func + nparams - n;
|
||||||
return "(*vararg)"; /* generic name for any vararg */
|
return "(*vararg)"; /* generic name for any vararg */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -126,7 +127,7 @@ static const char *findlocal (lua_State *L, CallInfo *ci, int n,
|
||||||
StkId base;
|
StkId base;
|
||||||
if (isLua(ci)) {
|
if (isLua(ci)) {
|
||||||
if (n < 0) /* access to vararg values? */
|
if (n < 0) /* access to vararg values? */
|
||||||
return findvararg(ci, -n, pos);
|
return findvararg(ci, n, pos);
|
||||||
else {
|
else {
|
||||||
base = ci->u.l.base;
|
base = ci->u.l.base;
|
||||||
name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));
|
name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));
|
||||||
|
|
|
@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
||||||
{
|
{
|
||||||
ASSERT(!abd_is_gang(abd));
|
ASSERT(!abd_is_gang(abd));
|
||||||
abd_verify(abd);
|
abd_verify(abd);
|
||||||
|
memset(aiter, 0, sizeof (struct abd_iter));
|
||||||
aiter->iter_abd = abd;
|
aiter->iter_abd = abd;
|
||||||
aiter->iter_pos = 0;
|
|
||||||
aiter->iter_mapaddr = NULL;
|
|
||||||
aiter->iter_mapsize = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -89,10 +89,6 @@ int zfs_debug_level;
|
||||||
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
|
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
|
||||||
"Debug level");
|
"Debug level");
|
||||||
|
|
||||||
int zfs_bclone_enabled = 0;
|
|
||||||
SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
|
|
||||||
&zfs_bclone_enabled, 0, "Enable block cloning");
|
|
||||||
|
|
||||||
struct zfs_jailparam {
|
struct zfs_jailparam {
|
||||||
int mount_snapshot;
|
int mount_snapshot;
|
||||||
};
|
};
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
|
||||||
* Copyright (c) 2019 by Delphix. All rights reserved.
|
* Copyright (c) 2019 by Delphix. All rights reserved.
|
||||||
|
* Copyright (c) 2023, 2024, Klara Inc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -59,7 +60,9 @@
|
||||||
#include <sys/zfs_znode.h>
|
#include <sys/zfs_znode.h>
|
||||||
#ifdef _KERNEL
|
#ifdef _KERNEL
|
||||||
#include <linux/kmap_compat.h>
|
#include <linux/kmap_compat.h>
|
||||||
|
#include <linux/mm_compat.h>
|
||||||
#include <linux/scatterlist.h>
|
#include <linux/scatterlist.h>
|
||||||
|
#include <linux/version.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _KERNEL
|
#ifdef _KERNEL
|
||||||
|
@ -895,14 +898,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
||||||
{
|
{
|
||||||
ASSERT(!abd_is_gang(abd));
|
ASSERT(!abd_is_gang(abd));
|
||||||
abd_verify(abd);
|
abd_verify(abd);
|
||||||
|
memset(aiter, 0, sizeof (struct abd_iter));
|
||||||
aiter->iter_abd = abd;
|
aiter->iter_abd = abd;
|
||||||
aiter->iter_mapaddr = NULL;
|
if (!abd_is_linear(abd)) {
|
||||||
aiter->iter_mapsize = 0;
|
|
||||||
aiter->iter_pos = 0;
|
|
||||||
if (abd_is_linear(abd)) {
|
|
||||||
aiter->iter_offset = 0;
|
|
||||||
aiter->iter_sg = NULL;
|
|
||||||
} else {
|
|
||||||
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
|
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
|
||||||
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
|
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
|
||||||
}
|
}
|
||||||
|
@ -915,6 +913,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
|
||||||
boolean_t
|
boolean_t
|
||||||
abd_iter_at_end(struct abd_iter *aiter)
|
abd_iter_at_end(struct abd_iter *aiter)
|
||||||
{
|
{
|
||||||
|
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
|
||||||
return (aiter->iter_pos == aiter->iter_abd->abd_size);
|
return (aiter->iter_pos == aiter->iter_abd->abd_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -926,8 +925,15 @@ abd_iter_at_end(struct abd_iter *aiter)
|
||||||
void
|
void
|
||||||
abd_iter_advance(struct abd_iter *aiter, size_t amount)
|
abd_iter_advance(struct abd_iter *aiter, size_t amount)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Ensure that last chunk is not in use. abd_iterate_*() must clear
|
||||||
|
* this state (directly or abd_iter_unmap()) before advancing.
|
||||||
|
*/
|
||||||
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
|
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
|
||||||
ASSERT0(aiter->iter_mapsize);
|
ASSERT0(aiter->iter_mapsize);
|
||||||
|
ASSERT3P(aiter->iter_page, ==, NULL);
|
||||||
|
ASSERT0(aiter->iter_page_doff);
|
||||||
|
ASSERT0(aiter->iter_page_dsize);
|
||||||
|
|
||||||
/* There's nothing left to advance to, so do nothing */
|
/* There's nothing left to advance to, so do nothing */
|
||||||
if (abd_iter_at_end(aiter))
|
if (abd_iter_at_end(aiter))
|
||||||
|
@ -1009,6 +1015,106 @@ abd_cache_reap_now(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_KERNEL)
|
#if defined(_KERNEL)
|
||||||
|
/*
|
||||||
|
* Yield the next page struct and data offset and size within it, without
|
||||||
|
* mapping it into the address space.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
abd_iter_page(struct abd_iter *aiter)
|
||||||
|
{
|
||||||
|
if (abd_iter_at_end(aiter)) {
|
||||||
|
aiter->iter_page = NULL;
|
||||||
|
aiter->iter_page_doff = 0;
|
||||||
|
aiter->iter_page_dsize = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct page *page;
|
||||||
|
size_t doff, dsize;
|
||||||
|
|
||||||
|
if (abd_is_linear(aiter->iter_abd)) {
|
||||||
|
ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
|
||||||
|
|
||||||
|
/* memory address at iter_pos */
|
||||||
|
void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
|
||||||
|
|
||||||
|
/* struct page for address */
|
||||||
|
page = is_vmalloc_addr(paddr) ?
|
||||||
|
vmalloc_to_page(paddr) : virt_to_page(paddr);
|
||||||
|
|
||||||
|
/* offset of address within the page */
|
||||||
|
doff = offset_in_page(paddr);
|
||||||
|
|
||||||
|
/* total data remaining in abd from this position */
|
||||||
|
dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
|
||||||
|
} else {
|
||||||
|
ASSERT(!abd_is_gang(aiter->iter_abd));
|
||||||
|
|
||||||
|
/* current scatter page */
|
||||||
|
page = sg_page(aiter->iter_sg);
|
||||||
|
|
||||||
|
/* position within page */
|
||||||
|
doff = aiter->iter_offset;
|
||||||
|
|
||||||
|
/* remaining data in scatterlist */
|
||||||
|
dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
|
||||||
|
aiter->iter_abd->abd_size - aiter->iter_pos);
|
||||||
|
}
|
||||||
|
ASSERT(page);
|
||||||
|
|
||||||
|
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
|
||||||
|
if (PageTail(page)) {
|
||||||
|
/*
|
||||||
|
* This page is part of a "compound page", which is a group of
|
||||||
|
* pages that can be referenced from a single struct page *.
|
||||||
|
* Its organised as a "head" page, followed by a series of
|
||||||
|
* "tail" pages.
|
||||||
|
*
|
||||||
|
* In OpenZFS, compound pages are allocated using the
|
||||||
|
* __GFP_COMP flag, which we get from scatter ABDs and SPL
|
||||||
|
* vmalloc slabs (ie >16K allocations). So a great many of the
|
||||||
|
* IO buffers we get are going to be of this type.
|
||||||
|
*
|
||||||
|
* The tail pages are just regular PAGE_SIZE pages, and can be
|
||||||
|
* safely used as-is. However, the head page has length
|
||||||
|
* covering itself and all the tail pages. If this ABD chunk
|
||||||
|
* spans multiple pages, then we can use the head page and a
|
||||||
|
* >PAGE_SIZE length, which is far more efficient.
|
||||||
|
*
|
||||||
|
* To do this, we need to adjust the offset to be counted from
|
||||||
|
* the head page. struct page for compound pages are stored
|
||||||
|
* contiguously, so we can just adjust by a simple offset.
|
||||||
|
*
|
||||||
|
* Before kernel 4.5, compound page heads were refcounted
|
||||||
|
* separately, such that moving back to the head page would
|
||||||
|
* require us to take a reference to it and releasing it once
|
||||||
|
* we're completely finished with it. In practice, that means
|
||||||
|
* when our caller is done with the ABD, which we have no
|
||||||
|
* insight into from here. Rather than contort this API to
|
||||||
|
* track head page references on such ancient kernels, we just
|
||||||
|
* compile this block out and use the tail pages directly. This
|
||||||
|
* is slightly less efficient, but makes everything far
|
||||||
|
* simpler.
|
||||||
|
*/
|
||||||
|
struct page *head = compound_head(page);
|
||||||
|
doff += ((page - head) * PAGESIZE);
|
||||||
|
page = head;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* final page and position within it */
|
||||||
|
aiter->iter_page = page;
|
||||||
|
aiter->iter_page_doff = doff;
|
||||||
|
|
||||||
|
/* amount of data in the chunk, up to the end of the page */
|
||||||
|
aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note: ABD BIO functions only needed to support vdev_classic. See comments in
|
||||||
|
* vdev_disk.c.
|
||||||
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* bio_nr_pages for ABD.
|
* bio_nr_pages for ABD.
|
||||||
* @off is the offset in @abd
|
* @off is the offset in @abd
|
||||||
|
@ -1163,4 +1269,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
|
||||||
module_param(zfs_abd_scatter_max_order, uint, 0644);
|
module_param(zfs_abd_scatter_max_order, uint, 0644);
|
||||||
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
|
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
|
||||||
"Maximum order allocation used for a scatter ABD.");
|
"Maximum order allocation used for a scatter ABD.");
|
||||||
#endif
|
|
||||||
|
#endif /* _KERNEL */
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
|
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
|
||||||
* LLNL-CODE-403049.
|
* LLNL-CODE-403049.
|
||||||
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
|
||||||
|
* Copyright (c) 2023, 2024, Klara Inc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <sys/zfs_context.h>
|
#include <sys/zfs_context.h>
|
||||||
|
@ -66,6 +67,13 @@ typedef struct vdev_disk {
|
||||||
krwlock_t vd_lock;
|
krwlock_t vd_lock;
|
||||||
} vdev_disk_t;
|
} vdev_disk_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Maximum number of segments to add to a bio (min 4). If this is higher than
|
||||||
|
* the maximum allowed by the device queue or the kernel itself, it will be
|
||||||
|
* clamped. Setting it to zero will cause the kernel's ideal size to be used.
|
||||||
|
*/
|
||||||
|
uint_t zfs_vdev_disk_max_segs = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Unique identifier for the exclusive vdev holder.
|
* Unique identifier for the exclusive vdev holder.
|
||||||
*/
|
*/
|
||||||
|
@ -83,17 +91,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
|
||||||
*/
|
*/
|
||||||
#define EFI_MIN_RESV_SIZE (16 * 1024)
|
#define EFI_MIN_RESV_SIZE (16 * 1024)
|
||||||
|
|
||||||
/*
|
|
||||||
* Virtual device vector for disks.
|
|
||||||
*/
|
|
||||||
typedef struct dio_request {
|
|
||||||
zio_t *dr_zio; /* Parent ZIO */
|
|
||||||
atomic_t dr_ref; /* References */
|
|
||||||
int dr_error; /* Bio error */
|
|
||||||
int dr_bio_count; /* Count of bio's */
|
|
||||||
struct bio *dr_bio[]; /* Attached bio's */
|
|
||||||
} dio_request_t;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* BIO request failfast mask.
|
* BIO request failfast mask.
|
||||||
*/
|
*/
|
||||||
|
@ -457,95 +454,15 @@ vdev_disk_close(vdev_t *v)
|
||||||
if (v->vdev_reopening || vd == NULL)
|
if (v->vdev_reopening || vd == NULL)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (vd->vd_bdh != NULL) {
|
if (vd->vd_bdh != NULL)
|
||||||
vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
|
vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
|
||||||
zfs_vdev_holder);
|
zfs_vdev_holder);
|
||||||
}
|
|
||||||
|
|
||||||
rw_destroy(&vd->vd_lock);
|
rw_destroy(&vd->vd_lock);
|
||||||
kmem_free(vd, sizeof (vdev_disk_t));
|
kmem_free(vd, sizeof (vdev_disk_t));
|
||||||
v->vdev_tsd = NULL;
|
v->vdev_tsd = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static dio_request_t *
|
|
||||||
vdev_disk_dio_alloc(int bio_count)
|
|
||||||
{
|
|
||||||
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
|
||||||
sizeof (struct bio *) * bio_count, KM_SLEEP);
|
|
||||||
atomic_set(&dr->dr_ref, 0);
|
|
||||||
dr->dr_bio_count = bio_count;
|
|
||||||
dr->dr_error = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < dr->dr_bio_count; i++)
|
|
||||||
dr->dr_bio[i] = NULL;
|
|
||||||
|
|
||||||
return (dr);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
vdev_disk_dio_free(dio_request_t *dr)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < dr->dr_bio_count; i++)
|
|
||||||
if (dr->dr_bio[i])
|
|
||||||
bio_put(dr->dr_bio[i]);
|
|
||||||
|
|
||||||
kmem_free(dr, sizeof (dio_request_t) +
|
|
||||||
sizeof (struct bio *) * dr->dr_bio_count);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
vdev_disk_dio_get(dio_request_t *dr)
|
|
||||||
{
|
|
||||||
atomic_inc(&dr->dr_ref);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
vdev_disk_dio_put(dio_request_t *dr)
|
|
||||||
{
|
|
||||||
int rc = atomic_dec_return(&dr->dr_ref);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Free the dio_request when the last reference is dropped and
|
|
||||||
* ensure zio_interpret is called only once with the correct zio
|
|
||||||
*/
|
|
||||||
if (rc == 0) {
|
|
||||||
zio_t *zio = dr->dr_zio;
|
|
||||||
int error = dr->dr_error;
|
|
||||||
|
|
||||||
vdev_disk_dio_free(dr);
|
|
||||||
|
|
||||||
if (zio) {
|
|
||||||
zio->io_error = error;
|
|
||||||
ASSERT3S(zio->io_error, >=, 0);
|
|
||||||
if (zio->io_error)
|
|
||||||
vdev_disk_error(zio);
|
|
||||||
|
|
||||||
zio_delay_interrupt(zio);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
|
|
||||||
{
|
|
||||||
dio_request_t *dr = bio->bi_private;
|
|
||||||
|
|
||||||
if (dr->dr_error == 0) {
|
|
||||||
#ifdef HAVE_1ARG_BIO_END_IO_T
|
|
||||||
dr->dr_error = BIO_END_IO_ERROR(bio);
|
|
||||||
#else
|
|
||||||
if (error)
|
|
||||||
dr->dr_error = -(error);
|
|
||||||
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
|
||||||
dr->dr_error = EIO;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Drop reference acquired by __vdev_disk_physio */
|
|
||||||
vdev_disk_dio_put(dr);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
vdev_submit_bio_impl(struct bio *bio)
|
vdev_submit_bio_impl(struct bio *bio)
|
||||||
{
|
{
|
||||||
|
@ -697,8 +614,457 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
|
||||||
return (bio);
|
return (bio);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline uint_t
|
||||||
|
vdev_bio_max_segs(struct block_device *bdev)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Smallest of the device max segs and the tuneable max segs. Minimum
|
||||||
|
* 4, so there's room to finish split pages if they come up.
|
||||||
|
*/
|
||||||
|
const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
|
||||||
|
const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
|
||||||
|
MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
|
||||||
|
const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
|
||||||
|
|
||||||
|
#ifdef HAVE_BIO_MAX_SEGS
|
||||||
|
return (bio_max_segs(max_segs));
|
||||||
|
#else
|
||||||
|
return (MIN(max_segs, BIO_MAX_PAGES));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint_t
|
||||||
|
vdev_bio_max_bytes(struct block_device *bdev)
|
||||||
|
{
|
||||||
|
return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Virtual block IO object (VBIO)
|
||||||
|
*
|
||||||
|
* Linux block IO (BIO) objects have a limit on how many data segments (pages)
|
||||||
|
* they can hold. Depending on how they're allocated and structured, a large
|
||||||
|
* ZIO can require more than one BIO to be submitted to the kernel, which then
|
||||||
|
* all have to complete before we can return the completed ZIO back to ZFS.
|
||||||
|
*
|
||||||
|
* A VBIO is a wrapper around multiple BIOs, carrying everything needed to
|
||||||
|
* translate a ZIO down into the kernel block layer and back again.
|
||||||
|
*
|
||||||
|
* Note that these are only used for data ZIOs (read/write). Meta-operations
|
||||||
|
* (flush/trim) don't need multiple BIOs and so can just make the call
|
||||||
|
* directly.
|
||||||
|
*/
|
||||||
|
typedef struct {
|
||||||
|
zio_t *vbio_zio; /* parent zio */
|
||||||
|
|
||||||
|
struct block_device *vbio_bdev; /* blockdev to submit bios to */
|
||||||
|
|
||||||
|
abd_t *vbio_abd; /* abd carrying borrowed linear buf */
|
||||||
|
|
||||||
|
uint_t vbio_max_segs; /* max segs per bio */
|
||||||
|
|
||||||
|
uint_t vbio_max_bytes; /* max bytes per bio */
|
||||||
|
uint_t vbio_lbs_mask; /* logical block size mask */
|
||||||
|
|
||||||
|
uint64_t vbio_offset; /* start offset of next bio */
|
||||||
|
|
||||||
|
struct bio *vbio_bio; /* pointer to the current bio */
|
||||||
|
int vbio_flags; /* bio flags */
|
||||||
|
} vbio_t;
|
||||||
|
|
||||||
|
static vbio_t *
|
||||||
|
vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
|
||||||
|
{
|
||||||
|
vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
|
||||||
|
|
||||||
|
vbio->vbio_zio = zio;
|
||||||
|
vbio->vbio_bdev = bdev;
|
||||||
|
vbio->vbio_abd = NULL;
|
||||||
|
vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
|
||||||
|
vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
|
||||||
|
vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
|
||||||
|
vbio->vbio_offset = zio->io_offset;
|
||||||
|
vbio->vbio_bio = NULL;
|
||||||
|
vbio->vbio_flags = flags;
|
||||||
|
|
||||||
|
return (vbio);
|
||||||
|
}
|
||||||
|
|
||||||
|
BIO_END_IO_PROTO(vbio_completion, bio, error);
|
||||||
|
|
||||||
|
static int
|
||||||
|
vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
|
||||||
|
{
|
||||||
|
struct bio *bio = vbio->vbio_bio;
|
||||||
|
uint_t ssize;
|
||||||
|
|
||||||
|
while (size > 0) {
|
||||||
|
if (bio == NULL) {
|
||||||
|
/* New BIO, allocate and set up */
|
||||||
|
bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
|
||||||
|
vbio->vbio_max_segs);
|
||||||
|
VERIFY(bio);
|
||||||
|
|
||||||
|
BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
|
||||||
|
bio_set_op_attrs(bio,
|
||||||
|
vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
|
||||||
|
WRITE : READ, vbio->vbio_flags);
|
||||||
|
|
||||||
|
if (vbio->vbio_bio) {
|
||||||
|
bio_chain(vbio->vbio_bio, bio);
|
||||||
|
vdev_submit_bio(vbio->vbio_bio);
|
||||||
|
}
|
||||||
|
vbio->vbio_bio = bio;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Only load as much of the current page data as will fit in
|
||||||
|
* the space left in the BIO, respecting lbs alignment. Older
|
||||||
|
* kernels will error if we try to overfill the BIO, while
|
||||||
|
* newer ones will accept it and split the BIO. This ensures
|
||||||
|
* everything works on older kernels, and avoids an additional
|
||||||
|
* overhead on the new.
|
||||||
|
*/
|
||||||
|
ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
|
||||||
|
vbio->vbio_lbs_mask);
|
||||||
|
if (ssize > 0 &&
|
||||||
|
bio_add_page(bio, page, ssize, offset) == ssize) {
|
||||||
|
/* Accepted, adjust and load any remaining. */
|
||||||
|
size -= ssize;
|
||||||
|
offset += ssize;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* No room, set up for a new BIO and loop */
|
||||||
|
vbio->vbio_offset += BIO_BI_SIZE(bio);
|
||||||
|
|
||||||
|
/* Signal new BIO allocation wanted */
|
||||||
|
bio = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Iterator callback to submit ABD pages to the vbio. */
|
||||||
|
static int
|
||||||
|
vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
|
||||||
|
{
|
||||||
|
vbio_t *vbio = priv;
|
||||||
|
return (vbio_add_page(vbio, page, len, off));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Create some BIOs, fill them with data and submit them */
|
||||||
|
static void
|
||||||
|
vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* We plug so we can submit the BIOs as we go and only unplug them when
|
||||||
|
* they are fully created and submitted. This is important; if we don't
|
||||||
|
* plug, then the kernel may start executing earlier BIOs while we're
|
||||||
|
* still creating and executing later ones, and if the device goes
|
||||||
|
* away while that's happening, older kernels can get confused and
|
||||||
|
* trample memory.
|
||||||
|
*/
|
||||||
|
struct blk_plug plug;
|
||||||
|
blk_start_plug(&plug);
|
||||||
|
|
||||||
|
(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
|
||||||
|
ASSERT(vbio->vbio_bio);
|
||||||
|
|
||||||
|
vbio->vbio_bio->bi_end_io = vbio_completion;
|
||||||
|
vbio->vbio_bio->bi_private = vbio;
|
||||||
|
|
||||||
|
vdev_submit_bio(vbio->vbio_bio);
|
||||||
|
|
||||||
|
blk_finish_plug(&plug);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* IO completion callback */
|
||||||
|
BIO_END_IO_PROTO(vbio_completion, bio, error)
|
||||||
|
{
|
||||||
|
vbio_t *vbio = bio->bi_private;
|
||||||
|
zio_t *zio = vbio->vbio_zio;
|
||||||
|
|
||||||
|
ASSERT(zio);
|
||||||
|
|
||||||
|
/* Capture and log any errors */
|
||||||
|
#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||||
|
zio->io_error = BIO_END_IO_ERROR(bio);
|
||||||
|
#else
|
||||||
|
zio->io_error = 0;
|
||||||
|
if (error)
|
||||||
|
zio->io_error = -(error);
|
||||||
|
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||||
|
zio->io_error = EIO;
|
||||||
|
#endif
|
||||||
|
ASSERT3U(zio->io_error, >=, 0);
|
||||||
|
|
||||||
|
if (zio->io_error)
|
||||||
|
vdev_disk_error(zio);
|
||||||
|
|
||||||
|
/* Return the BIO to the kernel */
|
||||||
|
bio_put(bio);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we copied the ABD before issuing it, clean up and return the copy
|
||||||
|
* to the ADB, with changes if appropriate.
|
||||||
|
*/
|
||||||
|
if (vbio->vbio_abd != NULL) {
|
||||||
|
void *buf = abd_to_buf(vbio->vbio_abd);
|
||||||
|
abd_free(vbio->vbio_abd);
|
||||||
|
vbio->vbio_abd = NULL;
|
||||||
|
|
||||||
|
if (zio->io_type == ZIO_TYPE_READ)
|
||||||
|
abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
|
||||||
|
else
|
||||||
|
abd_return_buf(zio->io_abd, buf, zio->io_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Final cleanup */
|
||||||
|
kmem_free(vbio, sizeof (vbio_t));
|
||||||
|
|
||||||
|
/* All done, submit for processing */
|
||||||
|
zio_delay_interrupt(zio);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Iterator callback to count ABD pages and check their size & alignment.
|
||||||
|
*
|
||||||
|
* On Linux, each BIO segment can take a page pointer, and an offset+length of
|
||||||
|
* the data within that page. A page can be arbitrarily large ("compound"
|
||||||
|
* pages) but we still have to ensure the data portion is correctly sized and
|
||||||
|
* aligned to the logical block size, to ensure that if the kernel wants to
|
||||||
|
* split the BIO, the two halves will still be properly aligned.
|
||||||
|
*/
|
||||||
|
typedef struct {
|
||||||
|
uint_t bmask;
|
||||||
|
uint_t npages;
|
||||||
|
uint_t end;
|
||||||
|
} vdev_disk_check_pages_t;
|
||||||
|
|
||||||
|
static int
|
||||||
|
vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
|
||||||
|
{
|
||||||
|
vdev_disk_check_pages_t *s = priv;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we didn't finish on a block size boundary last time, then there
|
||||||
|
* would be a gap if we tried to use this ABD as-is, so abort.
|
||||||
|
*/
|
||||||
|
if (s->end != 0)
|
||||||
|
return (1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note if we're taking less than a full block, so we can check it
|
||||||
|
* above on the next call.
|
||||||
|
*/
|
||||||
|
s->end = len & s->bmask;
|
||||||
|
|
||||||
|
/* All blocks after the first must start on a block size boundary. */
|
||||||
|
if (s->npages != 0 && (off & s->bmask) != 0)
|
||||||
|
return (1);
|
||||||
|
|
||||||
|
s->npages++;
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if we can submit the pages in this ABD to the kernel as-is. Returns
|
||||||
|
* the number of pages, or 0 if it can't be submitted like this.
|
||||||
|
*/
|
||||||
|
static boolean_t
|
||||||
|
vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
|
||||||
|
{
|
||||||
|
vdev_disk_check_pages_t s = {
|
||||||
|
.bmask = bdev_logical_block_size(bdev)-1,
|
||||||
|
.npages = 0,
|
||||||
|
.end = 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
|
||||||
|
return (B_FALSE);
|
||||||
|
|
||||||
|
return (B_TRUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
vdev_disk_io_rw(zio_t *zio)
|
||||||
|
{
|
||||||
|
vdev_t *v = zio->io_vd;
|
||||||
|
vdev_disk_t *vd = v->vdev_tsd;
|
||||||
|
struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
|
||||||
|
int flags = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Accessing outside the block device is never allowed.
|
||||||
|
*/
|
||||||
|
if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
|
||||||
|
vdev_dbgmsg(zio->io_vd,
|
||||||
|
"Illegal access %llu size %llu, device size %llu",
|
||||||
|
(u_longlong_t)zio->io_offset,
|
||||||
|
(u_longlong_t)zio->io_size,
|
||||||
|
(u_longlong_t)i_size_read(bdev->bd_inode));
|
||||||
|
return (SET_ERROR(EIO));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
||||||
|
v->vdev_failfast == B_TRUE) {
|
||||||
|
bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
|
||||||
|
zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check alignment of the incoming ABD. If any part of it would require
|
||||||
|
* submitting a page that is not aligned to the logical block size,
|
||||||
|
* then we take a copy into a linear buffer and submit that instead.
|
||||||
|
* This should be impossible on a 512b LBS, and fairly rare on 4K,
|
||||||
|
* usually requiring abnormally-small data blocks (eg gang blocks)
|
||||||
|
* mixed into the same ABD as larger ones (eg aggregated).
|
||||||
|
*/
|
||||||
|
abd_t *abd = zio->io_abd;
|
||||||
|
if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
|
||||||
|
void *buf;
|
||||||
|
if (zio->io_type == ZIO_TYPE_READ)
|
||||||
|
buf = abd_borrow_buf(zio->io_abd, zio->io_size);
|
||||||
|
else
|
||||||
|
buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Wrap the copy in an abd_t, so we can use the same iterators
|
||||||
|
* to count and fill the vbio later.
|
||||||
|
*/
|
||||||
|
abd = abd_get_from_buf(buf, zio->io_size);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* False here would mean the borrowed copy has an invalid
|
||||||
|
* alignment too, which would mean we've somehow been passed a
|
||||||
|
* linear ABD with an interior page that has a non-zero offset
|
||||||
|
* or a size not a multiple of PAGE_SIZE. This is not possible.
|
||||||
|
* It would mean either zio_buf_alloc() or its underlying
|
||||||
|
* allocators have done something extremely strange, or our
|
||||||
|
* math in vdev_disk_check_pages() is wrong. In either case,
|
||||||
|
* something in seriously wrong and its not safe to continue.
|
||||||
|
*/
|
||||||
|
VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
|
||||||
|
vbio_t *vbio = vbio_alloc(zio, bdev, flags);
|
||||||
|
if (abd != zio->io_abd)
|
||||||
|
vbio->vbio_abd = abd;
|
||||||
|
|
||||||
|
/* Fill it with data pages and submit it to the kernel */
|
||||||
|
vbio_submit(vbio, abd, zio->io_size);
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ========== */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is the classic, battle-tested BIO submission code. Until we're totally
|
||||||
|
* sure that the new code is safe and correct in all cases, this will remain
|
||||||
|
* available.
|
||||||
|
*
|
||||||
|
* It is enabled by setting zfs_vdev_disk_classic=1 at module load time. It is
|
||||||
|
* enabled (=1) by default since 2.2.4, and disabled by default (=0) on master.
|
||||||
|
*
|
||||||
|
* These functions have been renamed to vdev_classic_* to make it clear what
|
||||||
|
* they belong to, but their implementations are unchanged.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Virtual device vector for disks.
|
||||||
|
*/
|
||||||
|
typedef struct dio_request {
|
||||||
|
zio_t *dr_zio; /* Parent ZIO */
|
||||||
|
atomic_t dr_ref; /* References */
|
||||||
|
int dr_error; /* Bio error */
|
||||||
|
int dr_bio_count; /* Count of bio's */
|
||||||
|
struct bio *dr_bio[]; /* Attached bio's */
|
||||||
|
} dio_request_t;
|
||||||
|
|
||||||
|
static dio_request_t *
|
||||||
|
vdev_classic_dio_alloc(int bio_count)
|
||||||
|
{
|
||||||
|
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
||||||
|
sizeof (struct bio *) * bio_count, KM_SLEEP);
|
||||||
|
atomic_set(&dr->dr_ref, 0);
|
||||||
|
dr->dr_bio_count = bio_count;
|
||||||
|
dr->dr_error = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < dr->dr_bio_count; i++)
|
||||||
|
dr->dr_bio[i] = NULL;
|
||||||
|
|
||||||
|
return (dr);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
vdev_classic_dio_free(dio_request_t *dr)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < dr->dr_bio_count; i++)
|
||||||
|
if (dr->dr_bio[i])
|
||||||
|
bio_put(dr->dr_bio[i]);
|
||||||
|
|
||||||
|
kmem_free(dr, sizeof (dio_request_t) +
|
||||||
|
sizeof (struct bio *) * dr->dr_bio_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
vdev_classic_dio_get(dio_request_t *dr)
|
||||||
|
{
|
||||||
|
atomic_inc(&dr->dr_ref);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
vdev_classic_dio_put(dio_request_t *dr)
|
||||||
|
{
|
||||||
|
int rc = atomic_dec_return(&dr->dr_ref);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Free the dio_request when the last reference is dropped and
|
||||||
|
* ensure zio_interpret is called only once with the correct zio
|
||||||
|
*/
|
||||||
|
if (rc == 0) {
|
||||||
|
zio_t *zio = dr->dr_zio;
|
||||||
|
int error = dr->dr_error;
|
||||||
|
|
||||||
|
vdev_classic_dio_free(dr);
|
||||||
|
|
||||||
|
if (zio) {
|
||||||
|
zio->io_error = error;
|
||||||
|
ASSERT3S(zio->io_error, >=, 0);
|
||||||
|
if (zio->io_error)
|
||||||
|
vdev_disk_error(zio);
|
||||||
|
|
||||||
|
zio_delay_interrupt(zio);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
|
||||||
|
{
|
||||||
|
dio_request_t *dr = bio->bi_private;
|
||||||
|
|
||||||
|
if (dr->dr_error == 0) {
|
||||||
|
#ifdef HAVE_1ARG_BIO_END_IO_T
|
||||||
|
dr->dr_error = BIO_END_IO_ERROR(bio);
|
||||||
|
#else
|
||||||
|
if (error)
|
||||||
|
dr->dr_error = -(error);
|
||||||
|
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||||
|
dr->dr_error = EIO;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Drop reference acquired by vdev_classic_physio */
|
||||||
|
vdev_classic_dio_put(dr);
|
||||||
|
}
|
||||||
|
|
||||||
static inline unsigned int
|
static inline unsigned int
|
||||||
vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
||||||
{
|
{
|
||||||
unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
|
unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
|
||||||
bio_size, abd_offset);
|
bio_size, abd_offset);
|
||||||
|
@ -711,9 +1077,16 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
vdev_classic_physio(zio_t *zio)
|
||||||
size_t io_size, uint64_t io_offset, int rw, int flags)
|
|
||||||
{
|
{
|
||||||
|
vdev_t *v = zio->io_vd;
|
||||||
|
vdev_disk_t *vd = v->vdev_tsd;
|
||||||
|
struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
|
||||||
|
size_t io_size = zio->io_size;
|
||||||
|
uint64_t io_offset = zio->io_offset;
|
||||||
|
int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
|
||||||
|
int flags = 0;
|
||||||
|
|
||||||
dio_request_t *dr;
|
dio_request_t *dr;
|
||||||
uint64_t abd_offset;
|
uint64_t abd_offset;
|
||||||
uint64_t bio_offset;
|
uint64_t bio_offset;
|
||||||
|
@ -736,7 +1109,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
||||||
}
|
}
|
||||||
|
|
||||||
retry:
|
retry:
|
||||||
dr = vdev_disk_dio_alloc(bio_count);
|
dr = vdev_classic_dio_alloc(bio_count);
|
||||||
|
|
||||||
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
|
||||||
zio->io_vd->vdev_failfast == B_TRUE) {
|
zio->io_vd->vdev_failfast == B_TRUE) {
|
||||||
|
@ -771,23 +1144,23 @@ retry:
|
||||||
* this should be rare - see the comment above.
|
* this should be rare - see the comment above.
|
||||||
*/
|
*/
|
||||||
if (dr->dr_bio_count == i) {
|
if (dr->dr_bio_count == i) {
|
||||||
vdev_disk_dio_free(dr);
|
vdev_classic_dio_free(dr);
|
||||||
bio_count *= 2;
|
bio_count *= 2;
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
|
|
||||||
nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
|
nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
|
||||||
dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
|
dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
|
||||||
if (unlikely(dr->dr_bio[i] == NULL)) {
|
if (unlikely(dr->dr_bio[i] == NULL)) {
|
||||||
vdev_disk_dio_free(dr);
|
vdev_classic_dio_free(dr);
|
||||||
return (SET_ERROR(ENOMEM));
|
return (SET_ERROR(ENOMEM));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Matching put called by vdev_disk_physio_completion */
|
/* Matching put called by vdev_classic_physio_completion */
|
||||||
vdev_disk_dio_get(dr);
|
vdev_classic_dio_get(dr);
|
||||||
|
|
||||||
BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
|
BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
|
||||||
dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
|
dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
|
||||||
dr->dr_bio[i]->bi_private = dr;
|
dr->dr_bio[i]->bi_private = dr;
|
||||||
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
|
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
|
||||||
|
|
||||||
|
@ -801,7 +1174,7 @@ retry:
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Extra reference to protect dio_request during vdev_submit_bio */
|
/* Extra reference to protect dio_request during vdev_submit_bio */
|
||||||
vdev_disk_dio_get(dr);
|
vdev_classic_dio_get(dr);
|
||||||
|
|
||||||
if (dr->dr_bio_count > 1)
|
if (dr->dr_bio_count > 1)
|
||||||
blk_start_plug(&plug);
|
blk_start_plug(&plug);
|
||||||
|
@ -815,11 +1188,13 @@ retry:
|
||||||
if (dr->dr_bio_count > 1)
|
if (dr->dr_bio_count > 1)
|
||||||
blk_finish_plug(&plug);
|
blk_finish_plug(&plug);
|
||||||
|
|
||||||
vdev_disk_dio_put(dr);
|
vdev_classic_dio_put(dr);
|
||||||
|
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ========== */
|
||||||
|
|
||||||
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
|
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
|
||||||
{
|
{
|
||||||
zio_t *zio = bio->bi_private;
|
zio_t *zio = bio->bi_private;
|
||||||
|
@ -928,12 +1303,14 @@ vdev_disk_io_trim(zio_t *zio)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
|
||||||
|
|
||||||
static void
|
static void
|
||||||
vdev_disk_io_start(zio_t *zio)
|
vdev_disk_io_start(zio_t *zio)
|
||||||
{
|
{
|
||||||
vdev_t *v = zio->io_vd;
|
vdev_t *v = zio->io_vd;
|
||||||
vdev_disk_t *vd = v->vdev_tsd;
|
vdev_disk_t *vd = v->vdev_tsd;
|
||||||
int rw, error;
|
int error;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
|
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
|
||||||
|
@ -996,13 +1373,6 @@ vdev_disk_io_start(zio_t *zio)
|
||||||
rw_exit(&vd->vd_lock);
|
rw_exit(&vd->vd_lock);
|
||||||
zio_execute(zio);
|
zio_execute(zio);
|
||||||
return;
|
return;
|
||||||
case ZIO_TYPE_WRITE:
|
|
||||||
rw = WRITE;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ZIO_TYPE_READ:
|
|
||||||
rw = READ;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ZIO_TYPE_TRIM:
|
case ZIO_TYPE_TRIM:
|
||||||
zio->io_error = vdev_disk_io_trim(zio);
|
zio->io_error = vdev_disk_io_trim(zio);
|
||||||
|
@ -1015,23 +1385,34 @@ vdev_disk_io_start(zio_t *zio)
|
||||||
#endif
|
#endif
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
case ZIO_TYPE_READ:
|
||||||
|
case ZIO_TYPE_WRITE:
|
||||||
|
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
||||||
|
error = vdev_disk_io_rw_fn(zio);
|
||||||
|
rw_exit(&vd->vd_lock);
|
||||||
|
if (error) {
|
||||||
|
zio->io_error = error;
|
||||||
|
zio_interrupt(zio);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
/*
|
||||||
|
* Getting here means our parent vdev has made a very strange
|
||||||
|
* request of us, and shouldn't happen. Assert here to force a
|
||||||
|
* crash in dev builds, but in production return the IO
|
||||||
|
* unhandled. The pool will likely suspend anyway but that's
|
||||||
|
* nicer than crashing the kernel.
|
||||||
|
*/
|
||||||
|
ASSERT3S(zio->io_type, ==, -1);
|
||||||
|
|
||||||
rw_exit(&vd->vd_lock);
|
rw_exit(&vd->vd_lock);
|
||||||
zio->io_error = SET_ERROR(ENOTSUP);
|
zio->io_error = SET_ERROR(ENOTSUP);
|
||||||
zio_interrupt(zio);
|
zio_interrupt(zio);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
zio->io_target_timestamp = zio_handle_io_delay(zio);
|
__builtin_unreachable();
|
||||||
error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
|
|
||||||
zio->io_size, zio->io_offset, rw, 0);
|
|
||||||
rw_exit(&vd->vd_lock);
|
|
||||||
|
|
||||||
if (error) {
|
|
||||||
zio->io_error = error;
|
|
||||||
zio_interrupt(zio);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -1080,8 +1461,49 @@ vdev_disk_rele(vdev_t *vd)
|
||||||
/* XXX: Implement me as a vnode rele for the device */
|
/* XXX: Implement me as a vnode rele for the device */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* BIO submission method. See comment above about vdev_classic.
|
||||||
|
* Set zfs_vdev_disk_classic=0 for new, =1 for classic
|
||||||
|
*/
|
||||||
|
static uint_t zfs_vdev_disk_classic = 1; /* default classic */
|
||||||
|
|
||||||
|
/* Set submission function from module parameter */
|
||||||
|
static int
|
||||||
|
vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
|
||||||
|
{
|
||||||
|
int err = param_set_uint(buf, kp);
|
||||||
|
if (err < 0)
|
||||||
|
return (SET_ERROR(err));
|
||||||
|
|
||||||
|
vdev_disk_io_rw_fn =
|
||||||
|
zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
|
||||||
|
|
||||||
|
printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
|
||||||
|
zfs_vdev_disk_classic ? "classic" : "new");
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* At first use vdev use, set the submission function from the default value if
|
||||||
|
* it hasn't been set already.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
|
||||||
|
{
|
||||||
|
(void) spa;
|
||||||
|
(void) nv;
|
||||||
|
(void) tsd;
|
||||||
|
|
||||||
|
if (vdev_disk_io_rw_fn == NULL)
|
||||||
|
vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
|
||||||
|
vdev_classic_physio : vdev_disk_io_rw;
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
vdev_ops_t vdev_disk_ops = {
|
vdev_ops_t vdev_disk_ops = {
|
||||||
.vdev_op_init = NULL,
|
.vdev_op_init = vdev_disk_init,
|
||||||
.vdev_op_fini = NULL,
|
.vdev_op_fini = NULL,
|
||||||
.vdev_op_open = vdev_disk_open,
|
.vdev_op_open = vdev_disk_open,
|
||||||
.vdev_op_close = vdev_disk_close,
|
.vdev_op_close = vdev_disk_close,
|
||||||
|
@ -1174,3 +1596,10 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
|
||||||
"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
|
"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
|
||||||
|
|
||||||
|
ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
|
||||||
|
"Maximum number of data segments to add to an IO request (min 4)");
|
||||||
|
|
||||||
|
ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
|
||||||
|
vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
|
||||||
|
"Use classic BIO submission method");
|
||||||
|
|
|
@ -3821,11 +3821,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
|
||||||
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
||||||
zfs_sa_upgrade_txholds(tx, zp);
|
zfs_sa_upgrade_txholds(tx, zp);
|
||||||
|
|
||||||
err = dmu_tx_assign(tx, TXG_NOWAIT);
|
err = dmu_tx_assign(tx, TXG_WAIT);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
if (err == ERESTART)
|
|
||||||
dmu_tx_wait(tx);
|
|
||||||
|
|
||||||
dmu_tx_abort(tx);
|
dmu_tx_abort(tx);
|
||||||
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
|
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
|
||||||
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
|
filemap_dirty_folio(page_mapping(pp), page_folio(pp));
|
||||||
|
@ -4277,9 +4274,4 @@ EXPORT_SYMBOL(zfs_map);
|
||||||
/* CSTYLED */
|
/* CSTYLED */
|
||||||
module_param(zfs_delete_blocks, ulong, 0644);
|
module_param(zfs_delete_blocks, ulong, 0644);
|
||||||
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
|
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
|
||||||
|
|
||||||
/* CSTYLED */
|
|
||||||
module_param(zfs_bclone_enabled, uint, 0644);
|
|
||||||
MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning");
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
|
||||||
{
|
{
|
||||||
boolean_t *for_sync = data;
|
boolean_t *for_sync = data;
|
||||||
fstrans_cookie_t cookie;
|
fstrans_cookie_t cookie;
|
||||||
|
int ret;
|
||||||
|
|
||||||
ASSERT(PageLocked(pp));
|
ASSERT(PageLocked(pp));
|
||||||
ASSERT(!PageWriteback(pp));
|
ASSERT(!PageWriteback(pp));
|
||||||
|
|
||||||
cookie = spl_fstrans_mark();
|
cookie = spl_fstrans_mark();
|
||||||
(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
|
ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
|
||||||
spl_fstrans_unmark(cookie);
|
spl_fstrans_unmark(cookie);
|
||||||
|
|
||||||
return (0);
|
return (ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_WRITEPAGE_T_FOLIO
|
#ifdef HAVE_WRITEPAGE_T_FOLIO
|
||||||
static int
|
static int
|
||||||
zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
|
zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
|
||||||
{
|
{
|
||||||
(void) zpl_putpage(&pp->page, wbc, data);
|
return (zpl_putpage(&pp->page, wbc, data));
|
||||||
return (0);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -26,13 +26,14 @@
|
||||||
#include <linux/compat.h>
|
#include <linux/compat.h>
|
||||||
#endif
|
#endif
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
|
#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
|
||||||
|
#include <linux/splice.h>
|
||||||
|
#endif
|
||||||
#include <sys/file.h>
|
#include <sys/file.h>
|
||||||
#include <sys/zfs_znode.h>
|
#include <sys/zfs_znode.h>
|
||||||
#include <sys/zfs_vnops.h>
|
#include <sys/zfs_vnops.h>
|
||||||
#include <sys/zfeature.h>
|
#include <sys/zfeature.h>
|
||||||
|
|
||||||
int zfs_bclone_enabled = 0;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Clone part of a file via block cloning.
|
* Clone part of a file via block cloning.
|
||||||
*
|
*
|
||||||
|
@ -40,7 +41,7 @@ int zfs_bclone_enabled = 0;
|
||||||
* care of that depending on how it was called.
|
* care of that depending on how it was called.
|
||||||
*/
|
*/
|
||||||
static ssize_t
|
static ssize_t
|
||||||
__zpl_clone_file_range(struct file *src_file, loff_t src_off,
|
zpl_clone_file_range_impl(struct file *src_file, loff_t src_off,
|
||||||
struct file *dst_file, loff_t dst_off, size_t len)
|
struct file *dst_file, loff_t dst_off, size_t len)
|
||||||
{
|
{
|
||||||
struct inode *src_i = file_inode(src_file);
|
struct inode *src_i = file_inode(src_file);
|
||||||
|
@ -96,14 +97,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
||||||
{
|
{
|
||||||
ssize_t ret;
|
ssize_t ret;
|
||||||
|
|
||||||
|
/* Flags is reserved for future extensions and must be zero. */
|
||||||
if (flags != 0)
|
if (flags != 0)
|
||||||
return (-EINVAL);
|
return (-EINVAL);
|
||||||
|
|
||||||
/* Try to do it via zfs_clone_range() */
|
/* Try to do it via zfs_clone_range() and allow shortening. */
|
||||||
ret = __zpl_clone_file_range(src_file, src_off,
|
ret = zpl_clone_file_range_impl(src_file, src_off,
|
||||||
dst_file, dst_off, len);
|
dst_file, dst_off, len);
|
||||||
|
|
||||||
#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
|
#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
|
||||||
/*
|
/*
|
||||||
* Since Linux 5.3 the filesystem driver is responsible for executing
|
* Since Linux 5.3 the filesystem driver is responsible for executing
|
||||||
* an appropriate fallback, and a generic fallback function is provided.
|
* an appropriate fallback, and a generic fallback function is provided.
|
||||||
|
@ -112,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
||||||
ret == -EAGAIN)
|
ret == -EAGAIN)
|
||||||
ret = generic_copy_file_range(src_file, src_off, dst_file,
|
ret = generic_copy_file_range(src_file, src_off, dst_file,
|
||||||
dst_off, len, flags);
|
dst_off, len, flags);
|
||||||
|
#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
|
||||||
|
/*
|
||||||
|
* Since 6.8 the fallback function is called splice_copy_file_range
|
||||||
|
* and has a slightly different signature.
|
||||||
|
*/
|
||||||
|
if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
|
||||||
|
ret == -EAGAIN)
|
||||||
|
ret = splice_copy_file_range(src_file, src_off, dst_file,
|
||||||
|
dst_off, len);
|
||||||
#else
|
#else
|
||||||
/*
|
/*
|
||||||
* Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
|
* Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
|
||||||
|
@ -119,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
||||||
*/
|
*/
|
||||||
if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
|
if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
|
||||||
ret = -EOPNOTSUPP;
|
ret = -EOPNOTSUPP;
|
||||||
#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
|
#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
|
||||||
|
|
||||||
return (ret);
|
return (ret);
|
||||||
}
|
}
|
||||||
|
@ -137,6 +148,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
|
||||||
* FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
|
* FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
|
||||||
* range in both files and if they're the same, arrange for them to be backed
|
* range in both files and if they're the same, arrange for them to be backed
|
||||||
* by the same storage.
|
* by the same storage.
|
||||||
|
*
|
||||||
|
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range
|
||||||
|
* if we want. It's designed for filesystems that may need to shorten the
|
||||||
|
* length for alignment, EOF, or any other requirement. ZFS may shorten the
|
||||||
|
* request when there is outstanding dirty data which hasn't been written.
|
||||||
*/
|
*/
|
||||||
loff_t
|
loff_t
|
||||||
zpl_remap_file_range(struct file *src_file, loff_t src_off,
|
zpl_remap_file_range(struct file *src_file, loff_t src_off,
|
||||||
|
@ -145,24 +161,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off,
|
||||||
if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
|
if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
|
||||||
return (-EINVAL);
|
return (-EINVAL);
|
||||||
|
|
||||||
/*
|
/* No support for dedup yet */
|
||||||
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
|
|
||||||
* range if we want. Its designed for filesystems that make data past
|
|
||||||
* EOF available, and don't want it to be visible in both files. ZFS
|
|
||||||
* doesn't do that, so we just turn the flag off.
|
|
||||||
*/
|
|
||||||
flags &= ~REMAP_FILE_CAN_SHORTEN;
|
|
||||||
|
|
||||||
if (flags & REMAP_FILE_DEDUP)
|
if (flags & REMAP_FILE_DEDUP)
|
||||||
/* No support for dedup yet */
|
|
||||||
return (-EOPNOTSUPP);
|
return (-EOPNOTSUPP);
|
||||||
|
|
||||||
/* Zero length means to clone everything to the end of the file */
|
/* Zero length means to clone everything to the end of the file */
|
||||||
if (len == 0)
|
if (len == 0)
|
||||||
len = i_size_read(file_inode(src_file)) - src_off;
|
len = i_size_read(file_inode(src_file)) - src_off;
|
||||||
|
|
||||||
return (__zpl_clone_file_range(src_file, src_off,
|
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
|
||||||
dst_file, dst_off, len));
|
dst_file, dst_off, len);
|
||||||
|
|
||||||
|
if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len)
|
||||||
|
ret = -EINVAL;
|
||||||
|
|
||||||
|
return (ret);
|
||||||
}
|
}
|
||||||
#endif /* HAVE_VFS_REMAP_FILE_RANGE */
|
#endif /* HAVE_VFS_REMAP_FILE_RANGE */
|
||||||
|
|
||||||
|
@ -179,8 +192,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off,
|
||||||
if (len == 0)
|
if (len == 0)
|
||||||
len = i_size_read(file_inode(src_file)) - src_off;
|
len = i_size_read(file_inode(src_file)) - src_off;
|
||||||
|
|
||||||
return (__zpl_clone_file_range(src_file, src_off,
|
/* The entire length must be cloned or this is an error. */
|
||||||
dst_file, dst_off, len));
|
ssize_t ret = zpl_clone_file_range_impl(src_file, src_off,
|
||||||
|
dst_file, dst_off, len);
|
||||||
|
|
||||||
|
if (ret >= 0 && ret != len)
|
||||||
|
ret = -EINVAL;
|
||||||
|
|
||||||
|
return (ret);
|
||||||
}
|
}
|
||||||
#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
|
#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
|
||||||
|
|
||||||
|
@ -214,8 +233,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg)
|
||||||
|
|
||||||
size_t len = i_size_read(file_inode(src_file));
|
size_t len = i_size_read(file_inode(src_file));
|
||||||
|
|
||||||
ssize_t ret =
|
ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len);
|
||||||
__zpl_clone_file_range(src_file, 0, dst_file, 0, len);
|
|
||||||
|
|
||||||
fput(src_file);
|
fput(src_file);
|
||||||
|
|
||||||
|
@ -253,7 +271,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
|
||||||
if (len == 0)
|
if (len == 0)
|
||||||
len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
|
len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
|
||||||
|
|
||||||
ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
|
ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset,
|
||||||
dst_file, fcr.fcr_dest_offset, len);
|
dst_file, fcr.fcr_dest_offset, len);
|
||||||
|
|
||||||
fput(src_file);
|
fput(src_file);
|
||||||
|
|
|
@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
|
||||||
return (ret);
|
return (ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(__linux__) && defined(_KERNEL)
|
||||||
|
int
|
||||||
|
abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
|
||||||
|
abd_iter_page_func_t *func, void *private)
|
||||||
|
{
|
||||||
|
struct abd_iter aiter;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
if (size == 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
|
abd_verify(abd);
|
||||||
|
ASSERT3U(off + size, <=, abd->abd_size);
|
||||||
|
|
||||||
|
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
|
||||||
|
|
||||||
|
while (size > 0) {
|
||||||
|
IMPLY(abd_is_gang(abd), c_abd != NULL);
|
||||||
|
|
||||||
|
abd_iter_page(&aiter);
|
||||||
|
|
||||||
|
size_t len = MIN(aiter.iter_page_dsize, size);
|
||||||
|
ASSERT3U(len, >, 0);
|
||||||
|
|
||||||
|
ret = func(aiter.iter_page, aiter.iter_page_doff,
|
||||||
|
len, private);
|
||||||
|
|
||||||
|
aiter.iter_page = NULL;
|
||||||
|
aiter.iter_page_doff = 0;
|
||||||
|
aiter.iter_page_dsize = 0;
|
||||||
|
|
||||||
|
if (ret != 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
size -= len;
|
||||||
|
c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (ret);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
struct buf_arg {
|
struct buf_arg {
|
||||||
void *arg_buf;
|
void *arg_buf;
|
||||||
};
|
};
|
||||||
|
|
189
module/zfs/arc.c
189
module/zfs/arc.c
|
@ -3883,7 +3883,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
|
||||||
|
|
||||||
ASSERT3P(marker, !=, NULL);
|
ASSERT3P(marker, !=, NULL);
|
||||||
|
|
||||||
mls = multilist_sublist_lock(ml, idx);
|
mls = multilist_sublist_lock_idx(ml, idx);
|
||||||
|
|
||||||
for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
|
for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
|
||||||
hdr = multilist_sublist_prev(mls, marker)) {
|
hdr = multilist_sublist_prev(mls, marker)) {
|
||||||
|
@ -3995,6 +3995,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
|
||||||
return (bytes_evicted);
|
return (bytes_evicted);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static arc_buf_hdr_t *
|
||||||
|
arc_state_alloc_marker(void)
|
||||||
|
{
|
||||||
|
arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A b_spa of 0 is used to indicate that this header is
|
||||||
|
* a marker. This fact is used in arc_evict_state_impl().
|
||||||
|
*/
|
||||||
|
marker->b_spa = 0;
|
||||||
|
|
||||||
|
return (marker);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
arc_state_free_marker(arc_buf_hdr_t *marker)
|
||||||
|
{
|
||||||
|
kmem_cache_free(hdr_full_cache, marker);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Allocate an array of buffer headers used as placeholders during arc state
|
* Allocate an array of buffer headers used as placeholders during arc state
|
||||||
* eviction.
|
* eviction.
|
||||||
|
@ -4005,16 +4025,8 @@ arc_state_alloc_markers(int count)
|
||||||
arc_buf_hdr_t **markers;
|
arc_buf_hdr_t **markers;
|
||||||
|
|
||||||
markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
|
markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++)
|
||||||
markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
|
markers[i] = arc_state_alloc_marker();
|
||||||
|
|
||||||
/*
|
|
||||||
* A b_spa of 0 is used to indicate that this header is
|
|
||||||
* a marker. This fact is used in arc_evict_state_impl().
|
|
||||||
*/
|
|
||||||
markers[i]->b_spa = 0;
|
|
||||||
|
|
||||||
}
|
|
||||||
return (markers);
|
return (markers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4022,7 +4034,7 @@ static void
|
||||||
arc_state_free_markers(arc_buf_hdr_t **markers, int count)
|
arc_state_free_markers(arc_buf_hdr_t **markers, int count)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < count; i++)
|
for (int i = 0; i < count; i++)
|
||||||
kmem_cache_free(hdr_full_cache, markers[i]);
|
arc_state_free_marker(markers[i]);
|
||||||
kmem_free(markers, sizeof (*markers) * count);
|
kmem_free(markers, sizeof (*markers) * count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4066,7 +4078,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
|
||||||
for (int i = 0; i < num_sublists; i++) {
|
for (int i = 0; i < num_sublists; i++) {
|
||||||
multilist_sublist_t *mls;
|
multilist_sublist_t *mls;
|
||||||
|
|
||||||
mls = multilist_sublist_lock(ml, i);
|
mls = multilist_sublist_lock_idx(ml, i);
|
||||||
multilist_sublist_insert_tail(mls, markers[i]);
|
multilist_sublist_insert_tail(mls, markers[i]);
|
||||||
multilist_sublist_unlock(mls);
|
multilist_sublist_unlock(mls);
|
||||||
}
|
}
|
||||||
|
@ -4131,7 +4143,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < num_sublists; i++) {
|
for (int i = 0; i < num_sublists; i++) {
|
||||||
multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
|
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
|
||||||
multilist_sublist_remove(mls, markers[i]);
|
multilist_sublist_remove(mls, markers[i]);
|
||||||
multilist_sublist_unlock(mls);
|
multilist_sublist_unlock(mls);
|
||||||
}
|
}
|
||||||
|
@ -8639,7 +8651,7 @@ l2arc_sublist_lock(int list_num)
|
||||||
* sublists being selected.
|
* sublists being selected.
|
||||||
*/
|
*/
|
||||||
idx = multilist_get_random_index(ml);
|
idx = multilist_get_random_index(ml);
|
||||||
return (multilist_sublist_lock(ml, idx));
|
return (multilist_sublist_lock_idx(ml, idx));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -9051,9 +9063,9 @@ l2arc_blk_fetch_done(zio_t *zio)
|
||||||
static uint64_t
|
static uint64_t
|
||||||
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||||
{
|
{
|
||||||
arc_buf_hdr_t *hdr, *hdr_prev, *head;
|
arc_buf_hdr_t *hdr, *head, *marker;
|
||||||
uint64_t write_asize, write_psize, write_lsize, headroom;
|
uint64_t write_asize, write_psize, headroom;
|
||||||
boolean_t full;
|
boolean_t full, from_head = !arc_warm;
|
||||||
l2arc_write_callback_t *cb = NULL;
|
l2arc_write_callback_t *cb = NULL;
|
||||||
zio_t *pio, *wzio;
|
zio_t *pio, *wzio;
|
||||||
uint64_t guid = spa_load_guid(spa);
|
uint64_t guid = spa_load_guid(spa);
|
||||||
|
@ -9062,10 +9074,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||||
ASSERT3P(dev->l2ad_vdev, !=, NULL);
|
ASSERT3P(dev->l2ad_vdev, !=, NULL);
|
||||||
|
|
||||||
pio = NULL;
|
pio = NULL;
|
||||||
write_lsize = write_asize = write_psize = 0;
|
write_asize = write_psize = 0;
|
||||||
full = B_FALSE;
|
full = B_FALSE;
|
||||||
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
|
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
|
||||||
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
|
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
|
||||||
|
marker = arc_state_alloc_marker();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copy buffers for L2ARC writing.
|
* Copy buffers for L2ARC writing.
|
||||||
|
@ -9080,40 +9093,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
multilist_sublist_t *mls = l2arc_sublist_lock(pass);
|
|
||||||
uint64_t passed_sz = 0;
|
uint64_t passed_sz = 0;
|
||||||
|
|
||||||
VERIFY3P(mls, !=, NULL);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* L2ARC fast warmup.
|
|
||||||
*
|
|
||||||
* Until the ARC is warm and starts to evict, read from the
|
|
||||||
* head of the ARC lists rather than the tail.
|
|
||||||
*/
|
|
||||||
if (arc_warm == B_FALSE)
|
|
||||||
hdr = multilist_sublist_head(mls);
|
|
||||||
else
|
|
||||||
hdr = multilist_sublist_tail(mls);
|
|
||||||
|
|
||||||
headroom = target_sz * l2arc_headroom;
|
headroom = target_sz * l2arc_headroom;
|
||||||
if (zfs_compressed_arc_enabled)
|
if (zfs_compressed_arc_enabled)
|
||||||
headroom = (headroom * l2arc_headroom_boost) / 100;
|
headroom = (headroom * l2arc_headroom_boost) / 100;
|
||||||
|
|
||||||
for (; hdr; hdr = hdr_prev) {
|
/*
|
||||||
|
* Until the ARC is warm and starts to evict, read from the
|
||||||
|
* head of the ARC lists rather than the tail.
|
||||||
|
*/
|
||||||
|
multilist_sublist_t *mls = l2arc_sublist_lock(pass);
|
||||||
|
ASSERT3P(mls, !=, NULL);
|
||||||
|
if (from_head)
|
||||||
|
hdr = multilist_sublist_head(mls);
|
||||||
|
else
|
||||||
|
hdr = multilist_sublist_tail(mls);
|
||||||
|
|
||||||
|
while (hdr != NULL) {
|
||||||
kmutex_t *hash_lock;
|
kmutex_t *hash_lock;
|
||||||
abd_t *to_write = NULL;
|
abd_t *to_write = NULL;
|
||||||
|
|
||||||
if (arc_warm == B_FALSE)
|
|
||||||
hdr_prev = multilist_sublist_next(mls, hdr);
|
|
||||||
else
|
|
||||||
hdr_prev = multilist_sublist_prev(mls, hdr);
|
|
||||||
|
|
||||||
hash_lock = HDR_LOCK(hdr);
|
hash_lock = HDR_LOCK(hdr);
|
||||||
if (!mutex_tryenter(hash_lock)) {
|
if (!mutex_tryenter(hash_lock)) {
|
||||||
/*
|
skip:
|
||||||
* Skip this buffer rather than waiting.
|
/* Skip this buffer rather than waiting. */
|
||||||
*/
|
if (from_head)
|
||||||
|
hdr = multilist_sublist_next(mls, hdr);
|
||||||
|
else
|
||||||
|
hdr = multilist_sublist_prev(mls, hdr);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9128,11 +9135,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||||
|
|
||||||
if (!l2arc_write_eligible(guid, hdr)) {
|
if (!l2arc_write_eligible(guid, hdr)) {
|
||||||
mutex_exit(hash_lock);
|
mutex_exit(hash_lock);
|
||||||
continue;
|
goto skip;
|
||||||
}
|
}
|
||||||
|
|
||||||
ASSERT(HDR_HAS_L1HDR(hdr));
|
ASSERT(HDR_HAS_L1HDR(hdr));
|
||||||
|
|
||||||
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
|
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
|
||||||
ASSERT3U(arc_hdr_size(hdr), >, 0);
|
ASSERT3U(arc_hdr_size(hdr), >, 0);
|
||||||
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
|
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
|
||||||
|
@ -9154,12 +9160,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We rely on the L1 portion of the header below, so
|
* We should not sleep with sublist lock held or it
|
||||||
* it's invalid for this header to have been evicted out
|
* may block ARC eviction. Insert a marker to save
|
||||||
* of the ghost cache, prior to being written out. The
|
* the position and drop the lock.
|
||||||
* ARC_FLAG_L2_WRITING bit ensures this won't happen.
|
|
||||||
*/
|
*/
|
||||||
arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
|
if (from_head) {
|
||||||
|
multilist_sublist_insert_after(mls, hdr,
|
||||||
|
marker);
|
||||||
|
} else {
|
||||||
|
multilist_sublist_insert_before(mls, hdr,
|
||||||
|
marker);
|
||||||
|
}
|
||||||
|
multilist_sublist_unlock(mls);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If this header has b_rabd, we can use this since it
|
* If this header has b_rabd, we can use this since it
|
||||||
|
@ -9190,32 +9202,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||||
&to_write);
|
&to_write);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
arc_hdr_clear_flags(hdr,
|
arc_hdr_clear_flags(hdr,
|
||||||
ARC_FLAG_L2_WRITING);
|
ARC_FLAG_L2CACHE);
|
||||||
mutex_exit(hash_lock);
|
mutex_exit(hash_lock);
|
||||||
continue;
|
goto next;
|
||||||
}
|
}
|
||||||
|
|
||||||
l2arc_free_abd_on_write(to_write, asize, type);
|
l2arc_free_abd_on_write(to_write, asize, type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
hdr->b_l2hdr.b_dev = dev;
|
||||||
|
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
|
||||||
|
hdr->b_l2hdr.b_hits = 0;
|
||||||
|
hdr->b_l2hdr.b_arcs_state =
|
||||||
|
hdr->b_l1hdr.b_state->arcs_state;
|
||||||
|
mutex_enter(&dev->l2ad_mtx);
|
||||||
if (pio == NULL) {
|
if (pio == NULL) {
|
||||||
/*
|
/*
|
||||||
* Insert a dummy header on the buflist so
|
* Insert a dummy header on the buflist so
|
||||||
* l2arc_write_done() can find where the
|
* l2arc_write_done() can find where the
|
||||||
* write buffers begin without searching.
|
* write buffers begin without searching.
|
||||||
*/
|
*/
|
||||||
mutex_enter(&dev->l2ad_mtx);
|
|
||||||
list_insert_head(&dev->l2ad_buflist, head);
|
list_insert_head(&dev->l2ad_buflist, head);
|
||||||
mutex_exit(&dev->l2ad_mtx);
|
}
|
||||||
|
list_insert_head(&dev->l2ad_buflist, hdr);
|
||||||
|
mutex_exit(&dev->l2ad_mtx);
|
||||||
|
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
|
||||||
|
ARC_FLAG_L2_WRITING);
|
||||||
|
|
||||||
|
(void) zfs_refcount_add_many(&dev->l2ad_alloc,
|
||||||
|
arc_hdr_size(hdr), hdr);
|
||||||
|
l2arc_hdr_arcstats_increment(hdr);
|
||||||
|
|
||||||
|
boolean_t commit = l2arc_log_blk_insert(dev, hdr);
|
||||||
|
mutex_exit(hash_lock);
|
||||||
|
|
||||||
|
if (pio == NULL) {
|
||||||
cb = kmem_alloc(
|
cb = kmem_alloc(
|
||||||
sizeof (l2arc_write_callback_t), KM_SLEEP);
|
sizeof (l2arc_write_callback_t), KM_SLEEP);
|
||||||
cb->l2wcb_dev = dev;
|
cb->l2wcb_dev = dev;
|
||||||
cb->l2wcb_head = head;
|
cb->l2wcb_head = head;
|
||||||
/*
|
|
||||||
* Create a list to save allocated abd buffers
|
|
||||||
* for l2arc_log_blk_commit().
|
|
||||||
*/
|
|
||||||
list_create(&cb->l2wcb_abd_list,
|
list_create(&cb->l2wcb_abd_list,
|
||||||
sizeof (l2arc_lb_abd_buf_t),
|
sizeof (l2arc_lb_abd_buf_t),
|
||||||
offsetof(l2arc_lb_abd_buf_t, node));
|
offsetof(l2arc_lb_abd_buf_t, node));
|
||||||
|
@ -9223,54 +9248,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||||
ZIO_FLAG_CANFAIL);
|
ZIO_FLAG_CANFAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
hdr->b_l2hdr.b_dev = dev;
|
|
||||||
hdr->b_l2hdr.b_hits = 0;
|
|
||||||
|
|
||||||
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
|
|
||||||
hdr->b_l2hdr.b_arcs_state =
|
|
||||||
hdr->b_l1hdr.b_state->arcs_state;
|
|
||||||
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
|
|
||||||
|
|
||||||
mutex_enter(&dev->l2ad_mtx);
|
|
||||||
list_insert_head(&dev->l2ad_buflist, hdr);
|
|
||||||
mutex_exit(&dev->l2ad_mtx);
|
|
||||||
|
|
||||||
(void) zfs_refcount_add_many(&dev->l2ad_alloc,
|
|
||||||
arc_hdr_size(hdr), hdr);
|
|
||||||
|
|
||||||
wzio = zio_write_phys(pio, dev->l2ad_vdev,
|
wzio = zio_write_phys(pio, dev->l2ad_vdev,
|
||||||
hdr->b_l2hdr.b_daddr, asize, to_write,
|
dev->l2ad_hand, asize, to_write,
|
||||||
ZIO_CHECKSUM_OFF, NULL, hdr,
|
ZIO_CHECKSUM_OFF, NULL, hdr,
|
||||||
ZIO_PRIORITY_ASYNC_WRITE,
|
ZIO_PRIORITY_ASYNC_WRITE,
|
||||||
ZIO_FLAG_CANFAIL, B_FALSE);
|
ZIO_FLAG_CANFAIL, B_FALSE);
|
||||||
|
|
||||||
write_lsize += HDR_GET_LSIZE(hdr);
|
|
||||||
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
|
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
|
||||||
zio_t *, wzio);
|
zio_t *, wzio);
|
||||||
|
zio_nowait(wzio);
|
||||||
|
|
||||||
write_psize += psize;
|
write_psize += psize;
|
||||||
write_asize += asize;
|
write_asize += asize;
|
||||||
dev->l2ad_hand += asize;
|
dev->l2ad_hand += asize;
|
||||||
l2arc_hdr_arcstats_increment(hdr);
|
|
||||||
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
|
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
|
||||||
|
|
||||||
mutex_exit(hash_lock);
|
if (commit) {
|
||||||
|
/* l2ad_hand will be adjusted inside. */
|
||||||
/*
|
|
||||||
* Append buf info to current log and commit if full.
|
|
||||||
* arcstat_l2_{size,asize} kstats are updated
|
|
||||||
* internally.
|
|
||||||
*/
|
|
||||||
if (l2arc_log_blk_insert(dev, hdr)) {
|
|
||||||
/*
|
|
||||||
* l2ad_hand will be adjusted in
|
|
||||||
* l2arc_log_blk_commit().
|
|
||||||
*/
|
|
||||||
write_asize +=
|
write_asize +=
|
||||||
l2arc_log_blk_commit(dev, pio, cb);
|
l2arc_log_blk_commit(dev, pio, cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
zio_nowait(wzio);
|
next:
|
||||||
|
multilist_sublist_lock(mls);
|
||||||
|
if (from_head)
|
||||||
|
hdr = multilist_sublist_next(mls, marker);
|
||||||
|
else
|
||||||
|
hdr = multilist_sublist_prev(mls, marker);
|
||||||
|
multilist_sublist_remove(mls, marker);
|
||||||
}
|
}
|
||||||
|
|
||||||
multilist_sublist_unlock(mls);
|
multilist_sublist_unlock(mls);
|
||||||
|
@ -9279,9 +9284,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
arc_state_free_marker(marker);
|
||||||
|
|
||||||
/* No buffers selected for writing? */
|
/* No buffers selected for writing? */
|
||||||
if (pio == NULL) {
|
if (pio == NULL) {
|
||||||
ASSERT0(write_lsize);
|
ASSERT0(write_psize);
|
||||||
ASSERT(!HDR_HAS_L1HDR(head));
|
ASSERT(!HDR_HAS_L1HDR(head));
|
||||||
kmem_cache_free(hdr_l2only_cache, head);
|
kmem_cache_free(hdr_l2only_cache, head);
|
||||||
|
|
||||||
|
@ -10609,7 +10616,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
|
||||||
L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
|
L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
|
||||||
L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
|
L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
|
||||||
L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
|
L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
|
||||||
L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
|
L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
|
||||||
|
|
||||||
dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
|
dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
|
||||||
HDR_GET_PSIZE(hdr));
|
HDR_GET_PSIZE(hdr));
|
||||||
|
|
|
@ -754,7 +754,7 @@ static void
|
||||||
dbuf_evict_one(void)
|
dbuf_evict_one(void)
|
||||||
{
|
{
|
||||||
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
|
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
|
||||||
multilist_sublist_t *mls = multilist_sublist_lock(
|
multilist_sublist_t *mls = multilist_sublist_lock_idx(
|
||||||
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
|
&dbuf_caches[DB_DBUF_CACHE].cache, idx);
|
||||||
|
|
||||||
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
|
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
|
||||||
|
@ -1542,17 +1542,14 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
|
||||||
* returning.
|
* returning.
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
|
dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||||
db_lock_type_t dblt, const void *tag)
|
db_lock_type_t dblt, const void *tag)
|
||||||
{
|
{
|
||||||
dnode_t *dn;
|
|
||||||
zbookmark_phys_t zb;
|
zbookmark_phys_t zb;
|
||||||
uint32_t aflags = ARC_FLAG_NOWAIT;
|
uint32_t aflags = ARC_FLAG_NOWAIT;
|
||||||
int err, zio_flags;
|
int err, zio_flags;
|
||||||
blkptr_t bp, *bpp;
|
blkptr_t bp, *bpp;
|
||||||
|
|
||||||
DB_DNODE_ENTER(db);
|
|
||||||
dn = DB_DNODE(db);
|
|
||||||
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
||||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||||
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
|
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
|
||||||
|
@ -1627,8 +1624,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
|
||||||
if (err != 0)
|
if (err != 0)
|
||||||
goto early_unlock;
|
goto early_unlock;
|
||||||
|
|
||||||
DB_DNODE_EXIT(db);
|
|
||||||
|
|
||||||
db->db_state = DB_READ;
|
db->db_state = DB_READ;
|
||||||
DTRACE_SET_STATE(db, "read issued");
|
DTRACE_SET_STATE(db, "read issued");
|
||||||
mutex_exit(&db->db_mtx);
|
mutex_exit(&db->db_mtx);
|
||||||
|
@ -1653,12 +1648,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
|
||||||
* parent's rwlock, which would be a lock ordering violation.
|
* parent's rwlock, which would be a lock ordering violation.
|
||||||
*/
|
*/
|
||||||
dmu_buf_unlock_parent(db, dblt, tag);
|
dmu_buf_unlock_parent(db, dblt, tag);
|
||||||
(void) arc_read(zio, db->db_objset->os_spa, bpp,
|
return (arc_read(zio, db->db_objset->os_spa, bpp,
|
||||||
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
|
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
|
||||||
&aflags, &zb);
|
&aflags, &zb));
|
||||||
return (err);
|
|
||||||
early_unlock:
|
early_unlock:
|
||||||
DB_DNODE_EXIT(db);
|
|
||||||
mutex_exit(&db->db_mtx);
|
mutex_exit(&db->db_mtx);
|
||||||
dmu_buf_unlock_parent(db, dblt, tag);
|
dmu_buf_unlock_parent(db, dblt, tag);
|
||||||
return (err);
|
return (err);
|
||||||
|
@ -1743,7 +1737,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||||
{
|
{
|
||||||
int err = 0;
|
int err = 0;
|
||||||
boolean_t prefetch;
|
boolean_t prefetch;
|
||||||
|
@ -1759,7 +1753,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||||
dn = DB_DNODE(db);
|
dn = DB_DNODE(db);
|
||||||
|
|
||||||
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
||||||
(flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
|
(flags & DB_RF_NOPREFETCH) == 0;
|
||||||
|
|
||||||
mutex_enter(&db->db_mtx);
|
mutex_enter(&db->db_mtx);
|
||||||
if (flags & DB_RF_PARTIAL_FIRST)
|
if (flags & DB_RF_PARTIAL_FIRST)
|
||||||
|
@ -1806,13 +1800,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||||
|
|
||||||
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
|
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
|
||||||
|
|
||||||
if (zio == NULL && (db->db_state == DB_NOFILL ||
|
if (pio == NULL && (db->db_state == DB_NOFILL ||
|
||||||
(db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
|
(db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
|
||||||
spa_t *spa = dn->dn_objset->os_spa;
|
spa_t *spa = dn->dn_objset->os_spa;
|
||||||
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||||
need_wait = B_TRUE;
|
need_wait = B_TRUE;
|
||||||
}
|
}
|
||||||
err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
|
err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
|
||||||
/*
|
/*
|
||||||
* dbuf_read_impl has dropped db_mtx and our parent's rwlock
|
* dbuf_read_impl has dropped db_mtx and our parent's rwlock
|
||||||
* for us
|
* for us
|
||||||
|
@ -1833,9 +1827,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||||
*/
|
*/
|
||||||
if (need_wait) {
|
if (need_wait) {
|
||||||
if (err == 0)
|
if (err == 0)
|
||||||
err = zio_wait(zio);
|
err = zio_wait(pio);
|
||||||
else
|
else
|
||||||
VERIFY0(zio_wait(zio));
|
(void) zio_wait(pio);
|
||||||
|
pio = NULL;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
|
@ -1862,7 +1857,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||||
ASSERT(db->db_state == DB_READ ||
|
ASSERT(db->db_state == DB_READ ||
|
||||||
(flags & DB_RF_HAVESTRUCT) == 0);
|
(flags & DB_RF_HAVESTRUCT) == 0);
|
||||||
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
|
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
|
||||||
db, zio_t *, zio);
|
db, zio_t *, pio);
|
||||||
cv_wait(&db->db_changed, &db->db_mtx);
|
cv_wait(&db->db_changed, &db->db_mtx);
|
||||||
}
|
}
|
||||||
if (db->db_state == DB_UNCACHED)
|
if (db->db_state == DB_UNCACHED)
|
||||||
|
@ -1871,6 +1866,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (pio && err != 0) {
|
||||||
|
zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
|
||||||
|
ZIO_FLAG_CANFAIL);
|
||||||
|
zio->io_error = err;
|
||||||
|
zio_nowait(zio);
|
||||||
|
}
|
||||||
|
|
||||||
return (err);
|
return (err);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -569,8 +569,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||||
for (i = 0; i < nblks; i++) {
|
for (i = 0; i < nblks; i++) {
|
||||||
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
|
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
|
||||||
if (db == NULL) {
|
if (db == NULL) {
|
||||||
if (zs)
|
if (zs) {
|
||||||
dmu_zfetch_run(zs, missed, B_TRUE);
|
dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
|
||||||
|
B_TRUE);
|
||||||
|
}
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
dmu_buf_rele_array(dbp, nblks, tag);
|
dmu_buf_rele_array(dbp, nblks, tag);
|
||||||
if (read)
|
if (read)
|
||||||
|
@ -606,7 +608,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||||
zfs_racct_write(length, nblks);
|
zfs_racct_write(length, nblks);
|
||||||
|
|
||||||
if (zs)
|
if (zs)
|
||||||
dmu_zfetch_run(zs, missed, B_TRUE);
|
dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
|
||||||
rw_exit(&dn->dn_struct_rwlock);
|
rw_exit(&dn->dn_struct_rwlock);
|
||||||
|
|
||||||
if (read) {
|
if (read) {
|
||||||
|
|
|
@ -1633,7 +1633,7 @@ sync_dnodes_task(void *arg)
|
||||||
sync_dnodes_arg_t *sda = arg;
|
sync_dnodes_arg_t *sda = arg;
|
||||||
|
|
||||||
multilist_sublist_t *ms =
|
multilist_sublist_t *ms =
|
||||||
multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
|
multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
|
||||||
|
|
||||||
dmu_objset_sync_dnodes(ms, sda->sda_tx);
|
dmu_objset_sync_dnodes(ms, sda->sda_tx);
|
||||||
|
|
||||||
|
@ -1987,8 +1987,8 @@ userquota_updates_task(void *arg)
|
||||||
dnode_t *dn;
|
dnode_t *dn;
|
||||||
userquota_cache_t cache = { { 0 } };
|
userquota_cache_t cache = { { 0 } };
|
||||||
|
|
||||||
multilist_sublist_t *list =
|
multilist_sublist_t *list = multilist_sublist_lock_idx(
|
||||||
multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
|
&os->os_synced_dnodes, uua->uua_sublist_idx);
|
||||||
|
|
||||||
ASSERT(multilist_sublist_head(list) == NULL ||
|
ASSERT(multilist_sublist_head(list) == NULL ||
|
||||||
dmu_objset_userused_enabled(os));
|
dmu_objset_userused_enabled(os));
|
||||||
|
@ -2070,8 +2070,8 @@ dnode_rele_task(void *arg)
|
||||||
userquota_updates_arg_t *uua = arg;
|
userquota_updates_arg_t *uua = arg;
|
||||||
objset_t *os = uua->uua_os;
|
objset_t *os = uua->uua_os;
|
||||||
|
|
||||||
multilist_sublist_t *list =
|
multilist_sublist_t *list = multilist_sublist_lock_idx(
|
||||||
multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
|
&os->os_synced_dnodes, uua->uua_sublist_idx);
|
||||||
|
|
||||||
dnode_t *dn;
|
dnode_t *dn;
|
||||||
while ((dn = multilist_sublist_head(list)) != NULL) {
|
while ((dn = multilist_sublist_head(list)) != NULL) {
|
||||||
|
|
|
@ -2110,6 +2110,16 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
|
||||||
dmu_buf_rele(db, FTAG);
|
dmu_buf_rele(db, FTAG);
|
||||||
dnode_rele(dn, FTAG);
|
dnode_rele(dn, FTAG);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the receive fails, we want the resume stream to start with the
|
||||||
|
* same record that we last successfully received. There is no way to
|
||||||
|
* request resume from the object record, but we can benefit from the
|
||||||
|
* fact that sender always sends object record before anything else,
|
||||||
|
* after which it will "resend" data at offset 0 and resume normally.
|
||||||
|
*/
|
||||||
|
save_resume_state(rwa, drro->drr_object, 0, tx);
|
||||||
|
|
||||||
dmu_tx_commit(tx);
|
dmu_tx_commit(tx);
|
||||||
|
|
||||||
return (0);
|
return (0);
|
||||||
|
|
|
@ -65,9 +65,16 @@ unsigned int zfetch_max_distance = 64 * 1024 * 1024;
|
||||||
#endif
|
#endif
|
||||||
/* max bytes to prefetch indirects for per stream (default 64MB) */
|
/* max bytes to prefetch indirects for per stream (default 64MB) */
|
||||||
unsigned int zfetch_max_idistance = 64 * 1024 * 1024;
|
unsigned int zfetch_max_idistance = 64 * 1024 * 1024;
|
||||||
|
/* max request reorder distance within a stream (default 16MB) */
|
||||||
|
unsigned int zfetch_max_reorder = 16 * 1024 * 1024;
|
||||||
|
/* Max log2 fraction of holes in a stream */
|
||||||
|
unsigned int zfetch_hole_shift = 2;
|
||||||
|
|
||||||
typedef struct zfetch_stats {
|
typedef struct zfetch_stats {
|
||||||
kstat_named_t zfetchstat_hits;
|
kstat_named_t zfetchstat_hits;
|
||||||
|
kstat_named_t zfetchstat_future;
|
||||||
|
kstat_named_t zfetchstat_stride;
|
||||||
|
kstat_named_t zfetchstat_past;
|
||||||
kstat_named_t zfetchstat_misses;
|
kstat_named_t zfetchstat_misses;
|
||||||
kstat_named_t zfetchstat_max_streams;
|
kstat_named_t zfetchstat_max_streams;
|
||||||
kstat_named_t zfetchstat_io_issued;
|
kstat_named_t zfetchstat_io_issued;
|
||||||
|
@ -76,6 +83,9 @@ typedef struct zfetch_stats {
|
||||||
|
|
||||||
static zfetch_stats_t zfetch_stats = {
|
static zfetch_stats_t zfetch_stats = {
|
||||||
{ "hits", KSTAT_DATA_UINT64 },
|
{ "hits", KSTAT_DATA_UINT64 },
|
||||||
|
{ "future", KSTAT_DATA_UINT64 },
|
||||||
|
{ "stride", KSTAT_DATA_UINT64 },
|
||||||
|
{ "past", KSTAT_DATA_UINT64 },
|
||||||
{ "misses", KSTAT_DATA_UINT64 },
|
{ "misses", KSTAT_DATA_UINT64 },
|
||||||
{ "max_streams", KSTAT_DATA_UINT64 },
|
{ "max_streams", KSTAT_DATA_UINT64 },
|
||||||
{ "io_issued", KSTAT_DATA_UINT64 },
|
{ "io_issued", KSTAT_DATA_UINT64 },
|
||||||
|
@ -84,6 +94,9 @@ static zfetch_stats_t zfetch_stats = {
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
wmsum_t zfetchstat_hits;
|
wmsum_t zfetchstat_hits;
|
||||||
|
wmsum_t zfetchstat_future;
|
||||||
|
wmsum_t zfetchstat_stride;
|
||||||
|
wmsum_t zfetchstat_past;
|
||||||
wmsum_t zfetchstat_misses;
|
wmsum_t zfetchstat_misses;
|
||||||
wmsum_t zfetchstat_max_streams;
|
wmsum_t zfetchstat_max_streams;
|
||||||
wmsum_t zfetchstat_io_issued;
|
wmsum_t zfetchstat_io_issued;
|
||||||
|
@ -107,6 +120,12 @@ zfetch_kstats_update(kstat_t *ksp, int rw)
|
||||||
return (EACCES);
|
return (EACCES);
|
||||||
zs->zfetchstat_hits.value.ui64 =
|
zs->zfetchstat_hits.value.ui64 =
|
||||||
wmsum_value(&zfetch_sums.zfetchstat_hits);
|
wmsum_value(&zfetch_sums.zfetchstat_hits);
|
||||||
|
zs->zfetchstat_future.value.ui64 =
|
||||||
|
wmsum_value(&zfetch_sums.zfetchstat_future);
|
||||||
|
zs->zfetchstat_stride.value.ui64 =
|
||||||
|
wmsum_value(&zfetch_sums.zfetchstat_stride);
|
||||||
|
zs->zfetchstat_past.value.ui64 =
|
||||||
|
wmsum_value(&zfetch_sums.zfetchstat_past);
|
||||||
zs->zfetchstat_misses.value.ui64 =
|
zs->zfetchstat_misses.value.ui64 =
|
||||||
wmsum_value(&zfetch_sums.zfetchstat_misses);
|
wmsum_value(&zfetch_sums.zfetchstat_misses);
|
||||||
zs->zfetchstat_max_streams.value.ui64 =
|
zs->zfetchstat_max_streams.value.ui64 =
|
||||||
|
@ -122,6 +141,9 @@ void
|
||||||
zfetch_init(void)
|
zfetch_init(void)
|
||||||
{
|
{
|
||||||
wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
|
wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
|
||||||
|
wmsum_init(&zfetch_sums.zfetchstat_future, 0);
|
||||||
|
wmsum_init(&zfetch_sums.zfetchstat_stride, 0);
|
||||||
|
wmsum_init(&zfetch_sums.zfetchstat_past, 0);
|
||||||
wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
|
wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
|
||||||
wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
|
wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
|
||||||
wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
|
wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
|
||||||
|
@ -147,6 +169,9 @@ zfetch_fini(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
wmsum_fini(&zfetch_sums.zfetchstat_hits);
|
wmsum_fini(&zfetch_sums.zfetchstat_hits);
|
||||||
|
wmsum_fini(&zfetch_sums.zfetchstat_future);
|
||||||
|
wmsum_fini(&zfetch_sums.zfetchstat_stride);
|
||||||
|
wmsum_fini(&zfetch_sums.zfetchstat_past);
|
||||||
wmsum_fini(&zfetch_sums.zfetchstat_misses);
|
wmsum_fini(&zfetch_sums.zfetchstat_misses);
|
||||||
wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
|
wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
|
||||||
wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
|
wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
|
||||||
|
@ -222,22 +247,22 @@ static void
|
||||||
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
||||||
{
|
{
|
||||||
zstream_t *zs, *zs_next, *zs_old = NULL;
|
zstream_t *zs, *zs_next, *zs_old = NULL;
|
||||||
hrtime_t now = gethrtime(), t;
|
uint_t now = gethrestime_sec(), t;
|
||||||
|
|
||||||
ASSERT(MUTEX_HELD(&zf->zf_lock));
|
ASSERT(MUTEX_HELD(&zf->zf_lock));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Delete too old streams, reusing the first found one.
|
* Delete too old streams, reusing the first found one.
|
||||||
*/
|
*/
|
||||||
t = now - SEC2NSEC(zfetch_max_sec_reap);
|
t = now - zfetch_max_sec_reap;
|
||||||
for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
|
for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
|
||||||
zs_next = list_next(&zf->zf_stream, zs);
|
zs_next = list_next(&zf->zf_stream, zs);
|
||||||
/*
|
/*
|
||||||
* Skip if still active. 1 -- zf_stream reference.
|
* Skip if still active. 1 -- zf_stream reference.
|
||||||
*/
|
*/
|
||||||
if (zfs_refcount_count(&zs->zs_refs) != 1)
|
if ((int)(zs->zs_atime - t) >= 0)
|
||||||
continue;
|
continue;
|
||||||
if (zs->zs_atime > t)
|
if (zfs_refcount_count(&zs->zs_refs) != 1)
|
||||||
continue;
|
continue;
|
||||||
if (zs_old)
|
if (zs_old)
|
||||||
dmu_zfetch_stream_remove(zf, zs);
|
dmu_zfetch_stream_remove(zf, zs);
|
||||||
|
@ -246,6 +271,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
||||||
}
|
}
|
||||||
if (zs_old) {
|
if (zs_old) {
|
||||||
zs = zs_old;
|
zs = zs_old;
|
||||||
|
list_remove(&zf->zf_stream, zs);
|
||||||
goto reuse;
|
goto reuse;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -255,21 +281,23 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
||||||
* for all the streams to be non-overlapping.
|
* for all the streams to be non-overlapping.
|
||||||
*/
|
*/
|
||||||
uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
|
uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
|
||||||
zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
|
(zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) /
|
||||||
zfetch_max_distance));
|
zfetch_max_distance));
|
||||||
if (zf->zf_numstreams >= max_streams) {
|
if (zf->zf_numstreams >= max_streams) {
|
||||||
t = now - SEC2NSEC(zfetch_min_sec_reap);
|
t = now - zfetch_min_sec_reap;
|
||||||
for (zs = list_head(&zf->zf_stream); zs != NULL;
|
for (zs = list_head(&zf->zf_stream); zs != NULL;
|
||||||
zs = list_next(&zf->zf_stream, zs)) {
|
zs = list_next(&zf->zf_stream, zs)) {
|
||||||
|
if ((int)(zs->zs_atime - t) >= 0)
|
||||||
|
continue;
|
||||||
if (zfs_refcount_count(&zs->zs_refs) != 1)
|
if (zfs_refcount_count(&zs->zs_refs) != 1)
|
||||||
continue;
|
continue;
|
||||||
if (zs->zs_atime > t)
|
if (zs_old == NULL ||
|
||||||
continue;
|
(int)(zs_old->zs_atime - zs->zs_atime) >= 0)
|
||||||
if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime)
|
|
||||||
zs_old = zs;
|
zs_old = zs;
|
||||||
}
|
}
|
||||||
if (zs_old) {
|
if (zs_old) {
|
||||||
zs = zs_old;
|
zs = zs_old;
|
||||||
|
list_remove(&zf->zf_stream, zs);
|
||||||
goto reuse;
|
goto reuse;
|
||||||
}
|
}
|
||||||
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
|
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
|
||||||
|
@ -277,24 +305,24 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
|
||||||
}
|
}
|
||||||
|
|
||||||
zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
|
zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
|
||||||
zs->zs_fetch = zf;
|
|
||||||
zfs_refcount_create(&zs->zs_callers);
|
zfs_refcount_create(&zs->zs_callers);
|
||||||
zfs_refcount_create(&zs->zs_refs);
|
zfs_refcount_create(&zs->zs_refs);
|
||||||
/* One reference for zf_stream. */
|
/* One reference for zf_stream. */
|
||||||
zfs_refcount_add(&zs->zs_refs, NULL);
|
zfs_refcount_add(&zs->zs_refs, NULL);
|
||||||
zf->zf_numstreams++;
|
zf->zf_numstreams++;
|
||||||
list_insert_head(&zf->zf_stream, zs);
|
|
||||||
|
|
||||||
reuse:
|
reuse:
|
||||||
|
list_insert_head(&zf->zf_stream, zs);
|
||||||
zs->zs_blkid = blkid;
|
zs->zs_blkid = blkid;
|
||||||
|
/* Allow immediate stream reuse until first hit. */
|
||||||
|
zs->zs_atime = now - zfetch_min_sec_reap;
|
||||||
|
memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges));
|
||||||
zs->zs_pf_dist = 0;
|
zs->zs_pf_dist = 0;
|
||||||
|
zs->zs_ipf_dist = 0;
|
||||||
zs->zs_pf_start = blkid;
|
zs->zs_pf_start = blkid;
|
||||||
zs->zs_pf_end = blkid;
|
zs->zs_pf_end = blkid;
|
||||||
zs->zs_ipf_dist = 0;
|
|
||||||
zs->zs_ipf_start = blkid;
|
zs->zs_ipf_start = blkid;
|
||||||
zs->zs_ipf_end = blkid;
|
zs->zs_ipf_end = blkid;
|
||||||
/* Allow immediate stream reuse until first hit. */
|
|
||||||
zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap);
|
|
||||||
zs->zs_missed = B_FALSE;
|
zs->zs_missed = B_FALSE;
|
||||||
zs->zs_more = B_FALSE;
|
zs->zs_more = B_FALSE;
|
||||||
}
|
}
|
||||||
|
@ -311,6 +339,120 @@ dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
|
||||||
aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
|
aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process stream hit access for nblks blocks starting at zs_blkid. Return
|
||||||
|
* number of blocks to proceed for after aggregation with future ranges.
|
||||||
|
*/
|
||||||
|
static uint64_t
|
||||||
|
dmu_zfetch_hit(zstream_t *zs, uint64_t nblks)
|
||||||
|
{
|
||||||
|
uint_t i, j;
|
||||||
|
|
||||||
|
/* Optimize sequential accesses (no future ranges). */
|
||||||
|
if (zs->zs_ranges[0].start == 0)
|
||||||
|
goto done;
|
||||||
|
|
||||||
|
/* Look for intersections with further ranges. */
|
||||||
|
for (i = 0; i < ZFETCH_RANGES; i++) {
|
||||||
|
zsrange_t *r = &zs->zs_ranges[i];
|
||||||
|
if (r->start == 0 || r->start > nblks)
|
||||||
|
break;
|
||||||
|
if (r->end >= nblks) {
|
||||||
|
nblks = r->end;
|
||||||
|
i++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Delete all found intersecting ranges, updates remaining. */
|
||||||
|
for (j = 0; i < ZFETCH_RANGES; i++, j++) {
|
||||||
|
if (zs->zs_ranges[i].start == 0)
|
||||||
|
break;
|
||||||
|
ASSERT3U(zs->zs_ranges[i].start, >, nblks);
|
||||||
|
ASSERT3U(zs->zs_ranges[i].end, >, nblks);
|
||||||
|
zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks;
|
||||||
|
zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks;
|
||||||
|
}
|
||||||
|
if (j < ZFETCH_RANGES) {
|
||||||
|
zs->zs_ranges[j].start = 0;
|
||||||
|
zs->zs_ranges[j].end = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
done:
|
||||||
|
zs->zs_blkid += nblks;
|
||||||
|
return (nblks);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Process future stream access for nblks blocks starting at blkid. Return
|
||||||
|
* number of blocks to proceed for if future ranges reach fill threshold.
|
||||||
|
*/
|
||||||
|
static uint64_t
|
||||||
|
dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
|
||||||
|
{
|
||||||
|
ASSERT3U(blkid, >, zs->zs_blkid);
|
||||||
|
blkid -= zs->zs_blkid;
|
||||||
|
ASSERT3U(blkid + nblks, <=, UINT16_MAX);
|
||||||
|
|
||||||
|
/* Search for first and last intersection or insert point. */
|
||||||
|
uint_t f = ZFETCH_RANGES, l = 0, i;
|
||||||
|
for (i = 0; i < ZFETCH_RANGES; i++) {
|
||||||
|
zsrange_t *r = &zs->zs_ranges[i];
|
||||||
|
if (r->start == 0 || r->start > blkid + nblks)
|
||||||
|
break;
|
||||||
|
if (r->end < blkid)
|
||||||
|
continue;
|
||||||
|
if (f > i)
|
||||||
|
f = i;
|
||||||
|
if (l < i)
|
||||||
|
l = i;
|
||||||
|
}
|
||||||
|
if (f <= l) {
|
||||||
|
/* Got some intersecting range, expand it if needed. */
|
||||||
|
if (zs->zs_ranges[f].start > blkid)
|
||||||
|
zs->zs_ranges[f].start = blkid;
|
||||||
|
zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks);
|
||||||
|
if (f < l) {
|
||||||
|
/* Got more than one intersection, remove others. */
|
||||||
|
for (f++, l++; l < ZFETCH_RANGES; f++, l++) {
|
||||||
|
zs->zs_ranges[f].start = zs->zs_ranges[l].start;
|
||||||
|
zs->zs_ranges[f].end = zs->zs_ranges[l].end;
|
||||||
|
}
|
||||||
|
zs->zs_ranges[ZFETCH_RANGES - 1].start = 0;
|
||||||
|
zs->zs_ranges[ZFETCH_RANGES - 1].end = 0;
|
||||||
|
}
|
||||||
|
} else if (i < ZFETCH_RANGES) {
|
||||||
|
/* Got no intersecting ranges, insert new one. */
|
||||||
|
for (l = ZFETCH_RANGES - 1; l > i; l--) {
|
||||||
|
zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start;
|
||||||
|
zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end;
|
||||||
|
}
|
||||||
|
zs->zs_ranges[i].start = blkid;
|
||||||
|
zs->zs_ranges[i].end = blkid + nblks;
|
||||||
|
} else {
|
||||||
|
/* No space left to insert. Drop the range. */
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check if with the new access addition we reached fill threshold. */
|
||||||
|
if (zfetch_hole_shift >= 16)
|
||||||
|
return (0);
|
||||||
|
uint_t hole = 0;
|
||||||
|
for (i = f = l = 0; i < ZFETCH_RANGES; i++) {
|
||||||
|
zsrange_t *r = &zs->zs_ranges[i];
|
||||||
|
if (r->start == 0)
|
||||||
|
break;
|
||||||
|
hole += r->start - f;
|
||||||
|
f = r->end;
|
||||||
|
if (hole <= r->end >> zfetch_hole_shift)
|
||||||
|
l = r->end;
|
||||||
|
}
|
||||||
|
if (l > 0)
|
||||||
|
return (dmu_zfetch_hit(zs, l));
|
||||||
|
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is the predictive prefetch entry point. dmu_zfetch_prepare()
|
* This is the predictive prefetch entry point. dmu_zfetch_prepare()
|
||||||
* associates dnode access specified with blkid and nblks arguments with
|
* associates dnode access specified with blkid and nblks arguments with
|
||||||
|
@ -365,53 +507,92 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
|
||||||
mutex_enter(&zf->zf_lock);
|
mutex_enter(&zf->zf_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Find matching prefetch stream. Depending on whether the accesses
|
* Find perfect prefetch stream. Depending on whether the accesses
|
||||||
* are block-aligned, first block of the new access may either follow
|
* are block-aligned, first block of the new access may either follow
|
||||||
* the last block of the previous access, or be equal to it.
|
* the last block of the previous access, or be equal to it.
|
||||||
*/
|
*/
|
||||||
|
unsigned int dbs = zf->zf_dnode->dn_datablkshift;
|
||||||
|
uint64_t end_blkid = blkid + nblks;
|
||||||
for (zs = list_head(&zf->zf_stream); zs != NULL;
|
for (zs = list_head(&zf->zf_stream); zs != NULL;
|
||||||
zs = list_next(&zf->zf_stream, zs)) {
|
zs = list_next(&zf->zf_stream, zs)) {
|
||||||
if (blkid == zs->zs_blkid) {
|
if (blkid == zs->zs_blkid) {
|
||||||
break;
|
goto hit;
|
||||||
} else if (blkid + 1 == zs->zs_blkid) {
|
} else if (blkid + 1 == zs->zs_blkid) {
|
||||||
blkid++;
|
blkid++;
|
||||||
nblks--;
|
nblks--;
|
||||||
break;
|
goto hit;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the file is ending, remove the matching stream if found.
|
* Find close enough prefetch stream. Access crossing stream position
|
||||||
* If not found then it is too late to create a new one now.
|
* is a hit in its new part. Access ahead of stream position considered
|
||||||
|
* a hit for metadata prefetch, since we do not care about fill percent,
|
||||||
|
* or stored for future otherwise. Access behind stream position is
|
||||||
|
* silently ignored, since we already skipped it reaching fill percent.
|
||||||
*/
|
*/
|
||||||
uint64_t end_of_access_blkid = blkid + nblks;
|
uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX);
|
||||||
if (end_of_access_blkid >= maxblkid) {
|
uint_t t = gethrestime_sec() - zfetch_max_sec_reap;
|
||||||
if (zs != NULL)
|
for (zs = list_head(&zf->zf_stream); zs != NULL;
|
||||||
dmu_zfetch_stream_remove(zf, zs);
|
zs = list_next(&zf->zf_stream, zs)) {
|
||||||
mutex_exit(&zf->zf_lock);
|
if (blkid > zs->zs_blkid) {
|
||||||
if (!have_lock)
|
if (end_blkid <= zs->zs_blkid + max_reorder) {
|
||||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
if (!fetch_data) {
|
||||||
return (NULL);
|
nblks = dmu_zfetch_hit(zs,
|
||||||
|
end_blkid - zs->zs_blkid);
|
||||||
|
ZFETCHSTAT_BUMP(zfetchstat_stride);
|
||||||
|
goto future;
|
||||||
|
}
|
||||||
|
nblks = dmu_zfetch_future(zs, blkid, nblks);
|
||||||
|
if (nblks > 0)
|
||||||
|
ZFETCHSTAT_BUMP(zfetchstat_stride);
|
||||||
|
else
|
||||||
|
ZFETCHSTAT_BUMP(zfetchstat_future);
|
||||||
|
goto future;
|
||||||
|
}
|
||||||
|
} else if (end_blkid >= zs->zs_blkid) {
|
||||||
|
nblks -= zs->zs_blkid - blkid;
|
||||||
|
blkid += zs->zs_blkid - blkid;
|
||||||
|
goto hit;
|
||||||
|
} else if (end_blkid + max_reorder > zs->zs_blkid &&
|
||||||
|
(int)(zs->zs_atime - t) >= 0) {
|
||||||
|
ZFETCHSTAT_BUMP(zfetchstat_past);
|
||||||
|
zs->zs_atime = gethrestime_sec();
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Exit if we already prefetched this block before. */
|
/*
|
||||||
if (nblks == 0) {
|
* This access is not part of any existing stream. Create a new
|
||||||
mutex_exit(&zf->zf_lock);
|
* stream for it unless we are at the end of file.
|
||||||
if (!have_lock)
|
*/
|
||||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
if (end_blkid < maxblkid)
|
||||||
return (NULL);
|
dmu_zfetch_stream_create(zf, end_blkid);
|
||||||
}
|
mutex_exit(&zf->zf_lock);
|
||||||
|
if (!have_lock)
|
||||||
|
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||||
|
ZFETCHSTAT_BUMP(zfetchstat_misses);
|
||||||
|
return (NULL);
|
||||||
|
|
||||||
if (zs == NULL) {
|
hit:
|
||||||
/*
|
nblks = dmu_zfetch_hit(zs, nblks);
|
||||||
* This access is not part of any existing stream. Create
|
ZFETCHSTAT_BUMP(zfetchstat_hits);
|
||||||
* a new stream for it.
|
|
||||||
*/
|
future:
|
||||||
dmu_zfetch_stream_create(zf, end_of_access_blkid);
|
zs->zs_atime = gethrestime_sec();
|
||||||
|
|
||||||
|
/* Exit if we already prefetched for this position before. */
|
||||||
|
if (nblks == 0)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
/* If the file is ending, remove the stream. */
|
||||||
|
end_blkid = zs->zs_blkid;
|
||||||
|
if (end_blkid >= maxblkid) {
|
||||||
|
dmu_zfetch_stream_remove(zf, zs);
|
||||||
|
out:
|
||||||
mutex_exit(&zf->zf_lock);
|
mutex_exit(&zf->zf_lock);
|
||||||
if (!have_lock)
|
if (!have_lock)
|
||||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||||
ZFETCHSTAT_BUMP(zfetchstat_misses);
|
|
||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -427,7 +608,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
|
||||||
* than ~6% of ARC held by active prefetches. It should help with
|
* than ~6% of ARC held by active prefetches. It should help with
|
||||||
* getting out of RAM on some badly mispredicted read patterns.
|
* getting out of RAM on some badly mispredicted read patterns.
|
||||||
*/
|
*/
|
||||||
unsigned int dbs = zf->zf_dnode->dn_datablkshift;
|
|
||||||
unsigned int nbytes = nblks << dbs;
|
unsigned int nbytes = nblks << dbs;
|
||||||
unsigned int pf_nblks;
|
unsigned int pf_nblks;
|
||||||
if (fetch_data) {
|
if (fetch_data) {
|
||||||
|
@ -447,10 +627,10 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
|
||||||
} else {
|
} else {
|
||||||
pf_nblks = 0;
|
pf_nblks = 0;
|
||||||
}
|
}
|
||||||
if (zs->zs_pf_start < end_of_access_blkid)
|
if (zs->zs_pf_start < end_blkid)
|
||||||
zs->zs_pf_start = end_of_access_blkid;
|
zs->zs_pf_start = end_blkid;
|
||||||
if (zs->zs_pf_end < end_of_access_blkid + pf_nblks)
|
if (zs->zs_pf_end < end_blkid + pf_nblks)
|
||||||
zs->zs_pf_end = end_of_access_blkid + pf_nblks;
|
zs->zs_pf_end = end_blkid + pf_nblks;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Do the same for indirects, starting where we will stop reading
|
* Do the same for indirects, starting where we will stop reading
|
||||||
|
@ -468,9 +648,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
|
||||||
if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
|
if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
|
||||||
zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
|
zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
|
||||||
|
|
||||||
zs->zs_blkid = end_of_access_blkid;
|
|
||||||
/* Protect the stream from reclamation. */
|
|
||||||
zs->zs_atime = gethrtime();
|
|
||||||
zfs_refcount_add(&zs->zs_refs, NULL);
|
zfs_refcount_add(&zs->zs_refs, NULL);
|
||||||
/* Count concurrent callers. */
|
/* Count concurrent callers. */
|
||||||
zfs_refcount_add(&zs->zs_callers, NULL);
|
zfs_refcount_add(&zs->zs_callers, NULL);
|
||||||
|
@ -478,15 +655,13 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
|
||||||
|
|
||||||
if (!have_lock)
|
if (!have_lock)
|
||||||
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
|
||||||
|
|
||||||
ZFETCHSTAT_BUMP(zfetchstat_hits);
|
|
||||||
return (zs);
|
return (zs);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
|
dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
|
||||||
|
boolean_t have_lock)
|
||||||
{
|
{
|
||||||
zfetch_t *zf = zs->zs_fetch;
|
|
||||||
int64_t pf_start, pf_end, ipf_start, ipf_end;
|
int64_t pf_start, pf_end, ipf_start, ipf_end;
|
||||||
int epbs, issued;
|
int epbs, issued;
|
||||||
|
|
||||||
|
@ -562,7 +737,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
|
||||||
|
|
||||||
zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
|
zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
|
||||||
if (zs)
|
if (zs)
|
||||||
dmu_zfetch_run(zs, missed, have_lock);
|
dmu_zfetch_run(zf, zs, missed, have_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
|
||||||
|
@ -585,3 +760,9 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
|
||||||
"Max bytes to prefetch indirects for per stream");
|
"Max bytes to prefetch indirects for per stream");
|
||||||
|
|
||||||
|
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW,
|
||||||
|
"Max request reorder distance within a stream");
|
||||||
|
|
||||||
|
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW,
|
||||||
|
"Max log2 fraction of holes in a stream");
|
||||||
|
|
|
@ -641,7 +641,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
|
||||||
{
|
{
|
||||||
multilist_t *ml = &mc->mc_metaslab_txg_list;
|
multilist_t *ml = &mc->mc_metaslab_txg_list;
|
||||||
for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
|
for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
|
||||||
multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
|
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
|
||||||
metaslab_t *msp = multilist_sublist_head(mls);
|
metaslab_t *msp = multilist_sublist_head(mls);
|
||||||
multilist_sublist_unlock(mls);
|
multilist_sublist_unlock(mls);
|
||||||
while (msp != NULL) {
|
while (msp != NULL) {
|
||||||
|
@ -658,7 +658,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
|
||||||
i--;
|
i--;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
mls = multilist_sublist_lock(ml, i);
|
mls = multilist_sublist_lock_idx(ml, i);
|
||||||
metaslab_t *next_msp = multilist_sublist_next(mls, msp);
|
metaslab_t *next_msp = multilist_sublist_next(mls, msp);
|
||||||
multilist_sublist_unlock(mls);
|
multilist_sublist_unlock(mls);
|
||||||
if (txg >
|
if (txg >
|
||||||
|
@ -2190,12 +2190,12 @@ metaslab_potentially_evict(metaslab_class_t *mc)
|
||||||
unsigned int idx = multilist_get_random_index(
|
unsigned int idx = multilist_get_random_index(
|
||||||
&mc->mc_metaslab_txg_list);
|
&mc->mc_metaslab_txg_list);
|
||||||
multilist_sublist_t *mls =
|
multilist_sublist_t *mls =
|
||||||
multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
|
multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
|
||||||
metaslab_t *msp = multilist_sublist_head(mls);
|
metaslab_t *msp = multilist_sublist_head(mls);
|
||||||
multilist_sublist_unlock(mls);
|
multilist_sublist_unlock(mls);
|
||||||
while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
|
while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
|
||||||
inuse * size) {
|
inuse * size) {
|
||||||
VERIFY3P(mls, ==, multilist_sublist_lock(
|
VERIFY3P(mls, ==, multilist_sublist_lock_idx(
|
||||||
&mc->mc_metaslab_txg_list, idx));
|
&mc->mc_metaslab_txg_list, idx));
|
||||||
ASSERT3U(idx, ==,
|
ASSERT3U(idx, ==,
|
||||||
metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
|
metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
|
||||||
|
|
|
@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml)
|
||||||
return (random_in_range(ml->ml_num_sublists));
|
return (random_in_range(ml->ml_num_sublists));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
multilist_sublist_lock(multilist_sublist_t *mls)
|
||||||
|
{
|
||||||
|
mutex_enter(&mls->mls_lock);
|
||||||
|
}
|
||||||
|
|
||||||
/* Lock and return the sublist specified at the given index */
|
/* Lock and return the sublist specified at the given index */
|
||||||
multilist_sublist_t *
|
multilist_sublist_t *
|
||||||
multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
|
multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx)
|
||||||
{
|
{
|
||||||
multilist_sublist_t *mls;
|
multilist_sublist_t *mls;
|
||||||
|
|
||||||
|
@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
|
||||||
multilist_sublist_t *
|
multilist_sublist_t *
|
||||||
multilist_sublist_lock_obj(multilist_t *ml, void *obj)
|
multilist_sublist_lock_obj(multilist_t *ml, void *obj)
|
||||||
{
|
{
|
||||||
return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
|
return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj)));
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
|
||||||
list_insert_tail(&mls->mls_list, obj);
|
list_insert_tail(&mls->mls_list, obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* please see comment above multilist_sublist_insert_head */
|
||||||
|
void
|
||||||
|
multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj)
|
||||||
|
{
|
||||||
|
ASSERT(MUTEX_HELD(&mls->mls_lock));
|
||||||
|
list_insert_after(&mls->mls_list, prev, obj);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* please see comment above multilist_sublist_insert_head */
|
||||||
|
void
|
||||||
|
multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj)
|
||||||
|
{
|
||||||
|
ASSERT(MUTEX_HELD(&mls->mls_lock));
|
||||||
|
list_insert_before(&mls->mls_list, next, obj);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Move the object one element forward in the list.
|
* Move the object one element forward in the list.
|
||||||
*
|
*
|
||||||
|
|
|
@ -1822,7 +1822,8 @@ spa_get_slop_space(spa_t *spa)
|
||||||
* deduplicated data, so since it's not useful to reserve more
|
* deduplicated data, so since it's not useful to reserve more
|
||||||
* space with more deduplicated data, we subtract that out here.
|
* space with more deduplicated data, we subtract that out here.
|
||||||
*/
|
*/
|
||||||
space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
|
space =
|
||||||
|
spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa);
|
||||||
slop = MIN(space >> spa_slop_shift, spa_max_slop);
|
slop = MIN(space >> spa_slop_shift, spa_max_slop);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -58,6 +58,26 @@
|
||||||
#include <sys/zfs_vfsops.h>
|
#include <sys/zfs_vfsops.h>
|
||||||
#include <sys/zfs_znode.h>
|
#include <sys/zfs_znode.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Enable the experimental block cloning feature. If this setting is 0, then
|
||||||
|
* even if feature@block_cloning is enabled, attempts to clone blocks will act
|
||||||
|
* as though the feature is disabled.
|
||||||
|
*/
|
||||||
|
int zfs_bclone_enabled = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When set zfs_clone_range() waits for dirty data to be written to disk.
|
||||||
|
* This allows the clone operation to reliably succeed when a file is modified
|
||||||
|
* and then immediately cloned. For small files this may be slower than making
|
||||||
|
* a copy of the file and is therefore not the default. However, in certain
|
||||||
|
* scenarios this behavior may be desirable so a tunable is provided.
|
||||||
|
*/
|
||||||
|
static int zfs_bclone_wait_dirty = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Maximum bytes to read per chunk in zfs_read().
|
||||||
|
*/
|
||||||
|
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
|
||||||
|
|
||||||
static ulong_t zfs_fsync_sync_cnt = 4;
|
static ulong_t zfs_fsync_sync_cnt = 4;
|
||||||
|
|
||||||
|
@ -110,7 +130,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
|
||||||
|
|
||||||
/* Flush any mmap()'d data to disk */
|
/* Flush any mmap()'d data to disk */
|
||||||
if (zn_has_cached_data(zp, 0, file_sz - 1))
|
if (zn_has_cached_data(zp, 0, file_sz - 1))
|
||||||
zn_flush_cached_data(zp, B_FALSE);
|
zn_flush_cached_data(zp, B_TRUE);
|
||||||
|
|
||||||
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
|
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
|
||||||
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
|
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
|
||||||
|
@ -189,8 +209,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Read bytes from specified file into supplied buffer.
|
* Read bytes from specified file into supplied buffer.
|
||||||
*
|
*
|
||||||
|
@ -1055,6 +1073,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
||||||
size_t maxblocks, nbps;
|
size_t maxblocks, nbps;
|
||||||
uint_t inblksz;
|
uint_t inblksz;
|
||||||
uint64_t clear_setid_bits_txg = 0;
|
uint64_t clear_setid_bits_txg = 0;
|
||||||
|
uint64_t last_synced_txg = 0;
|
||||||
|
|
||||||
inoff = *inoffp;
|
inoff = *inoffp;
|
||||||
outoff = *outoffp;
|
outoff = *outoffp;
|
||||||
|
@ -1174,6 +1193,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Flush any mmap()'d data to disk */
|
||||||
|
if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
|
||||||
|
zn_flush_cached_data(inzp, B_TRUE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Maintain predictable lock order.
|
* Maintain predictable lock order.
|
||||||
*/
|
*/
|
||||||
|
@ -1293,15 +1316,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
|
||||||
}
|
}
|
||||||
|
|
||||||
nbps = maxblocks;
|
nbps = maxblocks;
|
||||||
|
last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
|
||||||
error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
|
error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
|
||||||
&nbps);
|
&nbps);
|
||||||
if (error != 0) {
|
if (error != 0) {
|
||||||
/*
|
/*
|
||||||
* If we are trying to clone a block that was created
|
* If we are trying to clone a block that was created
|
||||||
* in the current transaction group, error will be
|
* in the current transaction group, the error will be
|
||||||
* EAGAIN here, which we can just return to the caller
|
* EAGAIN here. Based on zfs_bclone_wait_dirty either
|
||||||
* so it can fallback if it likes.
|
* return a shortened range to the caller so it can
|
||||||
|
* fallback, or wait for the next TXG and check again.
|
||||||
*/
|
*/
|
||||||
|
if (error == EAGAIN && zfs_bclone_wait_dirty) {
|
||||||
|
txg_wait_synced(dmu_objset_pool(inos),
|
||||||
|
last_synced_txg + 1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1523,3 +1554,9 @@ EXPORT_SYMBOL(zfs_clone_range_replay);
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
|
||||||
"Bytes to read per chunk");
|
"Bytes to read per chunk");
|
||||||
|
|
||||||
|
ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
|
||||||
|
"Enable block cloning");
|
||||||
|
|
||||||
|
ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
|
||||||
|
"Wait for dirty blocks when cloning");
|
||||||
|
|
|
@ -535,7 +535,8 @@ tags = ['functional', 'cli_root', 'zpool_split']
|
||||||
tests = ['zpool_status_001_pos', 'zpool_status_002_pos',
|
tests = ['zpool_status_001_pos', 'zpool_status_002_pos',
|
||||||
'zpool_status_003_pos', 'zpool_status_004_pos',
|
'zpool_status_003_pos', 'zpool_status_004_pos',
|
||||||
'zpool_status_005_pos', 'zpool_status_006_pos',
|
'zpool_status_005_pos', 'zpool_status_006_pos',
|
||||||
'zpool_status_007_pos', 'zpool_status_features_001_pos']
|
'zpool_status_007_pos', 'zpool_status_008_pos',
|
||||||
|
'zpool_status_features_001_pos']
|
||||||
tags = ['functional', 'cli_root', 'zpool_status']
|
tags = ['functional', 'cli_root', 'zpool_status']
|
||||||
|
|
||||||
[tests/functional/cli_root/zpool_sync]
|
[tests/functional/cli_root/zpool_sync]
|
||||||
|
@ -630,7 +631,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
|
||||||
tags = ['functional', 'compression']
|
tags = ['functional', 'compression']
|
||||||
|
|
||||||
[tests/functional/cp_files]
|
[tests/functional/cp_files]
|
||||||
tests = ['cp_files_001_pos', 'cp_stress']
|
tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
|
||||||
tags = ['functional', 'cp_files']
|
tags = ['functional', 'cp_files']
|
||||||
|
|
||||||
[tests/functional/crtime]
|
[tests/functional/crtime]
|
||||||
|
|
|
@ -138,7 +138,11 @@ idmap_reason = 'Idmapped mount needs kernel 5.12+'
|
||||||
# copy_file_range() is not supported by all kernels
|
# copy_file_range() is not supported by all kernels
|
||||||
#
|
#
|
||||||
cfr_reason = 'Kernel copy_file_range support required'
|
cfr_reason = 'Kernel copy_file_range support required'
|
||||||
cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+'
|
|
||||||
|
if sys.platform.startswith('freebsd'):
|
||||||
|
cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs FreeBSD 14+'
|
||||||
|
else:
|
||||||
|
cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+'
|
||||||
|
|
||||||
#
|
#
|
||||||
# These tests are known to fail, thus we use this list to prevent these
|
# These tests are known to fail, thus we use this list to prevent these
|
||||||
|
@ -176,6 +180,7 @@ if sys.platform.startswith('freebsd'):
|
||||||
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
|
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
|
||||||
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
|
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
|
||||||
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
|
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
|
||||||
|
'cp_files/cp_files_002_pos': ['SKIP', na_reason],
|
||||||
'link_count/link_count_001': ['SKIP', na_reason],
|
'link_count/link_count_001': ['SKIP', na_reason],
|
||||||
'casenorm/mixed_create_failure': ['FAIL', 13215],
|
'casenorm/mixed_create_failure': ['FAIL', 13215],
|
||||||
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
|
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
|
||||||
|
@ -267,6 +272,22 @@ if sys.platform.startswith('freebsd'):
|
||||||
'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623],
|
'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623],
|
||||||
'resilver/resilver_restart_001': ['FAIL', known_reason],
|
'resilver/resilver_restart_001': ['FAIL', known_reason],
|
||||||
'snapshot/snapshot_002_pos': ['FAIL', '14831'],
|
'snapshot/snapshot_002_pos': ['FAIL', '14831'],
|
||||||
|
'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason],
|
||||||
|
'bclone/bclone_crossfs_corner_cases_limited':
|
||||||
|
['SKIP', cfr_cross_reason],
|
||||||
|
'bclone/bclone_crossfs_data': ['SKIP', cfr_cross_reason],
|
||||||
|
'bclone/bclone_crossfs_embedded': ['SKIP', cfr_cross_reason],
|
||||||
|
'bclone/bclone_crossfs_hole': ['SKIP', cfr_cross_reason],
|
||||||
|
'bclone/bclone_diffprops_all': ['SKIP', cfr_cross_reason],
|
||||||
|
'bclone/bclone_diffprops_checksum': ['SKIP', cfr_cross_reason],
|
||||||
|
'bclone/bclone_diffprops_compress': ['SKIP', cfr_cross_reason],
|
||||||
|
'bclone/bclone_diffprops_copies': ['SKIP', cfr_cross_reason],
|
||||||
|
'bclone/bclone_diffprops_recordsize': ['SKIP', cfr_cross_reason],
|
||||||
|
'bclone/bclone_prop_sync': ['SKIP', cfr_cross_reason],
|
||||||
|
'block_cloning/block_cloning_cross_enc_dataset':
|
||||||
|
['SKIP', cfr_cross_reason],
|
||||||
|
'block_cloning/block_cloning_copyfilerange_cross_dataset':
|
||||||
|
['SKIP', cfr_cross_reason]
|
||||||
})
|
})
|
||||||
elif sys.platform.startswith('linux'):
|
elif sys.platform.startswith('linux'):
|
||||||
maybe.update({
|
maybe.update({
|
||||||
|
@ -312,6 +333,7 @@ elif sys.platform.startswith('linux'):
|
||||||
['SKIP', cfr_reason],
|
['SKIP', cfr_reason],
|
||||||
'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
|
'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
|
||||||
'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
|
'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
|
||||||
|
'cp_files/cp_files_002_pos': ['SKIP', cfr_reason],
|
||||||
'fault/auto_online_002_pos': ['FAIL', 11889],
|
'fault/auto_online_002_pos': ['FAIL', 11889],
|
||||||
'fault/auto_replace_001_pos': ['FAIL', 14851],
|
'fault/auto_replace_001_pos': ['FAIL', 14851],
|
||||||
'fault/auto_spare_002_pos': ['FAIL', 11889],
|
'fault/auto_spare_002_pos': ['FAIL', 11889],
|
||||||
|
|
|
@ -61,13 +61,8 @@ function compare_version_gte
|
||||||
[ "$(printf "$1\n$2" | sort -V | tail -n1)" = "$1" ]
|
[ "$(printf "$1\n$2" | sort -V | tail -n1)" = "$1" ]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Linux kernel version comparison function
|
# Helper function used by linux_version() and freebsd_version()
|
||||||
#
|
function kernel_version
|
||||||
# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version
|
|
||||||
#
|
|
||||||
# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ]
|
|
||||||
#
|
|
||||||
function linux_version
|
|
||||||
{
|
{
|
||||||
typeset ver="$1"
|
typeset ver="$1"
|
||||||
|
|
||||||
|
@ -83,6 +78,24 @@ function linux_version
|
||||||
echo $((version * 100000 + major * 1000 + minor))
|
echo $((version * 100000 + major * 1000 + minor))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Linux kernel version comparison function
|
||||||
|
#
|
||||||
|
# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version
|
||||||
|
#
|
||||||
|
# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ]
|
||||||
|
function linux_version {
|
||||||
|
kernel_version "$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# FreeBSD version comparison function
|
||||||
|
#
|
||||||
|
# $1 FreeBSD version ("13.2", "14.0") or blank for installed FreeBSD version
|
||||||
|
#
|
||||||
|
# Used for comparison: if [ $(freebsd_version) -ge $(freebsd_version "13.2") ]
|
||||||
|
function freebsd_version {
|
||||||
|
kernel_version "$1"
|
||||||
|
}
|
||||||
|
|
||||||
# Determine if this is a Linux test system
|
# Determine if this is a Linux test system
|
||||||
#
|
#
|
||||||
# Return 0 if platform Linux, 1 if otherwise
|
# Return 0 if platform Linux, 1 if otherwise
|
||||||
|
|
|
@ -90,7 +90,8 @@ VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev
|
||||||
VOL_MODE vol.mode zvol_volmode
|
VOL_MODE vol.mode zvol_volmode
|
||||||
VOL_RECURSIVE vol.recursive UNSUPPORTED
|
VOL_RECURSIVE vol.recursive UNSUPPORTED
|
||||||
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
|
VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq
|
||||||
BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled
|
BCLONE_ENABLED bclone_enabled zfs_bclone_enabled
|
||||||
|
BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty
|
||||||
XATTR_COMPAT xattr_compat zfs_xattr_compat
|
XATTR_COMPAT xattr_compat zfs_xattr_compat
|
||||||
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
|
ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max
|
||||||
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max
|
ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max
|
||||||
|
|
|
@ -1238,6 +1238,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||||
functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
|
functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
|
||||||
functional/cli_root/zpool_status/zpool_status_006_pos.ksh \
|
functional/cli_root/zpool_status/zpool_status_006_pos.ksh \
|
||||||
functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
|
functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
|
||||||
|
functional/cli_root/zpool_status/zpool_status_008_pos.ksh \
|
||||||
functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
|
functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
|
||||||
functional/cli_root/zpool_sync/cleanup.ksh \
|
functional/cli_root/zpool_sync/cleanup.ksh \
|
||||||
functional/cli_root/zpool_sync/setup.ksh \
|
functional/cli_root/zpool_sync/setup.ksh \
|
||||||
|
@ -1393,6 +1394,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||||
functional/compression/setup.ksh \
|
functional/compression/setup.ksh \
|
||||||
functional/cp_files/cleanup.ksh \
|
functional/cp_files/cleanup.ksh \
|
||||||
functional/cp_files/cp_files_001_pos.ksh \
|
functional/cp_files/cp_files_001_pos.ksh \
|
||||||
|
functional/cp_files/cp_files_002_pos.ksh \
|
||||||
functional/cp_files/cp_stress.ksh \
|
functional/cp_files/cp_stress.ksh \
|
||||||
functional/cp_files/setup.ksh \
|
functional/cp_files/setup.ksh \
|
||||||
functional/crtime/cleanup.ksh \
|
functional/crtime/cleanup.ksh \
|
||||||
|
|
|
@ -42,6 +42,12 @@ function verify_crossfs_block_cloning
|
||||||
if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
|
if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
|
||||||
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
|
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Cross dataset block cloning only supported on FreeBSD 14+
|
||||||
|
# https://github.com/freebsd/freebsd-src/commit/969071be938c
|
||||||
|
if is_freebsd && [ $(freebsd_version) -lt $(freebsd_version 14.0) ] ; then
|
||||||
|
log_unsupported "Cloning across datasets not supported in $(uname -r)"
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# Unused.
|
# Unused.
|
||||||
|
|
|
@ -26,12 +26,11 @@
|
||||||
|
|
||||||
. $STF_SUITE/include/libtest.shlib
|
. $STF_SUITE/include/libtest.shlib
|
||||||
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
|
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
|
||||||
|
. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
|
||||||
|
|
||||||
verify_runnable "global"
|
verify_runnable "global"
|
||||||
|
|
||||||
if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
|
verify_crossfs_block_cloning
|
||||||
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
|
|
||||||
fi
|
|
||||||
|
|
||||||
claim="The copy_file_range syscall can clone across datasets."
|
claim="The copy_file_range syscall can clone across datasets."
|
||||||
|
|
||||||
|
|
|
@ -26,12 +26,11 @@
|
||||||
|
|
||||||
. $STF_SUITE/include/libtest.shlib
|
. $STF_SUITE/include/libtest.shlib
|
||||||
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
|
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
|
||||||
|
. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
|
||||||
|
|
||||||
verify_runnable "global"
|
verify_runnable "global"
|
||||||
|
|
||||||
if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
|
verify_crossfs_block_cloning
|
||||||
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
|
|
||||||
fi
|
|
||||||
|
|
||||||
claim="Block cloning across encrypted datasets."
|
claim="Block cloning across encrypted datasets."
|
||||||
|
|
||||||
|
|
|
@ -51,7 +51,7 @@ else
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \
|
set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \
|
||||||
"-vx $testpool"
|
"-vx $testpool" "-e $testpool" "-es $testpool"
|
||||||
|
|
||||||
log_assert "Executing 'zpool status' with correct options succeeds"
|
log_assert "Executing 'zpool status' with correct options succeeds"
|
||||||
|
|
||||||
|
@ -64,4 +64,6 @@ while [[ $i -lt ${#args[*]} ]]; do
|
||||||
(( i = i + 1 ))
|
(( i = i + 1 ))
|
||||||
done
|
done
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
|
||||||
log_pass "'zpool status' with correct options succeeded"
|
log_pass "'zpool status' with correct options succeeded"
|
||||||
|
|
|
@ -37,6 +37,7 @@
|
||||||
# 3. Read the file
|
# 3. Read the file
|
||||||
# 4. Take a snapshot and make a clone
|
# 4. Take a snapshot and make a clone
|
||||||
# 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v'
|
# 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v'
|
||||||
|
# and 'zpool status -ev'
|
||||||
|
|
||||||
function cleanup
|
function cleanup
|
||||||
{
|
{
|
||||||
|
@ -68,6 +69,7 @@ log_must zpool status -v $TESTPOOL2
|
||||||
log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'"
|
log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'"
|
||||||
log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'"
|
log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'"
|
||||||
log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'"
|
log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'"
|
||||||
|
log_must eval "zpool status -ev | grep '$TESTPOOL2/10m_file'"
|
||||||
log_mustnot eval "zpool status -v | grep '$TESTFS1'"
|
log_mustnot eval "zpool status -v | grep '$TESTFS1'"
|
||||||
|
|
||||||
log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone"
|
log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone"
|
||||||
|
|
|
@ -0,0 +1,104 @@
|
||||||
|
#!/bin/ksh -p
|
||||||
|
|
||||||
|
#
|
||||||
|
# CDDL HEADER START
|
||||||
|
#
|
||||||
|
# This file and its contents are supplied under the terms of the
|
||||||
|
# Common Development and Distribution License ("CDDL"), version 1.0.
|
||||||
|
# You may only use this file in accordance with the terms of version
|
||||||
|
# 1.0 of the CDDL.
|
||||||
|
#
|
||||||
|
# A full copy of the text of the CDDL should have accompanied this
|
||||||
|
# source. A copy of the CDDL is also available via the Internet at
|
||||||
|
# http://www.illumos.org/license/CDDL.
|
||||||
|
#
|
||||||
|
# CDDL HEADER END
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
|
||||||
|
#
|
||||||
|
|
||||||
|
. $STF_SUITE/include/libtest.shlib
|
||||||
|
|
||||||
|
#
|
||||||
|
# DESCRIPTION:
|
||||||
|
# Verify 'zpool status -e' only shows unhealthy devices.
|
||||||
|
#
|
||||||
|
# STRATEGY:
|
||||||
|
# 1. Create zpool
|
||||||
|
# 2. Force DEGRADE, FAULT, or inject slow IOs for vdevs
|
||||||
|
# 3. Verify vdevs are reported correctly with -e and -s
|
||||||
|
# 4. Verify parents are reported as DEGRADED
|
||||||
|
# 5. Verify healthy children are not reported
|
||||||
|
#
|
||||||
|
|
||||||
|
function cleanup
|
||||||
|
{
|
||||||
|
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
|
||||||
|
zinject -c all
|
||||||
|
poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
|
||||||
|
log_must rm -f $all_vdevs
|
||||||
|
}
|
||||||
|
|
||||||
|
log_assert "Verify 'zpool status -e'"
|
||||||
|
|
||||||
|
log_onexit cleanup
|
||||||
|
|
||||||
|
all_vdevs=$(echo $TESTDIR/vdev{1..6})
|
||||||
|
log_must mkdir -p $TESTDIR
|
||||||
|
log_must truncate -s $MINVDEVSIZE $all_vdevs
|
||||||
|
|
||||||
|
OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
|
||||||
|
|
||||||
|
for raid_type in "draid2:3d:6c:1s" "raidz2"; do
|
||||||
|
|
||||||
|
log_must zpool create -f $TESTPOOL2 $raid_type $all_vdevs
|
||||||
|
|
||||||
|
# Check DEGRADED vdevs are shown.
|
||||||
|
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "ONLINE"
|
||||||
|
log_must zinject -d $TESTDIR/vdev4 -A degrade $TESTPOOL2
|
||||||
|
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev4 | grep DEGRADED"
|
||||||
|
|
||||||
|
# Check FAULTED vdevs are shown.
|
||||||
|
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev5 "ONLINE"
|
||||||
|
log_must zinject -d $TESTDIR/vdev5 -A fault $TESTPOOL2
|
||||||
|
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev5 | grep FAULTED"
|
||||||
|
|
||||||
|
# Check no ONLINE vdevs are shown
|
||||||
|
log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE"
|
||||||
|
|
||||||
|
# Check no ONLINE slow vdevs are show. Then mark IOs greater than
|
||||||
|
# 10ms slow, delay IOs 20ms to vdev6, check slow IOs.
|
||||||
|
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE"
|
||||||
|
log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE"
|
||||||
|
|
||||||
|
log_must set_tunable64 ZIO_SLOW_IO_MS 10
|
||||||
|
log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2
|
||||||
|
log_must mkfile 1048576 /$TESTPOOL2/testfile
|
||||||
|
sync_pool $TESTPOOL2
|
||||||
|
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
|
||||||
|
|
||||||
|
# Check vdev6 slow IOs are only shown when requested with -s.
|
||||||
|
log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE"
|
||||||
|
log_must eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE"
|
||||||
|
|
||||||
|
# Pool level and top-vdev level status must be DEGRADED.
|
||||||
|
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTPOOL2 | grep DEGRADED"
|
||||||
|
log_must eval "zpool status -e $TESTPOOL2 | grep $raid_type | grep DEGRADED"
|
||||||
|
|
||||||
|
# Check that healthy vdevs[1-3] aren't shown with -e.
|
||||||
|
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE"
|
||||||
|
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE"
|
||||||
|
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE"
|
||||||
|
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE"
|
||||||
|
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE"
|
||||||
|
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE"
|
||||||
|
|
||||||
|
log_must zinject -c all
|
||||||
|
log_must zpool status -es $TESTPOOL2
|
||||||
|
|
||||||
|
zpool destroy $TESTPOOL2
|
||||||
|
done
|
||||||
|
|
||||||
|
log_pass "Verify zpool status -e shows only unhealthy vdevs"
|
|
@ -32,3 +32,7 @@
|
||||||
. $STF_SUITE/include/libtest.shlib
|
. $STF_SUITE/include/libtest.shlib
|
||||||
|
|
||||||
default_cleanup
|
default_cleanup
|
||||||
|
|
||||||
|
if tunable_exists BCLONE_ENABLED ; then
|
||||||
|
log_must restore_tunable BCLONE_ENABLED
|
||||||
|
fi
|
||||||
|
|
|
@ -0,0 +1,161 @@
|
||||||
|
#! /bin/ksh -p
|
||||||
|
#
|
||||||
|
# CDDL HEADER START
|
||||||
|
#
|
||||||
|
# The contents of this file are subject to the terms of the
|
||||||
|
# Common Development and Distribution License (the "License").
|
||||||
|
# You may not use this file except in compliance with the License.
|
||||||
|
#
|
||||||
|
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||||
|
# or https://opensource.org/licenses/CDDL-1.0.
|
||||||
|
# See the License for the specific language governing permissions
|
||||||
|
# and limitations under the License.
|
||||||
|
#
|
||||||
|
# When distributing Covered Code, include this CDDL HEADER in each
|
||||||
|
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||||
|
# If applicable, add the following below this CDDL HEADER, with the
|
||||||
|
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||||
|
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||||
|
#
|
||||||
|
# CDDL HEADER END
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
|
||||||
|
#
|
||||||
|
|
||||||
|
. $STF_SUITE/include/libtest.shlib
|
||||||
|
. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib
|
||||||
|
|
||||||
|
#
|
||||||
|
# DESCRIPTION:
|
||||||
|
# Verify all cp --reflink modes work with modified file.
|
||||||
|
#
|
||||||
|
# STRATEGY:
|
||||||
|
# 1. Verify "cp --reflink=never|auto|always" behaves as expected.
|
||||||
|
# Two different modes of operation are tested.
|
||||||
|
#
|
||||||
|
# a. zfs_bclone_wait_dirty=0: FICLONE and FICLONERANGE fail with EINVAL
|
||||||
|
# when there are dirty blocks which cannot be immediately cloned.
|
||||||
|
# This is the default behavior.
|
||||||
|
#
|
||||||
|
# b. zfs_bclone_wait_dirty=1: FICLONE and FICLONERANGE wait for
|
||||||
|
# dirty blocks to be written to disk allowing the clone to succeed.
|
||||||
|
# The downside to this is it may be slow which depending on the
|
||||||
|
# situtation may defeat the point of making a clone.
|
||||||
|
#
|
||||||
|
|
||||||
|
verify_runnable "global"
|
||||||
|
verify_block_cloning
|
||||||
|
|
||||||
|
if ! is_linux; then
|
||||||
|
log_unsupported "cp --reflink is a GNU coreutils option"
|
||||||
|
fi
|
||||||
|
|
||||||
|
function cleanup
|
||||||
|
{
|
||||||
|
datasetexists $TESTPOOL/cp-reflink && \
|
||||||
|
destroy_dataset $$TESTPOOL/cp-reflink -f
|
||||||
|
log_must set_tunable32 BCLONE_WAIT_DIRTY 0
|
||||||
|
}
|
||||||
|
|
||||||
|
function verify_copy
|
||||||
|
{
|
||||||
|
src_cksum=$(sha256digest $1)
|
||||||
|
dst_cksum=$(sha256digest $2)
|
||||||
|
|
||||||
|
if [[ "$src_cksum" != "$dst_cksum" ]]; then
|
||||||
|
log_must ls -l $CP_TESTDIR
|
||||||
|
log_fail "checksum mismatch ($src_cksum != $dst_cksum)"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
log_assert "Verify all cp --reflink modes work with modified file"
|
||||||
|
|
||||||
|
log_onexit cleanup
|
||||||
|
|
||||||
|
SRC_FILE=src.data
|
||||||
|
DST_FILE=dst.data
|
||||||
|
SRC_SIZE=$(($RANDOM % 2048))
|
||||||
|
|
||||||
|
# A smaller recordsize is used merely to speed up the test.
|
||||||
|
RECORDSIZE=4096
|
||||||
|
|
||||||
|
log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink
|
||||||
|
CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink)
|
||||||
|
|
||||||
|
log_must cd $CP_TESTDIR
|
||||||
|
|
||||||
|
# Never wait on dirty blocks (zfs_bclone_wait_dirty=0)
|
||||||
|
log_must set_tunable32 BCLONE_WAIT_DIRTY 0
|
||||||
|
|
||||||
|
for mode in "never" "auto" "always"; do
|
||||||
|
log_note "Checking 'cp --reflink=$mode'"
|
||||||
|
|
||||||
|
# Create a new file and immediately copy it.
|
||||||
|
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
|
||||||
|
|
||||||
|
if [[ "$mode" == "always" ]]; then
|
||||||
|
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||||
|
log_must ls -l $CP_TESTDIR
|
||||||
|
else
|
||||||
|
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||||
|
verify_copy $SRC_FILE $DST_FILE
|
||||||
|
fi
|
||||||
|
log_must rm -f $DST_FILE
|
||||||
|
|
||||||
|
# Append to an existing file and immediately copy it.
|
||||||
|
sync_pool $TESTPOOL
|
||||||
|
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
|
||||||
|
count=1 conv=notrunc
|
||||||
|
if [[ "$mode" == "always" ]]; then
|
||||||
|
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||||
|
log_must ls -l $CP_TESTDIR
|
||||||
|
else
|
||||||
|
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||||
|
verify_copy $SRC_FILE $DST_FILE
|
||||||
|
fi
|
||||||
|
log_must rm -f $DST_FILE
|
||||||
|
|
||||||
|
# Overwrite a random range of an existing file and immediately copy it.
|
||||||
|
sync_pool $TESTPOOL
|
||||||
|
log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
|
||||||
|
seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
|
||||||
|
if [[ "$mode" == "always" ]]; then
|
||||||
|
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||||
|
log_must ls -l $CP_TESTDIR
|
||||||
|
else
|
||||||
|
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||||
|
verify_copy $SRC_FILE $DST_FILE
|
||||||
|
fi
|
||||||
|
log_must rm -f $SRC_FILE $DST_FILE
|
||||||
|
done
|
||||||
|
|
||||||
|
# Wait on dirty blocks (zfs_bclone_wait_dirty=1)
|
||||||
|
log_must set_tunable32 BCLONE_WAIT_DIRTY 1
|
||||||
|
|
||||||
|
for mode in "never" "auto" "always"; do
|
||||||
|
log_note "Checking 'cp --reflink=$mode'"
|
||||||
|
|
||||||
|
# Create a new file and immediately copy it.
|
||||||
|
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE
|
||||||
|
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||||
|
verify_copy $SRC_FILE $DST_FILE
|
||||||
|
log_must rm -f $DST_FILE
|
||||||
|
|
||||||
|
# Append to an existing file and immediately copy it.
|
||||||
|
log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \
|
||||||
|
count=1 conv=notrunc
|
||||||
|
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||||
|
verify_copy $SRC_FILE $DST_FILE
|
||||||
|
log_must rm -f $DST_FILE
|
||||||
|
|
||||||
|
# Overwrite a random range of an existing file and immediately copy it.
|
||||||
|
log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
|
||||||
|
seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
|
||||||
|
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
|
||||||
|
verify_copy $SRC_FILE $DST_FILE
|
||||||
|
log_must rm -f $SRC_FILE $DST_FILE
|
||||||
|
done
|
||||||
|
|
||||||
|
log_pass
|
|
@ -32,4 +32,10 @@
|
||||||
. $STF_SUITE/include/libtest.shlib
|
. $STF_SUITE/include/libtest.shlib
|
||||||
|
|
||||||
DISK=${DISKS%% *}
|
DISK=${DISKS%% *}
|
||||||
|
|
||||||
|
if tunable_exists BCLONE_ENABLED ; then
|
||||||
|
log_must save_tunable BCLONE_ENABLED
|
||||||
|
log_must set_tunable32 BCLONE_ENABLED 1
|
||||||
|
fi
|
||||||
|
|
||||||
default_setup $DISK
|
default_setup $DISK
|
||||||
|
|
Loading…
Reference in New Issue