Merge pull request #1 from KlaraSystems/wasabi-215p11

Wasabi 215p11
This commit is contained in:
Geoff Amey 2024-05-28 13:29:54 -06:00 committed by GitHub
commit 483b71247b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
77 changed files with 3454 additions and 360 deletions

View File

@ -37,7 +37,7 @@ import re
bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"]
bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize",
"meta", "state", "dbholds", "dbc", "list", "atype", "flags",
"usize", "meta", "state", "dbholds", "dbc", "list", "atype", "flags",
"count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2",
"l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype",
"data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"]
@ -47,17 +47,17 @@ dhdr = ["pool", "objset", "object", "dtype", "cached"]
dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct",
"indirect", "bonus", "spill"]
dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds",
"dbc", "list", "atype", "flags", "count", "asize", "access",
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
"l2_comp", "aholds"]
dincompat = ["level", "blkid", "offset", "dbsize", "usize", "meta", "state",
"dbholds", "dbc", "list", "atype", "flags", "count", "asize",
"access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
"l2_asize", "l2_comp", "aholds"]
thdr = ["pool", "objset", "dtype", "cached"]
txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect",
"bonus", "spill"]
tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state",
"dbc", "dbholds", "list", "atype", "flags", "count", "asize",
"access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
tincompat = ["object", "level", "blkid", "offset", "dbsize", "usize", "meta",
"state", "dbc", "dbholds", "list", "atype", "flags", "count",
"asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
"l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize"]
@ -70,6 +70,7 @@ cols = {
"blkid": [8, -1, "block number of buffer"],
"offset": [12, 1024, "offset in object of buffer"],
"dbsize": [7, 1024, "size of buffer"],
"usize": [7, 1024, "size of attached user data"],
"meta": [4, -1, "is this buffer metadata?"],
"state": [5, -1, "state of buffer (read, cached, etc)"],
"dbholds": [7, 1000, "number of holds on buffer"],
@ -399,6 +400,7 @@ def update_dict(d, k, line, labels):
key = line[labels[k]]
dbsize = int(line[labels['dbsize']])
usize = int(line[labels['usize']])
blkid = int(line[labels['blkid']])
level = int(line[labels['level']])
@ -416,7 +418,7 @@ def update_dict(d, k, line, labels):
d[pool][objset][key]['indirect'] = 0
d[pool][objset][key]['spill'] = 0
d[pool][objset][key]['cached'] += dbsize
d[pool][objset][key]['cached'] += dbsize + usize
if blkid == -1:
d[pool][objset][key]['bonus'] += dbsize

View File

@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2023-2024, Klara Inc.
*/
/*
@ -276,6 +277,11 @@ usage(void)
"\t\tcreate 3 lanes on the device; one lane with a latency\n"
"\t\tof 10 ms and two lanes with a 25 ms latency.\n"
"\n"
"\tzinject -P import|export -s <seconds> pool\n"
"\t\tAdd an artificial delay to a future pool import or export,\n"
"\t\tsuch that the operation takes a minimum of supplied seconds\n"
"\t\tto complete.\n"
"\n"
"\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
"\t\tCause the pool to stop writing blocks yet not\n"
"\t\treport errors for a duration. Simulates buggy hardware\n"
@ -358,8 +364,10 @@ print_data_handler(int id, const char *pool, zinject_record_t *record,
{
int *count = data;
if (record->zi_guid != 0 || record->zi_func[0] != '\0')
if (record->zi_guid != 0 || record->zi_func[0] != '\0' ||
record->zi_duration != 0) {
return (0);
}
if (*count == 0) {
(void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s "
@ -462,6 +470,33 @@ print_panic_handler(int id, const char *pool, zinject_record_t *record,
return (0);
}
static int
print_pool_delay_handler(int id, const char *pool, zinject_record_t *record,
void *data)
{
int *count = data;
if (record->zi_cmd != ZINJECT_DELAY_IMPORT &&
record->zi_cmd != ZINJECT_DELAY_EXPORT) {
return (0);
}
if (*count == 0) {
(void) printf("%3s %-19s %-11s %s\n",
"ID", "POOL", "DELAY (sec)", "COMMAND");
(void) printf("--- ------------------- -----------"
" -------\n");
}
*count += 1;
(void) printf("%3d %-19s %-11llu %s\n",
id, pool, (u_longlong_t)record->zi_duration,
record->zi_cmd == ZINJECT_DELAY_IMPORT ? "import": "export");
return (0);
}
/*
* Print all registered error handlers. Returns the number of handlers
* registered.
@ -492,6 +527,13 @@ print_all_handlers(void)
count = 0;
}
(void) iter_handlers(print_pool_delay_handler, &count);
if (count > 0) {
total += count;
(void) printf("\n");
count = 0;
}
(void) iter_handlers(print_panic_handler, &count);
return (count + total);
@ -564,9 +606,27 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
zc.zc_guid = flags;
if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
(void) fprintf(stderr, "failed to add handler: %s\n",
errno == EDOM ? "block level exceeds max level of object" :
strerror(errno));
const char *errmsg = strerror(errno);
switch (errno) {
case EDOM:
errmsg = "block level exceeds max level of object";
break;
case EEXIST:
if (record->zi_cmd == ZINJECT_DELAY_IMPORT)
errmsg = "pool already imported";
if (record->zi_cmd == ZINJECT_DELAY_EXPORT)
errmsg = "a handler already exists";
break;
case ENOENT:
/* import delay injector running on older zfs module */
if (record->zi_cmd == ZINJECT_DELAY_IMPORT)
errmsg = "import delay injector not supported";
break;
default:
break;
}
(void) fprintf(stderr, "failed to add handler: %s\n", errmsg);
return (1);
}
@ -591,6 +651,9 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
} else if (record->zi_duration < 0) {
(void) printf(" txgs: %lld \n",
(u_longlong_t)-record->zi_duration);
} else if (record->zi_timer > 0) {
(void) printf(" timer: %lld ms\n",
(u_longlong_t)NSEC2MSEC(record->zi_timer));
} else {
(void) printf("objset: %llu\n",
(u_longlong_t)record->zi_objset);
@ -789,7 +852,7 @@ main(int argc, char **argv)
}
while ((c = getopt(argc, argv,
":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
@ -919,6 +982,19 @@ main(int argc, char **argv)
sizeof (record.zi_func));
record.zi_cmd = ZINJECT_PANIC;
break;
case 'P':
if (strcasecmp(optarg, "import") == 0) {
record.zi_cmd = ZINJECT_DELAY_IMPORT;
} else if (strcasecmp(optarg, "export") == 0) {
record.zi_cmd = ZINJECT_DELAY_EXPORT;
} else {
(void) fprintf(stderr, "invalid command '%s': "
"must be 'import' or 'export'\n", optarg);
usage();
libzfs_fini(g_zfs);
return (1);
}
break;
case 'q':
quiet = 1;
break;
@ -998,7 +1074,7 @@ main(int argc, char **argv)
argc -= optind;
argv += optind;
if (record.zi_duration != 0)
if (record.zi_duration != 0 && record.zi_cmd == 0)
record.zi_cmd = ZINJECT_IGNORED_WRITES;
if (cancel != NULL) {
@ -1128,8 +1204,8 @@ main(int argc, char **argv)
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || device != NULL || record.zi_freq > 0 ||
dvas != 0) {
(void) fprintf(stderr, "panic (-p) incompatible with "
"other options\n");
(void) fprintf(stderr, "%s incompatible with other "
"options\n", "import|export delay (-P)");
usage();
libzfs_fini(g_zfs);
return (2);
@ -1147,6 +1223,28 @@ main(int argc, char **argv)
if (argv[1] != NULL)
record.zi_type = atoi(argv[1]);
dataset[0] = '\0';
} else if (record.zi_cmd == ZINJECT_DELAY_IMPORT ||
record.zi_cmd == ZINJECT_DELAY_EXPORT) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || device != NULL || record.zi_freq > 0 ||
dvas != 0) {
(void) fprintf(stderr, "%s incompatible with other "
"options\n", "import|export delay (-P)");
usage();
libzfs_fini(g_zfs);
return (2);
}
if (argc != 1 || record.zi_duration <= 0) {
(void) fprintf(stderr, "import|export delay (-P) "
"injection requires a duration (-s) and a single "
"pool name\n");
usage();
libzfs_fini(g_zfs);
return (2);
}
(void) strlcpy(pool, argv[0], sizeof (pool));
} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_freq > 0 || dvas != 0) {

View File

@ -50,6 +50,7 @@
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <thread_pool.h>
#include <time.h>
#include <unistd.h>
#include <pwd.h>
@ -1848,10 +1849,19 @@ zpool_do_destroy(int argc, char **argv)
}
typedef struct export_cbdata {
tpool_t *tpool;
pthread_mutex_t mnttab_lock;
boolean_t force;
boolean_t hardforce;
int retval;
} export_cbdata_t;
typedef struct {
char *aea_poolname;
export_cbdata_t *aea_cbdata;
} async_export_args_t;
/*
* Export one pool
*/
@ -1860,11 +1870,20 @@ zpool_export_one(zpool_handle_t *zhp, void *data)
{
export_cbdata_t *cb = data;
if (zpool_disable_datasets(zhp, cb->force, cb->hardforce) != 0)
return (1);
/*
* zpool_disable_datasets() is not thread-safe for mnttab access.
* So we serialize access here for 'zpool export -a' parallel case.
*/
if (cb->tpool != NULL)
pthread_mutex_lock(&cb->mnttab_lock);
/* The history must be logged as part of the export */
log_history = B_FALSE;
int retval = zpool_disable_datasets(zhp, cb->force, cb->hardforce);
if (cb->tpool != NULL)
pthread_mutex_unlock(&cb->mnttab_lock);
if (retval)
return (1);
if (cb->hardforce) {
if (zpool_export_force(zhp, history_str) != 0)
@ -1876,6 +1895,48 @@ zpool_export_one(zpool_handle_t *zhp, void *data)
return (0);
}
/*
* Asynchronous export request
*/
static void
zpool_export_task(void *arg)
{
async_export_args_t *aea = arg;
zpool_handle_t *zhp = zpool_open(g_zfs, aea->aea_poolname);
if (zhp != NULL) {
int ret = zpool_export_one(zhp, aea->aea_cbdata);
if (ret != 0)
aea->aea_cbdata->retval = ret;
zpool_close(zhp);
} else {
aea->aea_cbdata->retval = 1;
}
free(aea->aea_poolname);
free(aea);
}
/*
* Process an export request in parallel
*/
static int
zpool_export_one_async(zpool_handle_t *zhp, void *data)
{
tpool_t *tpool = ((export_cbdata_t *)data)->tpool;
async_export_args_t *aea = safe_malloc(sizeof (async_export_args_t));
/* save pool name since zhp will go out of scope */
aea->aea_poolname = strdup(zpool_get_name(zhp));
aea->aea_cbdata = data;
/* ship off actual export to another thread */
if (tpool_dispatch(tpool, zpool_export_task, (void *)aea) != 0)
return (errno); /* unlikely */
else
return (0);
}
/*
* zpool export [-f] <pool> ...
*
@ -1919,17 +1980,33 @@ zpool_do_export(int argc, char **argv)
cb.force = force;
cb.hardforce = hardforce;
cb.tpool = NULL;
cb.retval = 0;
argc -= optind;
argv += optind;
/* The history will be logged as part of the export itself */
log_history = B_FALSE;
if (do_all) {
if (argc != 0) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
return (for_each_pool(argc, argv, B_TRUE, NULL,
B_FALSE, zpool_export_one, &cb));
cb.tpool = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN),
0, NULL);
pthread_mutex_init(&cb.mnttab_lock, NULL);
/* Asynchronously call zpool_export_one using thread pool */
ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE,
zpool_export_one_async, &cb);
tpool_wait(cb.tpool);
tpool_destroy(cb.tpool);
(void) pthread_mutex_destroy(&cb.mnttab_lock);
return (ret | cb.retval);
}
/* check arguments */
@ -3068,12 +3145,21 @@ zfs_force_import_required(nvlist_t *config)
nvlist_t *nvinfo;
state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE);
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
/*
* The hostid on LOAD_INFO comes from the MOS label via
* spa_tryimport(). If its not there then we're likely talking to an
* older kernel, so use the top one, which will be from the label
* discovered in zpool_find_import(), or if a cachefile is in use, the
* local hostid.
*/
if (nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID, &hostid) != 0)
nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid())
return (B_TRUE);
nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) {
mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo,
ZPOOL_CONFIG_MMP_STATE);
@ -3143,7 +3229,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
uint64_t timestamp = 0;
uint64_t hostid = 0;
if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME))
if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTNAME))
hostname = fnvlist_lookup_string(nvinfo,
ZPOOL_CONFIG_HOSTNAME);
else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME))
hostname = fnvlist_lookup_string(config,
ZPOOL_CONFIG_HOSTNAME);
@ -3151,7 +3240,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
timestamp = fnvlist_lookup_uint64(config,
ZPOOL_CONFIG_TIMESTAMP);
if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID))
if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTID))
hostid = fnvlist_lookup_uint64(nvinfo,
ZPOOL_CONFIG_HOSTID);
else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID))
hostid = fnvlist_lookup_uint64(config,
ZPOOL_CONFIG_HOSTID);
@ -3196,15 +3288,40 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
return (ret);
}
typedef struct import_parameters {
nvlist_t *ip_config;
const char *ip_mntopts;
nvlist_t *ip_props;
int ip_flags;
int *ip_err;
} import_parameters_t;
static void
do_import_task(void *arg)
{
import_parameters_t *ip = arg;
*ip->ip_err |= do_import(ip->ip_config, NULL, ip->ip_mntopts,
ip->ip_props, ip->ip_flags);
free(ip);
}
static int
import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,
char *orig_name, char *new_name,
boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all,
importargs_t *import)
char *orig_name, char *new_name, importargs_t *import)
{
nvlist_t *config = NULL;
nvlist_t *found_config = NULL;
uint64_t pool_state;
boolean_t pool_specified = (import->poolname != NULL ||
import->guid != 0);
tpool_t *tp = NULL;
if (import->do_all) {
tp = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN),
0, NULL);
}
/*
* At this point we have a list of import candidate configs. Even if
@ -3221,9 +3338,11 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
&pool_state) == 0);
if (!do_destroyed && pool_state == POOL_STATE_DESTROYED)
if (!import->do_destroyed &&
pool_state == POOL_STATE_DESTROYED)
continue;
if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
if (import->do_destroyed &&
pool_state != POOL_STATE_DESTROYED)
continue;
verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
@ -3232,12 +3351,21 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,
if (!pool_specified) {
if (first)
first = B_FALSE;
else if (!do_all)
else if (!import->do_all)
(void) printf("\n");
if (do_all) {
err |= do_import(config, NULL, mntopts,
props, flags);
if (import->do_all) {
import_parameters_t *ip = safe_malloc(
sizeof (import_parameters_t));
ip->ip_config = config;
ip->ip_mntopts = mntopts;
ip->ip_props = props;
ip->ip_flags = flags;
ip->ip_err = &err;
(void) tpool_dispatch(tp, do_import_task,
(void *)ip);
} else {
/*
* If we're importing from cachefile, then
@ -3285,6 +3413,10 @@ import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags,
found_config = config;
}
}
if (import->do_all) {
tpool_wait(tp);
tpool_destroy(tp);
}
/*
* If we were searching for a specific pool, verify that we found a
@ -3514,7 +3646,6 @@ zpool_do_import(int argc, char **argv)
boolean_t xtreme_rewind = B_FALSE;
boolean_t do_scan = B_FALSE;
boolean_t pool_exists = B_FALSE;
boolean_t pool_specified = B_FALSE;
uint64_t txg = -1ULL;
char *cachefile = NULL;
importargs_t idata = { 0 };
@ -3722,7 +3853,6 @@ zpool_do_import(int argc, char **argv)
searchname = argv[0];
searchguid = 0;
}
pool_specified = B_TRUE;
/*
* User specified a name or guid. Ensure it's unique.
@ -3763,6 +3893,8 @@ zpool_do_import(int argc, char **argv)
idata.cachefile = cachefile;
idata.scan = do_scan;
idata.policy = policy;
idata.do_destroyed = do_destroyed;
idata.do_all = do_all;
pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops);
@ -3802,9 +3934,7 @@ zpool_do_import(int argc, char **argv)
}
err = import_pools(pools, props, mntopts, flags,
argc >= 1 ? argv[0] : NULL,
argc >= 2 ? argv[1] : NULL,
do_destroyed, pool_specified, do_all, &idata);
argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, &idata);
/*
* If we're using the cachefile and we failed to import, then
@ -3825,9 +3955,8 @@ zpool_do_import(int argc, char **argv)
pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops);
err = import_pools(pools, props, mntopts, flags,
argc >= 1 ? argv[0] : NULL,
argc >= 2 ? argv[1] : NULL,
do_destroyed, pool_specified, do_all, &idata);
argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL,
&idata);
}
error:
@ -8411,7 +8540,7 @@ status_callback(zpool_handle_t *zhp, void *data)
printf_color(ANSI_BOLD, gettext("action: "));
printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices"
" are connected, then reboot your system and\n\timport the "
"pool.\n"));
"pool or run 'zpool clear' to resume the pool.\n"));
break;
case ZPOOL_STATUS_IO_FAILURE_WAIT:

View File

@ -0,0 +1,17 @@
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
ZFS_LINUX_TEST_SRC([page_size], [
#include <linux/mm.h>
],[
unsigned long s;
s = page_size(NULL);
])
])
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
AC_MSG_CHECKING([whether page_size() is available])
ZFS_LINUX_TEST_RESULT([page_size], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -144,6 +144,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_KTHREAD
ZFS_AC_KERNEL_SRC_ZERO_PAGE
ZFS_AC_KERNEL_SRC___COPY_FROM_USER_INATOMIC
ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
AC_MSG_CHECKING([for available kernel interfaces])
ZFS_LINUX_TEST_COMPILE_ALL([kabi])
@ -261,6 +262,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_KTHREAD
ZFS_AC_KERNEL_ZERO_PAGE
ZFS_AC_KERNEL___COPY_FROM_USER_INATOMIC
ZFS_AC_KERNEL_MM_PAGE_SIZE
])
dnl #

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018 by Delphix. All rights reserved.
* Copyright (c) 2018, 2024 by Delphix. All rights reserved.
*/
#ifndef _LIBZUTIL_H
@ -68,6 +68,8 @@ typedef struct importargs {
boolean_t can_be_active; /* can the pool be active? */
boolean_t scan; /* prefer scanning to libblkid cache */
nvlist_t *policy; /* load policy (max txg, rewind, etc.) */
boolean_t do_destroyed;
boolean_t do_all;
} importargs_t;
extern nvlist_t *zpool_search_import(void *, importargs_t *,

View File

@ -92,6 +92,12 @@
#define param_set_max_auto_ashift_args(var) \
CTLTYPE_U64, &var, 0, param_set_max_auto_ashift, "QU"
#define spa_taskq_read_param_set_args(var) \
CTLTYPE_STRING, NULL, 0, spa_taskq_read_param, "A"
#define spa_taskq_write_param_set_args(var) \
CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A"
#define fletcher_4_param_set_args(var) \
CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A"

View File

@ -91,6 +91,8 @@ extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *,
uint_t, clock_t);
extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
taskq_ent_t *);
extern boolean_t taskq_try_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
taskq_ent_t *);
extern int taskq_empty_ent(taskq_ent_t *);
taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t);

View File

@ -36,7 +36,11 @@ struct xucred;
typedef struct flock flock64_t;
typedef struct vnode vnode_t;
typedef struct vattr vattr_t;
#if __FreeBSD_version < 1400093
typedef enum vtype vtype_t;
#else
#define vtype_t __enum_uint8(vtype)
#endif
#include <sys/types.h>
#include <sys/queue.h>

View File

@ -10,6 +10,7 @@ KERNEL_H = \
simd_x86.h \
simd_aarch64.h \
simd_powerpc.h \
mm_compat.h \
mod_compat.h \
page_compat.h \
compiler_compat.h

View File

@ -0,0 +1,36 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2023, 2024, Klara Inc.
*/
#ifndef _ZFS_MM_COMPAT_H
#define _ZFS_MM_COMPAT_H
#include <linux/mm.h>
/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
#ifndef HAVE_MM_PAGE_SIZE
#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
#endif
#endif /* _ZFS_MM_COMPAT_H */

View File

@ -146,6 +146,8 @@ extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *,
uint_t, clock_t);
extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
taskq_ent_t *);
extern boolean_t taskq_try_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
taskq_ent_t *);
extern int taskq_empty_ent(taskq_ent_t *);
extern void taskq_init_ent(taskq_ent_t *);
extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);

View File

@ -11,6 +11,7 @@ KERNEL_H = \
trace_dnode.h \
trace_multilist.h \
trace_rrwlock.h \
trace_spa_taskqs.h \
trace_txg.h \
trace_vdev.h \
trace_zil.h \

View File

@ -0,0 +1,74 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
#if defined(_KERNEL)
#if defined(HAVE_DECLARE_EVENT_CLASS)
#undef TRACE_SYSTEM
#define TRACE_SYSTEM zfs
#undef TRACE_SYSTEM_VAR
#define TRACE_SYSTEM_VAR zfs_spa_taskqs
#if !defined(_TRACE_SPA_TASKQS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SPA_TASKQS_H
#include <linux/tracepoint.h>
#include <sys/types.h>
/*
* Generic support for two argument tracepoints of the form:
*
* DTRACE_PROBE2(...,
* spa_taskqs_t *stqs, ...,
* taskq_ent_t *ent, ...);
*/
/* BEGIN CSTYLED */
DECLARE_EVENT_CLASS(zfs_spa_taskqs_ent_class,
TP_PROTO(spa_taskqs_t *stqs, taskq_ent_t *ent),
TP_ARGS(stqs, ent),
);
/* END CSTYLED */
/* BEGIN CSTYLED */
#define DEFINE_SPA_TASKQS_ENT_EVENT(name) \
DEFINE_EVENT(zfs_spa_taskqs_ent_class, name, \
TP_PROTO(spa_taskqs_t *stqs, taskq_ent_t *ent), \
TP_ARGS(stqs, ent))
/* END CSTYLED */
DEFINE_SPA_TASKQS_ENT_EVENT(zfs_spa_taskqs_ent__dispatch);
DEFINE_SPA_TASKQS_ENT_EVENT(zfs_spa_taskqs_ent__dispatched);
#endif /* _TRACE_SPA_TASKQS_H */
#undef TRACE_INCLUDE_PATH
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_PATH sys
#define TRACE_INCLUDE_FILE trace_spa_taskqs
#include <trace/define_trace.h>
#else
DEFINE_DTRACE_PROBE2(spa_taskqs_ent__dispatch);
DEFINE_DTRACE_PROBE2(spa_taskqs_ent__dispatched);
#endif /* HAVE_DECLARE_EVENT_CLASS */
#endif /* _KERNEL */

View File

@ -44,6 +44,7 @@
#include <sys/trace_dnode.h>
#include <sys/trace_multilist.h>
#include <sys/trace_rrwlock.h>
#include <sys/trace_spa_taskqs.h>
#include <sys/trace_txg.h>
#include <sys/trace_vdev.h>
#include <sys/trace_zil.h>

View File

@ -79,6 +79,9 @@ typedef struct abd {
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
#if defined(__linux__) && defined(_KERNEL)
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
#endif
extern int zfs_abd_scatter_enabled;
@ -119,6 +122,10 @@ void abd_release_ownership_of_buf(abd_t *);
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
abd_iter_func2_t *, void *);
#if defined(__linux__) && defined(_KERNEL)
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
void *);
#endif
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
@ -207,6 +214,8 @@ void abd_fini(void);
/*
* Linux ABD bio functions
* Note: these are only needed to support vdev_classic. See comment in
* vdev_disk.c.
*/
#if defined(__linux__) && defined(_KERNEL)
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);

View File

@ -21,6 +21,7 @@
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#ifndef _ABD_IMPL_H
@ -38,12 +39,30 @@ typedef enum abd_stats_op {
ABDSTAT_DECR /* Decrease abdstat values */
} abd_stats_op_t;
struct scatterlist; /* forward declaration */
/* forward declarations */
struct scatterlist;
struct page;
struct abd_iter {
/* public interface */
void *iter_mapaddr; /* addr corresponding to iter_pos */
size_t iter_mapsize; /* length of data valid at mapaddr */
union {
/* for abd_iter_map()/abd_iter_unmap() */
struct {
/* addr corresponding to iter_pos */
void *iter_mapaddr;
/* length of data valid at mapaddr */
size_t iter_mapsize;
};
/* for abd_iter_page() */
struct {
/* current page */
struct page *iter_page;
/* offset of data in page */
size_t iter_page_doff;
/* size of data in page */
size_t iter_page_dsize;
};
};
/* private */
abd_t *iter_abd; /* ABD being iterated through */
@ -79,6 +98,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
void abd_iter_advance(struct abd_iter *, size_t);
void abd_iter_map(struct abd_iter *);
void abd_iter_unmap(struct abd_iter *);
void abd_iter_page(struct abd_iter *);
/*
* Helper macros

View File

@ -639,6 +639,9 @@ typedef struct dmu_buf_user {
*/
taskq_ent_t dbu_tqent;
/* Size of user data, for inclusion in dbuf_cache accounting. */
uint64_t dbu_size;
/*
* This instance's eviction function pointers.
*
@ -721,6 +724,16 @@ void *dmu_buf_replace_user(dmu_buf_t *db,
*/
void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
/*
* User data size accounting. This can be used to artifically inflate the size
* of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough
* to satisfy memory reclaim requests. It's not used for anything else, and
* defaults to 0.
*/
uint64_t dmu_buf_user_size(dmu_buf_t *db);
void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd);
void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub);
/*
* Returns the user data (dmu_buf_user_t *) associated with this dbuf.
*/

View File

@ -798,7 +798,7 @@ extern void spa_add_feature_stats(spa_t *spa, nvlist_t *config);
#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
#define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_FAULT_VDEV 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
#define SPA_ASYNC_AUTOEXPAND 0x20
@ -854,6 +854,8 @@ extern int zfs_sync_pass_deferred_free;
/* spa namespace global mutex */
extern kmutex_t spa_namespace_lock;
extern avl_tree_t spa_namespace_avl;
extern kcondvar_t spa_namespace_cv;
/*
* SPA configuration functions in spa_config.c
@ -1004,6 +1006,10 @@ extern int spa_import_progress_set_max_txg(uint64_t pool_guid,
uint64_t max_txg);
extern int spa_import_progress_set_state(uint64_t pool_guid,
spa_load_state_t spa_load_state);
extern void spa_import_progress_set_notes(spa_t *spa,
const char *fmt, ...) __printflike(2, 3);
extern void spa_import_progress_set_notes_nolog(spa_t *spa,
const char *fmt, ...) __printflike(2, 3);
/* Pool configuration locks */
extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag,
@ -1147,6 +1153,8 @@ extern uint32_t spa_get_hostid(spa_t *spa);
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
extern boolean_t spa_livelist_delete_check(spa_t *spa);
extern boolean_t spa_mmp_remote_host_activity(spa_t *spa);
extern spa_mode_t spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2019 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
@ -183,6 +183,8 @@ typedef enum spa_proc_state {
} spa_proc_state_t;
typedef struct spa_taskqs {
zio_taskq_type_t stqs_type;
zio_type_t stqs_zio_type;
uint_t stqs_count;
taskq_t **stqs_taskq;
} spa_taskqs_t;
@ -229,6 +231,8 @@ struct spa {
dsl_pool_t *spa_dsl_pool;
boolean_t spa_is_initializing; /* true while opening pool */
boolean_t spa_is_exporting; /* true while exporting pool */
kthread_t *spa_export_thread; /* valid during pool export */
kthread_t *spa_load_thread; /* loading, no namespace lock */
metaslab_class_t *spa_normal_class; /* normal data class */
metaslab_class_t *spa_log_class; /* intent log data class */
metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */

View File

@ -50,20 +50,20 @@ extern "C" {
#define MMP_SEQ_VALID_BIT 0x02
#define MMP_FAIL_INT_VALID_BIT 0x04
#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
ubp->ub_mmp_magic == MMP_MAGIC)
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \
(ubp)->ub_mmp_magic == MMP_MAGIC)
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_INTERVAL_VALID_BIT))
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_SEQ_VALID_BIT))
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_FAIL_INT_VALID_BIT))
#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \
>> 8)
#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \
>> 32)
#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \
>> 48)
#define MMP_INTERVAL_SET(write) \

View File

@ -290,7 +290,7 @@ struct vdev {
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
boolean_t vdev_fault_wanted; /* async faulted wanted? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */

View File

@ -503,6 +503,8 @@ extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t,
clock_t);
extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
taskq_ent_t *);
extern boolean_t taskq_try_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
taskq_ent_t *);
extern int taskq_empty_ent(taskq_ent_t *);
extern void taskq_init_ent(taskq_ent_t *);
extern void taskq_destroy(taskq_t *);

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2012, 2024 by Delphix. All rights reserved.
* Copyright 2016 RackTop Systems.
* Copyright (c) 2017, Intel Corporation.
*/
@ -447,6 +447,8 @@ typedef enum zinject_type {
ZINJECT_PANIC,
ZINJECT_DELAY_IO,
ZINJECT_DECRYPT_FAULT,
ZINJECT_DELAY_IMPORT,
ZINJECT_DELAY_EXPORT,
} zinject_type_t;
typedef struct zfs_share {

View File

@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2012, 2024 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright 2016 Toomas Soome <tsoome@me.com>
@ -685,6 +685,8 @@ extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1,
extern int zio_handle_label_injection(zio_t *zio, int error);
extern void zio_handle_ignored_writes(zio_t *zio);
extern hrtime_t zio_handle_io_delay(zio_t *zio);
extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed);
extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed);
/*
* Checksum ereport functions

View File

@ -182,6 +182,8 @@ extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern size_t slack_compress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern int slack_decompress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
/*
* Compress and decompress data if necessary.

View File

@ -29,6 +29,7 @@
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
* Copyright (c) 2021, 2023, Klara Inc.
*/
#include <errno.h>
@ -265,6 +266,7 @@ zpool_get_state_str(zpool_handle_t *zhp)
} else if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
str = gettext("FAULTED");
} else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT ||
status == ZPOOL_STATUS_IO_FAILURE_CONTINUE ||
status == ZPOOL_STATUS_IO_FAILURE_MMP) {
str = gettext("SUSPENDED");
} else {

View File

@ -156,8 +156,8 @@ taskq_init_ent(taskq_ent_t *t)
t->tqent_flags = 0;
}
void
taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
static void
taskq_dispatch_ent_impl(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
taskq_ent_t *t)
{
ASSERT(func != NULL);
@ -170,7 +170,6 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
/*
* Enqueue the task to the underlying queue.
*/
mutex_enter(&tq->tq_lock);
if (flags & TQ_FRONT) {
t->tqent_next = tq->tq_task.tqent_next;
@ -184,9 +183,28 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
t->tqent_func = func;
t->tqent_arg = arg;
cv_signal(&tq->tq_dispatch_cv);
}
void
taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
taskq_ent_t *t)
{
mutex_enter(&tq->tq_lock);
taskq_dispatch_ent_impl(tq, func, arg, flags, t);
mutex_exit(&tq->tq_lock);
}
boolean_t
taskq_try_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
taskq_ent_t *t)
{
if (!mutex_tryenter(&tq->tq_lock))
return (B_FALSE);
taskq_dispatch_ent_impl(tq, func, arg, flags, t);
mutex_exit(&tq->tq_lock);
return (B_TRUE);
}
void
taskq_wait(taskq_t *tq)
{

View File

@ -2,6 +2,7 @@
.\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
.\" Copyright (c) 2019 Datto Inc.
.\" Copyright (c) 2023, 2024 Klara, Inc.
.\" The contents of this file are subject to the terms of the Common Development
.\" and Distribution License (the "License"). You may not use this file except
.\" in compliance with the License. You can obtain a copy of the license at
@ -15,7 +16,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.Dd January 10, 2023
.Dd January 9, 2024
.Dt ZFS 4
.Os
.
@ -1305,6 +1306,29 @@ as fuller devices will tend to be slower than empty devices.
Also see
.Sy zio_dva_throttle_enabled .
.
.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
Maximum number of segments to add to a BIO (min 4).
If this is higher than the maximum allowed by the device queue or the kernel
itself, it will be clamped.
Setting it to zero will cause the kernel's ideal size to be used.
This parameter only applies on Linux.
This parameter is ignored if
.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
.
.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
and earlier.
This "classic" method has known issues with highly fragmented IO requests and
is slower on many workloads, but it has been in use for many years and is known
to be very stable.
If you set this parameter, please also open a bug report why you did so,
including the workload involved and any error messages.
.Pp
This parameter and the classic submission method will be removed once we have
total confidence in the new method.
.Pp
This parameter only applies on Linux, and can only be set at module load time.
.
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
Time before expiring
.Pa .zfs/snapshot .
@ -2167,6 +2191,16 @@ If
.Sy 0 ,
generate a system-dependent value close to 6 threads per taskq.
.
.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
Set the queue and thread configuration for the IO read queues.
This is an advanced debugging parameter.
Don't change this unless you understand what it does.
.
.It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp
Set the queue and thread configuration for the IO write queues.
This is an advanced debugging parameter.
Don't change this unless you understand what it does.
.
.It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Do not create zvol device nodes.
This may slightly improve startup time on

View File

@ -127,6 +127,14 @@ Force a vdev error.
.
.It Xo
.Nm zinject
.Fl i Ar seconds
.Ar pool
.Xc
Add an artificial delay during the future import of a pool.
This injector is automatically cleared after the import is finished.
.
.It Xo
.Nm zinject
.Fl I
.Op Fl s Ar seconds Ns | Ns Fl g Ar txgs
.Ar pool

View File

@ -49,9 +49,10 @@ If the pool was suspended it will be brought back online provided the
devices can be accessed.
Pools with
.Sy multihost
enabled which have been suspended cannot be resumed.
While the pool was suspended, it may have been imported on
another host, and resuming I/O could result in pool damage.
enabled which have been suspended cannot be resumed when there is evidence
that the pool was imported by another host.
The same checks performed during an import will be applied before the clear
proceeds.
.
.Sh SEE ALSO
.Xr zdb 8 ,

View File

@ -411,6 +411,15 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags,
taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
}
boolean_t
taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags,
taskq_ent_t *task)
{
/* XXX: implement me -- robn, 2023-10-23 */
taskq_dispatch_ent(tq, func, arg, flags, task);
return (B_TRUE);
}
void
taskq_wait(taskq_t *tq)
{

View File

@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
ASSERT(!abd_is_gang(abd));
abd_verify(abd);
memset(aiter, 0, sizeof (struct abd_iter));
aiter->iter_abd = abd;
aiter->iter_pos = 0;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}
/*

View File

@ -673,17 +673,13 @@ out:
}
EXPORT_SYMBOL(taskq_dispatch_delay);
void
taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
static void
taskq_dispatch_ent_impl(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
taskq_ent_t *t)
{
unsigned long irqflags;
ASSERT(tq);
ASSERT(func);
spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
tq->tq_lock_class);
/* Taskq being destroyed and all tasks drained */
if (!(tq->tq_flags & TASKQ_ACTIVE)) {
t->tqent_id = TASKQID_INVALID;
@ -694,7 +690,7 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
/* Dynamic taskq may be able to spawn another thread */
if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
taskq_thread_spawn(tq) == 0)
goto out2;
return;
flags |= TQ_FRONT;
}
@ -734,11 +730,45 @@ out:
/* Spawn additional taskq threads if required. */
if (tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq);
out2:
}
void
taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
taskq_ent_t *t)
{
unsigned long irqflags;
spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
tq->tq_lock_class);
taskq_dispatch_ent_impl(tq, func, arg, flags, t);
spin_unlock_irqrestore(&tq->tq_lock, irqflags);
}
EXPORT_SYMBOL(taskq_dispatch_ent);
boolean_t
taskq_try_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
taskq_ent_t *t)
{
unsigned long irqflags;
/*
* XXX I don't _think_ losing _nested matters, because I think its
* only related to lockdep, and we don't have access to that anyway
* -- robn, 2023-10-23
*/
if (!spin_trylock_irqsave(&tq->tq_lock, irqflags))
return (B_FALSE);
taskq_dispatch_ent_impl(tq, func, arg, flags, t);
spin_unlock_irqrestore(&tq->tq_lock, irqflags);
return (B_TRUE);
}
EXPORT_SYMBOL(taskq_try_dispatch_ent);
int
taskq_empty_ent(taskq_ent_t *t)
{

View File

@ -21,6 +21,7 @@
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
/*
@ -59,7 +60,9 @@
#include <sys/zfs_znode.h>
#ifdef _KERNEL
#include <linux/kmap_compat.h>
#include <linux/mm_compat.h>
#include <linux/scatterlist.h>
#include <linux/version.h>
#else
#define MAX_ORDER 1
#endif
@ -884,14 +887,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
ASSERT(!abd_is_gang(abd));
abd_verify(abd);
memset(aiter, 0, sizeof (struct abd_iter));
aiter->iter_abd = abd;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
aiter->iter_pos = 0;
if (abd_is_linear(abd)) {
aiter->iter_offset = 0;
aiter->iter_sg = NULL;
} else {
if (!abd_is_linear(abd)) {
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
}
@ -904,6 +902,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
boolean_t
abd_iter_at_end(struct abd_iter *aiter)
{
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
return (aiter->iter_pos == aiter->iter_abd->abd_size);
}
@ -915,8 +914,15 @@ abd_iter_at_end(struct abd_iter *aiter)
void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
/*
* Ensure that last chunk is not in use. abd_iterate_*() must clear
* this state (directly or abd_iter_unmap()) before advancing.
*/
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
ASSERT3P(aiter->iter_page, ==, NULL);
ASSERT0(aiter->iter_page_doff);
ASSERT0(aiter->iter_page_dsize);
/* There's nothing left to advance to, so do nothing */
if (abd_iter_at_end(aiter))
@ -998,6 +1004,106 @@ abd_cache_reap_now(void)
}
#if defined(_KERNEL)
/*
* Yield the next page struct and data offset and size within it, without
* mapping it into the address space.
*/
void
abd_iter_page(struct abd_iter *aiter)
{
if (abd_iter_at_end(aiter)) {
aiter->iter_page = NULL;
aiter->iter_page_doff = 0;
aiter->iter_page_dsize = 0;
return;
}
struct page *page;
size_t doff, dsize;
if (abd_is_linear(aiter->iter_abd)) {
ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
/* memory address at iter_pos */
void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
/* struct page for address */
page = is_vmalloc_addr(paddr) ?
vmalloc_to_page(paddr) : virt_to_page(paddr);
/* offset of address within the page */
doff = offset_in_page(paddr);
/* total data remaining in abd from this position */
dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
} else {
ASSERT(!abd_is_gang(aiter->iter_abd));
/* current scatter page */
page = sg_page(aiter->iter_sg);
/* position within page */
doff = aiter->iter_offset;
/* remaining data in scatterlist */
dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
aiter->iter_abd->abd_size - aiter->iter_pos);
}
ASSERT(page);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
if (PageTail(page)) {
/*
* This page is part of a "compound page", which is a group of
* pages that can be referenced from a single struct page *.
* Its organised as a "head" page, followed by a series of
* "tail" pages.
*
* In OpenZFS, compound pages are allocated using the
* __GFP_COMP flag, which we get from scatter ABDs and SPL
* vmalloc slabs (ie >16K allocations). So a great many of the
* IO buffers we get are going to be of this type.
*
* The tail pages are just regular PAGE_SIZE pages, and can be
* safely used as-is. However, the head page has length
* covering itself and all the tail pages. If this ABD chunk
* spans multiple pages, then we can use the head page and a
* >PAGE_SIZE length, which is far more efficient.
*
* To do this, we need to adjust the offset to be counted from
* the head page. struct page for compound pages are stored
* contiguously, so we can just adjust by a simple offset.
*
* Before kernel 4.5, compound page heads were refcounted
* separately, such that moving back to the head page would
* require us to take a reference to it and releasing it once
* we're completely finished with it. In practice, that means
* when our caller is done with the ABD, which we have no
* insight into from here. Rather than contort this API to
* track head page references on such ancient kernels, we just
* compile this block out and use the tail pages directly. This
* is slightly less efficient, but makes everything far
* simpler.
*/
struct page *head = compound_head(page);
doff += ((page - head) * PAGESIZE);
page = head;
}
#endif
/* final page and position within it */
aiter->iter_page = page;
aiter->iter_page_doff = doff;
/* amount of data in the chunk, up to the end of the page */
aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
}
/*
* Note: ABD BIO functions only needed to support vdev_classic. See comments in
* vdev_disk.c.
*/
/*
* bio_nr_pages for ABD.
* @off is the offset in @abd
@ -1220,4 +1326,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
module_param(zfs_abd_scatter_max_order, uint, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
"Maximum order allocation used for a scatter ABD.");
#endif
#endif /* _KERNEL */

View File

@ -47,6 +47,7 @@
#include <sys/trace_dnode.h>
#include <sys/trace_multilist.h>
#include <sys/trace_rrwlock.h>
#include <sys/trace_spa_taskqs.h>
#include <sys/trace_txg.h>
#include <sys/trace_vdev.h>
#include <sys/trace_zil.h>

View File

@ -24,6 +24,7 @@
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
* LLNL-CODE-403049.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#include <sys/zfs_context.h>
@ -49,11 +50,11 @@ typedef struct vdev_disk {
int zio_suppress_zero_writes = B_TRUE;
/*
* Maximum number of segments to add to a bio. If this is higher than the
* maximum allowed by the device queue or the kernel itself, it will be
* Maximum number of segments to add to a bio (min 4). If this is higher than
* the maximum allowed by the device queue or the kernel itself, it will be
* clamped. Setting it to zero will cause the kernel's ideal size to be used.
*/
unsigned long vdev_disk_max_segs = 0;
uint_t zfs_vdev_disk_max_segs = 0;
/*
* Unique identifier for the exclusive vdev holder.
@ -72,20 +73,22 @@ static unsigned zfs_vdev_open_timeout_ms = 1000;
*/
#define EFI_MIN_RESV_SIZE (16 * 1024)
/*
* Virtual device vector for disks.
*/
typedef struct dio_request {
zio_t *dr_zio; /* Parent ZIO */
atomic_t dr_ref; /* References */
int dr_error; /* Bio error */
int dr_bio_count; /* Count of bio's */
struct bio *dr_bio[0]; /* Attached bio's */
} dio_request_t;
#ifdef HAVE_BLK_MODE_T
static blk_mode_t
#else
static fmode_t
#endif
vdev_bdev_mode(spa_mode_t spa_mode)
{
#ifdef HAVE_BLK_MODE_T
blk_mode_t mode = 0;
if (spa_mode & SPA_MODE_READ)
mode |= BLK_OPEN_READ;
if (spa_mode & SPA_MODE_WRITE)
mode |= BLK_OPEN_WRITE;
#else
fmode_t mode = 0;
if (spa_mode & SPA_MODE_READ)
@ -93,6 +96,7 @@ vdev_bdev_mode(spa_mode_t spa_mode)
if (spa_mode & SPA_MODE_WRITE)
mode |= FMODE_WRITE;
#endif
return (mode);
}
@ -355,98 +359,15 @@ vdev_disk_close(vdev_t *v)
if (v->vdev_reopening || vd == NULL)
return;
if (vd->vd_bdev != NULL) {
if (vd->vd_bdev != NULL)
blkdev_put(vd->vd_bdev,
vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL);
}
rw_destroy(&vd->vd_lock);
kmem_free(vd, sizeof (vdev_disk_t));
v->vdev_tsd = NULL;
}
static dio_request_t *
vdev_disk_dio_alloc(int bio_count)
{
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
sizeof (struct bio *) * bio_count, KM_SLEEP);
atomic_set(&dr->dr_ref, 0);
dr->dr_bio_count = bio_count;
dr->dr_error = 0;
for (int i = 0; i < dr->dr_bio_count; i++)
dr->dr_bio[i] = NULL;
return (dr);
}
static void
vdev_disk_dio_free(dio_request_t *dr)
{
int i;
for (i = 0; i < dr->dr_bio_count; i++)
if (dr->dr_bio[i])
bio_put(dr->dr_bio[i]);
kmem_free(dr, sizeof (dio_request_t) +
sizeof (struct bio *) * dr->dr_bio_count);
}
static void
vdev_disk_dio_get(dio_request_t *dr)
{
atomic_inc(&dr->dr_ref);
}
static int
vdev_disk_dio_put(dio_request_t *dr)
{
int rc = atomic_dec_return(&dr->dr_ref);
/*
* Free the dio_request when the last reference is dropped and
* ensure zio_interpret is called only once with the correct zio
*/
if (rc == 0) {
zio_t *zio = dr->dr_zio;
int error = dr->dr_error;
vdev_disk_dio_free(dr);
if (zio) {
zio->io_error = error;
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
vdev_disk_error(zio);
zio_delay_interrupt(zio);
}
}
return (rc);
}
BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
{
dio_request_t *dr = bio->bi_private;
int rc;
if (dr->dr_error == 0) {
#ifdef HAVE_1ARG_BIO_END_IO_T
dr->dr_error = BIO_END_IO_ERROR(bio);
#else
if (error)
dr->dr_error = -(error);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
dr->dr_error = EIO;
#endif
}
/* Drop reference acquired by __vdev_disk_physio */
rc = vdev_disk_dio_put(dr);
}
static inline void
vdev_submit_bio_impl(struct bio *bio)
{
@ -598,13 +519,17 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
return (bio);
}
static inline unsigned int
vdev_bio_max_segs(struct block_device *bdev) {
const unsigned long tune_max_segs =
vdev_disk_max_segs > 0 ? vdev_disk_max_segs : ULONG_MAX;
const unsigned long dev_max_segs =
queue_max_segments(bdev_get_queue(bdev));
const unsigned long max_segs = MIN(tune_max_segs, dev_max_segs);
static inline uint_t
vdev_bio_max_segs(struct block_device *bdev)
{
/*
* Smallest of the device max segs and the tuneable max segs. Minimum
* 4, so there's room to finish split pages if they come up.
*/
const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
#ifdef HAVE_BIO_MAX_SEGS
return (bio_max_segs(max_segs));
@ -613,10 +538,461 @@ vdev_bio_max_segs(struct block_device *bdev) {
#endif
}
static int
__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
size_t io_size, uint64_t io_offset, int rw, int flags)
static inline uint_t
vdev_bio_max_bytes(struct block_device *bdev)
{
return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
}
/*
* Virtual block IO object (VBIO)
*
* Linux block IO (BIO) objects have a limit on how many data segments (pages)
* they can hold. Depending on how they're allocated and structured, a large
* ZIO can require more than one BIO to be submitted to the kernel, which then
* all have to complete before we can return the completed ZIO back to ZFS.
*
* A VBIO is a wrapper around multiple BIOs, carrying everything needed to
* translate a ZIO down into the kernel block layer and back again.
*
* Note that these are only used for data ZIOs (read/write). Meta-operations
* (flush/trim) don't need multiple BIOs and so can just make the call
* directly.
*/
typedef struct {
zio_t *vbio_zio; /* parent zio */
struct block_device *vbio_bdev; /* blockdev to submit bios to */
abd_t *vbio_abd; /* abd carrying borrowed linear buf */
uint_t vbio_max_segs; /* max segs per bio */
uint_t vbio_max_bytes; /* max bytes per bio */
uint_t vbio_lbs_mask; /* logical block size mask */
uint64_t vbio_offset; /* start offset of next bio */
struct bio *vbio_bio; /* pointer to the current bio */
int vbio_flags; /* bio flags */
} vbio_t;
static vbio_t *
vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
{
vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
vbio->vbio_zio = zio;
vbio->vbio_bdev = bdev;
vbio->vbio_abd = NULL;
vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
vbio->vbio_lbs_mask = bdev_logical_block_size(bdev)-1;
vbio->vbio_offset = zio->io_offset;
vbio->vbio_bio = NULL;
vbio->vbio_flags = flags;
return (vbio);
}
BIO_END_IO_PROTO(vbio_completion, bio, error);
static int
vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
{
struct bio *bio = vbio->vbio_bio;
uint_t ssize;
while (size > 0) {
if (bio == NULL) {
/* New BIO, allocate and set up */
bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
vbio->vbio_max_segs);
VERIFY(bio);
BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
bio_set_op_attrs(bio,
vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
WRITE : READ, vbio->vbio_flags);
if (vbio->vbio_bio) {
bio_chain(vbio->vbio_bio, bio);
vdev_submit_bio(vbio->vbio_bio);
}
vbio->vbio_bio = bio;
}
/*
* Only load as much of the current page data as will fit in
* the space left in the BIO, respecting lbs alignment. Older
* kernels will error if we try to overfill the BIO, while
* newer ones will accept it and split the BIO. This ensures
* everything works on older kernels, and avoids an additional
* overhead on the new.
*/
ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
~(vbio->vbio_lbs_mask));
if (ssize > 0 &&
bio_add_page(bio, page, ssize, offset) == ssize) {
/* Accepted, adjust and load any remaining. */
size -= ssize;
offset += ssize;
continue;
}
/* No room, set up for a new BIO and loop */
vbio->vbio_offset += BIO_BI_SIZE(bio);
/* Signal new BIO allocation wanted */
bio = NULL;
}
return (0);
}
/* Iterator callback to submit ABD pages to the vbio. */
static int
vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
{
vbio_t *vbio = priv;
return (vbio_add_page(vbio, page, len, off));
}
/* Create some BIOs, fill them with data and submit them */
static void
vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
{
/*
* We plug so we can submit the BIOs as we go and only unplug them when
* they are fully created and submitted. This is important; if we don't
* plug, then the kernel may start executing earlier BIOs while we're
* still creating and executing later ones, and if the device goes
* away while that's happening, older kernels can get confused and
* trample memory.
*/
struct blk_plug plug;
blk_start_plug(&plug);
(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
ASSERT(vbio->vbio_bio);
vbio->vbio_bio->bi_end_io = vbio_completion;
vbio->vbio_bio->bi_private = vbio;
/*
* Once submitted, vbio_bio now owns vbio (through bi_private) and we
* can't touch it again. The bio may complete and vbio_completion() be
* called and free the vbio before this task is run again, so we must
* consider it invalid from this point.
*/
vdev_submit_bio(vbio->vbio_bio);
blk_finish_plug(&plug);
}
/* IO completion callback */
BIO_END_IO_PROTO(vbio_completion, bio, error)
{
vbio_t *vbio = bio->bi_private;
zio_t *zio = vbio->vbio_zio;
ASSERT(zio);
/* Capture and log any errors */
#ifdef HAVE_1ARG_BIO_END_IO_T
zio->io_error = BIO_END_IO_ERROR(bio);
#else
zio->io_error = 0;
if (error)
zio->io_error = -(error);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
zio->io_error = EIO;
#endif
ASSERT3U(zio->io_error, >=, 0);
if (zio->io_error)
vdev_disk_error(zio);
/* Return the BIO to the kernel */
bio_put(bio);
/*
* If we copied the ABD before issuing it, clean up and return the copy
* to the ADB, with changes if appropriate.
*/
if (vbio->vbio_abd != NULL) {
void *buf = abd_to_buf(vbio->vbio_abd);
abd_free(vbio->vbio_abd);
vbio->vbio_abd = NULL;
if (zio->io_type == ZIO_TYPE_READ)
abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
else
abd_return_buf(zio->io_abd, buf, zio->io_size);
}
/* Final cleanup */
kmem_free(vbio, sizeof (vbio_t));
/* All done, submit for processing */
zio_delay_interrupt(zio);
}
/*
* Iterator callback to count ABD pages and check their size & alignment.
*
* On Linux, each BIO segment can take a page pointer, and an offset+length of
* the data within that page. A page can be arbitrarily large ("compound"
* pages) but we still have to ensure the data portion is correctly sized and
* aligned to the logical block size, to ensure that if the kernel wants to
* split the BIO, the two halves will still be properly aligned.
*/
typedef struct {
uint_t bmask;
uint_t npages;
uint_t end;
} vdev_disk_check_pages_t;
static int
vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
{
vdev_disk_check_pages_t *s = priv;
/*
* If we didn't finish on a block size boundary last time, then there
* would be a gap if we tried to use this ABD as-is, so abort.
*/
if (s->end != 0)
return (1);
/*
* Note if we're taking less than a full block, so we can check it
* above on the next call.
*/
s->end = (off+len) & s->bmask;
/* All blocks after the first must start on a block size boundary. */
if (s->npages != 0 && (off & s->bmask) != 0)
return (1);
s->npages++;
return (0);
}
/*
* Check if we can submit the pages in this ABD to the kernel as-is. Returns
* the number of pages, or 0 if it can't be submitted like this.
*/
static boolean_t
vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
{
vdev_disk_check_pages_t s = {
.bmask = bdev_logical_block_size(bdev)-1,
.npages = 0,
.end = 0,
};
if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
return (B_FALSE);
return (B_TRUE);
}
static int
vdev_disk_io_rw(zio_t *zio)
{
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
struct block_device *bdev = vd->vd_bdev;
int flags = 0;
/*
* Accessing outside the block device is never allowed.
*/
if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
vdev_dbgmsg(zio->io_vd,
"Illegal access %llu size %llu, device size %llu",
(u_longlong_t)zio->io_offset,
(u_longlong_t)zio->io_size,
(u_longlong_t)i_size_read(bdev->bd_inode));
return (SET_ERROR(EIO));
}
if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
bio_set_flags_failfast(bdev, &flags);
/*
* Check alignment of the incoming ABD. If any part of it would require
* submitting a page that is not aligned to the logical block size,
* then we take a copy into a linear buffer and submit that instead.
* This should be impossible on a 512b LBS, and fairly rare on 4K,
* usually requiring abnormally-small data blocks (eg gang blocks)
* mixed into the same ABD as larger ones (eg aggregated).
*/
abd_t *abd = zio->io_abd;
if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
void *buf;
if (zio->io_type == ZIO_TYPE_READ)
buf = abd_borrow_buf(zio->io_abd, zio->io_size);
else
buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
/*
* Wrap the copy in an abd_t, so we can use the same iterators
* to count and fill the vbio later.
*/
abd = abd_get_from_buf(buf, zio->io_size);
/*
* False here would mean the borrowed copy has an invalid
* alignment too, which would mean we've somehow been passed a
* linear ABD with an interior page that has a non-zero offset
* or a size not a multiple of PAGE_SIZE. This is not possible.
* It would mean either zio_buf_alloc() or its underlying
* allocators have done something extremely strange, or our
* math in vdev_disk_check_pages() is wrong. In either case,
* something in seriously wrong and its not safe to continue.
*/
VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
}
/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
vbio_t *vbio = vbio_alloc(zio, bdev, flags);
if (abd != zio->io_abd)
vbio->vbio_abd = abd;
/* Fill it with data pages and submit it to the kernel */
vbio_submit(vbio, abd, zio->io_size);
return (0);
}
/* ========== */
/*
* This is the classic, battle-tested BIO submission code. Until we're totally
* sure that the new code is safe and correct in all cases, this will remain
* available and can be enabled by setting zfs_vdev_disk_classic=1 at module
* load time.
*
* These functions have been renamed to vdev_classic_* to make it clear what
* they belong to, but their implementations are unchanged.
*/
/*
* Virtual device vector for disks.
*/
typedef struct dio_request {
zio_t *dr_zio; /* Parent ZIO */
atomic_t dr_ref; /* References */
int dr_error; /* Bio error */
int dr_bio_count; /* Count of bio's */
struct bio *dr_bio[]; /* Attached bio's */
} dio_request_t;
static dio_request_t *
vdev_classic_dio_alloc(int bio_count)
{
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
sizeof (struct bio *) * bio_count, KM_SLEEP);
atomic_set(&dr->dr_ref, 0);
dr->dr_bio_count = bio_count;
dr->dr_error = 0;
for (int i = 0; i < dr->dr_bio_count; i++)
dr->dr_bio[i] = NULL;
return (dr);
}
static void
vdev_classic_dio_free(dio_request_t *dr)
{
int i;
for (i = 0; i < dr->dr_bio_count; i++)
if (dr->dr_bio[i])
bio_put(dr->dr_bio[i]);
kmem_free(dr, sizeof (dio_request_t) +
sizeof (struct bio *) * dr->dr_bio_count);
}
static void
vdev_classic_dio_get(dio_request_t *dr)
{
atomic_inc(&dr->dr_ref);
}
static void
vdev_classic_dio_put(dio_request_t *dr)
{
int rc = atomic_dec_return(&dr->dr_ref);
/*
* Free the dio_request when the last reference is dropped and
* ensure zio_interpret is called only once with the correct zio
*/
if (rc == 0) {
zio_t *zio = dr->dr_zio;
int error = dr->dr_error;
vdev_classic_dio_free(dr);
if (zio) {
zio->io_error = error;
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
vdev_disk_error(zio);
zio_delay_interrupt(zio);
}
}
}
BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
{
dio_request_t *dr = bio->bi_private;
if (dr->dr_error == 0) {
#ifdef HAVE_1ARG_BIO_END_IO_T
dr->dr_error = BIO_END_IO_ERROR(bio);
#else
if (error)
dr->dr_error = -(error);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
dr->dr_error = EIO;
#endif
}
/* Drop reference acquired by vdev_classic_physio */
vdev_classic_dio_put(dr);
}
static inline unsigned int
vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
{
unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
bio_size, abd_offset);
#ifdef HAVE_BIO_MAX_SEGS
return (bio_max_segs(nr_segs));
#else
return (MIN(nr_segs, BIO_MAX_PAGES));
#endif
}
static int
vdev_classic_physio(zio_t *zio)
{
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
struct block_device *bdev = vd->vd_bdev;
size_t io_size = zio->io_size;
uint64_t io_offset = zio->io_offset;
int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
int flags = 0;
dio_request_t *dr;
uint64_t abd_offset;
uint64_t bio_offset;
@ -637,7 +1013,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
}
retry:
dr = vdev_disk_dio_alloc(bio_count);
dr = vdev_classic_dio_alloc(bio_count);
if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
bio_set_flags_failfast(bdev, &flags);
@ -669,23 +1045,23 @@ retry:
* this should be rare - see the comment above.
*/
if (dr->dr_bio_count == i) {
vdev_disk_dio_free(dr);
vdev_classic_dio_free(dr);
bio_count *= 2;
goto retry;
}
nr_vecs = vdev_bio_max_segs(bdev);
nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
if (unlikely(dr->dr_bio[i] == NULL)) {
vdev_disk_dio_free(dr);
vdev_classic_dio_free(dr);
return (SET_ERROR(ENOMEM));
}
/* Matching put called by vdev_disk_physio_completion */
vdev_disk_dio_get(dr);
/* Matching put called by vdev_classic_physio_completion */
vdev_classic_dio_get(dr);
BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
dr->dr_bio[i]->bi_private = dr;
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
@ -707,7 +1083,7 @@ retry:
}
/* Extra reference to protect dio_request during vdev_submit_bio */
vdev_disk_dio_get(dr);
vdev_classic_dio_get(dr);
if (dr->dr_bio_count > 1)
blk_start_plug(&plug);
@ -721,11 +1097,13 @@ retry:
if (dr->dr_bio_count > 1)
blk_finish_plug(&plug);
(void) vdev_disk_dio_put(dr);
(void) vdev_classic_dio_put(dr);
return (error);
}
/* ========== */
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
{
zio_t *zio = bio->bi_private;
@ -795,12 +1173,14 @@ vdev_disk_io_trim(zio_t *zio)
#endif
}
int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
static void
vdev_disk_io_start(zio_t *zio)
{
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
int rw, error;
int error;
/*
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
@ -879,13 +1259,6 @@ vdev_disk_io_start(zio_t *zio)
rw_exit(&vd->vd_lock);
zio_execute(zio);
return;
case ZIO_TYPE_WRITE:
rw = WRITE;
break;
case ZIO_TYPE_READ:
rw = READ;
break;
case ZIO_TYPE_TRIM:
zio->io_error = vdev_disk_io_trim(zio);
@ -893,23 +1266,34 @@ vdev_disk_io_start(zio_t *zio)
zio_interrupt(zio);
return;
case ZIO_TYPE_READ:
case ZIO_TYPE_WRITE:
zio->io_target_timestamp = zio_handle_io_delay(zio);
error = vdev_disk_io_rw_fn(zio);
rw_exit(&vd->vd_lock);
if (error) {
zio->io_error = error;
zio_interrupt(zio);
}
return;
default:
/*
* Getting here means our parent vdev has made a very strange
* request of us, and shouldn't happen. Assert here to force a
* crash in dev builds, but in production return the IO
* unhandled. The pool will likely suspend anyway but that's
* nicer than crashing the kernel.
*/
ASSERT3S(zio->io_type, ==, -1);
rw_exit(&vd->vd_lock);
zio->io_error = SET_ERROR(ENOTSUP);
zio_interrupt(zio);
return;
}
zio->io_target_timestamp = zio_handle_io_delay(zio);
error = __vdev_disk_physio(vd->vd_bdev, zio,
zio->io_size, zio->io_offset, rw, 0);
rw_exit(&vd->vd_lock);
if (error) {
zio->io_error = error;
zio_interrupt(zio);
return;
}
__builtin_unreachable();
}
static void
@ -958,8 +1342,49 @@ vdev_disk_rele(vdev_t *vd)
/* XXX: Implement me as a vnode rele for the device */
}
/*
* BIO submission method. See comment above about vdev_classic.
* Set zfs_vdev_disk_classic=0 for new, =1 for classic
*/
static uint_t zfs_vdev_disk_classic = 0; /* default new */
/* Set submission function from module parameter */
static int
vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
{
int err = param_set_uint(buf, kp);
if (err < 0)
return (SET_ERROR(err));
vdev_disk_io_rw_fn =
zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
zfs_vdev_disk_classic ? "classic" : "new");
return (0);
}
/*
* At first use vdev use, set the submission function from the default value if
* it hasn't been set already.
*/
static int
vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
{
(void) spa;
(void) nv;
(void) tsd;
if (vdev_disk_io_rw_fn == NULL)
vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
vdev_classic_physio : vdev_disk_io_rw;
return (0);
}
vdev_ops_t vdev_disk_ops = {
.vdev_op_init = NULL,
.vdev_op_init = vdev_disk_init,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_disk_open,
.vdev_op_close = vdev_disk_close,
@ -1049,5 +1474,12 @@ param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
ZFS_MODULE_PARAM(zfs_zio, zio_, suppress_zero_writes, INT, ZMOD_RW,
"Do not send zero byte writes to hardware");
ZFS_MODULE_PARAM(zfs_vdev_disk, vdev_disk_, max_segs, ULONG, ZMOD_RW,
"Maximum number of data segments to add to an IO request");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
"Timeout before determining that a device is missing");
ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
"Maximum number of data segments to add to an IO request (min 4)");
ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
"Use classic BIO submission method");

View File

@ -816,6 +816,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
return (ret);
}
#if defined(__linux__) && defined(_KERNEL)
int
abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
abd_iter_page_func_t *func, void *private)
{
struct abd_iter aiter;
int ret = 0;
if (size == 0)
return (0);
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
while (size > 0) {
IMPLY(abd_is_gang(abd), c_abd != NULL);
abd_iter_page(&aiter);
size_t len = MIN(aiter.iter_page_dsize, size);
ASSERT3U(len, >, 0);
ret = func(aiter.iter_page, aiter.iter_page_doff,
len, private);
aiter.iter_page = NULL;
aiter.iter_page_doff = 0;
aiter.iter_page_dsize = 0;
if (ret != 0)
break;
size -= len;
c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
}
return (ret);
}
#endif
struct buf_arg {
void *arg_buf;
};

View File

@ -8491,11 +8491,11 @@ l2arc_dev_get_next(void)
break;
} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
next->l2ad_trim_all);
next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting);
/* if we were unable to find any usable vdevs, return NULL */
if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
next->l2ad_trim_all)
next->l2ad_trim_all || next->l2ad_spa->spa_is_exporting)
next = NULL;
l2arc_dev_last = next;
@ -10145,7 +10145,8 @@ l2arc_spa_rebuild_start(spa_t *spa)
void
l2arc_spa_rebuild_stop(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
/*
* Locate the spa's l2arc devices and kick off rebuild threads.

View File

@ -554,6 +554,21 @@ dbuf_evict_user(dmu_buf_impl_t *db)
*dbu->dbu_clear_on_evict_dbufp = NULL;
#endif
if (db->db_caching_status != DB_NO_CACHE) {
/*
* This is a cached dbuf, so the size of the user data is
* included in its cached amount. We adjust it here because the
* user data has already been detached from the dbuf, and the
* sync functions are not supposed to touch it (the dbuf might
* not exist anymore by the time the sync functions run.
*/
uint64_t size = dbu->dbu_size;
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size, size, db);
if (db->db_caching_status == DB_DBUF_CACHE)
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
}
/*
* There are two eviction callbacks - one that we call synchronously
* and one that we invoke via a taskq. The async one is useful for
@ -693,12 +708,12 @@ dbuf_evict_one(void)
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
&dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
&dbuf_caches[DB_DBUF_CACHE].size, size, db);
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
db->db.db_size);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
@ -2808,6 +2823,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
db->db_caching_status == DB_DBUF_METADATA_CACHE);
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
ASSERT0(dmu_buf_user_size(&db->db));
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
@ -3540,17 +3557,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db_caching_status == DB_DBUF_METADATA_CACHE);
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
&dbuf_caches[db->db_caching_status].size, size, db);
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_BUMPDOWN(metadata_cache_count);
} else {
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
db->db.db_size);
DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
}
db->db_caching_status = DB_NO_CACHE;
}
@ -3782,7 +3799,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
db->db_caching_status = dcs;
multilist_insert(&dbuf_caches[dcs].cache, db);
uint64_t db_size = db->db.db_size;
uint64_t db_size = db->db.db_size +
dmu_buf_user_size(&db->db);
size = zfs_refcount_add_many(
&dbuf_caches[dcs].size, db_size, db);
uint8_t db_level = db->db_level;
@ -3885,6 +3903,35 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
return (db->db_user);
}
uint64_t
dmu_buf_user_size(dmu_buf_t *db_fake)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
if (db->db_user == NULL)
return (0);
return (atomic_load_64(&db->db_user->dbu_size));
}
void
dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT3P(db->db_user, !=, NULL);
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
atomic_add_64(&db->db_user->dbu_size, nadd);
}
void
dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT3P(db->db_user, !=, NULL);
ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
atomic_sub_64(&db->db_user->dbu_size, nsub);
}
void
dmu_buf_user_evict_wait(void)
{

View File

@ -46,14 +46,14 @@ static int
dbuf_stats_hash_table_headers(char *buf, size_t size)
{
(void) snprintf(buf, size,
"%-96s | %-119s | %s\n"
"%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | "
"%-105s | %-119s | %s\n"
"%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-8s %-5s %-5s %-7s %3s | "
"%-5s %-5s %-9s %-6s %-8s %-12s "
"%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | "
"%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n",
"dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
"blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc",
"list", "atype", "flags", "count", "asize", "access",
"blkid", "offset", "dbsize", "usize", "meta", "state", "dbholds",
"dbc", "list", "atype", "flags", "count", "asize", "access",
"mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
"l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
"bsize", "lvls", "dholds", "blocks", "dsize");
@ -75,8 +75,8 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
__dmu_object_info_from_dnode(dn, &doi);
nwritten = snprintf(buf, size,
"%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d "
"%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
"%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-8llu "
"%-5d %-5d %-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
"%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | "
"%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n",
/* dmu_buf_impl_t */
@ -87,6 +87,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
(longlong_t)db->db_blkid,
(u_longlong_t)db->db.db_offset,
(u_longlong_t)db->db.db_size,
(u_longlong_t)dmu_buf_user_size(&db->db),
!!dbuf_is_metadata(db),
db->db_state,
(ulong_t)zfs_refcount_count(&db->db_holds),

View File

@ -1120,9 +1120,11 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
return (B_TRUE);
}
static void
static uint_t
dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
{
uint_t reclaimed = 0;
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
for (int i = idx; i < idx + slots; i++) {
@ -1134,8 +1136,11 @@ dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
dnode_destroy(dnh->dnh_dnode);
dnh->dnh_dnode = DN_SLOT_FREE;
reclaimed++;
}
}
return (reclaimed);
}
void
@ -1448,6 +1453,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
dmu_buf_add_user_size(&db->db,
sizeof (dnode_t));
}
}
@ -1505,8 +1512,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
* to be freed. Single slot dnodes can be safely
* re-purposed as a performance optimization.
*/
if (slots > 1)
dnode_reclaim_slots(dnc, idx + 1, slots - 1);
if (slots > 1) {
uint_t reclaimed =
dnode_reclaim_slots(dnc, idx + 1, slots - 1);
if (reclaimed > 0)
dmu_buf_sub_user_size(&db->db,
reclaimed * sizeof (dnode_t));
}
dnh = &dnc->dnc_children[idx];
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
@ -1514,6 +1526,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
}
mutex_enter(&dn->dn_mtx);

View File

@ -662,12 +662,13 @@ mmp_thread(void *arg)
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
"mmp_last_write %llu mmp_interval %llu "
"mmp_fail_intervals %llu mmp_fail_ns %llu",
"mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)mmp->mmp_last_write,
(u_longlong_t)mmp_interval,
(u_longlong_t)mmp_fail_intervals,
(u_longlong_t)mmp_fail_ns);
(u_longlong_t)mmp_fail_ns,
(u_longlong_t)spa->spa_uberblock.ub_txg);
cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
"succeeded in over %llu ms; suspending pool. "
"Hrtime %llu",

View File

@ -54,3 +54,11 @@ slack_compress(void *src, void *dst, size_t s_len, size_t d_len, int level)
memcpy(dst, src, c_len);
return (c_len);
}
int
slack_decompress(void *src, void *dst, size_t s_len, size_t d_len, int level)
{
ASSERT3U(d_len, >=, s_len);
memcpy(dst, src, s_len);
return (0);
}

View File

@ -33,6 +33,7 @@
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
* Copyright (c) 2024, Klara Inc.
*/
/*
@ -87,6 +88,7 @@
#include <sys/zfeature.h>
#include <sys/dsl_destroy.h>
#include <sys/zvol.h>
#include <sys/trace_zfs.h>
#ifdef _KERNEL
#include <sys/fm/protocol.h>
@ -150,7 +152,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
* need to be handled with minimum delay.
*/
const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
{ ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */
@ -172,6 +174,14 @@ uint_t zio_taskq_batch_tpq; /* threads per taskq */
boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
uint_t zio_taskq_basedc = 80; /* base duty cycle */
/*
* If enabled, try to find an unlocked IO taskq to dispatch an IO onto before
* falling back to waiting on a lock. This should only be enabled in
* conjunction with careful performance testing, and will likely require
* zio_taskq_read/zio_taskq_write to be adjusted as well.
*/
boolean_t zio_taskq_trylock = B_FALSE;
boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
/*
@ -982,6 +992,9 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
uint_t cpus, flags = TASKQ_DYNAMIC;
boolean_t batch = B_FALSE;
tqs->stqs_type = q;
tqs->stqs_zio_type = t;
switch (mode) {
case ZTI_MODE_FIXED:
ASSERT3U(value, >, 0);
@ -1114,29 +1127,313 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
tqs->stqs_taskq = NULL;
}
#ifdef _KERNEL
/*
* The READ and WRITE rows of zio_taskqs are configurable at module load time
* by setting zio_taskq_read or zio_taskq_write.
*
* Example (the defaults for READ and WRITE)
* zio_taskq_read='fixed,1,8 null scale null'
* zio_taskq_write='batch fixed,1,5 scale fixed,1,5'
*
* Each sets the entire row at a time.
*
* 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
* of threads per taskq.
*
* 'null' can only be set on the high-priority queues (queue selection for
* high-priority queues will fall back to the regular queue if the high-pri
* is NULL.
*/
static const char *const modes[ZTI_NMODES] = {
"fixed", "batch", "scale", "null"
};
/* Parse the incoming config string. Modifies cfg */
static int
spa_taskq_param_set(zio_type_t t, char *cfg)
{
int err = 0;
zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
char *next = cfg, *tok, *c;
/*
* Parse out each element from the string and fill `row`. The entire
* row has to be set at once, so any errors are flagged by just
* breaking out of this loop early.
*/
uint_t q;
for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
/* `next` is the start of the config */
if (next == NULL)
break;
/* Eat up leading space */
while (isspace(*next))
next++;
if (*next == '\0')
break;
/* Mode ends at space or end of string */
tok = next;
next = strchr(tok, ' ');
if (next != NULL) *next++ = '\0';
/* Parameters start after a comma */
c = strchr(tok, ',');
if (c != NULL) *c++ = '\0';
/* Match mode string */
uint_t mode;
for (mode = 0; mode < ZTI_NMODES; mode++)
if (strcmp(tok, modes[mode]) == 0)
break;
if (mode == ZTI_NMODES)
break;
/* Invalid canary */
row[q].zti_mode = ZTI_NMODES;
/* Per-mode setup */
switch (mode) {
/*
* FIXED is parameterised: number of queues, and number of
* threads per queue.
*/
case ZTI_MODE_FIXED: {
/* No parameters? */
if (c == NULL || *c == '\0')
break;
/* Find next parameter */
tok = c;
c = strchr(tok, ',');
if (c == NULL)
break;
/* Take digits and convert */
unsigned long long nq;
if (!(isdigit(*tok)))
break;
err = ddi_strtoull(tok, &tok, 10, &nq);
/* Must succeed and also end at the next param sep */
if (err != 0 || tok != c)
break;
/* Move past the comma */
tok++;
/* Need another number */
if (!(isdigit(*tok)))
break;
/* Remember start to make sure we moved */
c = tok;
/* Take digits */
unsigned long long ntpq;
err = ddi_strtoull(tok, &tok, 10, &ntpq);
/* Must succeed, and moved forward */
if (err != 0 || tok == c || *tok != '\0')
break;
/*
* sanity; zero queues/threads make no sense, and
* 16K is almost certainly more than anyone will ever
* need and avoids silly numbers like UINT32_MAX
*/
if (nq == 0 || nq >= 16384 ||
ntpq == 0 || ntpq >= 16384)
break;
const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
row[q] = zti;
break;
}
case ZTI_MODE_BATCH: {
const zio_taskq_info_t zti = ZTI_BATCH;
row[q] = zti;
break;
}
case ZTI_MODE_SCALE: {
const zio_taskq_info_t zti = ZTI_SCALE;
row[q] = zti;
break;
}
case ZTI_MODE_NULL: {
/*
* Can only null the high-priority queues; the general-
* purpose ones have to exist.
*/
if (q != ZIO_TASKQ_ISSUE_HIGH &&
q != ZIO_TASKQ_INTERRUPT_HIGH)
break;
const zio_taskq_info_t zti = ZTI_NULL;
row[q] = zti;
break;
}
default:
break;
}
/* Ensure we set a mode */
if (row[q].zti_mode == ZTI_NMODES)
break;
}
/* Didn't get a full row, fail */
if (q < ZIO_TASKQ_TYPES)
return (SET_ERROR(EINVAL));
/* Eat trailing space */
if (next != NULL)
while (isspace(*next))
next++;
/* If there's anything left over then fail */
if (next != NULL && *next != '\0')
return (SET_ERROR(EINVAL));
/* Success! Copy it into the real config */
for (q = 0; q < ZIO_TASKQ_TYPES; q++)
zio_taskqs[t][q] = row[q];
return (0);
}
static int
spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
{
int pos = 0;
/* Build paramater string from live config */
const char *sep = "";
for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
const zio_taskq_info_t *zti = &zio_taskqs[t][q];
if (zti->zti_mode == ZTI_MODE_FIXED)
pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
modes[zti->zti_mode], zti->zti_count,
zti->zti_value);
else
pos += sprintf(&buf[pos], "%s%s", sep,
modes[zti->zti_mode]);
sep = " ";
}
if (add_newline)
buf[pos++] = '\n';
buf[pos] = '\0';
return (pos);
}
#ifdef __linux__
static int
spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
{
char *cfg = kmem_strdup(val);
int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
kmem_free(cfg, strlen(val)+1);
return (-err);
}
static int
spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
{
return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE));
}
static int
spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
{
char *cfg = kmem_strdup(val);
int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
kmem_free(cfg, strlen(val)+1);
return (-err);
}
static int
spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
{
return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
}
#else
/*
* On FreeBSD load-time parameters can be set up before malloc() is available,
* so we have to do all the parsing work on the stack.
*/
#define SPA_TASKQ_PARAM_MAX (128)
static int
spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
{
char buf[SPA_TASKQ_PARAM_MAX];
int err;
(void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE);
err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
if (err || req->newptr == NULL)
return (err);
return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
}
static int
spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
{
char buf[SPA_TASKQ_PARAM_MAX];
int err;
(void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE);
err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
if (err || req->newptr == NULL)
return (err);
return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
}
#endif
#endif /* _KERNEL */
/*
* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
* Note that a type may have multiple discrete taskqs to avoid lock contention
* on the taskq itself. In that case we choose which taskq at random by using
* the low bits of gethrtime().
* on the taskq itself. In that case we try each one until it goes in, before
* falling back to waiting on a lock.
*/
void
spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
{
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
taskq_t *tq;
ASSERT3P(tqs->stqs_taskq, !=, NULL);
ASSERT3U(tqs->stqs_count, !=, 0);
DTRACE_PROBE2(spa_taskqs_ent__dispatch,
spa_taskqs_t *, tqs, taskq_ent_t *, ent);
if (tqs->stqs_count == 1) {
tq = tqs->stqs_taskq[0];
} else {
tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
taskq_dispatch_ent(tqs->stqs_taskq[0], func, arg, flags, ent);
goto out;
}
taskq_dispatch_ent(tq, func, arg, flags, ent);
int select = ((uint64_t)gethrtime()) % tqs->stqs_count;
if (zio_taskq_trylock) {
for (int i = 0; i < tqs->stqs_count; i++) {
if (taskq_try_dispatch_ent(
tqs->stqs_taskq[select], func, arg, flags, ent))
goto out;
select = (select+1) % tqs->stqs_count;
}
}
taskq_dispatch_ent(tqs->stqs_taskq[select], func, arg, flags, ent);
out:
DTRACE_PROBE2(spa_taskqs_ent__dispatched,
spa_taskqs_t *, tqs, taskq_ent_t *, ent);
}
/*
@ -1619,7 +1916,8 @@ spa_unload(spa_t *spa, txg_wait_flag_t txg_how)
vdev_t *vd;
uint64_t t, txg;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
spa_import_progress_remove(spa_guid(spa));
@ -2931,8 +3229,6 @@ spa_spawn_aux_threads(spa_t *spa)
{
ASSERT(spa_writeable(spa));
ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa_start_indirect_condensing_thread(spa);
spa_start_livelist_destroy_thread(spa);
spa_start_livelist_condensing_thread(spa);
@ -3035,6 +3331,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
spa->spa_load_state = state;
(void) spa_import_progress_set_state(spa_guid(spa),
spa_load_state(spa));
spa_import_progress_set_notes(spa, "spa_load()");
gethrestime(&spa->spa_loaded_ts);
error = spa_load_impl(spa, type, &ereport);
@ -3244,18 +3541,23 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
}
/*
* Perform the import activity check. If the user canceled the import or
* we detected activity then fail.
* Remote host activity check.
*
* error results:
* 0 - no activity detected
* EREMOTEIO - remote activity detected
* EINTR - user canceled the operation
*/
static int
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
boolean_t importing)
{
uint64_t txg = ub->ub_txg;
uint64_t timestamp = ub->ub_timestamp;
uint64_t mmp_config = ub->ub_mmp_config;
uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
uint64_t import_delay;
hrtime_t import_expire;
hrtime_t import_expire, now;
nvlist_t *mmp_label = NULL;
vdev_t *rvd = spa->spa_root_vdev;
kcondvar_t cv;
@ -3293,9 +3595,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
import_expire = gethrtime() + import_delay;
while (gethrtime() < import_expire) {
(void) spa_import_progress_set_mmp_check(spa_guid(spa),
NSEC2SEC(import_expire - gethrtime()));
if (importing) {
spa_import_progress_set_notes(spa, "Checking MMP activity, "
"waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
}
int iterations = 0;
while ((now = gethrtime()) < import_expire) {
if (importing && iterations++ % 30 == 0) {
spa_import_progress_set_notes(spa, "Checking MMP "
"activity, %llu ms remaining",
(u_longlong_t)NSEC2MSEC(import_expire - now));
}
if (importing) {
(void) spa_import_progress_set_mmp_check(spa_guid(spa),
NSEC2SEC(import_expire - gethrtime()));
}
vdev_uberblock_load(rvd, ub, &mmp_label);
@ -3377,6 +3693,61 @@ out:
return (error);
}
/*
* Called from zfs_ioc_clear for a pool that was suspended
* after failing mmp write checks.
*/
boolean_t
spa_mmp_remote_host_activity(spa_t *spa)
{
ASSERT(spa_multihost(spa) && spa_suspended(spa));
nvlist_t *best_label;
uberblock_t best_ub;
/*
* Locate the best uberblock on disk
*/
vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
if (best_label) {
/*
* confirm that the best hostid matches our hostid
*/
if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
spa_get_hostid(spa) !=
fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
nvlist_free(best_label);
return (B_TRUE);
}
nvlist_free(best_label);
} else {
return (B_TRUE);
}
if (!MMP_VALID(&best_ub) ||
!MMP_FAIL_INT_VALID(&best_ub) ||
MMP_FAIL_INT(&best_ub) == 0) {
return (B_TRUE);
}
if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
zfs_dbgmsg("txg mismatch detected during pool clear "
"txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
(u_longlong_t)spa->spa_uberblock.ub_txg,
(u_longlong_t)best_ub.ub_txg,
(u_longlong_t)spa->spa_uberblock.ub_timestamp,
(u_longlong_t)best_ub.ub_timestamp);
return (B_TRUE);
}
/*
* Perform an activity check looking for any remote writer
*/
return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
B_FALSE) != 0);
}
static int
spa_verify_host(spa_t *spa, nvlist_t *mos_config)
{
@ -3697,7 +4068,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
}
int error = spa_activity_check(spa, ub, spa->spa_config);
int error =
spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
if (error) {
nvlist_free(label);
return (error);
@ -3904,6 +4276,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
rvd = mrvd;
spa_config_exit(spa, SCL_ALL, FTAG);
/*
* If 'zpool import' used a cached config, then the on-disk hostid and
* hostname may be different to the cached config in ways that should
* prevent import. Userspace can't discover this without a scan, but
* we know, so we add these values to LOAD_INFO so the caller can know
* the difference.
*
* Note that we have to do this before the config is regenerated,
* because the new config will have the hostid and hostname for this
* host, in readiness for import.
*/
if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
/*
* We will use spa_config if we decide to reload the spa or if spa_load
* fails and we rewind. We must thus regenerate the config using the
@ -4580,7 +4970,8 @@ spa_ld_read_checkpoint_txg(spa_t *spa)
int error = 0;
ASSERT0(spa->spa_checkpoint_txg);
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
@ -4827,6 +5218,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
boolean_t checkpoint_rewind =
(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
boolean_t update_config_cache = B_FALSE;
hrtime_t load_start = gethrtime();
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
@ -4871,12 +5263,19 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
return (error);
}
/*
* Drop the namespace lock for the rest of the function.
*/
spa->spa_load_thread = curthread;
mutex_exit(&spa_namespace_lock);
/*
* Retrieve the checkpoint txg if the pool has a checkpoint.
*/
spa_import_progress_set_notes(spa, "Loading checkpoint txg");
error = spa_ld_read_checkpoint_txg(spa);
if (error != 0)
return (error);
goto fail;
/*
* Retrieve the mapping of indirect vdevs. Those vdevs were removed
@ -4886,60 +5285,68 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* initiated. Otherwise we could be reading from indirect vdevs before
* we have loaded their mappings.
*/
spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
error = spa_ld_open_indirect_vdev_metadata(spa);
if (error != 0)
return (error);
goto fail;
/*
* Retrieve the full list of active features from the MOS and check if
* they are all supported.
*/
spa_import_progress_set_notes(spa, "Checking feature flags");
error = spa_ld_check_features(spa, &missing_feat_write);
if (error != 0)
return (error);
goto fail;
/*
* Load several special directories from the MOS needed by the dsl_pool
* layer.
*/
spa_import_progress_set_notes(spa, "Loading special MOS directories");
error = spa_ld_load_special_directories(spa);
if (error != 0)
return (error);
goto fail;
/*
* Retrieve pool properties from the MOS.
*/
spa_import_progress_set_notes(spa, "Loading properties");
error = spa_ld_get_props(spa);
if (error != 0)
return (error);
goto fail;
/*
* Retrieve the list of auxiliary devices - cache devices and spares -
* and open them.
*/
spa_import_progress_set_notes(spa, "Loading AUX vdevs");
error = spa_ld_open_aux_vdevs(spa, type);
if (error != 0)
return (error);
goto fail;
/*
* Load the metadata for all vdevs. Also check if unopenable devices
* should be autoreplaced.
*/
spa_import_progress_set_notes(spa, "Loading vdev metadata");
error = spa_ld_load_vdev_metadata(spa);
if (error != 0)
return (error);
goto fail;
spa_import_progress_set_notes(spa, "Loading dedup tables");
error = spa_ld_load_dedup_tables(spa);
if (error != 0)
return (error);
goto fail;
/*
* Verify the logs now to make sure we don't have any unexpected errors
* when we claim log blocks later.
*/
spa_import_progress_set_notes(spa, "Verifying Log Devices");
error = spa_ld_verify_logs(spa, type, ereport);
if (error != 0)
return (error);
goto fail;
if (missing_feat_write) {
ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
@ -4949,8 +5356,9 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* read-only mode but not read-write mode. We now have enough
* information and can return to userland.
*/
return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
ENOTSUP));
error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
ENOTSUP);
goto fail;
}
/*
@ -4958,15 +5366,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* state. When performing an extreme rewind, we verify the whole pool,
* which can take a very long time.
*/
spa_import_progress_set_notes(spa, "Verifying pool data");
error = spa_ld_verify_pool_data(spa);
if (error != 0)
return (error);
goto fail;
/*
* Calculate the deflated space for the pool. This must be done before
* we write anything to the pool because we'd need to update the space
* accounting using the deflated sizes.
*/
spa_import_progress_set_notes(spa, "Calculating deflated space");
spa_update_dspace(spa);
/*
@ -4974,6 +5384,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* pool. If we are importing the pool in read-write mode, a few
* additional steps must be performed to finish the import.
*/
spa_import_progress_set_notes(spa, "Starting import");
if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
spa->spa_load_max_txg == UINT64_MAX)) {
uint64_t config_cache_txg = spa->spa_config_txg;
@ -4990,6 +5401,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
(u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
}
spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
/*
* Traverse the ZIL and claim all blocks.
*/
@ -5009,6 +5421,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* will have been set for us by ZIL traversal operations
* performed above.
*/
spa_import_progress_set_notes(spa, "Syncing ZIL claims");
txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
/*
@ -5016,6 +5429,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* next sync, we would update the config stored in vdev labels
* and the cachefile (by default /etc/zfs/zpool.cache).
*/
spa_import_progress_set_notes(spa, "Updating configs");
spa_ld_check_for_config_update(spa, config_cache_txg,
update_config_cache);
@ -5024,6 +5438,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* Then check all DTLs to see if anything needs resilvering.
* The resilver will be deferred if a rebuild was started.
*/
spa_import_progress_set_notes(spa, "Starting resilvers");
if (vdev_rebuild_active(spa->spa_root_vdev)) {
vdev_rebuild_restart(spa);
} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
@ -5037,6 +5452,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
*/
spa_history_log_version(spa, "open", NULL);
spa_import_progress_set_notes(spa,
"Restarting device removals");
spa_restart_removal(spa);
spa_spawn_aux_threads(spa);
@ -5049,27 +5466,40 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
* auxiliary threads above (from which the livelist
* deletion zthr is part of).
*/
spa_import_progress_set_notes(spa,
"Cleaning up inconsistent objsets");
(void) dmu_objset_find(spa_name(spa),
dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
/*
* Clean up any stale temporary dataset userrefs.
*/
spa_import_progress_set_notes(spa,
"Cleaning up temporary userrefs");
dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
spa_import_progress_set_notes(spa, "Restarting initialize");
vdev_initialize_restart(spa->spa_root_vdev);
spa_import_progress_set_notes(spa, "Restarting TRIM");
vdev_trim_restart(spa->spa_root_vdev);
vdev_autotrim_restart(spa);
spa_config_exit(spa, SCL_CONFIG, FTAG);
spa_import_progress_set_notes(spa, "Finished importing");
}
zio_handle_import_delay(spa, gethrtime() - load_start);
spa_import_progress_remove(spa_guid(spa));
spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
spa_load_note(spa, "LOADED");
fail:
mutex_enter(&spa_namespace_lock);
spa->spa_load_thread = NULL;
cv_broadcast(&spa_namespace_cv);
return (error);
return (0);
}
static int
@ -6337,9 +6767,14 @@ spa_tryimport(nvlist_t *tryconfig)
/*
* Create and initialize the spa structure.
*/
char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
(void) snprintf(name, MAXPATHLEN, "%s-%llx-%s",
TRYIMPORT_NAME, (u_longlong_t)curthread, poolname);
mutex_enter(&spa_namespace_lock);
spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
spa = spa_add(name, tryconfig, NULL);
spa_activate(spa, SPA_MODE_READ);
kmem_free(name, MAXPATHLEN);
/*
* Rewind pool if a max txg was provided.
@ -6476,9 +6911,10 @@ static int
spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
boolean_t force, boolean_t hardforce)
{
int error;
int error = 0;
spa_t *spa;
boolean_t force_removal, modifying;
hrtime_t export_start = gethrtime();
if (oldconfig)
*oldconfig = NULL;
@ -6509,8 +6945,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
new_state == POOL_STATE_EXPORTED);
/*
* Put a hold on the pool, drop the namespace lock, stop async tasks,
* reacquire the namespace lock, and see if we can export.
* Put a hold on the pool, drop the namespace lock, stop async tasks
* and see if we can export.
*/
spa_open_ref(spa, FTAG);
@ -6547,10 +6983,13 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
taskq_wait(spa->spa_zvol_taskq);
}
mutex_enter(&spa_namespace_lock);
spa->spa_export_thread = curthread;
spa_close(spa, FTAG);
if (spa->spa_state == POOL_STATE_UNINITIALIZED)
if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
mutex_exit(&spa_namespace_lock);
goto export_spa;
}
/*
* The pool will be in core if it's openable, in which case we can
@ -6594,6 +7033,14 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
goto fail;
}
mutex_exit(&spa_namespace_lock);
/*
* At this point we no longer hold the spa_namespace_lock and
* there were no references on the spa. Future spa_lookups will
* notice the spa->spa_export_thread and wait until we signal
* that we are finshed.
*/
if (spa->spa_sync_on) {
/*
* A pool cannot be exported if it has an active shared spare.
@ -6604,7 +7051,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
if (!force && new_state == POOL_STATE_EXPORTED &&
spa_has_active_shared_spare(spa)) {
error = SET_ERROR(EXDEV);
goto fail;
goto fail_unlocked;
}
/*
@ -6670,13 +7117,20 @@ export_spa:
error = spa_unload(spa, hardforce ?
TXG_WAIT_F_FORCE_EXPORT : TXG_WAIT_F_NOSUSPEND);
if (error != 0)
goto fail;
goto fail_unlocked;
spa_deactivate(spa);
}
if (oldconfig && spa->spa_config)
VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
if (new_state == POOL_STATE_EXPORTED)
zio_handle_export_delay(spa, gethrtime() - export_start);
/*
* Take the namewspace lock for the actual spa_t removal
*/
mutex_enter(&spa_namespace_lock);
if (new_state != POOL_STATE_UNINITIALIZED) {
if (!force_removal)
spa_write_cachefile(spa, B_TRUE, B_TRUE);
@ -6688,16 +7142,29 @@ export_spa:
* we make sure to reset the exporting flag.
*/
spa->spa_is_exporting = B_FALSE;
spa->spa_export_thread = NULL;
}
/*
* Wake up any waiters in spa_lookup()
*/
cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (0);
fail_unlocked:
mutex_enter(&spa_namespace_lock);
fail:
if (force_removal)
spa_set_export_initiator(spa, NULL);
spa->spa_is_exporting = B_FALSE;
spa->spa_export_thread = NULL;
spa_async_resume(spa);
/*
* Wake up any waiters in spa_lookup()
*/
cv_broadcast(&spa_namespace_cv);
mutex_exit(&spa_namespace_lock);
return (error);
}
@ -8311,15 +8778,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
}
static void
spa_async_probe(spa_t *spa, vdev_t *vd)
spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_probe_wanted) {
vd->vdev_probe_wanted = B_FALSE;
vdev_reopen(vd); /* vdev_open() does the actual probe */
if (vd->vdev_fault_wanted) {
vd->vdev_fault_wanted = B_FALSE;
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
VDEV_AUX_ERR_EXCEEDED);
}
for (int c = 0; c < vd->vdev_children; c++)
spa_async_probe(spa, vd->vdev_child[c]);
spa_async_fault_vdev(spa, vd->vdev_child[c]);
}
static void
@ -8408,11 +8876,11 @@ spa_async_thread(void *arg)
}
/*
* See if any devices need to be probed.
* See if any devices need to be marked faulted.
*/
if (tasks & SPA_ASYNC_PROBE) {
if (tasks & SPA_ASYNC_FAULT_VDEV) {
spa_vdev_state_enter(spa, SCL_NONE);
spa_async_probe(spa, spa->spa_root_vdev);
spa_async_fault_vdev(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
@ -10199,6 +10667,9 @@ ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
"Number of threads per IO worker taskqueue");
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_trylock, UINT, ZMOD_RD,
"Try to dispatch IO to an unlocked IO taskqueue before sleeping");
ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW,
"Allow importing pool with up to this number of missing top-level "
"vdevs (in read-only mode)");
@ -10218,4 +10689,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT
ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW,
"Whether extra ALLOC blkptrs were added to a livelist entry while it "
"was being condensed");
#ifdef _KERNEL
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
"Configure IO queues for read IO");
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
"Configure IO queues for write IO");
#endif
/* END CSTYLED */

View File

@ -1155,6 +1155,7 @@ spa_ld_log_sm_data(spa_t *spa)
uint_t pn = 0;
uint64_t ps = 0;
uint64_t nsm = 0;
psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
while (sls != NULL) {
/* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
@ -1187,6 +1188,10 @@ spa_ld_log_sm_data(spa_t *spa)
summary_add_data(spa, sls->sls_txg,
sls->sls_mscount, 0, sls->sls_nblocks);
spa_import_progress_set_notes_nolog(spa,
"Read %llu of %lu log space maps", (u_longlong_t)nsm,
avl_numnodes(&spa->spa_sm_logs_by_txg));
struct spa_ld_log_sm_arg vla = {
.slls_spa = spa,
.slls_txg = sls->sls_txg
@ -1202,6 +1207,7 @@ spa_ld_log_sm_data(spa_t *spa)
pn--;
ps -= space_map_length(sls->sls_sm);
nsm++;
space_map_close(sls->sls_sm);
sls->sls_sm = NULL;
sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
@ -1212,11 +1218,11 @@ spa_ld_log_sm_data(spa_t *spa)
hrtime_t read_logs_endtime = gethrtime();
spa_load_note(spa,
"read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
"in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
"Read %lu log space maps (%llu total blocks - blksz = %llu bytes) "
"in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg),
(u_longlong_t)spa_log_sm_nblocks(spa),
(u_longlong_t)zfs_log_sm_blksz,
(longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
(longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime));
out:
if (error != 0) {

View File

@ -20,13 +20,14 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2019 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#include <sys/zfs_context.h>
@ -79,7 +80,8 @@
* - Check if spa_refcount is zero
* - Rename a spa_t
* - add/remove/attach/detach devices
* - Held for the duration of create/destroy/import/export
* - Held for the duration of create/destroy
* - Held at the start and end of import and export
*
* It does not need to handle recursion. A create or destroy may
* reference objects (files or zvols) in other pools, but by
@ -232,9 +234,9 @@
* locking is, always, based on spa_namespace_lock and spa_config_lock[].
*/
static avl_tree_t spa_namespace_avl;
avl_tree_t spa_namespace_avl;
kmutex_t spa_namespace_lock;
static kcondvar_t spa_namespace_cv;
kcondvar_t spa_namespace_cv;
int spa_max_replication_override = SPA_DVAS_PER_BP;
static kmutex_t spa_spare_lock;
@ -417,6 +419,8 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
spa->spa_trust_config ? "trusted" : "untrusted", buf);
spa_import_progress_set_notes_nolog(spa, "%s", buf);
}
/*
@ -604,6 +608,7 @@ spa_lookup(const char *name)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
retry:
(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
/*
@ -615,6 +620,20 @@ spa_lookup(const char *name)
*cp = '\0';
spa = avl_find(&spa_namespace_avl, &search, &where);
if (spa == NULL)
return (NULL);
/*
* Avoid racing with import/export, which don't hold the namespace
* lock for their entire duration.
*/
if ((spa->spa_load_thread != NULL &&
spa->spa_load_thread != curthread) ||
(spa->spa_export_thread != NULL &&
spa->spa_export_thread != curthread)) {
cv_wait(&spa_namespace_cv, &spa_namespace_lock);
goto retry;
}
return (spa);
}
@ -712,6 +731,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa_config_lock_init(spa);
spa_stats_init(spa);
ASSERT(MUTEX_HELD(&spa_namespace_lock));
avl_add(&spa_namespace_avl, spa);
/*
@ -806,7 +826,6 @@ spa_remove(spa_t *spa)
nvlist_free(spa->spa_config_splitting);
avl_remove(&spa_namespace_avl, spa);
cv_broadcast(&spa_namespace_cv);
if (spa->spa_root)
spa_strfree(spa->spa_root);
@ -901,7 +920,8 @@ void
spa_open_ref(spa_t *spa, void *tag)
{
ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
MUTEX_HELD(&spa_namespace_lock));
MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
(void) zfs_refcount_add(&spa->spa_refcount, tag);
}
@ -921,13 +941,15 @@ spa_close_common(spa_t *spa, const void *tag)
/*
* Remove a reference to the given spa_t. Must have at least one reference, or
* have the namespace lock held.
* have the namespace lock held or be part of a pool import/export.
*/
void
spa_close(spa_t *spa, void *tag)
{
ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
MUTEX_HELD(&spa_namespace_lock));
MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread ||
spa->spa_export_thread == curthread);
spa_close_common(spa, tag);
}
@ -947,13 +969,15 @@ spa_async_close(spa_t *spa, void *tag)
/*
* Check to see if the spa refcount is zero. Must be called with
* spa_namespace_lock held. We really compare against spa_minref, which is the
* number of references acquired when opening a pool
* spa_namespace_lock held or be the spa export thread. We really
* compare against spa_minref, which is the number of references
* acquired when opening a pool
*/
boolean_t
spa_refcount_zero(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
}
@ -1201,6 +1225,8 @@ spa_vdev_enter(spa_t *spa)
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
ASSERT0(spa->spa_export_thread);
vdev_autotrim_stop_all(spa);
return (spa_vdev_config_enter(spa));
@ -1218,6 +1244,8 @@ spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
ASSERT0(spa->spa_export_thread);
vdev_autotrim_stop_all(spa);
if (guid != 0) {
@ -2215,6 +2243,7 @@ typedef struct spa_import_progress {
uint64_t pool_guid; /* unique id for updates */
char *pool_name;
spa_load_state_t spa_load_state;
char *spa_load_notes;
uint64_t mmp_sec_remaining; /* MMP activity check */
uint64_t spa_load_max_txg; /* rewind txg */
procfs_list_node_t smh_node;
@ -2225,9 +2254,9 @@ spa_history_list_t *spa_import_progress_list = NULL;
static int
spa_import_progress_show_header(struct seq_file *f)
{
seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid",
seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid",
"load_state", "multihost_secs", "max_txg",
"pool_name");
"pool_name", "notes");
return (0);
}
@ -2236,11 +2265,12 @@ spa_import_progress_show(struct seq_file *f, void *data)
{
spa_import_progress_t *sip = (spa_import_progress_t *)data;
seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n",
seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n",
(u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state,
(u_longlong_t)sip->mmp_sec_remaining,
(u_longlong_t)sip->spa_load_max_txg,
(sip->pool_name ? sip->pool_name : "-"));
(sip->pool_name ? sip->pool_name : "-"),
(sip->spa_load_notes ? sip->spa_load_notes : "-"));
return (0);
}
@ -2254,6 +2284,8 @@ spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size)
sip = list_remove_head(&shl->procfs_list.pl_list);
if (sip->pool_name)
spa_strfree(sip->pool_name);
if (sip->spa_load_notes)
kmem_strfree(sip->spa_load_notes);
kmem_free(sip, sizeof (spa_import_progress_t));
shl->size--;
}
@ -2309,6 +2341,10 @@ spa_import_progress_set_state(uint64_t pool_guid,
sip = list_prev(&shl->procfs_list.pl_list, sip)) {
if (sip->pool_guid == pool_guid) {
sip->spa_load_state = load_state;
if (sip->spa_load_notes != NULL) {
kmem_strfree(sip->spa_load_notes);
sip->spa_load_notes = NULL;
}
error = 0;
break;
}
@ -2318,6 +2354,59 @@ spa_import_progress_set_state(uint64_t pool_guid,
return (error);
}
static void
spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg,
const char *fmt, va_list adx)
{
spa_history_list_t *shl = spa_import_progress_list;
spa_import_progress_t *sip;
uint64_t pool_guid = spa_guid(spa);
if (shl->size == 0)
return;
char *notes = kmem_vasprintf(fmt, adx);
mutex_enter(&shl->procfs_list.pl_lock);
for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
sip = list_prev(&shl->procfs_list.pl_list, sip)) {
if (sip->pool_guid == pool_guid) {
if (sip->spa_load_notes != NULL) {
kmem_strfree(sip->spa_load_notes);
sip->spa_load_notes = NULL;
}
sip->spa_load_notes = notes;
if (log_dbgmsg)
zfs_dbgmsg("'%s' %s", sip->pool_name, notes);
notes = NULL;
break;
}
}
mutex_exit(&shl->procfs_list.pl_lock);
if (notes != NULL)
kmem_strfree(notes);
}
void
spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...)
{
va_list adx;
va_start(adx, fmt);
spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx);
va_end(adx);
}
void
spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...)
{
va_list adx;
va_start(adx, fmt);
spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx);
va_end(adx);
}
int
spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg)
{
@ -2386,6 +2475,7 @@ spa_import_progress_add(spa_t *spa)
poolname = spa_name(spa);
sip->pool_name = spa_strdup(poolname);
sip->spa_load_state = spa_load_state(spa);
sip->spa_load_notes = NULL;
mutex_enter(&shl->procfs_list.pl_lock);
procfs_list_add(&shl->procfs_list, sip);
@ -2405,6 +2495,8 @@ spa_import_progress_remove(uint64_t pool_guid)
if (sip->pool_guid == pool_guid) {
if (sip->pool_name)
spa_strfree(sip->pool_name);
if (sip->spa_load_notes)
spa_strfree(sip->spa_load_notes);
list_remove(&shl->procfs_list.pl_list, sip);
shl->size--;
kmem_free(sip, sizeof (spa_import_progress_t));
@ -2801,8 +2893,7 @@ spa_state_to_name(spa_t *spa)
vdev_state_t state = rvd->vdev_state;
vdev_aux_t aux = rvd->vdev_stat.vs_aux;
if (spa_suspended(spa) &&
(spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE))
if (spa_suspended(spa))
return ("SUSPENDED");
switch (state) {

View File

@ -585,6 +585,15 @@ txg_sync_thread(void *arg)
timer = (delta > timeout ? 0 : timeout - delta);
}
/*
* When we're suspended, nothing should be changing and for
* MMP we don't want to bump anything that would make it
* harder to detect if another host is changing it when
* resuming after a MMP suspend.
*/
if (spa_suspended(spa))
continue;
/*
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.

View File

@ -1584,6 +1584,7 @@ vdev_metaslab_fini(vdev_t *vd)
typedef struct vdev_probe_stats {
boolean_t vps_readable;
boolean_t vps_writeable;
boolean_t vps_zio_done_probe;
int vps_flags;
} vdev_probe_stats_t;
@ -1627,6 +1628,17 @@ vdev_probe_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO);
/*
* If this probe was initiated from zio pipeline, then
* change the state in a spa_async_request. Probes that
* were initiated from a vdev_open can change the state
* as part of the open call.
*/
if (vps->vps_zio_done_probe) {
vd->vdev_fault_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
}
}
mutex_enter(&vd->vdev_probe_lock);
@ -1678,6 +1690,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
ZIO_FLAG_TRYHARD;
vps->vps_zio_done_probe = (zio != NULL);
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
@ -1704,15 +1717,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
/*
* We can't change the vdev state in this context, so we
* kick off an async task to do it on our behalf.
*/
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
}
}
if (zio != NULL)

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2016, 2024 by Delphix. All rights reserved.
*/
#include <sys/spa.h>
@ -636,7 +636,8 @@ vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
(void) spa;
vdev_t *vd;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
while ((vd = list_remove_head(vd_list)) != NULL) {
mutex_enter(&vd->vdev_initialize_lock);
@ -678,7 +679,8 @@ vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
if (vd_list == NULL) {
vdev_initialize_stop_wait_impl(vd);
} else {
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
vd->vdev_spa->spa_export_thread == curthread);
list_insert_tail(vd_list, vd);
}
}
@ -710,7 +712,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
spa_t *spa = vd->vdev_spa;
list_t vd_list;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
list_create(&vd_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_initialize_node));
@ -729,7 +732,8 @@ vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
void
vdev_initialize_restart(vdev_t *vd)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
vd->vdev_spa->spa_load_thread == curthread);
ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
if (vd->vdev_leaf_zap != 0) {

View File

@ -1894,6 +1894,7 @@ retry:
/*
* If this isn't a resync due to I/O errors,
* and nothing changed in this transaction group,
* and multihost protection isn't enabled,
* and the vdev configuration hasn't changed,
* then there's nothing to do.
*/
@ -1901,7 +1902,8 @@ retry:
boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
txg, spa->spa_mmp.mmp_delay);
if (!changed && list_is_empty(&spa->spa_config_dirty_list))
if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
!spa_multihost(spa))
return (0);
}

View File

@ -22,6 +22,7 @@
*
* Copyright (c) 2018, Intel Corporation.
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
* Copyright (c) 2024 by Delphix. All rights reserved.
*/
#include <sys/vdev_impl.h>
@ -1067,7 +1068,8 @@ vdev_rebuild_restart_impl(vdev_t *vd)
void
vdev_rebuild_restart(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
vdev_rebuild_restart_impl(spa->spa_root_vdev);
}
@ -1081,7 +1083,8 @@ vdev_rebuild_stop_wait(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
if (vd == spa->spa_root_vdev) {
for (uint64_t i = 0; i < vd->vdev_children; i++)

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright (c) 2016, 2024 by Delphix. All rights reserved.
* Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
* Copyright (c) 2021 Hewlett Packard Enterprise Development LP
*/
@ -1021,7 +1021,8 @@ vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
(void) spa;
vdev_t *vd;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
while ((vd = list_remove_head(vd_list)) != NULL) {
mutex_enter(&vd->vdev_trim_lock);
@ -1060,7 +1061,8 @@ vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
if (vd_list == NULL) {
vdev_trim_stop_wait_impl(vd);
} else {
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
vd->vdev_spa->spa_export_thread == curthread);
list_insert_tail(vd_list, vd);
}
}
@ -1096,7 +1098,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
list_t vd_list;
vdev_t *vd_l2cache;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_export_thread == curthread);
list_create(&vd_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_trim_node));
@ -1129,7 +1132,8 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
void
vdev_trim_restart(vdev_t *vd)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
vd->vdev_spa->spa_load_thread == curthread);
ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
if (vd->vdev_leaf_zap != 0) {
@ -1523,8 +1527,8 @@ vdev_autotrim_stop_all(spa_t *spa)
void
vdev_autotrim_restart(spa_t *spa)
{
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
spa->spa_load_thread == curthread);
if (spa->spa_autotrim)
vdev_autotrim(spa);
}

View File

@ -241,6 +241,11 @@ unsigned long zfs_max_nvlist_src_size = 0;
*/
unsigned long zfs_history_output_max = 1024 * 1024;
/*
* Whether or not to allow compression=slack to be set on a dataset.
*/
int zfs_slack_compress_enabled = 0;
uint_t zfs_fsyncer_key;
uint_t zfs_allow_log_key;
@ -4573,6 +4578,9 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
if (compval == ZIO_COMPRESS_SLACK) {
spa_t *spa;
if (!zfs_slack_compress_enabled)
return (SET_ERROR(ENOTSUP));
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
@ -5715,10 +5723,13 @@ zfs_ioc_clear(zfs_cmd_t *zc)
/*
* If multihost is enabled, resuming I/O is unsafe as another
* host may have imported the pool.
* host may have imported the pool. Check for remote activity.
*/
if (spa_multihost(spa) && spa_suspended(spa))
return (SET_ERROR(EINVAL));
if (spa_multihost(spa) && spa_suspended(spa) &&
spa_mmp_remote_host_activity(spa)) {
spa_close(spa, FTAG);
return (SET_ERROR(EREMOTEIO));
}
spa_vdev_state_enter(spa, SCL_NONE);
@ -7770,4 +7781,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW,
"Maximum size in bytes of ZFS ioctl output that will be logged");
ZFS_MODULE_PARAM(zfs, zfs_, slack_compress_enabled, INT, ZMOD_RW,
"Allow slack compression feature to be set on a dataset");
/* END CSTYLED */

View File

@ -3607,7 +3607,7 @@ zil_commit(zilog_t *zilog, uint64_t foid)
int
zil_commit_impl(zilog_t *zilog, uint64_t foid)
{
ASSERT0(zil_failed(zilog) || zilog->zl_suspend > 0);
ASSERT0(zil_failed(zilog));
ZIL_STAT_BUMP(zil_commit_count);

View File

@ -2535,8 +2535,11 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"is set to panic.", spa_name(spa));
if (!spa_suspended(spa)) {
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
"I/O failure and has been suspended.\n", spa_name(spa));
if (reason != ZIO_SUSPEND_MMP) {
cmn_err(CE_WARN, "Pool '%s' has encountered an "
"uncorrectable I/O failure and has been "
"suspended.\n", spa_name(spa));
}
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0);

View File

@ -68,7 +68,7 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
{"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL},
{"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress,
zfs_zstd_decompress, zfs_zstd_decompress_level},
{"slack", 0, slack_compress, NULL, NULL },
{"slack", 0, slack_compress, slack_decompress, NULL },
};
uint8_t

View File

@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2024, Klara Inc.
*/
/*
@ -59,6 +60,7 @@ uint32_t zio_injection_enabled = 0;
typedef struct inject_handler {
int zi_id;
spa_t *zi_spa;
char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */
zinject_record_t zi_record;
uint64_t *zi_lanes;
int zi_next_lane;
@ -699,6 +701,63 @@ zio_handle_io_delay(zio_t *zio)
return (min_target);
}
static void
zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command)
{
inject_handler_t *handler;
hrtime_t delay = 0;
int id = 0;
rw_enter(&inject_lock, RW_READER);
for (handler = list_head(&inject_handlers);
handler != NULL && handler->zi_record.zi_cmd == command;
handler = list_next(&inject_handlers, handler)) {
ASSERT3P(handler->zi_spa_name, !=, NULL);
if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) {
uint64_t pause =
SEC2NSEC(handler->zi_record.zi_duration);
if (pause > elapsed) {
delay = pause - elapsed;
}
id = handler->zi_id;
break;
}
}
rw_exit(&inject_lock);
if (delay) {
if (command == ZINJECT_DELAY_IMPORT) {
spa_import_progress_set_notes(spa, "injecting %llu "
"sec delay", (u_longlong_t)NSEC2SEC(delay));
}
zfs_sleep_until(gethrtime() + delay);
}
if (id) {
/* all done with this one-shot handler */
zio_clear_fault(id);
}
}
/*
* For testing, inject a delay during an import
*/
void
zio_handle_import_delay(spa_t *spa, hrtime_t elapsed)
{
zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT);
}
/*
* For testing, inject a delay during an export
*/
void
zio_handle_export_delay(spa_t *spa, hrtime_t elapsed)
{
zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT);
}
static int
zio_calculate_range(const char *pool, zinject_record_t *record)
{
@ -756,6 +815,28 @@ zio_calculate_range(const char *pool, zinject_record_t *record)
return (0);
}
static boolean_t
zio_pool_handler_exists(const char *name, zinject_type_t command)
{
boolean_t exists = B_FALSE;
rw_enter(&inject_lock, RW_READER);
for (inject_handler_t *handler = list_head(&inject_handlers);
handler != NULL; handler = list_next(&inject_handlers, handler)) {
if (command != handler->zi_record.zi_cmd)
continue;
const char *pool = (handler->zi_spa_name != NULL) ?
handler->zi_spa_name : spa_name(handler->zi_spa);
if (strcmp(name, pool) == 0) {
exists = B_TRUE;
break;
}
}
rw_exit(&inject_lock);
return (exists);
}
/*
* Create a new handler for the given record. We add it to the list, adding
* a reference to the spa_t in the process. We increment zio_injection_enabled,
@ -806,16 +887,42 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
if (!(flags & ZINJECT_NULL)) {
/*
* spa_inject_ref() will add an injection reference, which will
* prevent the pool from being removed from the namespace while
* still allowing it to be unloaded.
* Pool delays for import or export don't take an
* injection reference on the spa. Instead they
* rely on matching by name.
*/
if ((spa = spa_inject_addref(name)) == NULL)
return (SET_ERROR(ENOENT));
if (record->zi_cmd == ZINJECT_DELAY_IMPORT ||
record->zi_cmd == ZINJECT_DELAY_EXPORT) {
if (record->zi_duration <= 0)
return (SET_ERROR(EINVAL));
/*
* Only one import | export delay handler per pool.
*/
if (zio_pool_handler_exists(name, record->zi_cmd))
return (SET_ERROR(EEXIST));
mutex_enter(&spa_namespace_lock);
boolean_t has_spa = spa_lookup(name) != NULL;
mutex_exit(&spa_namespace_lock);
if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa)
return (SET_ERROR(EEXIST));
if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa)
return (SET_ERROR(ENOENT));
spa = NULL;
} else {
/*
* spa_inject_ref() will add an injection reference,
* which will prevent the pool from being removed
* from the namespace while still allowing it to be
* unloaded.
*/
if ((spa = spa_inject_addref(name)) == NULL)
return (SET_ERROR(ENOENT));
}
handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
handler->zi_spa = spa;
handler->zi_spa = spa; /* note: can be NULL */
handler->zi_record = *record;
if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
@ -828,6 +935,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
handler->zi_next_lane = 0;
}
if (handler->zi_spa == NULL)
handler->zi_spa_name = spa_strdup(name);
else
handler->zi_spa_name = NULL;
rw_enter(&inject_lock, RW_WRITER);
/*
@ -887,7 +999,11 @@ zio_inject_list_next(int *id, char *name, size_t buflen,
if (handler) {
*record = handler->zi_record;
*id = handler->zi_id;
(void) strncpy(name, spa_name(handler->zi_spa), buflen);
ASSERT(handler->zi_spa || handler->zi_spa_name);
if (handler->zi_spa != NULL)
(void) strncpy(name, spa_name(handler->zi_spa), buflen);
else
(void) strncpy(name, handler->zi_spa_name, buflen);
ret = 0;
} else {
ret = SET_ERROR(ENOENT);
@ -937,7 +1053,11 @@ zio_clear_fault(int id)
ASSERT3P(handler->zi_lanes, ==, NULL);
}
spa_inject_delref(handler->zi_spa);
if (handler->zi_spa_name != NULL)
spa_strfree(handler->zi_spa_name);
if (handler->zi_spa != NULL)
spa_inject_delref(handler->zi_spa);
kmem_free(handler, sizeof (inject_handler_t));
atomic_dec_32(&zio_injection_enabled);

View File

@ -376,7 +376,8 @@ tags = ['functional', 'cli_root', 'zpool_events']
[tests/functional/cli_root/zpool_export]
tests = ['zpool_export_001_pos', 'zpool_export_002_pos',
'zpool_export_003_neg', 'zpool_export_004_pos', 'zpool_export_005_pos',
'zpool_export_006_pos', 'zpool_export_007_pos']
'zpool_export_006_pos', 'zpool_export_007_pos',
'zpool_export_parallel_pos', 'zpool_export_parallel_admin']
tags = ['functional', 'cli_root', 'zpool_export']
[tests/functional/cli_root/zpool_get]
@ -401,6 +402,10 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
'zpool_import_rename_001_pos', 'zpool_import_all_001_pos',
'zpool_import_encrypted', 'zpool_import_encrypted_load',
'zpool_import_errata3', 'zpool_import_errata4',
'zpool_import_hostid_changed',
'zpool_import_hostid_changed_unclean_export',
'zpool_import_hostid_changed_cachefile',
'zpool_import_hostid_changed_cachefile_unclean_export',
'import_cachefile_device_added',
'import_cachefile_device_removed',
'import_cachefile_device_replaced',
@ -411,7 +416,9 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
'import_devices_missing',
'import_paths_changed',
'import_rewind_config_changed',
'import_rewind_device_replaced']
'import_rewind_device_replaced',
'zpool_import_status', 'zpool_import_parallel_pos',
'zpool_import_parallel_neg', 'zpool_import_parallel_admin']
tags = ['functional', 'cli_root', 'zpool_import']
timeout = 1200

View File

@ -127,7 +127,7 @@ tags = ['functional', 'mmap']
tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid']
'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk']
tags = ['functional', 'mmp']
[tests/functional/mount:Linux]

View File

@ -8,7 +8,9 @@ dist_pkgdata_SCRIPTS = \
zpool_export_004_pos.ksh \
zpool_export_005_pos.ksh \
zpool_export_006_pos.ksh \
zpool_export_007_pos.ksh
zpool_export_007_pos.ksh \
zpool_export_parallel_admin.ksh \
zpool_export_parallel_pos.ksh
dist_pkgdata_DATA = \
zpool_export.cfg \

View File

@ -0,0 +1,72 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2024 Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Verify that admin commands cannot race a pool export
#
# STRATEGY:
# 1. Create a pool
# 2. Import the pool with an injected delay in the background
# 3. Execute some admin commands against the pool
#
verify_runnable "global"
DEVICE_DIR=$TEST_BASE_DIR/dev_export-test
function cleanup
{
zinject -c all
poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
[[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR
}
log_assert "admin commands cannot race a pool export"
log_onexit cleanup
[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR
log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1
log_must zpool create -f $TESTPOOL1 mirror ${DEVICE_DIR}/disk0 ${DEVICE_DIR}/disk1
log_must zinject -P export -s 10 $TESTPOOL1
log_must zpool export $TESTPOOL1 &
zpool set comment=hello $TESTPOOL1
zpool reguid $TESTPOOL1 &
zpool split $TESTPOOL1 &
log_pass "admin commands cannot race a pool export"

View File

@ -0,0 +1,129 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2024 Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
# test uses 8 vdevs
MAX_NUM=8
DEVICE_DIR=$TEST_BASE_DIR/dev_import-test
#
# DESCRIPTION:
# Verify that pool exports can occur in parallel
#
# STRATEGY:
# 1. Create 8 pools
# 2. Inject an export delay using zinject
# 3. Export half of the pools synchronously to baseline sequential cost
# 4. Export the other half asynchronously to demonstrate parallel savings
# 6. Import 4 pools
# 7. Test zpool export -a
#
verify_runnable "global"
#
# override the minimum sized vdevs
#
POOLNAME="test_pool"
function cleanup
{
zinject -c all
for i in {0..$(($MAX_NUM - 1))}; do
poolexists $POOLNAME-$i && destroy_pool $POOLNAME-$i
done
[[ -d $DEVICE_DIR ]] && log_must rm -rf $DEVICE_DIR
}
log_assert "Pool exports can occur in parallel"
log_onexit cleanup
[[ ! -d $DEVICE_DIR ]] && log_must mkdir -p $DEVICE_DIR
#
# Create some pools with export delay injectors
#
for i in {0..$(($MAX_NUM - 1))}; do
log_must truncate -s $MINVDEVSIZE ${DEVICE_DIR}/disk$i
log_must zpool create $POOLNAME-$i $DEVICE_DIR/disk$i
log_must zinject -P export -s 8 $POOLNAME-$i
done
#
# Export half of the pools synchronously
#
SECONDS=0
for i in {0..3}; do
log_must zpool export $POOLNAME-$i
done
sequential_time=$SECONDS
log_note "sequentially exported 4 pools in $sequential_time seconds"
#
# Export half of the pools in parallel
#
SECONDS=0
for i in {4..7}; do
log_must zpool export $POOLNAME-$i &
done
wait
parallel_time=$SECONDS
log_note "asyncronously exported 4 pools in $parallel_time seconds"
log_must test $parallel_time -lt $(($sequential_time / 3))
#
# import 4 pools with export delay injectors
#
for i in {4..7}; do
log_must zpool import -d $DEVICE_DIR/disk$i $POOLNAME-$i
log_must zinject -P export -s 8 $POOLNAME-$i
done
#
# now test zpool export -a
#
SECONDS=0
log_must zpool export -a
parallel_time=$SECONDS
log_note "asyncronously exported 4 pools, using '-a', in $parallel_time seconds"
log_must test $parallel_time -lt $(($sequential_time / 3))
log_pass "Pool exports occur in parallel"

View File

@ -36,10 +36,17 @@ dist_pkgdata_SCRIPTS = \
zpool_import_features_001_pos.ksh \
zpool_import_features_002_neg.ksh \
zpool_import_features_003_pos.ksh \
zpool_import_hostid_changed.ksh \
zpool_import_hostid_changed_unclean_export.ksh \
zpool_import_hostid_changed_cachefile.ksh \
zpool_import_hostid_changed_cachefile_unclean_export.ksh \
zpool_import_missing_001_pos.ksh \
zpool_import_missing_002_pos.ksh \
zpool_import_missing_003_pos.ksh \
zpool_import_rename_001_pos.ksh \
zpool_import_parallel_admin.ksh \
zpool_import_parallel_neg.ksh \
zpool_import_parallel_pos.ksh \
zpool_import_encrypted.ksh \
zpool_import_encrypted_load.ksh \
zpool_import_errata3.ksh \

View File

@ -26,6 +26,7 @@
#
# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
@ -63,3 +64,7 @@ export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4
export VDEV5=$DEVICE_DIR/${DEVICE_FILE}5
export ALTER_ROOT=/alter_import-test
export HOSTID_FILE="/etc/hostid"
export HOSTID1=01234567
export HOSTID2=89abcdef

View File

@ -11,6 +11,7 @@
#
# Copyright (c) 2016 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib

View File

@ -0,0 +1,59 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2021 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
#
# DESCRIPTION:
# A pool that was cleanly exported should be importable without force even if
# the local hostid doesn't match the on-disk hostid.
#
# STRATEGY:
# 1. Set a hostid.
# 2. Create a pool.
# 3. Export the pool.
# 4. Change the hostid.
# 5. Verify that importing the pool without force succeeds.
#
verify_runnable "global"
function custom_cleanup
{
rm -f $HOSTID_FILE
cleanup
}
log_onexit custom_cleanup
# 1. Set a hostid.
log_must zgenhostid -f $HOSTID1
# 2. Create a pool.
log_must zpool create $TESTPOOL1 $VDEV0
# 3. Export the pool.
log_must zpool export $TESTPOOL1
# 4. Change the hostid.
log_must zgenhostid -f $HOSTID2
# 5. Verify that importing the pool without force succeeds.
log_must zpool import -d $DEVICE_DIR $TESTPOOL1
log_pass "zpool import can import cleanly exported pool when hostid changes."

View File

@ -0,0 +1,65 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2021 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
#
# DESCRIPTION:
# A pool that was cleanly exported should be importable from a cachefile
# without force even if the local hostid doesn't match the on-disk hostid.
#
# STRATEGY:
# 1. Set a hostid.
# 2. Create a pool with a cachefile.
# 3. Backup the cachfile.
# 4. Export the pool.
# 5. Change the hostid.
# 6. Verify that importing the pool from the cachefile succeeds
# without force.
#
verify_runnable "global"
function custom_cleanup
{
rm -f $HOSTID_FILE $CPATH $CPATHBKP
cleanup
}
log_onexit custom_cleanup
# 1. Set a hostid.
log_must zgenhostid -f $HOSTID1
# 2. Create a pool.
log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0
# 3. Backup the cachfile.
log_must cp $CPATH $CPATHBKP
# 4. Export the pool.
log_must zpool export $TESTPOOL1
# 5. Change the hostid.
log_must zgenhostid -f $HOSTID2
# 6. Verify that importing the pool from the cachefile succeeds without force.
log_must zpool import -c $CPATHBKP $TESTPOOL1
log_pass "zpool import can import cleanly exported pool from cachefile " \
"when hostid changes."

View File

@ -0,0 +1,75 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2021 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
#
# DESCRIPTION:
# A pool that wasn't cleanly exported should not be importable from a cachefile
# without force if the local hostid doesn't match the on-disk hostid.
#
# STRATEGY:
# 1. Set a hostid.
# 2. Create a pool.
# 3. Backup the cachefile.
# 4. Simulate the pool being torn down without export:
# 4.1. Copy the underlying device state.
# 4.2. Export the pool.
# 4.3. Restore the device state from the copy.
# 5. Change the hostid.
# 6. Verify that importing the pool from the cachefile fails.
# 7. Verify that importing the pool from the cachefile with force
# succeeds.
#
verify_runnable "global"
function custom_cleanup
{
rm -f $HOSTID_FILE $CPATH $CPATHBKP $VDEV0.bak
cleanup
}
log_onexit custom_cleanup
# 1. Set a hostid.
log_must zgenhostid -f $HOSTID1
# 2. Create a pool.
log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0
# 3. Backup the cachfile.
log_must cp $CPATH $CPATHBKP
# 4. Simulate the pool being torn down without export.
log_must cp $VDEV0 $VDEV0.bak
log_must zpool export $TESTPOOL1
log_must cp -f $VDEV0.bak $VDEV0
log_must rm -f $VDEV0.bak
# 5. Change the hostid.
log_must zgenhostid -f $HOSTID2
# 6. Verify that importing the pool from the cachefile fails.
log_mustnot zpool import -c $CPATHBKP $TESTPOOL1
# 7. Verify that importing the pool from the cachefile with force succeeds.
log_must zpool import -f -c $CPATHBKP $TESTPOOL1
log_pass "zpool import from cachefile requires force if not cleanly " \
"exported and hostid changes."

View File

@ -0,0 +1,70 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2021 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
#
# DESCRIPTION:
# A pool that wasn't cleanly exported should not be importable without force if
# the local hostid doesn't match the on-disk hostid.
#
# STRATEGY:
# 1. Set a hostid.
# 2. Create a pool.
# 3. Simulate the pool being torn down without export:
# 3.1. Copy the underlying device state.
# 3.2. Export the pool.
# 3.3. Restore the device state from the copy.
# 4. Change the hostid.
# 5. Verify that importing the pool fails.
# 6. Verify that importing the pool with force succeeds.
#
verify_runnable "global"
function custom_cleanup
{
rm -f $HOSTID_FILE $VDEV0.bak
cleanup
}
log_onexit custom_cleanup
# 1. Set a hostid.
log_must zgenhostid -f $HOSTID1
# 2. Create a pool.
log_must zpool create $TESTPOOL1 $VDEV0
# 3. Simulate the pool being torn down without export.
log_must cp $VDEV0 $VDEV0.bak
log_must zpool export $TESTPOOL1
log_must cp -f $VDEV0.bak $VDEV0
log_must rm -f $VDEV0.bak
# 4. Change the hostid.
log_must zgenhostid -f $HOSTID2
# 5. Verify that importing the pool fails.
log_mustnot zpool import -d $DEVICE_DIR $TESTPOOL1
# 6. Verify that importing the pool with force succeeds.
log_must zpool import -d $DEVICE_DIR -f $TESTPOOL1
log_pass "zpool import requires force if not cleanly exported " \
"and hostid changed."

View File

@ -0,0 +1,165 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2023 Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
#
# DESCRIPTION:
# Verify that admin commands to different pool are not blocked by import
#
# STRATEGY:
# 1. Create 2 pools
# 2. Export one of the pools
# 4. Import the pool with an injected delay
# 5. Execute some admin commands against both pools
# 6. Verify that the admin commands to the non-imported pool don't stall
#
verify_runnable "global"
function cleanup
{
zinject -c all
destroy_pool $TESTPOOL1
destroy_pool $TESTPOOL2
}
function pool_import
{
typeset dir=$1
typeset pool=$2
SECONDS=0
errmsg=$(zpool import -d $dir -f $pool 2>&1 > /dev/null)
if [[ $? -eq 0 ]]; then
echo ${pool}: imported in $SECONDS secs
echo $SECONDS > ${DEVICE_DIR}/${pool}-import
else
echo ${pool}: import failed $errmsg in $SECONDS secs
fi
}
function pool_add_device
{
typeset pool=$1
typeset device=$2
typeset devtype=$3
SECONDS=0
errmsg=$(zpool add $pool $devtype $device 2>&1 > /dev/null)
if [[ $? -eq 0 ]]; then
echo ${pool}: added $devtype vdev in $SECONDS secs
echo $SECONDS > ${DEVICE_DIR}/${pool}-add
else
echo ${pool}: add $devtype vdev failed ${errmsg}, in $SECONDS secs
fi
}
function pool_stats
{
typeset stats=$1
typeset pool=$2
SECONDS=0
errmsg=$(zpool $stats $pool 2>&1 > /dev/null)
if [[ $? -eq 0 ]]; then
echo ${pool}: $stats in $SECONDS secs
echo $SECONDS > ${DEVICE_DIR}/${pool}-${stats}
else
echo ${pool}: $stats failed ${errmsg}, in $SECONDS secs
fi
}
function pool_create
{
typeset pool=$1
typeset device=$2
SECONDS=0
errmsg=$(zpool create $pool $device 2>&1 > /dev/null)
if [[ $? -eq 0 ]]; then
echo ${pool}: created in $SECONDS secs
echo $SECONDS > ${DEVICE_DIR}/${pool}-create
else
echo ${pool}: create failed ${errmsg}, in $SECONDS secs
fi
}
log_assert "Simple admin commands to different pool not blocked by import"
log_onexit cleanup
#
# create two pools and export one
#
log_must zpool create $TESTPOOL1 $VDEV0
log_must zpool export $TESTPOOL1
log_must zpool create $TESTPOOL2 $VDEV1
#
# import pool asyncronously with an injected 10 second delay
#
log_must zinject -P import -s 10 $TESTPOOL1
pool_import $DEVICE_DIR $TESTPOOL1 &
sleep 2
#
# run some admin commands on the pools while the import is in progress
#
pool_add_device $TESTPOOL1 $VDEV2 "log" &
pool_add_device $TESTPOOL2 $VDEV3 "cache" &
pool_stats "status" $TESTPOOL1 &
pool_stats "status" $TESTPOOL2 &
pool_stats "list" $TESTPOOL1 &
pool_stats "list" $TESTPOOL2 &
pool_create $TESTPOOL1 $VDEV4 &
wait
log_must zpool sync $TESTPOOL1 $TESTPOOL2
zpool history $TESTPOOL1
zpool history $TESTPOOL2
log_must test "5" -lt $(<${DEVICE_DIR}/${TESTPOOL1}-import)
#
# verify that commands to second pool did not wait for import to finish
#
log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-status)
log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-list)
log_must test "2" -gt $(<${DEVICE_DIR}/${TESTPOOL2}-add)
[[ -e ${DEVICE_DIR}/${TESTPOOL1}-create ]] && log_fail "unexpected pool create"
log_pass "Simple admin commands to different pool not blocked by import"

View File

@ -0,0 +1,130 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2023 Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
#
# DESCRIPTION:
# Verify that pool imports by same name only have one winner
#
# STRATEGY:
# 1. Create 4 single disk pools with the same name
# 2. Generate some ZIL records (for a longer import)
# 3. Export the pools
# 4. Import the pools in parallel
# 5. Repeat with using matching guids
#
verify_runnable "global"
POOLNAME="import_pool"
DEV_DIR_PREFIX="$DEVICE_DIR/$POOLNAME"
VDEVSIZE=$((512 * 1024 * 1024))
log_assert "parallel pool imports by same name only have one winner"
# each pool has its own device directory
for i in {0..3}; do
log_must mkdir -p ${DEV_DIR_PREFIX}$i
log_must truncate -s $VDEVSIZE ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i
done
function cleanup
{
zinject -c all
log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0
log_must set_tunable64 METASLAB_DEBUG_LOAD 0
destroy_pool $POOLNAME
log_must rm -rf $DEV_DIR_PREFIX*
}
log_onexit cleanup
log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1
log_must set_tunable64 METASLAB_DEBUG_LOAD 1
function import_pool
{
typeset dir=$1
typeset pool=$2
typeset newname=$3
SECONDS=0
errmsg=$(zpool import -N -d $dir -f $pool $newname 2>&1 > /dev/null)
if [[ $? -eq 0 ]]; then
touch $dir/imported
echo "imported $pool in $SECONDS secs"
elif [[ $errmsg == *"cannot import"* ]]; then
echo "pool import failed: $errmsg, waited $SECONDS secs"
touch $dir/failed
fi
}
#
# create four exported pools with the same name
#
for i in {0..3}; do
log_must zpool create $POOLNAME ${DEV_DIR_PREFIX}$i/${DEVICE_FILE}$i
log_must zpool export $POOLNAME
done
log_must zinject -P import -s 10 $POOLNAME
#
# import the pools in parallel, expecting only one winner
#
for i in {0..3}; do
import_pool ${DEV_DIR_PREFIX}$i $POOLNAME &
done
wait
# check the result of background imports
typeset num_imports=0
typeset num_cannot=0
for i in {0..3}; do
if [[ -f ${DEV_DIR_PREFIX}$i/imported ]]; then
((num_imports += 1))
fi
if [[ -f ${DEV_DIR_PREFIX}$i/failed ]]; then
((num_cannot += 1))
loser=$i
fi
done
[[ $num_imports -eq "1" ]] || log_fail "expecting an import"
[[ $num_cannot -eq "3" ]] || \
log_fail "expecting 3 pool exists errors, found $num_cannot"
log_note "$num_imports imported and $num_cannot failed (expected)"
log_pass "parallel pool imports by same name only have one winner"

View File

@ -0,0 +1,137 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2023 Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
# test uses 8 vdevs
export MAX_NUM=8
#
# DESCRIPTION:
# Verify that pool imports can occur in parallel
#
# STRATEGY:
# 1. Create 8 pools
# 2. Generate some ZIL records
# 3. Export the pools
# 4. Import half of the pools synchronously to baseline sequential cost
# 5. Import the other half asynchronously to demonstrate parallel savings
# 6. Export 4 pools
# 7. Test zpool import -a
#
verify_runnable "global"
#
# override the minimum sized vdevs
#
VDEVSIZE=$((512 * 1024 * 1024))
increase_device_sizes $VDEVSIZE
POOLNAME="import_pool"
function cleanup
{
zinject -c all
log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0
log_must set_tunable64 METASLAB_DEBUG_LOAD 0
for i in {0..$(($MAX_NUM - 1))}; do
destroy_pool $POOLNAME-$i
done
# reset the devices
increase_device_sizes 0
increase_device_sizes $FILE_SIZE
}
log_assert "Pool imports can occur in parallel"
log_onexit cleanup
log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1
log_must set_tunable64 METASLAB_DEBUG_LOAD 1
#
# create some exported pools with import delay injectors
#
for i in {0..$(($MAX_NUM - 1))}; do
log_must zpool create $POOLNAME-$i $DEVICE_DIR/${DEVICE_FILE}$i
log_must zpool export $POOLNAME-$i
log_must zinject -P import -s 12 $POOLNAME-$i
done
wait
#
# import half of the pools synchronously
#
SECONDS=0
for i in {0..3}; do
log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i
done
sequential_time=$SECONDS
log_note "sequentially imported 4 pools in $sequential_time seconds"
#
# import half of the pools in parallel
#
SECONDS=0
for i in {4..7}; do
log_must zpool import -d $DEVICE_DIR -f $POOLNAME-$i &
done
wait
parallel_time=$SECONDS
log_note "asyncronously imported 4 pools in $parallel_time seconds"
log_must test $parallel_time -lt $(($sequential_time / 3))
#
# export pools with import delay injectors
#
for i in {4..7}; do
log_must zpool export $POOLNAME-$i
log_must zinject -P import -s 12 $POOLNAME-$i
done
wait
#
# now test zpool import -a
#
SECONDS=0
log_must zpool import -a -d $DEVICE_DIR -f
parallel_time=$SECONDS
log_note "asyncronously imported 4 pools in $parallel_time seconds"
log_must test $parallel_time -lt $(($sequential_time / 3))
log_pass "Pool imports occur in parallel"

View File

@ -0,0 +1,132 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2023 Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg
#
# DESCRIPTION:
# During a pool import, the 'import_progress' kstat contains details
# on the import progress.
#
# STRATEGY:
# 1. Create test pool with several devices
# 2. Generate some ZIL records and spacemap logs
# 3. Export the pool
# 4. Import the pool in the background and monitor the kstat content
# 5. Check the zfs debug messages for import progress
#
verify_runnable "global"
function cleanup
{
log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0
log_must set_tunable64 METASLAB_DEBUG_LOAD 0
destroy_pool $TESTPOOL1
}
log_assert "During a pool import, the 'import_progress' kstat contains " \
"notes on the progress"
log_onexit cleanup
log_must zpool create $TESTPOOL1 $VDEV0 $VDEV1 $VDEV2
typeset guid=$(zpool get -H -o value guid $TESTPOOL1)
log_must zfs create -o recordsize=8k $TESTPOOL1/fs
#
# This dd command works around an issue where ZIL records aren't created
# after freezing the pool unless a ZIL header already exists. Create a file
# synchronously to force ZFS to write one out.
#
log_must dd if=/dev/zero of=/$TESTPOOL1/fs/sync conv=fsync bs=1 count=1
#
# Overwrite some blocks to populate spacemap logs
#
log_must dd if=/dev/urandom of=/$TESTPOOL1/fs/00 bs=1M count=200
sync_all_pools
log_must dd if=/dev/urandom of=/$TESTPOOL1/fs/00 bs=1M count=200
sync_all_pools
#
# Freeze the pool to retain intent log records
#
log_must zpool freeze $TESTPOOL1
# fill_fs [destdir] [dirnum] [filenum] [bytes] [num_writes] [data]
log_must fill_fs /$TESTPOOL1/fs 1 2000 100 1024 R
log_must zpool list -v $TESTPOOL1
#
# Unmount filesystem and export the pool
#
# At this stage the zfs intent log contains
# a set of records to replay.
#
log_must zfs unmount /$TESTPOOL1/fs
log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1
log_must zpool export $TESTPOOL1
log_must set_tunable64 METASLAB_DEBUG_LOAD 1
log_note "Starting zpool import in background at" $(date +'%H:%M:%S')
zpool import -d $DEVICE_DIR -f $guid &
pid=$!
#
# capture progress until import is finished
#
log_note waiting for pid $pid to exit
kstat import_progress
while [[ -d /proc/"$pid" ]]; do
line=$(kstat import_progress | grep -v pool_guid)
if [[ -n $line ]]; then
echo $line
fi
if [[ -f /$TESTPOOL1/fs/00 ]]; then
break;
fi
sleep 0.0001
done
log_note "zpool import completed at" $(date +'%H:%M:%S')
entries=$(kstat dbgmsg | grep "spa_import_progress_set_notes_impl(): 'testpool1'" | wc -l)
log_note "found $entries progress notes in dbgmsg"
log_must test $entries -gt 20
log_must zpool status $TESTPOOL1
log_pass "During a pool import, the 'import_progress' kstat contains " \
"notes on the progress"

View File

@ -67,7 +67,15 @@ log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
log_must fio $FIO_SCRIPTS/mkfiles.fio
log_must fio $FIO_SCRIPTS/random_reads.fio
timeout_handler() {
log_fail "${TIMEOUT_MESSAGE}"
}
TIMEOUT_MESSAGE="Time out arcstat_quiescence_noecho l2_size before zpool offline"
trap timeout_handler USR1
ppid="$$" && (sleep 600 && kill -USR1 "$ppid") & timeout_pid="$!"
arcstat_quiescence_noecho l2_size
trap - USR1
log_must zpool offline $TESTPOOL $VDEV_CACHE
arcstat_quiescence_noecho l2_size

View File

@ -8,6 +8,7 @@ dist_pkgdata_SCRIPTS = \
mmp_active_import.ksh \
mmp_inactive_import.ksh \
mmp_exported_import.ksh \
mmp_write_slow_disk.ksh \
mmp_write_uberblocks.ksh \
mmp_reset_interval.ksh \
mmp_on_zdb.ksh \

View File

@ -0,0 +1,97 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
# CDDL HEADER END
#
#
# Copyright (c) 2024, Klara Inc
#
# DESCRIPTION:
# Verify that long VDEV probes do not cause MMP checks to suspend pool
# Note: without PR-15839 fix, this test will suspend the pool.
#
# A device that is returning unexpected errors will trigger a vdev_probe.
# When the device additionally has slow response times, the probe can hold
# the spa config lock as a writer for a long period of time such that the
# mmp uberblock updates stall when trying to acquire the spa config lock.
#
# STRATEGY:
# 1. Create a pool with multiple leaf vdevs
# 2. Enable multihost and multihost_history
# 3. Delay for MMP writes to occur
# 4. Verify that a long VDEV probe didn't cause MMP check to suspend pool
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/mmp/mmp.cfg
. $STF_SUITE/tests/functional/mmp/mmp.kshlib
verify_runnable "both"
function cleanup
{
log_must zinject -c all
if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then
log_must zpool clear $MMP_POOL
zpool get state $MMP_POOL $MMP_DIR/file.3
zpool events | grep ".fs.zfs." | grep -v "history_event"
fi
poolexists $MMP_POOL && destroy_pool $MMP_POOL
log_must rm -r $MMP_DIR
log_must mmp_clear_hostid
}
log_assert "A long VDEV probe doesn't cause a MMP check suspend"
log_onexit cleanup
MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost
# Create a multiple drive pool
log_must zpool events -c
log_must mkdir -p $MMP_DIR
log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5}
log_must zpool create -f $MMP_POOL \
mirror $MMP_DIR/file.{0,1,2} \
mirror $MMP_DIR/file.{3,4,5}
# Enable MMP
log_must mmp_set_hostid $HOSTID1
log_must zpool set multihost=on $MMP_POOL
clear_mmp_history
# Inject vdev write error along with a delay
log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL
log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL
log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL
log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5
sleep 10
sync_pool $MMP_POOL
# Confirm mmp writes to the non-slow disks have taken place
for x in {0,1,2,4}; do
write_count=$(grep -c file.${x} $MMP_HISTORY_URL)
[[ $write_count -gt 0 ]] || log_fail "expecting mmp writes"
done
# Expect that the pool was not suspended
log_must check_state $MMP_POOL "" "ONLINE"
health=$(zpool list -H -o health $MMP_POOL)
log_note "$MMP_POOL health is $health"
[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected"
log_pass "A long VDEV probe doesn't cause a MMP check suspend"