Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-13 21:51:51 +00:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* Copyright (c) 2018 Intel Corporation.
|
|
|
|
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <zlib.h>
|
|
|
|
#include <zfs_fletcher.h>
|
|
|
|
#include <sys/vdev_draid.h>
|
|
|
|
#include <sys/nvpair.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The number of rows to generate for new permutation maps.
|
|
|
|
*/
|
|
|
|
#define MAP_ROWS_DEFAULT 256
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Key values for dRAID maps when stored as nvlists.
|
|
|
|
*/
|
|
|
|
#define MAP_SEED "seed"
|
|
|
|
#define MAP_CHECKSUM "checksum"
|
|
|
|
#define MAP_WORST_RATIO "worst_ratio"
|
|
|
|
#define MAP_AVG_RATIO "avg_ratio"
|
|
|
|
#define MAP_CHILDREN "children"
|
|
|
|
#define MAP_NPERMS "nperms"
|
|
|
|
#define MAP_PERMS "perms"
|
|
|
|
|
|
|
|
static void
|
|
|
|
draid_usage(void)
|
|
|
|
{
|
|
|
|
(void) fprintf(stderr,
|
|
|
|
"usage: draid command args ...\n"
|
|
|
|
"Available commands are:\n"
|
|
|
|
"\n"
|
|
|
|
"\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n"
|
|
|
|
"\tdraid verify [-rv] FILE\n"
|
|
|
|
"\tdraid dump [-v] [-m min] [-n max] FILE\n"
|
|
|
|
"\tdraid table FILE\n"
|
|
|
|
"\tdraid merge FILE SRC SRC...\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
read_map(const char *filename, nvlist_t **allcfgs)
|
|
|
|
{
|
|
|
|
int block_size = 131072;
|
|
|
|
int buf_size = 131072;
|
|
|
|
int tmp_size, error;
|
|
|
|
char *tmp_buf;
|
|
|
|
|
|
|
|
struct stat64 stat;
|
|
|
|
if (lstat64(filename, &stat) != 0)
|
|
|
|
return (errno);
|
|
|
|
|
|
|
|
if (stat.st_size == 0 ||
|
|
|
|
!(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) {
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
gzFile fp = gzopen(filename, "rb");
|
|
|
|
if (fp == Z_NULL)
|
|
|
|
return (errno);
|
|
|
|
|
|
|
|
char *buf = malloc(buf_size);
|
|
|
|
if (buf == NULL) {
|
|
|
|
(void) gzclose(fp);
|
|
|
|
return (ENOMEM);
|
|
|
|
}
|
|
|
|
|
|
|
|
ssize_t rc, bytes = 0;
|
|
|
|
while (!gzeof(fp)) {
|
|
|
|
rc = gzread(fp, buf + bytes, block_size);
|
|
|
|
if ((rc < 0) || (rc == 0 && !gzeof(fp))) {
|
|
|
|
free(buf);
|
|
|
|
(void) gzclose(fp);
|
|
|
|
(void) gzerror(fp, &error);
|
|
|
|
return (error);
|
|
|
|
} else {
|
|
|
|
bytes += rc;
|
|
|
|
|
|
|
|
if (bytes + block_size >= buf_size) {
|
|
|
|
tmp_size = 2 * buf_size;
|
|
|
|
tmp_buf = malloc(tmp_size);
|
|
|
|
if (tmp_buf == NULL) {
|
|
|
|
free(buf);
|
|
|
|
(void) gzclose(fp);
|
|
|
|
return (ENOMEM);
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(tmp_buf, buf, bytes);
|
|
|
|
free(buf);
|
|
|
|
buf = tmp_buf;
|
|
|
|
buf_size = tmp_size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
(void) gzclose(fp);
|
|
|
|
|
|
|
|
error = nvlist_unpack(buf, bytes, allcfgs, 0);
|
|
|
|
free(buf);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read a map from the specified filename. A file contains multiple maps
|
|
|
|
* which are indexed by the number of children. The caller is responsible
|
|
|
|
* for freeing the configuration returned.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
read_map_key(const char *filename, char *key, nvlist_t **cfg)
|
|
|
|
{
|
|
|
|
nvlist_t *allcfgs, *foundcfg = NULL;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = read_map(filename, &allcfgs);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
nvlist_lookup_nvlist(allcfgs, key, &foundcfg);
|
|
|
|
if (foundcfg != NULL) {
|
|
|
|
nvlist_dup(foundcfg, cfg, KM_SLEEP);
|
|
|
|
error = 0;
|
|
|
|
} else {
|
|
|
|
error = ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
nvlist_free(allcfgs);
|
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write all mappings to the map file.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
write_map(const char *filename, nvlist_t *allcfgs)
|
|
|
|
{
|
|
|
|
size_t buflen = 0;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
char *buf = malloc(buflen);
|
|
|
|
if (buf == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
|
|
|
|
error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
|
|
|
|
if (error) {
|
|
|
|
free(buf);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Atomically update the file using a temporary file and the
|
|
|
|
* traditional unlink then rename steps. This code provides
|
|
|
|
* no locking, it only guarantees the packed nvlist on disk
|
|
|
|
* is updated atomically and is internally consistent.
|
|
|
|
*/
|
2022-02-02 19:27:35 +00:00
|
|
|
char *tmpname = calloc(1, MAXPATHLEN);
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-13 21:51:51 +00:00
|
|
|
if (tmpname == NULL) {
|
|
|
|
free(buf);
|
|
|
|
return (ENOMEM);
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename);
|
|
|
|
|
|
|
|
int fd = mkstemp(tmpname);
|
|
|
|
if (fd < 0) {
|
|
|
|
error = errno;
|
|
|
|
free(buf);
|
|
|
|
free(tmpname);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
(void) close(fd);
|
|
|
|
|
|
|
|
gzFile fp = gzopen(tmpname, "w9b");
|
|
|
|
if (fp == Z_NULL) {
|
|
|
|
error = errno;
|
|
|
|
free(buf);
|
|
|
|
free(tmpname);
|
|
|
|
return (errno);
|
|
|
|
}
|
|
|
|
|
|
|
|
ssize_t rc, bytes = 0;
|
|
|
|
while (bytes < buflen) {
|
|
|
|
size_t size = MIN(buflen - bytes, 131072);
|
|
|
|
rc = gzwrite(fp, buf + bytes, size);
|
|
|
|
if (rc < 0) {
|
|
|
|
free(buf);
|
|
|
|
(void) gzerror(fp, &error);
|
|
|
|
(void) gzclose(fp);
|
|
|
|
(void) unlink(tmpname);
|
|
|
|
free(tmpname);
|
|
|
|
return (error);
|
|
|
|
} else if (rc == 0) {
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
bytes += rc;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
free(buf);
|
|
|
|
(void) gzclose(fp);
|
|
|
|
|
|
|
|
if (bytes != buflen) {
|
|
|
|
(void) unlink(tmpname);
|
|
|
|
free(tmpname);
|
|
|
|
return (EIO);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Unlink the previous config file and replace it with the updated
|
|
|
|
* version. If we're able to unlink the file then directory is
|
|
|
|
* writable by us and the subsequent rename should never fail.
|
|
|
|
*/
|
|
|
|
error = unlink(filename);
|
|
|
|
if (error != 0 && errno != ENOENT) {
|
|
|
|
error = errno;
|
|
|
|
(void) unlink(tmpname);
|
|
|
|
free(tmpname);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
error = rename(tmpname, filename);
|
|
|
|
if (error != 0) {
|
|
|
|
error = errno;
|
|
|
|
(void) unlink(tmpname);
|
|
|
|
free(tmpname);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
free(tmpname);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add the dRAID map to the file and write it out.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
write_map_key(const char *filename, char *key, draid_map_t *map,
|
|
|
|
double worst_ratio, double avg_ratio)
|
|
|
|
{
|
|
|
|
nvlist_t *nv_cfg, *allcfgs;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add the configuration to an existing or new file. The new
|
|
|
|
* configuration will replace an existing configuration with the
|
|
|
|
* same key if it has a lower ratio and is therefore better.
|
|
|
|
*/
|
|
|
|
error = read_map(filename, &allcfgs);
|
|
|
|
if (error == ENOENT) {
|
|
|
|
allcfgs = fnvlist_alloc();
|
|
|
|
} else if (error != 0) {
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg);
|
|
|
|
if (error == 0) {
|
|
|
|
uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg,
|
|
|
|
MAP_WORST_RATIO);
|
|
|
|
double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0;
|
|
|
|
|
|
|
|
if (worst_ratio < nv_worst_ratio) {
|
|
|
|
/* Replace old map with the more balanced new map. */
|
|
|
|
fnvlist_remove(allcfgs, key);
|
|
|
|
} else {
|
|
|
|
/* The old map is preferable, keep it. */
|
|
|
|
nvlist_free(allcfgs);
|
|
|
|
return (EEXIST);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
nvlist_t *cfg = fnvlist_alloc();
|
|
|
|
fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed);
|
|
|
|
fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum);
|
|
|
|
fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children);
|
|
|
|
fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms);
|
|
|
|
fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms,
|
|
|
|
map->dm_children * map->dm_nperms * sizeof (uint8_t));
|
|
|
|
|
|
|
|
fnvlist_add_uint64(cfg, MAP_WORST_RATIO,
|
|
|
|
(uint64_t)(worst_ratio * 1000.0));
|
|
|
|
fnvlist_add_uint64(cfg, MAP_AVG_RATIO,
|
|
|
|
(uint64_t)(avg_ratio * 1000.0));
|
|
|
|
|
|
|
|
error = nvlist_add_nvlist(allcfgs, key, cfg);
|
|
|
|
if (error == 0)
|
|
|
|
error = write_map(filename, allcfgs);
|
|
|
|
|
|
|
|
nvlist_free(cfg);
|
|
|
|
nvlist_free(allcfgs);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dump_map(draid_map_t *map, char *key, double worst_ratio, double avg_ratio,
|
|
|
|
int verbose)
|
|
|
|
{
|
|
|
|
if (verbose == 0) {
|
|
|
|
return;
|
|
|
|
} else if (verbose == 1) {
|
|
|
|
printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f "
|
|
|
|
"avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed,
|
|
|
|
worst_ratio, avg_ratio);
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
printf(" \"%s\":\n"
|
|
|
|
" seed: 0x%016llx\n"
|
|
|
|
" checksum: 0x%016llx\n"
|
|
|
|
" worst_ratio: %2.03f\n"
|
|
|
|
" avg_ratio: %2.03f\n"
|
|
|
|
" children: %llu\n"
|
|
|
|
" nperms: %llu\n",
|
|
|
|
key, (u_longlong_t)map->dm_seed,
|
|
|
|
(u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio,
|
|
|
|
(u_longlong_t)map->dm_children,
|
|
|
|
(u_longlong_t)map->dm_nperms);
|
|
|
|
|
|
|
|
if (verbose > 2) {
|
|
|
|
printf(" perms = {\n");
|
|
|
|
for (int i = 0; i < map->dm_nperms; i++) {
|
|
|
|
printf(" { ");
|
|
|
|
for (int j = 0; j < map->dm_children; j++) {
|
|
|
|
printf("%3d%s ", map->dm_perms[
|
|
|
|
i * map->dm_children + j],
|
|
|
|
j < map->dm_children - 1 ?
|
|
|
|
"," : "");
|
|
|
|
}
|
|
|
|
printf(" },\n");
|
|
|
|
}
|
|
|
|
printf(" }\n");
|
|
|
|
} else if (verbose == 2) {
|
|
|
|
printf(" draid_perms = <omitted>\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
dump_map_nv(char *key, nvlist_t *cfg, int verbose)
|
|
|
|
{
|
|
|
|
draid_map_t map;
|
|
|
|
uint_t c;
|
|
|
|
|
|
|
|
uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO);
|
|
|
|
uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
|
|
|
|
|
|
|
|
map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
|
|
|
|
map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
|
|
|
|
map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
|
|
|
|
map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
|
|
|
|
nvlist_lookup_uint8_array(cfg, MAP_PERMS, &map.dm_perms, &c);
|
|
|
|
|
|
|
|
dump_map(&map, key, (double)worst_ratio / 1000.0,
|
|
|
|
avg_ratio / 1000.0, verbose);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Print a summary of the mapping.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
dump_map_key(const char *filename, char *key, int verbose)
|
|
|
|
{
|
|
|
|
nvlist_t *cfg;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = read_map_key(filename, key, &cfg);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
dump_map_nv(key, cfg, verbose);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate a new permutation map for evaluation.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed,
|
|
|
|
draid_map_t **mapp)
|
|
|
|
{
|
|
|
|
draid_map_t *map;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
map = malloc(sizeof (draid_map_t));
|
|
|
|
if (map == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
|
|
|
|
map->dm_children = children;
|
|
|
|
map->dm_nperms = nperms;
|
|
|
|
map->dm_seed = seed;
|
|
|
|
map->dm_checksum = 0;
|
|
|
|
|
|
|
|
error = vdev_draid_generate_perms(map, &map->dm_perms);
|
|
|
|
if (error) {
|
|
|
|
free(map);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
*mapp = map;
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate the fixed permutation map for N children.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
alloc_fixed_map(uint64_t children, draid_map_t **mapp)
|
|
|
|
{
|
|
|
|
const draid_map_t *fixed_map;
|
|
|
|
draid_map_t *map;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = vdev_draid_lookup_map(children, &fixed_map);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
map = malloc(sizeof (draid_map_t));
|
|
|
|
if (map == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
|
|
|
|
memcpy(map, fixed_map, sizeof (draid_map_t));
|
|
|
|
VERIFY3U(map->dm_checksum, !=, 0);
|
|
|
|
|
|
|
|
error = vdev_draid_generate_perms(map, &map->dm_perms);
|
|
|
|
if (error) {
|
|
|
|
free(map);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
*mapp = map;
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a permutation map.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
free_map(draid_map_t *map)
|
|
|
|
{
|
|
|
|
free(map->dm_perms);
|
|
|
|
free(map);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if dev is in the provided list of faulted devices.
|
|
|
|
*/
|
|
|
|
static inline boolean_t
|
|
|
|
is_faulted(int *faulted_devs, int nfaulted, int dev)
|
|
|
|
{
|
|
|
|
for (int i = 0; i < nfaulted; i++)
|
|
|
|
if (faulted_devs[i] == dev)
|
|
|
|
return (B_TRUE);
|
|
|
|
|
|
|
|
return (B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Evaluate how resilvering I/O will be distributed given a list of faulted
|
|
|
|
* vdevs. As a simplification we assume one IO is sufficient to repair each
|
|
|
|
* damaged device in a group.
|
|
|
|
*/
|
|
|
|
static double
|
|
|
|
eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares,
|
|
|
|
int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios)
|
|
|
|
{
|
|
|
|
uint64_t children = map->dm_children;
|
|
|
|
uint64_t ngroups = 1;
|
|
|
|
uint64_t ndisks = children - nspares;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate the minimum number of groups required to fill a slice.
|
|
|
|
*/
|
|
|
|
while (ngroups * (groupwidth) % (children - nspares) != 0)
|
|
|
|
ngroups++;
|
|
|
|
|
|
|
|
int *ios = calloc(map->dm_children, sizeof (uint64_t));
|
|
|
|
|
|
|
|
/* Resilver all rows */
|
|
|
|
for (int i = 0; i < map->dm_nperms; i++) {
|
|
|
|
uint8_t *row = &map->dm_perms[i * map->dm_children];
|
|
|
|
|
|
|
|
/* Resilver all groups with faulted drives */
|
|
|
|
for (int j = 0; j < ngroups; j++) {
|
|
|
|
uint64_t spareidx = map->dm_children - nspares;
|
|
|
|
boolean_t repair_needed = B_FALSE;
|
|
|
|
|
|
|
|
/* See if any devices in this group are faulted */
|
|
|
|
uint64_t groupstart = (j * groupwidth) % ndisks;
|
|
|
|
|
|
|
|
for (int k = 0; k < groupwidth; k++) {
|
|
|
|
uint64_t groupidx = (groupstart + k) % ndisks;
|
|
|
|
|
|
|
|
repair_needed = is_faulted(faulted_devs,
|
|
|
|
nfaulted, row[groupidx]);
|
|
|
|
if (repair_needed)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (repair_needed == B_FALSE)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This group is degraded. Calculate the number of
|
|
|
|
* reads the non-faulted drives require and the number
|
|
|
|
* of writes to the distributed hot spare for this row.
|
|
|
|
*/
|
|
|
|
for (int k = 0; k < groupwidth; k++) {
|
|
|
|
uint64_t groupidx = (groupstart + k) % ndisks;
|
|
|
|
|
|
|
|
if (!is_faulted(faulted_devs, nfaulted,
|
|
|
|
row[groupidx])) {
|
|
|
|
ios[row[groupidx]]++;
|
|
|
|
} else if (nspares > 0) {
|
|
|
|
while (is_faulted(faulted_devs,
|
|
|
|
nfaulted, row[spareidx])) {
|
|
|
|
spareidx++;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT3U(spareidx, <, map->dm_children);
|
|
|
|
ios[row[spareidx]]++;
|
|
|
|
spareidx++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*min_child_ios = INT_MAX;
|
|
|
|
*max_child_ios = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the drives with fewest and most required I/O. These values
|
|
|
|
* are used to calculate the imbalance ratio. To avoid returning an
|
|
|
|
* infinite value for permutations which have children that perform
|
|
|
|
* no IO a floor of 1 IO per child is set. This ensures a meaningful
|
|
|
|
* ratio is returned for comparison and it is not an uncommon when
|
|
|
|
* there are a large number of children.
|
|
|
|
*/
|
|
|
|
for (int i = 0; i < map->dm_children; i++) {
|
|
|
|
|
|
|
|
if (is_faulted(faulted_devs, nfaulted, i)) {
|
|
|
|
ASSERT0(ios[i]);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ios[i] == 0)
|
|
|
|
ios[i] = 1;
|
|
|
|
|
|
|
|
if (ios[i] < *min_child_ios)
|
|
|
|
*min_child_ios = ios[i];
|
|
|
|
|
|
|
|
if (ios[i] > *max_child_ios)
|
|
|
|
*max_child_ios = ios[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT3S(*min_child_ios, !=, INT_MAX);
|
|
|
|
ASSERT3S(*max_child_ios, !=, 0);
|
|
|
|
|
|
|
|
double ratio = (double)(*max_child_ios) / (double)(*min_child_ios);
|
|
|
|
|
|
|
|
free(ios);
|
|
|
|
|
|
|
|
return (ratio);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Evaluate the quality of the permutation mapping by considering possible
|
|
|
|
* device failures. Returns the imbalance ratio for the worst mapping which
|
|
|
|
* is defined to be the largest number of child IOs over the fewest number
|
|
|
|
* child IOs. A value of 1.0 indicates the mapping is perfectly balance and
|
|
|
|
* all children perform an equal amount of work during reconstruction.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop)
|
|
|
|
{
|
|
|
|
uint64_t children = map->dm_children;
|
|
|
|
double worst_ratio = 1.0;
|
|
|
|
double sum = 0;
|
|
|
|
int worst_min_ios = 0, worst_max_ios = 0;
|
|
|
|
int n = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When there are only 2 children there can be no distributed
|
|
|
|
* spare and no resilver to evaluate. Default to a ratio of 1.0
|
|
|
|
* for this degenerate case.
|
|
|
|
*/
|
|
|
|
if (children == VDEV_DRAID_MIN_CHILDREN) {
|
|
|
|
*worst_ratiop = 1.0;
|
|
|
|
*avg_ratiop = 1.0;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Score the mapping as if it had either 1 or 2 distributed spares.
|
|
|
|
*/
|
|
|
|
for (int nspares = 1; nspares <= 2; nspares++) {
|
|
|
|
uint64_t faults = nspares;
|
|
|
|
|
|
|
|
/*
|
2021-04-03 01:38:53 +00:00
|
|
|
* Score groupwidths up to 19. This value was chosen as the
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-13 21:51:51 +00:00
|
|
|
* largest reasonable width (16d+3p). dRAID pools may be still
|
|
|
|
* be created with wider stripes but they are not considered in
|
|
|
|
* this analysis in order to optimize for the most common cases.
|
|
|
|
*/
|
|
|
|
for (uint64_t groupwidth = 2;
|
|
|
|
groupwidth <= MIN(children - nspares, 19);
|
|
|
|
groupwidth++) {
|
|
|
|
int faulted_devs[2];
|
|
|
|
int min_ios, max_ios;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Score possible devices faults. This is limited
|
|
|
|
* to exactly one fault per distributed spare for
|
|
|
|
* the purposes of this similation.
|
|
|
|
*/
|
|
|
|
for (int f1 = 0; f1 < children; f1++) {
|
|
|
|
faulted_devs[0] = f1;
|
|
|
|
double ratio;
|
|
|
|
|
|
|
|
if (faults == 1) {
|
|
|
|
ratio = eval_resilver(map, groupwidth,
|
|
|
|
nspares, faulted_devs, faults,
|
|
|
|
&min_ios, &max_ios);
|
|
|
|
|
|
|
|
if (ratio > worst_ratio) {
|
|
|
|
worst_ratio = ratio;
|
|
|
|
worst_min_ios = min_ios;
|
|
|
|
worst_max_ios = max_ios;
|
|
|
|
}
|
|
|
|
|
|
|
|
sum += ratio;
|
|
|
|
n++;
|
|
|
|
} else if (faults == 2) {
|
|
|
|
for (int f2 = f1 + 1; f2 < children;
|
|
|
|
f2++) {
|
|
|
|
faulted_devs[1] = f2;
|
|
|
|
|
|
|
|
ratio = eval_resilver(map,
|
|
|
|
groupwidth, nspares,
|
|
|
|
faulted_devs, faults,
|
|
|
|
&min_ios, &max_ios);
|
|
|
|
|
|
|
|
if (ratio > worst_ratio) {
|
|
|
|
worst_ratio = ratio;
|
|
|
|
worst_min_ios = min_ios;
|
|
|
|
worst_max_ios = max_ios;
|
|
|
|
}
|
|
|
|
|
|
|
|
sum += ratio;
|
|
|
|
n++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*worst_ratiop = worst_ratio;
|
|
|
|
*avg_ratiop = sum / n;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Log the min/max io values for particularly unbalanced maps.
|
|
|
|
* Since the maps are generated entirely randomly these are possible
|
|
|
|
* be exceedingly unlikely. We log it for possible investigation.
|
|
|
|
*/
|
|
|
|
if (worst_ratio > 100.0) {
|
|
|
|
dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2);
|
|
|
|
printf("worst_min_ios=%d worst_max_ios=%d\n",
|
|
|
|
worst_min_ios, worst_max_ios);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
eval_maps(uint64_t children, int passes, uint64_t *map_seed,
|
|
|
|
draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop)
|
|
|
|
{
|
|
|
|
draid_map_t *best_map = NULL;
|
|
|
|
double best_worst_ratio = 1000.0;
|
|
|
|
double best_avg_ratio = 1000.0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Perform the requested number of passes evaluating randomly
|
|
|
|
* generated permutation maps. Only the best version is kept.
|
|
|
|
*/
|
|
|
|
for (int i = 0; i < passes; i++) {
|
|
|
|
double worst_ratio, avg_ratio;
|
|
|
|
draid_map_t *map;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate the next seed and generate a new candidate map.
|
|
|
|
*/
|
|
|
|
error = alloc_new_map(children, MAP_ROWS_DEFAULT,
|
|
|
|
vdev_draid_rand(map_seed), &map);
|
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Consider maps with a lower worst_ratio to be of higher
|
|
|
|
* quality. Some maps may have a lower avg_ratio but they
|
|
|
|
* are discarded since they might include some particularly
|
2021-04-03 01:38:53 +00:00
|
|
|
* imbalanced permutations. The average is tracked to in
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-13 21:51:51 +00:00
|
|
|
* order to get a sense of the average permutation quality.
|
|
|
|
*/
|
|
|
|
eval_decluster(map, &worst_ratio, &avg_ratio);
|
|
|
|
|
|
|
|
if (best_map == NULL || worst_ratio < best_worst_ratio) {
|
|
|
|
|
|
|
|
if (best_map != NULL)
|
|
|
|
free_map(best_map);
|
|
|
|
|
|
|
|
best_map = map;
|
|
|
|
best_worst_ratio = worst_ratio;
|
|
|
|
best_avg_ratio = avg_ratio;
|
|
|
|
} else {
|
|
|
|
free_map(map);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* After determining the best map generate a checksum over the full
|
|
|
|
* permutation array. This checksum is verified when opening a dRAID
|
|
|
|
* pool to ensure the generated in memory permutations are correct.
|
|
|
|
*/
|
|
|
|
zio_cksum_t cksum;
|
|
|
|
fletcher_4_native_varsize(best_map->dm_perms,
|
|
|
|
sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms,
|
|
|
|
&cksum);
|
|
|
|
best_map->dm_checksum = cksum.zc_word[0];
|
|
|
|
|
|
|
|
*best_mapp = best_map;
|
|
|
|
*best_ratiop = best_worst_ratio;
|
|
|
|
*avg_ratiop = best_avg_ratio;
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
draid_generate(int argc, char *argv[])
|
|
|
|
{
|
|
|
|
char filename[MAXPATHLEN];
|
|
|
|
uint64_t map_seed;
|
|
|
|
int c, fd, error, verbose = 0, passes = 1, continuous = 0;
|
|
|
|
int min_children = VDEV_DRAID_MIN_CHILDREN;
|
|
|
|
int max_children = VDEV_DRAID_MAX_CHILDREN;
|
|
|
|
int restarts = 0;
|
|
|
|
|
|
|
|
while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) {
|
|
|
|
switch (c) {
|
|
|
|
case 'c':
|
|
|
|
continuous++;
|
|
|
|
break;
|
|
|
|
case 'm':
|
|
|
|
min_children = (int)strtol(optarg, NULL, 0);
|
|
|
|
if (min_children < VDEV_DRAID_MIN_CHILDREN) {
|
|
|
|
(void) fprintf(stderr, "A minimum of 2 "
|
|
|
|
"children are required.\n");
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
case 'n':
|
|
|
|
max_children = (int)strtol(optarg, NULL, 0);
|
|
|
|
if (max_children > VDEV_DRAID_MAX_CHILDREN) {
|
|
|
|
(void) fprintf(stderr, "A maximum of %d "
|
|
|
|
"children are allowed.\n",
|
|
|
|
VDEV_DRAID_MAX_CHILDREN);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case 'p':
|
|
|
|
passes = (int)strtol(optarg, NULL, 0);
|
|
|
|
break;
|
|
|
|
case 'v':
|
|
|
|
/*
|
|
|
|
* 0 - Only log when a better map is added to the file.
|
|
|
|
* 1 - Log the current best map for each child count.
|
|
|
|
* Minimal output on a single summary line.
|
|
|
|
* 2 - Log the current best map for each child count.
|
|
|
|
* More verbose includes most map fields.
|
|
|
|
* 3 - Log the current best map for each child count.
|
|
|
|
* Very verbose all fields including the full map.
|
|
|
|
*/
|
|
|
|
verbose++;
|
|
|
|
break;
|
|
|
|
case ':':
|
|
|
|
(void) fprintf(stderr,
|
|
|
|
"missing argument for '%c' option\n", optopt);
|
|
|
|
draid_usage();
|
|
|
|
break;
|
|
|
|
case '?':
|
|
|
|
(void) fprintf(stderr, "invalid option '%c'\n",
|
|
|
|
optopt);
|
|
|
|
draid_usage();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (argc > optind) {
|
|
|
|
bzero(filename, MAXPATHLEN);
|
|
|
|
strncpy(filename, argv[optind], MAXPATHLEN - 1);
|
|
|
|
} else {
|
|
|
|
(void) fprintf(stderr, "A FILE must be specified.\n");
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
restart:
|
|
|
|
/*
|
|
|
|
* Start with a fresh seed from /dev/urandom.
|
|
|
|
*/
|
|
|
|
fd = open("/dev/urandom", O_RDONLY);
|
|
|
|
if (fd < 0) {
|
|
|
|
printf("Unable to open /dev/urandom: %s\n:", strerror(errno));
|
|
|
|
return (1);
|
|
|
|
} else {
|
|
|
|
ssize_t bytes = sizeof (map_seed);
|
|
|
|
ssize_t bytes_read = 0;
|
|
|
|
|
|
|
|
while (bytes_read < bytes) {
|
|
|
|
ssize_t rc = read(fd, ((char *)&map_seed) + bytes_read,
|
|
|
|
bytes - bytes_read);
|
|
|
|
if (rc < 0) {
|
|
|
|
printf("Unable to read /dev/urandom: %s\n:",
|
|
|
|
strerror(errno));
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
bytes_read += rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
(void) close(fd);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (restarts == 0)
|
|
|
|
printf("Writing generated mappings to '%s':\n", filename);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Generate maps for all requested child counts. The best map for
|
|
|
|
* each child count is written out to the specified file. If the file
|
|
|
|
* already contains a better mapping this map will not be added.
|
|
|
|
*/
|
|
|
|
for (uint64_t children = min_children;
|
|
|
|
children <= max_children; children++) {
|
|
|
|
char key[8] = { 0 };
|
|
|
|
draid_map_t *map;
|
|
|
|
double worst_ratio = 1000.0;
|
|
|
|
double avg_ratio = 1000.0;
|
|
|
|
|
|
|
|
error = eval_maps(children, passes, &map_seed, &map,
|
|
|
|
&worst_ratio, &avg_ratio);
|
|
|
|
if (error) {
|
|
|
|
printf("Error eval_maps(): %s\n", strerror(error));
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (worst_ratio < 1.0 || avg_ratio < 1.0) {
|
|
|
|
printf("Error ratio < 1.0: worst_ratio = %2.03f "
|
|
|
|
"avg_ratio = %2.03f\n", worst_ratio, avg_ratio);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(key, 7, "%llu", (u_longlong_t)children);
|
|
|
|
error = write_map_key(filename, key, map, worst_ratio,
|
|
|
|
avg_ratio);
|
|
|
|
if (error == 0) {
|
|
|
|
/* The new map was added to the file. */
|
|
|
|
dump_map(map, key, worst_ratio, avg_ratio,
|
|
|
|
MAX(verbose, 1));
|
|
|
|
} else if (error == EEXIST) {
|
|
|
|
/* The existing map was preferable and kept. */
|
|
|
|
if (verbose > 0)
|
|
|
|
dump_map_key(filename, key, verbose);
|
|
|
|
} else {
|
|
|
|
printf("Error write_map_key(): %s\n", strerror(error));
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
free_map(map);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When the continuous option is set restart at the minimum number of
|
|
|
|
* children instead of exiting. This option is useful as a mechanism
|
|
|
|
* to continuous try and refine the discovered permutations.
|
|
|
|
*/
|
|
|
|
if (continuous) {
|
|
|
|
restarts++;
|
|
|
|
printf("Restarting by request (-c): %d\n", restarts);
|
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify each map in the file by generating its in-memory permutation array
|
|
|
|
* and comfirming its checksum is correct.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
draid_verify(int argc, char *argv[])
|
|
|
|
{
|
|
|
|
char filename[MAXPATHLEN];
|
|
|
|
int n = 0, c, error, verbose = 1;
|
|
|
|
int check_ratios = 0;
|
|
|
|
|
|
|
|
while ((c = getopt(argc, argv, ":rv")) != -1) {
|
|
|
|
switch (c) {
|
|
|
|
case 'r':
|
|
|
|
check_ratios++;
|
|
|
|
break;
|
|
|
|
case 'v':
|
|
|
|
verbose++;
|
|
|
|
break;
|
|
|
|
case ':':
|
|
|
|
(void) fprintf(stderr,
|
|
|
|
"missing argument for '%c' option\n", optopt);
|
|
|
|
draid_usage();
|
|
|
|
break;
|
|
|
|
case '?':
|
|
|
|
(void) fprintf(stderr, "invalid option '%c'\n",
|
|
|
|
optopt);
|
|
|
|
draid_usage();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (argc > optind) {
|
|
|
|
char *abspath = malloc(MAXPATHLEN);
|
|
|
|
if (abspath == NULL)
|
|
|
|
return (ENOMEM);
|
|
|
|
|
|
|
|
bzero(filename, MAXPATHLEN);
|
|
|
|
if (realpath(argv[optind], abspath) != NULL)
|
|
|
|
strncpy(filename, abspath, MAXPATHLEN - 1);
|
|
|
|
else
|
|
|
|
strncpy(filename, argv[optind], MAXPATHLEN - 1);
|
|
|
|
|
|
|
|
free(abspath);
|
|
|
|
} else {
|
|
|
|
(void) fprintf(stderr, "A FILE must be specified.\n");
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("Verifying permutation maps: '%s'\n", filename);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Lookup hardcoded permutation map for each valid number of children
|
|
|
|
* and verify a generated map has the correct checksum. Then compare
|
|
|
|
* the generated map values with the nvlist map values read from the
|
|
|
|
* reference file to cross-check the permutation.
|
|
|
|
*/
|
|
|
|
for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
|
|
|
|
children <= VDEV_DRAID_MAX_CHILDREN;
|
|
|
|
children++) {
|
|
|
|
draid_map_t *map;
|
|
|
|
char key[8];
|
|
|
|
|
|
|
|
bzero(key, 8);
|
|
|
|
snprintf(key, 8, "%llu", (u_longlong_t)children);
|
|
|
|
|
|
|
|
error = alloc_fixed_map(children, &map);
|
|
|
|
if (error) {
|
|
|
|
printf("Error alloc_fixed_map() failed: %s\n",
|
|
|
|
error == ECKSUM ? "Invalid checksum" :
|
|
|
|
strerror(error));
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t nv_seed, nv_checksum, nv_children, nv_nperms;
|
|
|
|
uint8_t *nv_perms;
|
|
|
|
nvlist_t *cfg;
|
|
|
|
uint_t c;
|
|
|
|
|
|
|
|
error = read_map_key(filename, key, &cfg);
|
|
|
|
if (error != 0) {
|
|
|
|
printf("Error read_map_key() failed: %s\n",
|
|
|
|
strerror(error));
|
|
|
|
free_map(map);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
|
|
|
|
nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
|
|
|
|
nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
|
|
|
|
nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
|
|
|
|
nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Compare draid_map_t and nvlist reference values.
|
|
|
|
*/
|
|
|
|
if (map->dm_seed != nv_seed) {
|
|
|
|
printf("Error different seeds: 0x%016llx != "
|
|
|
|
"0x%016llx\n", (u_longlong_t)map->dm_seed,
|
|
|
|
(u_longlong_t)nv_seed);
|
|
|
|
error = EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (map->dm_checksum != nv_checksum) {
|
|
|
|
printf("Error different checksums: 0x%016llx "
|
|
|
|
"!= 0x%016llx\n",
|
|
|
|
(u_longlong_t)map->dm_checksum,
|
|
|
|
(u_longlong_t)nv_checksum);
|
|
|
|
error = EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (map->dm_children != nv_children) {
|
|
|
|
printf("Error different children: %llu "
|
|
|
|
"!= %llu\n", (u_longlong_t)map->dm_children,
|
|
|
|
(u_longlong_t)nv_children);
|
|
|
|
error = EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (map->dm_nperms != nv_nperms) {
|
|
|
|
printf("Error different nperms: %llu "
|
|
|
|
"!= %llu\n", (u_longlong_t)map->dm_nperms,
|
|
|
|
(u_longlong_t)nv_nperms);
|
|
|
|
error = EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (uint64_t i = 0; i < nv_children * nv_nperms; i++) {
|
|
|
|
if (map->dm_perms[i] != nv_perms[i]) {
|
|
|
|
printf("Error different perms[%llu]: "
|
|
|
|
"%d != %d\n", (u_longlong_t)i,
|
|
|
|
(int)map->dm_perms[i],
|
|
|
|
(int)nv_perms[i]);
|
|
|
|
error = EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For good measure recalculate the worst and average
|
|
|
|
* ratios and confirm they match the nvlist values.
|
|
|
|
*/
|
|
|
|
if (check_ratios) {
|
|
|
|
uint64_t nv_worst_ratio, nv_avg_ratio;
|
|
|
|
double worst_ratio, avg_ratio;
|
|
|
|
|
|
|
|
eval_decluster(map, &worst_ratio, &avg_ratio);
|
|
|
|
|
|
|
|
nv_worst_ratio = fnvlist_lookup_uint64(cfg,
|
|
|
|
MAP_WORST_RATIO);
|
|
|
|
nv_avg_ratio = fnvlist_lookup_uint64(cfg,
|
|
|
|
MAP_AVG_RATIO);
|
|
|
|
|
|
|
|
if (worst_ratio < 1.0 || avg_ratio < 1.0) {
|
|
|
|
printf("Error ratio out of range %2.03f, "
|
|
|
|
"%2.03f\n", worst_ratio, avg_ratio);
|
|
|
|
error = EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((uint64_t)(worst_ratio * 1000.0) !=
|
|
|
|
nv_worst_ratio) {
|
|
|
|
printf("Error different worst_ratio %2.03f "
|
|
|
|
"!= %2.03f\n", (double)nv_worst_ratio /
|
|
|
|
1000.0, worst_ratio);
|
|
|
|
error = EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) {
|
|
|
|
printf("Error different average_ratio %2.03f "
|
|
|
|
"!= %2.03f\n", (double)nv_avg_ratio /
|
|
|
|
1000.0, avg_ratio);
|
|
|
|
error = EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (error) {
|
|
|
|
free_map(map);
|
|
|
|
nvlist_free(cfg);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (verbose > 0) {
|
|
|
|
printf("- %llu children: good\n",
|
|
|
|
(u_longlong_t)children);
|
|
|
|
}
|
|
|
|
n++;
|
|
|
|
|
|
|
|
free_map(map);
|
|
|
|
nvlist_free(cfg);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) {
|
|
|
|
printf("Error permutation maps missing: %d / %d checked\n",
|
|
|
|
n, VDEV_DRAID_MAX_CHILDREN - 1);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("Successfully verified %d / %d permutation maps\n",
|
|
|
|
n, VDEV_DRAID_MAX_CHILDREN - 1);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Dump the contents of the specified mapping(s) for inspection.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
draid_dump(int argc, char *argv[])
|
|
|
|
{
|
|
|
|
char filename[MAXPATHLEN];
|
|
|
|
int c, error, verbose = 1;
|
|
|
|
int min_children = VDEV_DRAID_MIN_CHILDREN;
|
|
|
|
int max_children = VDEV_DRAID_MAX_CHILDREN;
|
|
|
|
|
|
|
|
while ((c = getopt(argc, argv, ":vm:n:")) != -1) {
|
|
|
|
switch (c) {
|
|
|
|
case 'm':
|
|
|
|
min_children = (int)strtol(optarg, NULL, 0);
|
|
|
|
if (min_children < 2) {
|
|
|
|
(void) fprintf(stderr, "A minimum of 2 "
|
|
|
|
"children are required.\n");
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
case 'n':
|
|
|
|
max_children = (int)strtol(optarg, NULL, 0);
|
|
|
|
if (max_children > VDEV_DRAID_MAX_CHILDREN) {
|
|
|
|
(void) fprintf(stderr, "A maximum of %d "
|
|
|
|
"children are allowed.\n",
|
|
|
|
VDEV_DRAID_MAX_CHILDREN);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case 'v':
|
|
|
|
verbose++;
|
|
|
|
break;
|
|
|
|
case ':':
|
|
|
|
(void) fprintf(stderr,
|
|
|
|
"missing argument for '%c' option\n", optopt);
|
|
|
|
draid_usage();
|
|
|
|
break;
|
|
|
|
case '?':
|
|
|
|
(void) fprintf(stderr, "invalid option '%c'\n",
|
|
|
|
optopt);
|
|
|
|
draid_usage();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (argc > optind) {
|
|
|
|
bzero(filename, MAXPATHLEN);
|
|
|
|
strncpy(filename, argv[optind], MAXPATHLEN - 1);
|
|
|
|
} else {
|
|
|
|
(void) fprintf(stderr, "A FILE must be specified.\n");
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Dump maps for the requested child counts.
|
|
|
|
*/
|
|
|
|
for (uint64_t children = min_children;
|
|
|
|
children <= max_children; children++) {
|
|
|
|
char key[8] = { 0 };
|
|
|
|
|
|
|
|
snprintf(key, 7, "%llu", (u_longlong_t)children);
|
|
|
|
error = dump_map_key(filename, key, verbose);
|
|
|
|
if (error) {
|
|
|
|
printf("Error dump_map_key(): %s\n", strerror(error));
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2021-04-03 01:38:53 +00:00
|
|
|
* Print all of the mappings as a C formatted draid_map_t array. This table
|
|
|
|
* is found in the module/zcommon/zfs_draid.c file and is the definitive
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-13 21:51:51 +00:00
|
|
|
* source for all mapping used by dRAID. It cannot be updated without
|
|
|
|
* changing the dRAID on disk format.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
draid_table(int argc, char *argv[])
|
|
|
|
{
|
|
|
|
char filename[MAXPATHLEN];
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (argc > optind) {
|
|
|
|
bzero(filename, MAXPATHLEN);
|
|
|
|
strncpy(filename, argv[optind], MAXPATHLEN - 1);
|
|
|
|
} else {
|
|
|
|
(void) fprintf(stderr, "A FILE must be specified.\n");
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("static const draid_map_t "
|
|
|
|
"draid_maps[VDEV_DRAID_MAX_MAPS] = {\n");
|
|
|
|
|
|
|
|
for (uint64_t children = VDEV_DRAID_MIN_CHILDREN;
|
|
|
|
children <= VDEV_DRAID_MAX_CHILDREN;
|
|
|
|
children++) {
|
|
|
|
uint64_t seed, checksum, nperms, avg_ratio;
|
|
|
|
nvlist_t *cfg;
|
|
|
|
char key[8];
|
|
|
|
|
|
|
|
bzero(key, 8);
|
|
|
|
snprintf(key, 8, "%llu", (u_longlong_t)children);
|
|
|
|
|
|
|
|
error = read_map_key(filename, key, &cfg);
|
|
|
|
if (error != 0) {
|
|
|
|
printf("Error read_map_key() failed: %s\n",
|
|
|
|
strerror(error));
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
seed = fnvlist_lookup_uint64(cfg, MAP_SEED);
|
|
|
|
checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM);
|
|
|
|
children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN);
|
|
|
|
nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS);
|
|
|
|
avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO);
|
|
|
|
|
|
|
|
printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t"
|
|
|
|
"/* %2.03f */\n", (u_longlong_t)children,
|
|
|
|
(u_longlong_t)nperms, (u_longlong_t)seed,
|
|
|
|
(u_longlong_t)checksum, (double)avg_ratio / 1000.0);
|
|
|
|
|
|
|
|
nvlist_free(cfg);
|
|
|
|
}
|
|
|
|
|
|
|
|
printf("};\n");
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp)
|
|
|
|
{
|
|
|
|
nvlist_t *srccfgs;
|
|
|
|
nvpair_t *elem = NULL;
|
|
|
|
int error, merged = 0;
|
|
|
|
|
|
|
|
error = read_map(srcfilename, &srccfgs);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) {
|
|
|
|
uint64_t nv_worst_ratio;
|
|
|
|
uint64_t allcfg_worst_ratio;
|
|
|
|
nvlist_t *cfg, *allcfg;
|
|
|
|
char *key;
|
|
|
|
|
|
|
|
switch (nvpair_type(elem)) {
|
|
|
|
case DATA_TYPE_NVLIST:
|
|
|
|
|
|
|
|
(void) nvpair_value_nvlist(elem, &cfg);
|
|
|
|
key = nvpair_name(elem);
|
|
|
|
|
|
|
|
nv_worst_ratio = fnvlist_lookup_uint64(cfg,
|
|
|
|
MAP_WORST_RATIO);
|
|
|
|
|
|
|
|
error = nvlist_lookup_nvlist(allcfgs, key, &allcfg);
|
|
|
|
if (error == 0) {
|
|
|
|
allcfg_worst_ratio = fnvlist_lookup_uint64(
|
|
|
|
allcfg, MAP_WORST_RATIO);
|
|
|
|
|
|
|
|
if (nv_worst_ratio < allcfg_worst_ratio) {
|
|
|
|
fnvlist_remove(allcfgs, key);
|
|
|
|
error = nvlist_add_nvlist(allcfgs,
|
|
|
|
key, cfg);
|
|
|
|
merged++;
|
|
|
|
}
|
|
|
|
} else if (error == ENOENT) {
|
|
|
|
error = nvlist_add_nvlist(allcfgs, key, cfg);
|
|
|
|
merged++;
|
|
|
|
} else {
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
nvlist_free(srccfgs);
|
|
|
|
|
|
|
|
*mergedp = merged;
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Merge the best map for each child count found in the listed files into
|
|
|
|
* a new file. This allows 'draid generate' to be run in parallel and for
|
|
|
|
* the results maps to be combined.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
draid_merge(int argc, char *argv[])
|
|
|
|
{
|
|
|
|
char filename[MAXPATHLEN];
|
|
|
|
int c, error, total_merged = 0, verbose = 0;
|
|
|
|
nvlist_t *allcfgs;
|
|
|
|
|
|
|
|
while ((c = getopt(argc, argv, ":v")) != -1) {
|
|
|
|
switch (c) {
|
|
|
|
case 'v':
|
|
|
|
verbose++;
|
|
|
|
break;
|
|
|
|
case ':':
|
|
|
|
(void) fprintf(stderr,
|
|
|
|
"missing argument for '%c' option\n", optopt);
|
|
|
|
draid_usage();
|
|
|
|
break;
|
|
|
|
case '?':
|
|
|
|
(void) fprintf(stderr, "invalid option '%c'\n",
|
|
|
|
optopt);
|
|
|
|
draid_usage();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (argc < 4) {
|
|
|
|
(void) fprintf(stderr,
|
|
|
|
"A FILE and multiple SRCs must be specified.\n");
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
bzero(filename, MAXPATHLEN);
|
|
|
|
strncpy(filename, argv[optind], MAXPATHLEN - 1);
|
|
|
|
optind++;
|
|
|
|
|
|
|
|
error = read_map(filename, &allcfgs);
|
|
|
|
if (error == ENOENT) {
|
|
|
|
allcfgs = fnvlist_alloc();
|
|
|
|
} else if (error != 0) {
|
|
|
|
printf("Error read_map(): %s\n", strerror(error));
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
while (optind < argc) {
|
|
|
|
char srcfilename[MAXPATHLEN];
|
|
|
|
int merged = 0;
|
|
|
|
|
|
|
|
bzero(srcfilename, MAXPATHLEN);
|
|
|
|
strncpy(srcfilename, argv[optind], MAXPATHLEN - 1);
|
|
|
|
|
|
|
|
error = draid_merge_impl(allcfgs, srcfilename, &merged);
|
|
|
|
if (error) {
|
|
|
|
printf("Error draid_merge_impl(): %s\n",
|
|
|
|
strerror(error));
|
|
|
|
nvlist_free(allcfgs);
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
total_merged += merged;
|
|
|
|
printf("Merged %d key(s) from '%s' into '%s'\n", merged,
|
|
|
|
srcfilename, filename);
|
|
|
|
|
|
|
|
optind++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (total_merged > 0)
|
|
|
|
write_map(filename, allcfgs);
|
|
|
|
|
|
|
|
printf("Merged a total of %d key(s) into '%s'\n", total_merged,
|
|
|
|
filename);
|
|
|
|
|
|
|
|
nvlist_free(allcfgs);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
main(int argc, char *argv[])
|
|
|
|
{
|
|
|
|
if (argc < 2)
|
|
|
|
draid_usage();
|
|
|
|
|
|
|
|
char *subcommand = argv[1];
|
|
|
|
|
|
|
|
if (strcmp(subcommand, "generate") == 0) {
|
|
|
|
return (draid_generate(argc - 1, argv + 1));
|
|
|
|
} else if (strcmp(subcommand, "verify") == 0) {
|
|
|
|
return (draid_verify(argc - 1, argv + 1));
|
|
|
|
} else if (strcmp(subcommand, "dump") == 0) {
|
|
|
|
return (draid_dump(argc - 1, argv + 1));
|
|
|
|
} else if (strcmp(subcommand, "table") == 0) {
|
|
|
|
return (draid_table(argc - 1, argv + 1));
|
|
|
|
} else if (strcmp(subcommand, "merge") == 0) {
|
|
|
|
return (draid_merge(argc - 1, argv + 1));
|
|
|
|
} else {
|
|
|
|
draid_usage();
|
|
|
|
}
|
|
|
|
}
|