2008-11-20 20:01:55 +00:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
2012-12-13 23:24:15 +00:00
|
|
|
|
2008-11-20 20:01:55 +00:00
|
|
|
/*
|
2010-05-28 20:45:14 +00:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2013-12-09 18:37:51 +00:00
|
|
|
* Copyright (c) 2013 by Delphix. All rights reserved.
|
2008-11-20 20:01:55 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _SYS_VDEV_H
|
|
|
|
#define _SYS_VDEV_H
|
|
|
|
|
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/zio.h>
|
|
|
|
#include <sys/dmu.h>
|
|
|
|
#include <sys/space_map.h>
|
|
|
|
#include <sys/fs/zfs.h>
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
2009-01-15 21:59:39 +00:00
|
|
|
typedef enum vdev_dtl_type {
|
|
|
|
DTL_MISSING, /* 0% replication: no copies of the data */
|
|
|
|
DTL_PARTIAL, /* less than 100% replication: some copies missing */
|
|
|
|
DTL_SCRUB, /* unable to fully repair during scrub/resilver */
|
|
|
|
DTL_OUTAGE, /* temporarily missing (used to attempt detach) */
|
|
|
|
DTL_TYPES
|
|
|
|
} vdev_dtl_type_t;
|
|
|
|
|
2011-05-03 22:09:28 +00:00
|
|
|
extern int zfs_nocacheflush;
|
2008-11-20 20:01:55 +00:00
|
|
|
|
|
|
|
extern int vdev_open(vdev_t *);
|
2010-05-28 20:45:14 +00:00
|
|
|
extern void vdev_open_children(vdev_t *);
|
2012-07-11 20:02:44 +00:00
|
|
|
extern int vdev_validate(vdev_t *, boolean_t);
|
2008-11-20 20:01:55 +00:00
|
|
|
extern void vdev_close(vdev_t *);
|
|
|
|
extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
|
|
|
|
extern void vdev_reopen(vdev_t *);
|
|
|
|
extern int vdev_validate_aux(vdev_t *vd);
|
2008-12-03 20:09:06 +00:00
|
|
|
extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
|
2008-11-20 20:01:55 +00:00
|
|
|
|
2008-12-03 20:09:06 +00:00
|
|
|
extern boolean_t vdev_is_bootable(vdev_t *vd);
|
2008-11-20 20:01:55 +00:00
|
|
|
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
|
|
|
|
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
|
2015-05-06 16:07:55 +00:00
|
|
|
extern int vdev_count_leaves(spa_t *spa);
|
2009-01-15 21:59:39 +00:00
|
|
|
extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
|
|
|
|
uint64_t txg, uint64_t size);
|
|
|
|
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
|
|
|
|
uint64_t txg, uint64_t size);
|
|
|
|
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
|
2017-05-13 00:28:03 +00:00
|
|
|
extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
|
2008-11-20 20:01:55 +00:00
|
|
|
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
|
|
|
|
int scrub_done);
|
2009-01-15 21:59:39 +00:00
|
|
|
extern boolean_t vdev_dtl_required(vdev_t *vd);
|
2008-12-03 20:09:06 +00:00
|
|
|
extern boolean_t vdev_resilver_needed(vdev_t *vd,
|
|
|
|
uint64_t *minp, uint64_t *maxp);
|
2016-04-11 20:16:57 +00:00
|
|
|
extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
|
|
|
|
dmu_tx_t *tx);
|
|
|
|
extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
|
|
|
|
extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
|
2008-11-20 20:01:55 +00:00
|
|
|
|
2010-05-28 20:45:14 +00:00
|
|
|
extern void vdev_hold(vdev_t *);
|
|
|
|
extern void vdev_rele(vdev_t *);
|
|
|
|
|
2008-11-20 20:01:55 +00:00
|
|
|
extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
|
|
|
|
extern void vdev_metaslab_fini(vdev_t *vd);
|
2009-07-02 22:44:48 +00:00
|
|
|
extern void vdev_metaslab_set_size(vdev_t *);
|
|
|
|
extern void vdev_expand(vdev_t *vd, uint64_t txg);
|
2010-05-28 20:45:14 +00:00
|
|
|
extern void vdev_split(vdev_t *vd);
|
2013-04-29 22:49:23 +00:00
|
|
|
extern void vdev_deadman(vdev_t *vd);
|
2010-05-28 20:45:14 +00:00
|
|
|
|
2016-02-29 18:05:23 +00:00
|
|
|
extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
|
2008-11-20 20:01:55 +00:00
|
|
|
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
|
|
|
|
extern void vdev_clear_stats(vdev_t *vd);
|
2008-12-03 20:09:06 +00:00
|
|
|
extern void vdev_stat_update(zio_t *zio, uint64_t psize);
|
2010-05-28 20:45:14 +00:00
|
|
|
extern void vdev_scan_stat_init(vdev_t *vd);
|
2008-11-20 20:01:55 +00:00
|
|
|
extern void vdev_propagate_state(vdev_t *vd);
|
|
|
|
extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
|
|
|
|
vdev_aux_t aux);
|
|
|
|
|
2010-05-28 20:45:14 +00:00
|
|
|
extern void vdev_space_update(vdev_t *vd,
|
|
|
|
int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
|
2008-11-20 20:01:55 +00:00
|
|
|
|
|
|
|
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
|
|
|
|
|
2010-05-28 20:45:14 +00:00
|
|
|
extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
|
|
|
|
extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
|
2008-11-20 20:01:55 +00:00
|
|
|
extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
|
|
|
|
vdev_state_t *);
|
|
|
|
extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
|
2008-12-03 20:09:06 +00:00
|
|
|
extern void vdev_clear(spa_t *spa, vdev_t *vd);
|
2008-11-20 20:01:55 +00:00
|
|
|
|
2008-12-03 20:09:06 +00:00
|
|
|
extern boolean_t vdev_is_dead(vdev_t *vd);
|
|
|
|
extern boolean_t vdev_readable(vdev_t *vd);
|
|
|
|
extern boolean_t vdev_writeable(vdev_t *vd);
|
|
|
|
extern boolean_t vdev_allocatable(vdev_t *vd);
|
|
|
|
extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
|
2008-11-20 20:01:55 +00:00
|
|
|
|
|
|
|
extern void vdev_cache_init(vdev_t *vd);
|
|
|
|
extern void vdev_cache_fini(vdev_t *vd);
|
2013-12-09 18:37:51 +00:00
|
|
|
extern boolean_t vdev_cache_read(zio_t *zio);
|
2008-11-20 20:01:55 +00:00
|
|
|
extern void vdev_cache_write(zio_t *zio);
|
|
|
|
extern void vdev_cache_purge(vdev_t *vd);
|
|
|
|
|
|
|
|
extern void vdev_queue_init(vdev_t *vd);
|
|
|
|
extern void vdev_queue_fini(vdev_t *vd);
|
|
|
|
extern zio_t *vdev_queue_io(zio_t *zio);
|
|
|
|
extern void vdev_queue_io_done(zio_t *zio);
|
2017-12-21 17:13:06 +00:00
|
|
|
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
|
2008-11-20 20:01:55 +00:00
|
|
|
|
FreeBSD r256956: Improve ZFS N-way mirror read performance by using load and locality information.
The existing algorithm selects a preferred leaf vdev based on offset of the zio
request modulo the number of members in the mirror. It assumes the devices are
of equal performance and that spreading the requests randomly over both drives
will be sufficient to saturate them. In practice this results in the leaf vdevs
being under utilized.
The new algorithm takes into the following additional factors:
* Load of the vdevs (number outstanding I/O requests)
* The locality of last queued I/O vs the new I/O request.
Within the locality calculation additional knowledge about the underlying vdev
is considered such as; is the device backing the vdev a rotating media device.
This results in performance increases across the board as well as significant
increases for predominantly streaming loads and for configurations which don't
have evenly performing devices.
The following are results from a setup with 3 Way Mirror with 2 x HD's and
1 x SSD from a basic test running multiple parrallel dd's.
With pre-fetch disabled (vfs.zfs.prefetch_disable=1):
== Stripe Balanced (default) ==
Read 15360MB using bs: 1048576, readers: 3, took 161 seconds @ 95 MB/s
== Load Balanced (zfslinux) ==
Read 15360MB using bs: 1048576, readers: 3, took 297 seconds @ 51 MB/s
== Load Balanced (locality freebsd) ==
Read 15360MB using bs: 1048576, readers: 3, took 54 seconds @ 284 MB/s
With pre-fetch enabled (vfs.zfs.prefetch_disable=0):
== Stripe Balanced (default) ==
Read 15360MB using bs: 1048576, readers: 3, took 91 seconds @ 168 MB/s
== Load Balanced (zfslinux) ==
Read 15360MB using bs: 1048576, readers: 3, took 108 seconds @ 142 MB/s
== Load Balanced (locality freebsd) ==
Read 15360MB using bs: 1048576, readers: 3, took 48 seconds @ 320 MB/s
In addition to the performance changes the code was also restructured, with
the help of Justin Gibbs, to provide a more logical flow which also ensures
vdevs loads are only calculated from the set of valid candidates.
The following additional sysctls where added to allow the administrator
to tune the behaviour of the load algorithm:
* vfs.zfs.vdev.mirror.rotating_inc
* vfs.zfs.vdev.mirror.rotating_seek_inc
* vfs.zfs.vdev.mirror.rotating_seek_offset
* vfs.zfs.vdev.mirror.non_rotating_inc
* vfs.zfs.vdev.mirror.non_rotating_seek_inc
These changes where based on work started by the zfsonlinux developers:
https://github.com/zfsonlinux/zfs/pull/1487
Reviewed by: gibbs, mav, will
MFC after: 2 weeks
Sponsored by: Multiplay
References:
https://github.com/freebsd/freebsd@5c7a6f5d
https://github.com/freebsd/freebsd@31b7f68d
https://github.com/freebsd/freebsd@e186f564
Performance Testing:
https://github.com/zfsonlinux/zfs/pull/4334#issuecomment-189057141
Porting notes:
- The tunables were adjusted to have ZoL-style names.
- The code was modified to use ZoL's vd_nonrot.
- Fixes were done to make cstyle.pl happy
- Merge conflicts were handled manually
- freebsd/freebsd@e186f564bc946f82c76e0b34c2f0370ed9aea022 by my
collegue Andriy Gapon has been included. It applied perfectly, but
added a cstyle regression.
- This replaces 556011dbec2d10579819078559a77630fc559112 entirely.
- A typo "IO'a" has been corrected to say "IO's"
- Descriptions of new tunables were added to man/man5/zfs-module-parameters.5.
Ported-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #4334
2016-02-13 01:47:22 +00:00
|
|
|
extern int vdev_queue_length(vdev_t *vd);
|
2017-08-04 09:29:56 +00:00
|
|
|
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
|
FreeBSD r256956: Improve ZFS N-way mirror read performance by using load and locality information.
The existing algorithm selects a preferred leaf vdev based on offset of the zio
request modulo the number of members in the mirror. It assumes the devices are
of equal performance and that spreading the requests randomly over both drives
will be sufficient to saturate them. In practice this results in the leaf vdevs
being under utilized.
The new algorithm takes into the following additional factors:
* Load of the vdevs (number outstanding I/O requests)
* The locality of last queued I/O vs the new I/O request.
Within the locality calculation additional knowledge about the underlying vdev
is considered such as; is the device backing the vdev a rotating media device.
This results in performance increases across the board as well as significant
increases for predominantly streaming loads and for configurations which don't
have evenly performing devices.
The following are results from a setup with 3 Way Mirror with 2 x HD's and
1 x SSD from a basic test running multiple parrallel dd's.
With pre-fetch disabled (vfs.zfs.prefetch_disable=1):
== Stripe Balanced (default) ==
Read 15360MB using bs: 1048576, readers: 3, took 161 seconds @ 95 MB/s
== Load Balanced (zfslinux) ==
Read 15360MB using bs: 1048576, readers: 3, took 297 seconds @ 51 MB/s
== Load Balanced (locality freebsd) ==
Read 15360MB using bs: 1048576, readers: 3, took 54 seconds @ 284 MB/s
With pre-fetch enabled (vfs.zfs.prefetch_disable=0):
== Stripe Balanced (default) ==
Read 15360MB using bs: 1048576, readers: 3, took 91 seconds @ 168 MB/s
== Load Balanced (zfslinux) ==
Read 15360MB using bs: 1048576, readers: 3, took 108 seconds @ 142 MB/s
== Load Balanced (locality freebsd) ==
Read 15360MB using bs: 1048576, readers: 3, took 48 seconds @ 320 MB/s
In addition to the performance changes the code was also restructured, with
the help of Justin Gibbs, to provide a more logical flow which also ensures
vdevs loads are only calculated from the set of valid candidates.
The following additional sysctls where added to allow the administrator
to tune the behaviour of the load algorithm:
* vfs.zfs.vdev.mirror.rotating_inc
* vfs.zfs.vdev.mirror.rotating_seek_inc
* vfs.zfs.vdev.mirror.rotating_seek_offset
* vfs.zfs.vdev.mirror.non_rotating_inc
* vfs.zfs.vdev.mirror.non_rotating_seek_inc
These changes where based on work started by the zfsonlinux developers:
https://github.com/zfsonlinux/zfs/pull/1487
Reviewed by: gibbs, mav, will
MFC after: 2 weeks
Sponsored by: Multiplay
References:
https://github.com/freebsd/freebsd@5c7a6f5d
https://github.com/freebsd/freebsd@31b7f68d
https://github.com/freebsd/freebsd@e186f564
Performance Testing:
https://github.com/zfsonlinux/zfs/pull/4334#issuecomment-189057141
Porting notes:
- The tunables were adjusted to have ZoL-style names.
- The code was modified to use ZoL's vd_nonrot.
- Fixes were done to make cstyle.pl happy
- Merge conflicts were handled manually
- freebsd/freebsd@e186f564bc946f82c76e0b34c2f0370ed9aea022 by my
collegue Andriy Gapon has been included. It applied perfectly, but
added a cstyle regression.
- This replaces 556011dbec2d10579819078559a77630fc559112 entirely.
- A typo "IO'a" has been corrected to say "IO's"
- Descriptions of new tunables were added to man/man5/zfs-module-parameters.5.
Ported-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #4334
2016-02-13 01:47:22 +00:00
|
|
|
|
2008-11-20 20:01:55 +00:00
|
|
|
extern void vdev_config_dirty(vdev_t *vd);
|
|
|
|
extern void vdev_config_clean(vdev_t *vd);
|
2016-01-27 01:27:46 +00:00
|
|
|
extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
|
2008-11-20 20:01:55 +00:00
|
|
|
|
2008-12-03 20:09:06 +00:00
|
|
|
extern void vdev_state_dirty(vdev_t *vd);
|
|
|
|
extern void vdev_state_clean(vdev_t *vd);
|
|
|
|
|
2010-05-28 20:45:14 +00:00
|
|
|
typedef enum vdev_config_flag {
|
|
|
|
VDEV_CONFIG_SPARE = 1 << 0,
|
|
|
|
VDEV_CONFIG_L2CACHE = 1 << 1,
|
2016-04-11 20:16:57 +00:00
|
|
|
VDEV_CONFIG_REMOVING = 1 << 2,
|
|
|
|
VDEV_CONFIG_MOS = 1 << 3
|
2010-05-28 20:45:14 +00:00
|
|
|
} vdev_config_flag_t;
|
|
|
|
|
|
|
|
extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
|
2008-11-20 20:01:55 +00:00
|
|
|
extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
|
2010-05-28 20:45:14 +00:00
|
|
|
boolean_t getstats, vdev_config_flag_t flags);
|
2008-11-20 20:01:55 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Label routines
|
|
|
|
*/
|
|
|
|
struct uberblock;
|
|
|
|
extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
|
2008-12-03 20:09:06 +00:00
|
|
|
extern int vdev_label_number(uint64_t psise, uint64_t offset);
|
2012-12-14 20:38:04 +00:00
|
|
|
extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
|
2012-12-13 23:24:15 +00:00
|
|
|
extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
|
2016-02-29 18:05:23 +00:00
|
|
|
extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv);
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 03:20:35 +00:00
|
|
|
extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t
|
|
|
|
offset, uint64_t size, zio_done_func_t *done, void *private, int flags);
|
2008-11-20 20:01:55 +00:00
|
|
|
|
|
|
|
typedef enum {
|
|
|
|
VDEV_LABEL_CREATE, /* create/add a new device */
|
|
|
|
VDEV_LABEL_REPLACE, /* replace an existing device */
|
|
|
|
VDEV_LABEL_SPARE, /* add a new hot spare */
|
|
|
|
VDEV_LABEL_REMOVE, /* remove an existing device */
|
2010-05-28 20:45:14 +00:00
|
|
|
VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */
|
|
|
|
VDEV_LABEL_SPLIT /* generating new label for split-off dev */
|
2008-11-20 20:01:55 +00:00
|
|
|
} vdev_labeltype_t;
|
|
|
|
|
|
|
|
extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _SYS_VDEV_H */
|