2008-11-20 20:01:55 +00:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
2010-05-28 20:45:14 +00:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2015-05-20 04:14:01 +00:00
|
|
|
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
|
2011-11-11 22:07:54 +00:00
|
|
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
2015-04-02 03:44:32 +00:00
|
|
|
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
|
2016-06-15 22:47:05 +00:00
|
|
|
* Copyright 2013 Saso Kiselkov. All rights reserved.
|
2014-03-22 09:07:14 +00:00
|
|
|
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
|
2017-07-07 05:16:13 +00:00
|
|
|
* Copyright (c) 2017 Datto Inc.
|
2008-11-20 20:01:55 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _SYS_SPA_IMPL_H
|
|
|
|
#define _SYS_SPA_IMPL_H
|
|
|
|
|
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/vdev.h>
|
|
|
|
#include <sys/metaslab.h>
|
|
|
|
#include <sys/dmu.h>
|
|
|
|
#include <sys/dsl_pool.h>
|
|
|
|
#include <sys/uberblock_impl.h>
|
|
|
|
#include <sys/zfs_context.h>
|
|
|
|
#include <sys/avl.h>
|
|
|
|
#include <sys/refcount.h>
|
|
|
|
#include <sys/bplist.h>
|
2010-05-28 20:45:14 +00:00
|
|
|
#include <sys/bpobj.h>
|
2014-06-05 21:19:08 +00:00
|
|
|
#include <sys/zfeature.h>
|
2013-12-09 18:37:51 +00:00
|
|
|
#include <zfeature_common.h>
|
2008-11-20 20:01:55 +00:00
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
typedef struct spa_error_entry {
|
2014-06-25 18:37:59 +00:00
|
|
|
zbookmark_phys_t se_bookmark;
|
|
|
|
char *se_name;
|
|
|
|
avl_node_t se_avl;
|
2008-11-20 20:01:55 +00:00
|
|
|
} spa_error_entry_t;
|
|
|
|
|
|
|
|
typedef struct spa_history_phys {
|
|
|
|
uint64_t sh_pool_create_len; /* ending offset of zpool create */
|
|
|
|
uint64_t sh_phys_max_off; /* physical EOF */
|
|
|
|
uint64_t sh_bof; /* logical BOF */
|
|
|
|
uint64_t sh_eof; /* logical EOF */
|
|
|
|
uint64_t sh_records_lost; /* num of records overwritten */
|
|
|
|
} spa_history_phys_t;
|
|
|
|
|
|
|
|
struct spa_aux_vdev {
|
|
|
|
uint64_t sav_object; /* MOS object for device list */
|
|
|
|
nvlist_t *sav_config; /* cached device config */
|
|
|
|
vdev_t **sav_vdevs; /* devices */
|
|
|
|
int sav_count; /* number devices */
|
|
|
|
boolean_t sav_sync; /* sync the device list */
|
|
|
|
nvlist_t **sav_pending; /* pending device additions */
|
|
|
|
uint_t sav_npending; /* # pending devices */
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef struct spa_config_lock {
|
|
|
|
kmutex_t scl_lock;
|
|
|
|
kthread_t *scl_writer;
|
2008-12-03 20:09:06 +00:00
|
|
|
int scl_write_wanted;
|
2008-11-20 20:01:55 +00:00
|
|
|
kcondvar_t scl_cv;
|
|
|
|
refcount_t scl_count;
|
|
|
|
} spa_config_lock_t;
|
|
|
|
|
2008-12-03 20:09:06 +00:00
|
|
|
typedef struct spa_config_dirent {
|
|
|
|
list_node_t scd_link;
|
|
|
|
char *scd_path;
|
|
|
|
} spa_config_dirent_t;
|
|
|
|
|
2013-05-06 19:24:30 +00:00
|
|
|
typedef enum zio_taskq_type {
|
2008-12-03 20:09:06 +00:00
|
|
|
ZIO_TASKQ_ISSUE = 0,
|
2010-05-28 20:45:14 +00:00
|
|
|
ZIO_TASKQ_ISSUE_HIGH,
|
2008-12-03 20:09:06 +00:00
|
|
|
ZIO_TASKQ_INTERRUPT,
|
2010-05-28 20:45:14 +00:00
|
|
|
ZIO_TASKQ_INTERRUPT_HIGH,
|
2008-12-03 20:09:06 +00:00
|
|
|
ZIO_TASKQ_TYPES
|
2013-05-06 19:24:30 +00:00
|
|
|
} zio_taskq_type_t;
|
2008-12-03 20:09:06 +00:00
|
|
|
|
2010-05-28 20:45:14 +00:00
|
|
|
/*
|
2013-05-06 19:24:30 +00:00
|
|
|
* State machine for the zpool-poolname process. The states transitions
|
2010-05-28 20:45:14 +00:00
|
|
|
* are done as follows:
|
|
|
|
*
|
|
|
|
* From To Routine
|
|
|
|
* PROC_NONE -> PROC_CREATED spa_activate()
|
|
|
|
* PROC_CREATED -> PROC_ACTIVE spa_thread()
|
|
|
|
* PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate()
|
|
|
|
* PROC_DEACTIVATE -> PROC_GONE spa_thread()
|
|
|
|
* PROC_GONE -> PROC_NONE spa_deactivate()
|
|
|
|
*/
|
|
|
|
typedef enum spa_proc_state {
|
|
|
|
SPA_PROC_NONE, /* spa_proc = &p0, no process created */
|
|
|
|
SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */
|
|
|
|
SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */
|
|
|
|
SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */
|
|
|
|
SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
|
|
|
|
} spa_proc_state_t;
|
|
|
|
|
2013-05-06 19:24:30 +00:00
|
|
|
typedef struct spa_taskqs {
|
|
|
|
uint_t stqs_count;
|
|
|
|
taskq_t **stqs_taskq;
|
|
|
|
} spa_taskqs_t;
|
|
|
|
|
2016-04-11 20:16:57 +00:00
|
|
|
typedef enum spa_all_vdev_zap_action {
|
|
|
|
AVZ_ACTION_NONE = 0,
|
|
|
|
AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */
|
2017-01-13 21:50:22 +00:00
|
|
|
AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */
|
|
|
|
AVZ_ACTION_INITIALIZE
|
2016-04-11 20:16:57 +00:00
|
|
|
} spa_avz_action_t;
|
|
|
|
|
2008-11-20 20:01:55 +00:00
|
|
|
struct spa {
|
|
|
|
/*
|
|
|
|
* Fields protected by spa_namespace_lock.
|
|
|
|
*/
|
2016-06-15 21:28:36 +00:00
|
|
|
char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */
|
2011-11-15 19:01:27 +00:00
|
|
|
char *spa_comment; /* comment */
|
2008-11-20 20:01:55 +00:00
|
|
|
avl_node_t spa_avl; /* node in spa_namespace_avl */
|
|
|
|
nvlist_t *spa_config; /* last synced config */
|
|
|
|
nvlist_t *spa_config_syncing; /* currently syncing config */
|
2010-05-28 20:45:14 +00:00
|
|
|
nvlist_t *spa_config_splitting; /* config for splitting */
|
2010-08-26 21:24:34 +00:00
|
|
|
nvlist_t *spa_load_info; /* info and errors from load */
|
2008-11-20 20:01:55 +00:00
|
|
|
uint64_t spa_config_txg; /* txg of last config change */
|
|
|
|
int spa_sync_pass; /* iterate-to-convergence */
|
2008-12-03 20:09:06 +00:00
|
|
|
pool_state_t spa_state; /* pool state */
|
2008-11-20 20:01:55 +00:00
|
|
|
int spa_inject_ref; /* injection references */
|
|
|
|
uint8_t spa_sync_on; /* sync threads are running */
|
|
|
|
spa_load_state_t spa_load_state; /* current load operation */
|
2010-08-26 21:24:34 +00:00
|
|
|
uint64_t spa_import_flags; /* import specific flags */
|
2013-05-06 19:24:30 +00:00
|
|
|
spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
|
2008-11-20 20:01:55 +00:00
|
|
|
dsl_pool_t *spa_dsl_pool;
|
2012-12-13 23:24:15 +00:00
|
|
|
boolean_t spa_is_initializing; /* true while opening pool */
|
2008-11-20 20:01:55 +00:00
|
|
|
metaslab_class_t *spa_normal_class; /* normal data class */
|
|
|
|
metaslab_class_t *spa_log_class; /* intent log data class */
|
|
|
|
uint64_t spa_first_txg; /* first txg after spa_open() */
|
|
|
|
uint64_t spa_final_txg; /* txg of export/destroy */
|
|
|
|
uint64_t spa_freeze_txg; /* freeze pool at this txg */
|
2010-05-28 20:45:14 +00:00
|
|
|
uint64_t spa_load_max_txg; /* best initial ub_txg */
|
|
|
|
uint64_t spa_claim_max_txg; /* highest claimed birth txg */
|
2018-08-12 22:22:03 +00:00
|
|
|
inode_timespec_t spa_loaded_ts; /* 1st successful open time */
|
2008-11-20 20:01:55 +00:00
|
|
|
objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
|
2015-04-02 03:44:32 +00:00
|
|
|
kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */
|
|
|
|
list_t spa_evicting_os_list; /* Objsets being evicted. */
|
|
|
|
kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */
|
2008-11-20 20:01:55 +00:00
|
|
|
txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
|
|
|
|
vdev_t *spa_root_vdev; /* top-level vdev container */
|
2015-05-20 04:14:01 +00:00
|
|
|
int spa_min_ashift; /* of vdevs in normal class */
|
|
|
|
int spa_max_ashift; /* of vdevs in normal class */
|
2011-11-11 22:07:54 +00:00
|
|
|
uint64_t spa_config_guid; /* config pool guid */
|
|
|
|
uint64_t spa_load_guid; /* spa_load initialized guid */
|
2012-12-14 20:38:04 +00:00
|
|
|
uint64_t spa_last_synced_guid; /* last synced guid */
|
2008-12-03 20:09:06 +00:00
|
|
|
list_t spa_config_dirty_list; /* vdevs with dirty config */
|
|
|
|
list_t spa_state_dirty_list; /* vdevs with dirty state */
|
2016-10-14 00:59:18 +00:00
|
|
|
kmutex_t spa_alloc_lock;
|
|
|
|
avl_tree_t spa_alloc_tree;
|
2008-11-20 20:01:55 +00:00
|
|
|
spa_aux_vdev_t spa_spares; /* hot spares */
|
|
|
|
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
|
2012-12-13 23:24:15 +00:00
|
|
|
nvlist_t *spa_label_features; /* Features for reading MOS */
|
2008-11-20 20:01:55 +00:00
|
|
|
uint64_t spa_config_object; /* MOS object for pool config */
|
2010-05-28 20:45:14 +00:00
|
|
|
uint64_t spa_config_generation; /* config generation number */
|
2008-11-20 20:01:55 +00:00
|
|
|
uint64_t spa_syncing_txg; /* txg currently syncing */
|
2010-05-28 20:45:14 +00:00
|
|
|
bpobj_t spa_deferred_bpobj; /* deferred-free bplist */
|
|
|
|
bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
|
2016-06-15 22:47:05 +00:00
|
|
|
zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
|
|
|
|
/* checksum context templates */
|
|
|
|
kmutex_t spa_cksum_tmpls_lock;
|
|
|
|
void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
|
2008-11-20 20:01:55 +00:00
|
|
|
uberblock_t spa_ubsync; /* last synced uberblock */
|
|
|
|
uberblock_t spa_uberblock; /* current uberblock */
|
2010-05-28 20:45:14 +00:00
|
|
|
boolean_t spa_extreme_rewind; /* rewind past deferred frees */
|
2010-08-26 21:24:34 +00:00
|
|
|
uint64_t spa_last_io; /* lbolt of last non-scan I/O */
|
2008-11-20 20:01:55 +00:00
|
|
|
kmutex_t spa_scrub_lock; /* resilver/scrub lock */
|
|
|
|
uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
|
|
|
|
kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
|
|
|
|
uint8_t spa_scrub_active; /* active or suspended? */
|
|
|
|
uint8_t spa_scrub_type; /* type of scrub we're doing */
|
|
|
|
uint8_t spa_scrub_finished; /* indicator to rotate logs */
|
2008-12-03 20:09:06 +00:00
|
|
|
uint8_t spa_scrub_started; /* started since last boot */
|
|
|
|
uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */
|
2010-05-28 20:45:14 +00:00
|
|
|
uint64_t spa_scan_pass_start; /* start time per pass/reboot */
|
2017-07-07 05:16:13 +00:00
|
|
|
uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */
|
|
|
|
uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */
|
2010-05-28 20:45:14 +00:00
|
|
|
uint64_t spa_scan_pass_exam; /* examined bytes per pass */
|
2008-11-20 20:01:55 +00:00
|
|
|
kmutex_t spa_async_lock; /* protect async state */
|
|
|
|
kthread_t *spa_async_thread; /* thread doing async task */
|
|
|
|
int spa_async_suspended; /* async tasks suspended */
|
|
|
|
kcondvar_t spa_async_cv; /* wait for thread_exit() */
|
|
|
|
uint16_t spa_async_tasks; /* async task mask */
|
|
|
|
char *spa_root; /* alternate root directory */
|
|
|
|
uint64_t spa_ena; /* spa-wide ereport ENA */
|
2010-05-28 20:45:14 +00:00
|
|
|
int spa_last_open_failed; /* error if last open failed */
|
|
|
|
uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */
|
|
|
|
uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */
|
|
|
|
uint64_t spa_load_txg; /* ub txg that loaded */
|
|
|
|
uint64_t spa_load_txg_ts; /* timestamp from that ub */
|
|
|
|
uint64_t spa_load_meta_errors; /* verify metadata err count */
|
|
|
|
uint64_t spa_load_data_errors; /* verify data err count */
|
|
|
|
uint64_t spa_verify_min_txg; /* start txg of verify scrub */
|
2008-11-20 20:01:55 +00:00
|
|
|
kmutex_t spa_errlog_lock; /* error log lock */
|
|
|
|
uint64_t spa_errlog_last; /* last error log object */
|
|
|
|
uint64_t spa_errlog_scrub; /* scrub error log object */
|
|
|
|
kmutex_t spa_errlist_lock; /* error list/ereport lock */
|
|
|
|
avl_tree_t spa_errlist_last; /* last error list */
|
|
|
|
avl_tree_t spa_errlist_scrub; /* scrub error list */
|
|
|
|
uint64_t spa_deflate; /* should we deflate? */
|
|
|
|
uint64_t spa_history; /* history object */
|
|
|
|
kmutex_t spa_history_lock; /* history lock */
|
|
|
|
vdev_t *spa_pending_vdev; /* pending vdev additions */
|
|
|
|
kmutex_t spa_props_lock; /* property lock */
|
|
|
|
uint64_t spa_pool_props_object; /* object for properties */
|
|
|
|
uint64_t spa_bootfs; /* default boot filesystem */
|
2008-12-03 20:09:06 +00:00
|
|
|
uint64_t spa_failmode; /* failure mode for the pool */
|
|
|
|
uint64_t spa_delegation; /* delegation on/off */
|
|
|
|
list_t spa_config_list; /* previous cache file(s) */
|
2014-09-17 06:59:43 +00:00
|
|
|
/* per-CPU array of root of async I/O: */
|
|
|
|
zio_t **spa_async_zio_root;
|
2008-12-03 20:09:06 +00:00
|
|
|
zio_t *spa_suspend_zio_root; /* root of all suspended I/O */
|
|
|
|
kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
|
|
|
|
kcondvar_t spa_suspend_cv; /* notification of resume */
|
2018-03-15 17:56:55 +00:00
|
|
|
zio_suspend_reason_t spa_suspended; /* pool is suspended */
|
2010-05-28 20:45:14 +00:00
|
|
|
uint8_t spa_claiming; /* pool is doing zil_claim() */
|
2011-07-26 19:08:52 +00:00
|
|
|
boolean_t spa_debug; /* debug enabled? */
|
2008-12-03 20:09:06 +00:00
|
|
|
boolean_t spa_is_root; /* pool is root */
|
|
|
|
int spa_minref; /* num refs when first opened */
|
2009-01-15 21:59:39 +00:00
|
|
|
int spa_mode; /* FREAD | FWRITE */
|
2008-12-03 20:09:06 +00:00
|
|
|
spa_log_state_t spa_log_state; /* log state */
|
2009-07-02 22:44:48 +00:00
|
|
|
uint64_t spa_autoexpand; /* lun expansion on/off */
|
2010-05-28 20:45:14 +00:00
|
|
|
ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
|
|
|
|
uint64_t spa_ddt_stat_object; /* DDT statistics */
|
2016-12-02 23:59:35 +00:00
|
|
|
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
|
2010-05-28 20:45:14 +00:00
|
|
|
uint64_t spa_dedup_ditto; /* dedup ditto threshold */
|
|
|
|
uint64_t spa_dedup_checksum; /* default dedup checksum */
|
|
|
|
uint64_t spa_dspace; /* dspace in normal class */
|
|
|
|
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
|
|
|
|
kmutex_t spa_proc_lock; /* protects spa_proc* */
|
|
|
|
kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
|
|
|
|
spa_proc_state_t spa_proc_state; /* see definition */
|
2010-08-26 18:45:02 +00:00
|
|
|
proc_t *spa_proc; /* "zpool-poolname" process */
|
2010-05-28 20:45:14 +00:00
|
|
|
uint64_t spa_did; /* if procp != p0, did of t1 */
|
|
|
|
boolean_t spa_autoreplace; /* autoreplace set in open */
|
|
|
|
int spa_vdev_locks; /* locks grabbed */
|
|
|
|
uint64_t spa_creation_version; /* version at pool creation */
|
2012-12-13 23:24:15 +00:00
|
|
|
uint64_t spa_prev_software_version; /* See ub_software_version */
|
|
|
|
uint64_t spa_feat_for_write_obj; /* required to write to pool */
|
|
|
|
uint64_t spa_feat_for_read_obj; /* required to read from pool */
|
|
|
|
uint64_t spa_feat_desc_obj; /* Feature descriptions */
|
2013-12-09 18:37:51 +00:00
|
|
|
uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */
|
2015-04-23 19:32:59 +00:00
|
|
|
kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */
|
2015-02-26 20:24:11 +00:00
|
|
|
nvlist_t *spa_feat_stats; /* Cache of enabled features */
|
2013-12-09 18:37:51 +00:00
|
|
|
/* cache feature refcounts */
|
|
|
|
uint64_t spa_feat_refcount_cache[SPA_FEATURES];
|
2013-04-29 22:49:23 +00:00
|
|
|
taskqid_t spa_deadman_tqid; /* Task id */
|
|
|
|
uint64_t spa_deadman_calls; /* number of deadman calls */
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 03:01:20 +00:00
|
|
|
hrtime_t spa_sync_starttime; /* starting time of spa_sync */
|
2013-04-29 22:49:23 +00:00
|
|
|
uint64_t spa_deadman_synctime; /* deadman expiration timer */
|
2016-04-11 20:16:57 +00:00
|
|
|
uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */
|
|
|
|
spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */
|
2014-02-21 03:57:17 +00:00
|
|
|
uint64_t spa_errata; /* errata issues detected */
|
Add visibility in to arc_read
This change is an attempt to add visibility into the arc_read calls
occurring on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to a call to arc_read. These entries are then exported
through the kstat interface, which can then be interpreted in userspace.
For each arc_read call, the following information is exported:
* A unique identifier (uint64_t)
* The time the entry was added to the list (hrtime_t)
(*not* wall clock time; relative to the other entries on the list)
* The objset ID (uint64_t)
* The object number (uint64_t)
* The indirection level (uint64_t)
* The block ID (uint64_t)
* The name of the function originating the arc_read call (char[24])
* The arc_flags from the arc_read call (uint32_t)
* The PID of the reading thread (pid_t)
* The command or name of thread originating read (char[16])
From this exported information one can see, in real time, exactly what
is being read, what function is generating the read, and whether or not
the read was found to be already cached.
There is still some work to be done, but this should serve as a good
starting point.
Specifically, dbuf_read's are not accounted for in the currently
exported information. Thus, a follow up patch should probably be added
to export these calls that never call into arc_read (they only hit the
dbuf hash table). In addition, it might be nice to create a utility
similar to "arcstat.py" to digest the exported information and display
it in a more readable format. Or perhaps, log the information and allow
for it to be "replayed" at a later time.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2013-09-06 23:09:05 +00:00
|
|
|
spa_stats_t spa_stats; /* assorted spa statistics */
|
2015-12-31 16:38:59 +00:00
|
|
|
hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
|
2017-01-03 17:31:18 +00:00
|
|
|
taskq_t *spa_zvol_taskq; /* Taskq for minor management */
|
2018-03-30 19:10:01 +00:00
|
|
|
taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */
|
Multi-modifier protection (MMP)
Add multihost=on|off pool property to control MMP. When enabled
a new thread writes uberblocks to the last slot in each label, at a
set frequency, to indicate to other hosts the pool is actively imported.
These uberblocks are the last synced uberblock with an updated
timestamp. Property defaults to off.
During tryimport, find the "best" uberblock (newest txg and timestamp)
repeatedly, checking for change in the found uberblock. Include the
results of the activity test in the config returned by tryimport.
These results are reported to user in "zpool import".
Allow the user to control the period between MMP writes, and the
duration of the activity test on import, via a new module parameter
zfs_multihost_interval. The period is specified in milliseconds. The
activity test duration is calculated from this value, and from the
mmp_delay in the "best" uberblock found initially.
Add a kstat interface to export statistics about Multiple Modifier
Protection (MMP) updates. Include the last synced txg number, the
timestamp, the delay since the last MMP update, the VDEV GUID, the VDEV
label that received the last MMP update, and the VDEV path. Abbreviated
output below.
$ cat /proc/spl/kstat/zfs/mypool/multihost
31 0 0x01 10 880 105092382393521 105144180101111
txg timestamp mmp_delay vdev_guid vdev_label vdev_path
20468 261337 250274925 68396651780 3 /dev/sda
20468 261339 252023374 6267402363293 1 /dev/sdc
20468 261340 252000858 6698080955233 1 /dev/sdx
20468 261341 251980635 783892869810 2 /dev/sdy
20468 261342 253385953 8923255792467 3 /dev/sdd
20468 261344 253336622 042125143176 0 /dev/sdab
20468 261345 253310522 1200778101278 2 /dev/sde
20468 261346 253286429 0950576198362 2 /dev/sdt
20468 261347 253261545 96209817917 3 /dev/sds
20468 261349 253238188 8555725937673 3 /dev/sdb
Add a new tunable zfs_multihost_history to specify the number of MMP
updates to store history for. By default it is set to zero meaning that
no MMP statistics are stored.
When using ztest to generate activity, for automated tests of the MMP
function, some test functions interfere with the test. For example, the
pool is exported to run zdb and then imported again. Add a new ztest
function, "-M", to alter ztest behavior to prevent this.
Add new tests to verify the new functionality. Tests provided by
Giuseppe Di Natale.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #745
Closes #6279
2017-07-08 03:20:35 +00:00
|
|
|
uint64_t spa_multihost; /* multihost aware (mmp) */
|
|
|
|
mmp_thread_t spa_mmp; /* multihost mmp thread */
|
Add visibility in to arc_read
This change is an attempt to add visibility into the arc_read calls
occurring on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to a call to arc_read. These entries are then exported
through the kstat interface, which can then be interpreted in userspace.
For each arc_read call, the following information is exported:
* A unique identifier (uint64_t)
* The time the entry was added to the list (hrtime_t)
(*not* wall clock time; relative to the other entries on the list)
* The objset ID (uint64_t)
* The object number (uint64_t)
* The indirection level (uint64_t)
* The block ID (uint64_t)
* The name of the function originating the arc_read call (char[24])
* The arc_flags from the arc_read call (uint32_t)
* The PID of the reading thread (pid_t)
* The command or name of thread originating read (char[16])
From this exported information one can see, in real time, exactly what
is being read, what function is generating the read, and whether or not
the read was found to be already cached.
There is still some work to be done, but this should serve as a good
starting point.
Specifically, dbuf_read's are not accounted for in the currently
exported information. Thus, a follow up patch should probably be added
to export these calls that never call into arc_read (they only hit the
dbuf hash table). In addition, it might be nice to create a utility
similar to "arcstat.py" to digest the exported information and display
it in a more readable format. Or perhaps, log the information and allow
for it to be "replayed" at a later time.
Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2013-09-06 23:09:05 +00:00
|
|
|
|
2008-11-20 20:01:55 +00:00
|
|
|
/*
|
2013-06-11 17:12:34 +00:00
|
|
|
* spa_refcount & spa_config_lock must be the last elements
|
2008-11-20 20:01:55 +00:00
|
|
|
* because refcount_t changes size based on compilation options.
|
|
|
|
* In order for the MDB module to function correctly, the other
|
|
|
|
* fields must remain in the same location.
|
|
|
|
*/
|
2008-12-03 20:09:06 +00:00
|
|
|
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
|
2008-11-20 20:01:55 +00:00
|
|
|
refcount_t spa_refcount; /* number of opens */
|
2016-10-04 18:46:10 +00:00
|
|
|
|
|
|
|
taskq_t *spa_upgrade_taskq; /* taskq for upgrade jobs */
|
2008-11-20 20:01:55 +00:00
|
|
|
};
|
|
|
|
|
2010-08-26 18:49:16 +00:00
|
|
|
extern char *spa_config_path;
|
2008-12-03 20:09:06 +00:00
|
|
|
|
2013-05-06 19:24:30 +00:00
|
|
|
extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
|
|
|
|
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
|
2013-05-03 21:17:21 +00:00
|
|
|
extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q,
|
|
|
|
task_func_t *func, void *arg, uint_t flags);
|
|
|
|
|
2013-05-06 19:24:30 +00:00
|
|
|
|
2008-11-20 20:01:55 +00:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _SYS_SPA_IMPL_H */
|