zfs/include/sys/zil_impl.h

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2012 by Delphix. All rights reserved.
 */

/* Portions Copyright 2010 Robert Milkowski */

#ifndef	_SYS_ZIL_IMPL_H
#define	_SYS_ZIL_IMPL_H

#include <sys/zil.h>
#include <sys/dmu_objset.h>

#ifdef	__cplusplus
extern "C" {
#endif

/*
 * Log write buffer.
 */
typedef struct lwb {
	zilog_t		*lwb_zilog;	/* back pointer to log struct */
	blkptr_t	lwb_blk;	/* on disk address of this log blk */
	boolean_t       lwb_fastwrite;  /* is blk marked for fastwrite? */
	int		lwb_nused;	/* # used bytes in buffer */
	int		lwb_sz;		/* size of block and buffer */
	char		*lwb_buf;	/* log write buffer */
	zio_t		*lwb_zio;	/* zio for this buffer */
	dmu_tx_t	*lwb_tx;	/* tx for log block allocation */
	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
	list_node_t	lwb_node;	/* zilog->zl_lwb_list linkage */
} lwb_t;

/*
 * Intent log transaction lists
 */
typedef struct itxs {
	list_t		i_sync_list;	/* list of synchronous itxs */
	avl_tree_t	i_async_tree;	/* tree of foids for async itxs */
} itxs_t;

typedef struct itxg {
	kmutex_t	itxg_lock;	/* lock for this structure */
	uint64_t	itxg_txg;	/* txg for this chain */
	uint64_t	itxg_sod;	/* total size on disk for this txg */
	itxs_t		*itxg_itxs;	/* sync and async itxs */
} itxg_t;

/* for async nodes we build up an AVL tree of lists of async itxs per file */
typedef struct itx_async_node {
	uint64_t	ia_foid;	/* file object id */
	list_t		ia_list;	/* list of async itxs for this foid */
	avl_node_t	ia_node;	/* AVL tree linkage */
} itx_async_node_t;

/*
 * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
 * we've touched so we know which ones need a write cache flush at the end.
 */
typedef struct zil_vdev_node {
	uint64_t	zv_vdev;	/* vdev to be flushed */
	avl_node_t	zv_node;	/* AVL tree linkage */
} zil_vdev_node_t;

#define	ZIL_PREV_BLKS 16

/*
 * Stable storage intent log management structure.  One per dataset.
 */
struct zilog {
	kmutex_t	zl_lock;	/* protects most zilog_t fields */
	struct dsl_pool	*zl_dmu_pool;	/* DSL pool */
	spa_t		*zl_spa;	/* handle for read/write log */
	const zil_header_t *zl_header;	/* log header buffer */
	objset_t	*zl_os;		/* object set we're logging */
	zil_get_data_t	*zl_get_data;	/* callback to get object content */
	zio_t		*zl_root_zio;	/* log writer root zio */
	uint64_t	zl_lr_seq;	/* on-disk log record sequence number */
	uint64_t	zl_commit_lr_seq; /* last committed on-disk lr seq */
	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
	uint64_t	zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
	uint64_t	zl_replaying_seq; /* current replay seq number */
	uint32_t	zl_suspend;	/* log suspend count */
	kcondvar_t	zl_cv_writer;	/* log writer thread completion */
	kcondvar_t	zl_cv_suspend;	/* log suspend completion */
	uint8_t		zl_suspending;	/* log is currently suspending */
	uint8_t		zl_keep_first;	/* keep first log block in destroy */
	uint8_t		zl_replay;	/* replaying records while set */
	uint8_t		zl_stop_sync;	/* for debugging */
	uint8_t		zl_writer;	/* boolean: write setup in progress */
	uint8_t		zl_logbias;	/* latency or throughput */
	uint8_t		zl_sync;	/* synchronous or asynchronous */
	int		zl_parse_error;	/* last zil_parse() error */
	uint64_t	zl_parse_blk_seq; /* highest blk seq on last parse */
	uint64_t	zl_parse_lr_seq; /* highest lr seq on last parse */
	uint64_t	zl_parse_blk_count; /* number of blocks parsed */
	uint64_t	zl_parse_lr_count; /* number of log records parsed */
	uint64_t	zl_next_batch;	/* next batch number */
	uint64_t	zl_com_batch;	/* committed batch number */
	kcondvar_t	zl_cv_batch[2];	/* batch condition variables */
	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
	list_t		zl_itx_commit_list; /* itx list to be committed */
	uint64_t	zl_itx_list_sz;	/* total size of records on list */
	uint64_t	zl_cur_used;	/* current commit log size used */
	list_t		zl_lwb_list;	/* in-flight log write list */
	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
	avl_tree_t	zl_vdev_tree;	/* vdevs to flush in zil_commit() */
	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */
	avl_tree_t	zl_bp_tree;	/* track bps during log parse */
	clock_t		zl_replay_time;	/* lbolt of when replay started */
	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
	zil_header_t	zl_old_header;	/* debugging aid */
	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
	txg_node_t	zl_dirty_link;	/* protected by dp_dirty_zilogs list */
};

typedef struct zil_bp_node {
	dva_t		zn_dva;
	avl_node_t	zn_node;
} zil_bp_node_t;

#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
    sizeof (lr_write_t))

#ifdef	__cplusplus
}
#endif

#endif	/* _SYS_ZIL_IMPL_H */
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`/*`
			`* CDDL HEADER START`
			`*`
			`* The contents of this file are subject to the terms of the`
			`* Common Development and Distribution License (the "License").`
			`* You may not use this file except in compliance with the License.`
			`*`
			`* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE`
			`* or http://www.opensolaris.org/os/licensing.`
			`* See the License for the specific language governing permissions`
			`* and limitations under the License.`
			`*`
			`* When distributing Covered Code, include this CDDL HEADER in each`
			`* file and include the License file at usr/src/OPENSOLARIS.LICENSE.`
			`* If applicable, add the following below this CDDL HEADER, with the`
			`* fields enclosed by brackets "[]" replaced with your own identifying`
			`* information: Portions Copyright [yyyy] [name of copyright owner]`
			`*`
			`* CDDL HEADER END`
			`*/`
			`/*`
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.`
Illumos #3086: unnecessarily setting DS_FLAG_INCONSISTENT on async 3086 unnecessarily setting DS_FLAG_INCONSISTENT on async destroyed datasets Reviewed by: Christopher Siden <chris.siden@delphix.com> Approved by: Eric Schrock <Eric.Schrock@delphix.com> References: illumos/illumos-gate@ce636f8b38e8c9ff484e880d9abb27251a882860 illumos changeset: 13776:cd512c80fd75 https://www.illumos.org/issues/3086 Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> 2012-12-15 00:13:40 +00:00			`* Copyright (c) 2012 by Delphix. All rights reserved.`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`*/`

Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`/* Portions Copyright 2010 Robert Milkowski */`

Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`#ifndef _SYS_ZIL_IMPL_H`
			`#define _SYS_ZIL_IMPL_H`

			`#include <sys/zil.h>`
			`#include <sys/dmu_objset.h>`

			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

			`/*`
			`* Log write buffer.`
			`*/`
			`typedef struct lwb {`
			`zilog_t lwb_zilog; / back pointer to log struct */`
			`blkptr_t lwb_blk; /* on disk address of this log blk */`
Add FASTWRITE algorithm for synchronous writes. Currently, ZIL blocks are spread over vdevs using hint block pointers managed by the ZIL commit code and passed to metaslab_alloc(). Spreading log blocks accross vdevs is important for performance: indeed, using mutliple disks in parallel decreases the ZIL commit latency, which is the main performance metric for synchronous writes. However, the current implementation suffers from the following issues: 1) It would be best if the ZIL module was not aware of such low-level details. They should be handled by the ZIO and metaslab modules; 2) Because the hint block pointer is managed per log, simultaneous commits from multiple logs might use the same vdevs at the same time, which is inefficient; 3) Because dmu_write() does not honor the block pointer hint, indirect writes are not spread. The naive solution of rotating the metaslab rotor each time a block is allocated for the ZIL or dmu_sync() doesn't work in practice because the first ZIL block to be written is actually allocated during the previous commit. Consequently, when metaslab_alloc() decides the vdev for this block, it will do so while a bunch of other allocations are happening at the same time (from dmu_sync() and other ZILs). This means the vdev for this block is chosen more or less at random. When the next commit happens, there is a high chance (especially when the number of blocks per commit is slightly less than the number of the disks) that one disk will have to write two blocks (with a potential seek) while other disks are sitting idle, which defeats spreading and increases the commit latency. This commit introduces a new concept in the metaslab allocator: fastwrites. Basically, each top-level vdev maintains a counter indicating the number of synchronous writes (from dmu_sync() and the ZIL) which have been allocated but not yet completed. When the metaslab is called with the FASTWRITE flag, it will choose the vdev with the least amount of pending synchronous writes. If there are multiple vdevs with the same value, the first matching vdev (starting from the rotor) is used. Once metaslab_alloc() has decided which vdev the block is allocated to, it updates the fastwrite counter for this vdev. The rationale goes like this: when an allocation is done with FASTWRITE, it "reserves" the vdev until the data is written. Until then, all future allocations will naturally avoid this vdev, even after a full rotation of the rotor. As a result, pending synchronous writes at a given point in time will be nicely spread over all vdevs. This contrasts with the previous algorithm, which is based on the implicit assumption that blocks are written instantaneously after they're allocated. metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to manually increase or decrease fastwrite counters, respectively. They should be used with caution, as there is no per-BP tracking of fastwrite information, so leaks and "double-unmarks" are possible. There is, however, an assert in the vdev teardown code which will fire if the fastwrite counters are not zero when the pool is exported or the vdev removed. Note that as stated above, marking is also done implictly by metaslab_alloc(). ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to the metaslab when allocating (assuming ZIO does the allocation, which is only true in the case of dmu_sync). This flag will also trigger an unmark when zio_done() fires. A side-effect of the new algorithm is that when a ZIL stops being used, its last block can stay in the pending state (allocated but not yet written) for a long time, polluting the fastwrite counters. To avoid that, I've implemented a somewhat crude but working solution which unmarks these pending blocks in zil_sync(), thus guaranteeing that linguering fastwrites will get pruned at each sync event. The best performance improvements are observed with pools using a large number of top-level vdevs and heavy synchronous write workflows (especially indirect writes and concurrent writes from multiple ZILs). Real-life testing shows a 200% to 300% performance increase with indirect writes and various commit sizes. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1013 2012-06-27 13:20:20 +00:00			`boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`int lwb_nused; /* # used bytes in buffer */`
			`int lwb_sz; /* size of block and buffer */`
			`char lwb_buf; / log write buffer */`
			`zio_t lwb_zio; / zio for this buffer */`
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`dmu_tx_t lwb_tx; / tx for log block allocation */`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`uint64_t lwb_max_txg; /* highest txg in this lwb */`
			`list_node_t lwb_node; /* zilog->zl_lwb_list linkage */`
			`} lwb_t;`

Update to onnv_147 This is the last official OpenSolaris tag before the public development tree was closed. 2010-08-26 21:24:34 +00:00			`/*`
			`* Intent log transaction lists`
			`*/`
			`typedef struct itxs {`
			`list_t i_sync_list; /* list of synchronous itxs */`
			`avl_tree_t i_async_tree; /* tree of foids for async itxs */`
			`} itxs_t;`

			`typedef struct itxg {`
			`kmutex_t itxg_lock; /* lock for this structure */`
			`uint64_t itxg_txg; /* txg for this chain */`
			`uint64_t itxg_sod; /* total size on disk for this txg */`
			`itxs_t itxg_itxs; / sync and async itxs */`
			`} itxg_t;`

			`/* for async nodes we build up an AVL tree of lists of async itxs per file */`
			`typedef struct itx_async_node {`
			`uint64_t ia_foid; /* file object id */`
			`list_t ia_list; /* list of async itxs for this foid */`
			`avl_node_t ia_node; /* AVL tree linkage */`
			`} itx_async_node_t;`

Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`/*`
			`* Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs`
			`* we've touched so we know which ones need a write cache flush at the end.`
			`*/`
			`typedef struct zil_vdev_node {`
			`uint64_t zv_vdev; /* vdev to be flushed */`
			`avl_node_t zv_node; /* AVL tree linkage */`
			`} zil_vdev_node_t;`

Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`#define ZIL_PREV_BLKS 16`

Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`/*`
			`* Stable storage intent log management structure. One per dataset.`
			`*/`
			`struct zilog {`
			`kmutex_t zl_lock; /* protects most zilog_t fields */`
			`struct dsl_pool zl_dmu_pool; / DSL pool */`
			`spa_t zl_spa; / handle for read/write log */`
			`const zil_header_t zl_header; / log header buffer */`
			`objset_t zl_os; / object set we're logging */`
			`zil_get_data_t zl_get_data; / callback to get object content */`
			`zio_t zl_root_zio; / log writer root zio */`
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`uint64_t zl_lr_seq; /* on-disk log record sequence number */`
			`uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`uint64_t zl_destroy_txg; /* txg of last zil_destroy() */`
Rebase master to b105 2009-01-15 21:59:39 +00:00			`uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */`
			`uint64_t zl_replaying_seq; /* current replay seq number */`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`uint32_t zl_suspend; /* log suspend count */`
			`kcondvar_t zl_cv_writer; /* log writer thread completion */`
			`kcondvar_t zl_cv_suspend; /* log suspend completion */`
			`uint8_t zl_suspending; /* log is currently suspending */`
			`uint8_t zl_keep_first; /* keep first log block in destroy */`
Rebase master to b105 2009-01-15 21:59:39 +00:00			`uint8_t zl_replay; /* replaying records while set */`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`uint8_t zl_stop_sync; /* for debugging */`
			`uint8_t zl_writer; /* boolean: write setup in progress */`
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`uint8_t zl_logbias; /* latency or throughput */`
			`uint8_t zl_sync; /* synchronous or asynchronous */`
			`int zl_parse_error; /* last zil_parse() error */`
			`uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */`
			`uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */`
			`uint64_t zl_parse_blk_count; /* number of blocks parsed */`
			`uint64_t zl_parse_lr_count; /* number of log records parsed */`
Update to onnv_147 This is the last official OpenSolaris tag before the public development tree was closed. 2010-08-26 21:24:34 +00:00			`uint64_t zl_next_batch; /* next batch number */`
			`uint64_t zl_com_batch; /* committed batch number */`
			`kcondvar_t zl_cv_batch[2]; /* batch condition variables */`
			`itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */`
			`list_t zl_itx_commit_list; /* itx list to be committed */`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`uint64_t zl_itx_list_sz; /* total size of records on list */`
			`uint64_t zl_cur_used; /* current commit log size used */`
			`list_t zl_lwb_list; /* in-flight log write list */`
			`kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */`
			`avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */`
			`taskq_t zl_clean_taskq; / runs lwb and itx clean tasks */`
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`avl_tree_t zl_bp_tree; /* track bps during log parse */`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`clock_t zl_replay_time; /* lbolt of when replay started */`
			`uint64_t zl_replay_blks; /* number of log blocks replayed */`
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`zil_header_t zl_old_header; /* debugging aid */`
			`uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */`
			`uint_t zl_prev_rotor; /* rotor for zl_prev[] */`
Illumos #3086: unnecessarily setting DS_FLAG_INCONSISTENT on async 3086 unnecessarily setting DS_FLAG_INCONSISTENT on async destroyed datasets Reviewed by: Christopher Siden <chris.siden@delphix.com> Approved by: Eric Schrock <Eric.Schrock@delphix.com> References: illumos/illumos-gate@ce636f8b38e8c9ff484e880d9abb27251a882860 illumos changeset: 13776:cd512c80fd75 https://www.illumos.org/issues/3086 Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> 2012-12-15 00:13:40 +00:00			`txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`};`

Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`typedef struct zil_bp_node {`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`dva_t zn_dva;`
			`avl_node_t zn_node;`
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`} zil_bp_node_t;`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \`
Rebase master to b117 2009-07-02 22:44:48 +00:00			`sizeof (lr_write_t))`

Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`#ifdef __cplusplus`
			`}`
			`#endif`

			`#endif /* _SYS_ZIL_IMPL_H */`