zfs/include/sys/metaslab.h

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2011 by Delphix. All rights reserved.
 */

#ifndef _SYS_METASLAB_H
#define	_SYS_METASLAB_H

#include <sys/spa.h>
#include <sys/space_map.h>
#include <sys/txg.h>
#include <sys/zio.h>
#include <sys/avl.h>

#ifdef	__cplusplus
extern "C" {
#endif

extern space_map_ops_t *zfs_metaslab_ops;

extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
    uint64_t start, uint64_t size, uint64_t txg);
extern void metaslab_fini(metaslab_t *msp);
extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
extern void metaslab_sync_reassess(metaslab_group_t *mg);

#define	METASLAB_HINTBP_FAVOR	0x0
#define	METASLAB_HINTBP_AVOID	0x1
#define	METASLAB_GANG_HEADER	0x2
#define	METASLAB_GANG_CHILD	0x4
#define	METASLAB_GANG_AVOID	0x8
#define	METASLAB_FASTWRITE	0x10

extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
    blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
    boolean_t now);
extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
extern void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp);
extern void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp);

extern metaslab_class_t *metaslab_class_create(spa_t *spa,
    space_map_ops_t *ops);
extern void metaslab_class_destroy(metaslab_class_t *mc);
extern int metaslab_class_validate(metaslab_class_t *mc);

extern void metaslab_class_space_update(metaslab_class_t *mc,
    int64_t alloc_delta, int64_t defer_delta,
    int64_t space_delta, int64_t dspace_delta);
extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);

extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
    vdev_t *vd);
extern void metaslab_group_destroy(metaslab_group_t *mg);
extern void metaslab_group_activate(metaslab_group_t *mg);
extern void metaslab_group_passivate(metaslab_group_t *mg);

#ifdef	__cplusplus
}
#endif

#endif	/* _SYS_METASLAB_H */
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`/*`
			`* CDDL HEADER START`
			`*`
			`* The contents of this file are subject to the terms of the`
			`* Common Development and Distribution License (the "License").`
			`* You may not use this file except in compliance with the License.`
			`*`
			`* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE`
			`* or http://www.opensolaris.org/os/licensing.`
			`* See the License for the specific language governing permissions`
			`* and limitations under the License.`
			`*`
			`* When distributing Covered Code, include this CDDL HEADER in each`
			`* file and include the License file at usr/src/OPENSOLARIS.LICENSE.`
			`* If applicable, add the following below this CDDL HEADER, with the`
			`* fields enclosed by brackets "[]" replaced with your own identifying`
			`* information: Portions Copyright [yyyy] [name of copyright owner]`
			`*`
			`* CDDL HEADER END`
			`*/`
			`/*`
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.`
Illumos #1051: zfs should handle imbalanced luns Today zfs tries to allocate blocks evenly across all devices. This means when devices are imbalanced zfs will use lots of CPU searching for space on devices which tend to be pretty full. It should instead fail quickly on the full LUNs and move onto devices which have more availability. Reviewed by: Eric Schrock <Eric.Schrock@delphix.com> Reviewed by: Matt Ahrens <Matt.Ahrens@delphix.com> Reviewed by: Adam Leventhal <Adam.Leventhal@delphix.com> Reviewed by: Albert Lee <trisk@nexenta.com> Reviewed by: Gordon Ross <gwr@nexenta.com> Approved by: Garrett D'Amore <garrett@nexenta.com> References to Illumos issue and patch: - https://www.illumos.org/issues/510 - https://github.com/illumos/illumos-gate/commit/5ead3ed965 Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #340 2011-07-26 19:08:52 +00:00			`* Copyright (c) 2011 by Delphix. All rights reserved.`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`*/`

			`#ifndef _SYS_METASLAB_H`
			`#define _SYS_METASLAB_H`

			`#include <sys/spa.h>`
			`#include <sys/space_map.h>`
			`#include <sys/txg.h>`
			`#include <sys/zio.h>`
			`#include <sys/avl.h>`

			`#ifdef __cplusplus`
			`extern "C" {`
			`#endif`

Rebase master to b117 2009-07-02 22:44:48 +00:00			`extern space_map_ops_t *zfs_metaslab_ops;`

Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`extern metaslab_t metaslab_init(metaslab_group_t mg, space_map_obj_t *smo,`
			`uint64_t start, uint64_t size, uint64_t txg);`
			`extern void metaslab_fini(metaslab_t *msp);`
			`extern void metaslab_sync(metaslab_t *msp, uint64_t txg);`
			`extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);`
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`extern void metaslab_sync_reassess(metaslab_group_t *mg);`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00
Rebase to OpenSolaris b103, in the process we are removing any code which did not originate from the OpenSolaris source. These changes will be reintroduced in topic branches for easier tracking 2008-12-03 20:09:06 +00:00			`#define METASLAB_HINTBP_FAVOR 0x0`
			`#define METASLAB_HINTBP_AVOID 0x1`
			`#define METASLAB_GANG_HEADER 0x2`
Illumos #1051: zfs should handle imbalanced luns Today zfs tries to allocate blocks evenly across all devices. This means when devices are imbalanced zfs will use lots of CPU searching for space on devices which tend to be pretty full. It should instead fail quickly on the full LUNs and move onto devices which have more availability. Reviewed by: Eric Schrock <Eric.Schrock@delphix.com> Reviewed by: Matt Ahrens <Matt.Ahrens@delphix.com> Reviewed by: Adam Leventhal <Adam.Leventhal@delphix.com> Reviewed by: Albert Lee <trisk@nexenta.com> Reviewed by: Gordon Ross <gwr@nexenta.com> Approved by: Garrett D'Amore <garrett@nexenta.com> References to Illumos issue and patch: - https://www.illumos.org/issues/510 - https://github.com/illumos/illumos-gate/commit/5ead3ed965 Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #340 2011-07-26 19:08:52 +00:00			`#define METASLAB_GANG_CHILD 0x4`
			`#define METASLAB_GANG_AVOID 0x8`
Add FASTWRITE algorithm for synchronous writes. Currently, ZIL blocks are spread over vdevs using hint block pointers managed by the ZIL commit code and passed to metaslab_alloc(). Spreading log blocks accross vdevs is important for performance: indeed, using mutliple disks in parallel decreases the ZIL commit latency, which is the main performance metric for synchronous writes. However, the current implementation suffers from the following issues: 1) It would be best if the ZIL module was not aware of such low-level details. They should be handled by the ZIO and metaslab modules; 2) Because the hint block pointer is managed per log, simultaneous commits from multiple logs might use the same vdevs at the same time, which is inefficient; 3) Because dmu_write() does not honor the block pointer hint, indirect writes are not spread. The naive solution of rotating the metaslab rotor each time a block is allocated for the ZIL or dmu_sync() doesn't work in practice because the first ZIL block to be written is actually allocated during the previous commit. Consequently, when metaslab_alloc() decides the vdev for this block, it will do so while a bunch of other allocations are happening at the same time (from dmu_sync() and other ZILs). This means the vdev for this block is chosen more or less at random. When the next commit happens, there is a high chance (especially when the number of blocks per commit is slightly less than the number of the disks) that one disk will have to write two blocks (with a potential seek) while other disks are sitting idle, which defeats spreading and increases the commit latency. This commit introduces a new concept in the metaslab allocator: fastwrites. Basically, each top-level vdev maintains a counter indicating the number of synchronous writes (from dmu_sync() and the ZIL) which have been allocated but not yet completed. When the metaslab is called with the FASTWRITE flag, it will choose the vdev with the least amount of pending synchronous writes. If there are multiple vdevs with the same value, the first matching vdev (starting from the rotor) is used. Once metaslab_alloc() has decided which vdev the block is allocated to, it updates the fastwrite counter for this vdev. The rationale goes like this: when an allocation is done with FASTWRITE, it "reserves" the vdev until the data is written. Until then, all future allocations will naturally avoid this vdev, even after a full rotation of the rotor. As a result, pending synchronous writes at a given point in time will be nicely spread over all vdevs. This contrasts with the previous algorithm, which is based on the implicit assumption that blocks are written instantaneously after they're allocated. metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to manually increase or decrease fastwrite counters, respectively. They should be used with caution, as there is no per-BP tracking of fastwrite information, so leaks and "double-unmarks" are possible. There is, however, an assert in the vdev teardown code which will fire if the fastwrite counters are not zero when the pool is exported or the vdev removed. Note that as stated above, marking is also done implictly by metaslab_alloc(). ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to the metaslab when allocating (assuming ZIO does the allocation, which is only true in the case of dmu_sync). This flag will also trigger an unmark when zio_done() fires. A side-effect of the new algorithm is that when a ZIL stops being used, its last block can stay in the pending state (allocated but not yet written) for a long time, polluting the fastwrite counters. To avoid that, I've implemented a somewhat crude but working solution which unmarks these pending blocks in zil_sync(), thus guaranteeing that linguering fastwrites will get pruned at each sync event. The best performance improvements are observed with pools using a large number of top-level vdevs and heavy synchronous write workflows (especially indirect writes and concurrent writes from multiple ZILs). Real-life testing shows a 200% to 300% performance increase with indirect writes and various commit sizes. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1013 2012-06-27 13:20:20 +00:00			`#define METASLAB_FASTWRITE 0x10`
Rebase to OpenSolaris b103, in the process we are removing any code which did not originate from the OpenSolaris source. These changes will be reintroduced in topic branches for easier tracking 2008-12-03 20:09:06 +00:00
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`extern int metaslab_alloc(spa_t spa, metaslab_class_t mc, uint64_t psize,`
Rebase to OpenSolaris b103, in the process we are removing any code which did not originate from the OpenSolaris source. These changes will be reintroduced in topic branches for easier tracking 2008-12-03 20:09:06 +00:00			`blkptr_t bp, int ncopies, uint64_t txg, blkptr_t hintbp, int flags);`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`extern void metaslab_free(spa_t spa, const blkptr_t bp, uint64_t txg,`
			`boolean_t now);`
			`extern int metaslab_claim(spa_t spa, const blkptr_t bp, uint64_t txg);`
Add FASTWRITE algorithm for synchronous writes. Currently, ZIL blocks are spread over vdevs using hint block pointers managed by the ZIL commit code and passed to metaslab_alloc(). Spreading log blocks accross vdevs is important for performance: indeed, using mutliple disks in parallel decreases the ZIL commit latency, which is the main performance metric for synchronous writes. However, the current implementation suffers from the following issues: 1) It would be best if the ZIL module was not aware of such low-level details. They should be handled by the ZIO and metaslab modules; 2) Because the hint block pointer is managed per log, simultaneous commits from multiple logs might use the same vdevs at the same time, which is inefficient; 3) Because dmu_write() does not honor the block pointer hint, indirect writes are not spread. The naive solution of rotating the metaslab rotor each time a block is allocated for the ZIL or dmu_sync() doesn't work in practice because the first ZIL block to be written is actually allocated during the previous commit. Consequently, when metaslab_alloc() decides the vdev for this block, it will do so while a bunch of other allocations are happening at the same time (from dmu_sync() and other ZILs). This means the vdev for this block is chosen more or less at random. When the next commit happens, there is a high chance (especially when the number of blocks per commit is slightly less than the number of the disks) that one disk will have to write two blocks (with a potential seek) while other disks are sitting idle, which defeats spreading and increases the commit latency. This commit introduces a new concept in the metaslab allocator: fastwrites. Basically, each top-level vdev maintains a counter indicating the number of synchronous writes (from dmu_sync() and the ZIL) which have been allocated but not yet completed. When the metaslab is called with the FASTWRITE flag, it will choose the vdev with the least amount of pending synchronous writes. If there are multiple vdevs with the same value, the first matching vdev (starting from the rotor) is used. Once metaslab_alloc() has decided which vdev the block is allocated to, it updates the fastwrite counter for this vdev. The rationale goes like this: when an allocation is done with FASTWRITE, it "reserves" the vdev until the data is written. Until then, all future allocations will naturally avoid this vdev, even after a full rotation of the rotor. As a result, pending synchronous writes at a given point in time will be nicely spread over all vdevs. This contrasts with the previous algorithm, which is based on the implicit assumption that blocks are written instantaneously after they're allocated. metaslab_fastwrite_mark() and metaslab_fastwrite_unmark() are used to manually increase or decrease fastwrite counters, respectively. They should be used with caution, as there is no per-BP tracking of fastwrite information, so leaks and "double-unmarks" are possible. There is, however, an assert in the vdev teardown code which will fire if the fastwrite counters are not zero when the pool is exported or the vdev removed. Note that as stated above, marking is also done implictly by metaslab_alloc(). ZIO also got a new FASTWRITE flag; when it is used, ZIO will pass it to the metaslab when allocating (assuming ZIO does the allocation, which is only true in the case of dmu_sync). This flag will also trigger an unmark when zio_done() fires. A side-effect of the new algorithm is that when a ZIL stops being used, its last block can stay in the pending state (allocated but not yet written) for a long time, polluting the fastwrite counters. To avoid that, I've implemented a somewhat crude but working solution which unmarks these pending blocks in zil_sync(), thus guaranteeing that linguering fastwrites will get pruned at each sync event. The best performance improvements are observed with pools using a large number of top-level vdevs and heavy synchronous write workflows (especially indirect writes and concurrent writes from multiple ZILs). Real-life testing shows a 200% to 300% performance increase with indirect writes and various commit sizes. Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1013 2012-06-27 13:20:20 +00:00			`extern void metaslab_fastwrite_mark(spa_t spa, const blkptr_t bp);`
			`extern void metaslab_fastwrite_unmark(spa_t spa, const blkptr_t bp);`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`extern metaslab_class_t metaslab_class_create(spa_t spa,`
			`space_map_ops_t *ops);`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00			`extern void metaslab_class_destroy(metaslab_class_t *mc);`
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`extern int metaslab_class_validate(metaslab_class_t *mc);`

			`extern void metaslab_class_space_update(metaslab_class_t *mc,`
			`int64_t alloc_delta, int64_t defer_delta,`
			`int64_t space_delta, int64_t dspace_delta);`
			`extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);`
			`extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);`
			`extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);`
			`extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00
			`extern metaslab_group_t metaslab_group_create(metaslab_class_t mc,`
			`vdev_t *vd);`
			`extern void metaslab_group_destroy(metaslab_group_t *mg);`
Update core ZFS code from build 121 to build 141. 2010-05-28 20:45:14 +00:00			`extern void metaslab_group_activate(metaslab_group_t *mg);`
			`extern void metaslab_group_passivate(metaslab_group_t *mg);`
Initial Linux ZFS GIT Repo 2008-11-20 20:01:55 +00:00
			`#ifdef __cplusplus`
			`}`
			`#endif`

			`#endif /* _SYS_METASLAB_H */`