3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit
4080 zpool clear fails to clear pool
4081 need zfs_mg_noalloc_threshold
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>

References:
  https://www.illumos.org/issues/3954
  https://www.illumos.org/issues/4080
  https://www.illumos.org/issues/4081
  illumos/illumos-gate@22e30981d8

Ported-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1775
This commit is contained in:
George Wilson 2013-08-29 10:56:49 -08:00 committed by Brian Behlendorf
parent a169a625a6
commit ac72fac3ea
4 changed files with 117 additions and 8 deletions

View File

@ -24,7 +24,7 @@
*/ */
/* /*
* Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_METASLAB_IMPL_H #ifndef _SYS_METASLAB_IMPL_H
@ -45,6 +45,7 @@ struct metaslab_class {
metaslab_group_t *mc_rotor; metaslab_group_t *mc_rotor;
space_map_ops_t *mc_ops; space_map_ops_t *mc_ops;
uint64_t mc_aliquot; uint64_t mc_aliquot;
uint64_t mc_alloc_groups; /* # of allocatable groups */
uint64_t mc_alloc; /* total allocated space */ uint64_t mc_alloc; /* total allocated space */
uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_space; /* total space (alloc + free) */
@ -58,6 +59,8 @@ struct metaslab_group {
uint64_t mg_aliquot; uint64_t mg_aliquot;
uint64_t mg_bonus_area; uint64_t mg_bonus_area;
uint64_t mg_alloc_failures; uint64_t mg_alloc_failures;
boolean_t mg_allocatable; /* can we allocate? */
uint64_t mg_free_capacity; /* percentage free */
int64_t mg_bias; int64_t mg_bias;
int64_t mg_activation_count; int64_t mg_activation_count;
metaslab_class_t *mg_class; metaslab_class_t *mg_class;

View File

@ -60,9 +60,25 @@ int zfs_condense_pct = 200;
/* /*
* This value defines the number of allowed allocation failures per vdev. * This value defines the number of allowed allocation failures per vdev.
* If a device reaches this threshold in a given txg then we consider skipping * If a device reaches this threshold in a given txg then we consider skipping
* allocations on that device. * allocations on that device. The value of zfs_mg_alloc_failures is computed
* in zio_init() unless it has been overridden in /etc/system.
*/ */
int zfs_mg_alloc_failures; int zfs_mg_alloc_failures = 0;
/*
* The zfs_mg_noalloc_threshold defines which metaslab groups should
* be eligible for allocation. The value is defined as a percentage of
* a free space. Metaslab groups that have more free space than
* zfs_mg_noalloc_threshold are always eligible for allocations. Once
* a metaslab group's free space is less than or equal to the
* zfs_mg_noalloc_threshold the allocator will avoid allocating to that
* group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
* Once all groups in the pool reach zfs_mg_noalloc_threshold then all
* groups are allowed to accept allocations. Gang blocks are always
* eligible to allocate on any metaslab group. The default value of 0 means
* no metaslab group will be excluded based on this criterion.
*/
int zfs_mg_noalloc_threshold = 0;
/* /*
* Metaslab debugging: when set, keeps all space maps in core to verify frees. * Metaslab debugging: when set, keeps all space maps in core to verify frees.
@ -223,6 +239,53 @@ metaslab_compare(const void *x1, const void *x2)
return (0); return (0);
} }
/*
* Update the allocatable flag and the metaslab group's capacity.
* The allocatable flag is set to true if the capacity is below
* the zfs_mg_noalloc_threshold. If a metaslab group transitions
* from allocatable to non-allocatable or vice versa then the metaslab
* group's class is updated to reflect the transition.
*/
static void
metaslab_group_alloc_update(metaslab_group_t *mg)
{
vdev_t *vd = mg->mg_vd;
metaslab_class_t *mc = mg->mg_class;
vdev_stat_t *vs = &vd->vdev_stat;
boolean_t was_allocatable;
ASSERT(vd == vd->vdev_top);
mutex_enter(&mg->mg_lock);
was_allocatable = mg->mg_allocatable;
mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
(vs->vs_space + 1);
mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
/*
* The mc_alloc_groups maintains a count of the number of
* groups in this metaslab class that are still above the
* zfs_mg_noalloc_threshold. This is used by the allocating
* threads to determine if they should avoid allocations to
* a given group. The allocator will avoid allocations to a group
* if that group has reached or is below the zfs_mg_noalloc_threshold
* and there are still other groups that are above the threshold.
* When a group transitions from allocatable to non-allocatable or
* vice versa we update the metaslab class to reflect that change.
* When the mc_alloc_groups value drops to 0 that means that all
* groups have reached the zfs_mg_noalloc_threshold making all groups
* eligible for allocations. This effectively means that all devices
* are balanced again.
*/
if (was_allocatable && !mg->mg_allocatable)
mc->mc_alloc_groups--;
else if (!was_allocatable && mg->mg_allocatable)
mc->mc_alloc_groups++;
mutex_exit(&mg->mg_lock);
}
metaslab_group_t * metaslab_group_t *
metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
{ {
@ -273,6 +336,7 @@ metaslab_group_activate(metaslab_group_t *mg)
return; return;
mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
metaslab_group_alloc_update(mg);
if ((mgprev = mc->mc_rotor) == NULL) { if ((mgprev = mc->mc_rotor) == NULL) {
mg->mg_prev = mg; mg->mg_prev = mg;
@ -357,6 +421,29 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
mutex_exit(&mg->mg_lock); mutex_exit(&mg->mg_lock);
} }
/*
* Determine if a given metaslab group should skip allocations. A metaslab
* group should avoid allocations if its used capacity has crossed the
* zfs_mg_noalloc_threshold and there is at least one metaslab group
* that can still handle allocations.
*/
static boolean_t
metaslab_group_allocatable(metaslab_group_t *mg)
{
vdev_t *vd = mg->mg_vd;
spa_t *spa = vd->vdev_spa;
metaslab_class_t *mc = mg->mg_class;
/*
* A metaslab group is considered allocatable if its free capacity
* is greater than the set value of zfs_mg_noalloc_threshold, it's
* associated with a slog, or there are no other metaslab groups
* with free capacity greater than zfs_mg_noalloc_threshold.
*/
return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
}
/* /*
* ========================================================================== * ==========================================================================
* Common allocator routines * Common allocator routines
@ -1301,6 +1388,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
} }
metaslab_group_alloc_update(mg);
/* /*
* If the map is loaded but no longer active, evict it as soon as all * If the map is loaded but no longer active, evict it as soon as all
* future allocations have synced. (If we unloaded it now and then * future allocations have synced. (If we unloaded it now and then
@ -1430,6 +1519,8 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
if (msp == NULL) if (msp == NULL)
return (-1ULL); return (-1ULL);
mutex_enter(&msp->ms_lock);
/* /*
* If we've already reached the allowable number of failed * If we've already reached the allowable number of failed
* allocation attempts on this metaslab group then we * allocation attempts on this metaslab group then we
@ -1446,11 +1537,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
"asize %llu, failures %llu", spa_name(spa), "asize %llu, failures %llu", spa_name(spa),
mg->mg_vd->vdev_id, txg, mg, psize, asize, mg->mg_vd->vdev_id, txg, mg, psize, asize,
mg->mg_alloc_failures); mg->mg_alloc_failures);
mutex_exit(&msp->ms_lock);
return (-1ULL); return (-1ULL);
} }
mutex_enter(&msp->ms_lock);
/* /*
* Ensure that the metaslab we have selected is still * Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that * capable of handling our request. It's possible that
@ -1615,6 +1705,21 @@ top:
} else { } else {
allocatable = vdev_allocatable(vd); allocatable = vdev_allocatable(vd);
} }
/*
* Determine if the selected metaslab group is eligible
* for allocations. If we're ganging or have requested
* an allocation for the smallest gang block size
* then we don't want to avoid allocating to the this
* metaslab group. If we're in this condition we should
* try to allocate from any device possible so that we
* don't inadvertently return ENOSPC and suspend the pool
* even though space is still available.
*/
if (allocatable && CAN_FASTGANG(flags) &&
psize > SPA_GANGBLOCKSIZE)
allocatable = metaslab_group_allocatable(mg);
if (!allocatable) if (!allocatable)
goto next; goto next;

View File

@ -5351,7 +5351,7 @@ zfs_ioctl_init(void)
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);

View File

@ -227,6 +227,7 @@ zio_init(void)
* The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
* to fail 3 times per txg or 8 failures, whichever is greater. * to fail 3 times per txg or 8 failures, whichever is greater.
*/ */
if (zfs_mg_alloc_failures == 0)
zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
zio_inject_init(); zio_inject_init();
@ -2518,7 +2519,7 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
if (error) { if (error) {
error = metaslab_alloc(spa, spa_normal_class(spa), size, error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, NULL, new_bp, 1, txg, NULL,
METASLAB_FASTWRITE | METASLAB_GANG_AVOID); METASLAB_FASTWRITE);
} }
if (error == 0) { if (error == 0) {