OpenZFS 9080 - recursive enter of vdev_indirect_rwlock from vdev_indirect_remap()

Authored by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>

OpenZFS-issue: https://illumos.org/issues/9080
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/bdfded42e6
Closes #6900
This commit is contained in:
Serapheim Dimitropoulos 2017-01-09 12:54:51 -08:00 committed by Brian Behlendorf
parent 9d5b524597
commit 4bf8108ede
1 changed files with 94 additions and 23 deletions

View File

@ -14,7 +14,7 @@
*/ */
/* /*
* Copyright (c) 2014, 2015 by Delphix. All rights reserved. * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -944,6 +944,57 @@ rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
return (rs); return (rs);
} }
/*
* Given an indirect vdev and an extent on that vdev, it duplicates the
* physical entries of the indirect mapping that correspond to the extent
* to a new array and returns a pointer to it. In addition, copied_entries
* is populated with the number of mapping entries that were duplicated.
*
* Note that the function assumes that the caller holds vdev_indirect_rwlock.
* This ensures that the mapping won't change due to condensing as we
* copy over its contents.
*
* Finally, since we are doing an allocation, it is up to the caller to
* free the array allocated in this function.
*/
vdev_indirect_mapping_entry_phys_t *
vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
uint64_t asize, uint64_t *copied_entries)
{
vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
uint64_t entries = 0;
ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
vdev_indirect_mapping_entry_phys_t *first_mapping =
vdev_indirect_mapping_entry_for_offset(vim, offset);
ASSERT3P(first_mapping, !=, NULL);
vdev_indirect_mapping_entry_phys_t *m = first_mapping;
while (asize > 0) {
uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
uint64_t inner_size = MIN(asize, size - inner_offset);
offset += inner_size;
asize -= inner_size;
entries++;
m++;
}
size_t copy_length = entries * sizeof (*first_mapping);
duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
bcopy(first_mapping, duplicate_mappings, copy_length);
*copied_entries = entries;
return (duplicate_mappings);
}
/* /*
* Goes through the relevant indirect mappings until it hits a concrete vdev * Goes through the relevant indirect mappings until it hits a concrete vdev
* and issues the callback. On the way to the concrete vdev, if any other * and issues the callback. On the way to the concrete vdev, if any other
@ -984,24 +1035,41 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
rs != NULL; rs = list_remove_head(&stack)) { rs != NULL; rs = list_remove_head(&stack)) {
vdev_t *v = rs->rs_vd; vdev_t *v = rs->rs_vd;
uint64_t num_entries = 0;
/*
* Note: this can be called from open context
* (eg. zio_read()), so we need the rwlock to prevent
* the mapping from being changed by condensing.
*/
rw_enter(&v->vdev_indirect_rwlock, RW_READER);
vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping;
ASSERT3P(vim, !=, NULL);
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
ASSERT(rs->rs_asize > 0); ASSERT(rs->rs_asize > 0);
vdev_indirect_mapping_entry_phys_t *mapping = /*
vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset); * Note: As this function can be called from open context
ASSERT3P(mapping, !=, NULL); * (e.g. zio_read()), we need the following rwlock to
* prevent the mapping from being changed by condensing.
*
* So we grab the lock and we make a copy of the entries
* that are relevant to the extent that we are working on.
* Once that is done, we drop the lock and iterate over
* our copy of the mapping. Once we are done with the with
* the remap segment and we free it, we also free our copy
* of the indirect mapping entries that are relevant to it.
*
* This way we don't need to wait until the function is
* finished with a segment, to condense it. In addition, we
* don't need a recursive rwlock for the case that a call to
* vdev_indirect_remap() needs to call itself (through the
* codepath of its callback) for the same vdev in the middle
* of its execution.
*/
rw_enter(&v->vdev_indirect_rwlock, RW_READER);
ASSERT3P(v->vdev_indirect_mapping, !=, NULL);
while (rs->rs_asize > 0) { vdev_indirect_mapping_entry_phys_t *mapping =
vdev_indirect_mapping_duplicate_adjacent_entries(v,
rs->rs_offset, rs->rs_asize, &num_entries);
ASSERT3P(mapping, !=, NULL);
ASSERT3U(num_entries, >, 0);
rw_exit(&v->vdev_indirect_rwlock);
for (uint64_t i = 0; i < num_entries; i++) {
/* /*
* Note: the vdev_indirect_mapping can not change * Note: the vdev_indirect_mapping can not change
* while we are running. It only changes while the * while we are running. It only changes while the
@ -1010,20 +1078,23 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
* function is only called for frees, which also only * function is only called for frees, which also only
* happen from syncing context. * happen from syncing context.
*/ */
vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); ASSERT3P(m, !=, NULL);
uint64_t dst_offset = ASSERT3U(rs->rs_asize, >, 0);
DVA_GET_OFFSET(&mapping->vimep_dst);
uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst); uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
ASSERT3U(rs->rs_offset, >=, ASSERT3U(rs->rs_offset, >=,
DVA_MAPPING_GET_SRC_OFFSET(mapping)); DVA_MAPPING_GET_SRC_OFFSET(m));
ASSERT3U(rs->rs_offset, <, ASSERT3U(rs->rs_offset, <,
DVA_MAPPING_GET_SRC_OFFSET(mapping) + size); DVA_MAPPING_GET_SRC_OFFSET(m) + size);
ASSERT3U(dst_vdev, !=, v->vdev_id); ASSERT3U(dst_vdev, !=, v->vdev_id);
uint64_t inner_offset = rs->rs_offset - uint64_t inner_offset = rs->rs_offset -
DVA_MAPPING_GET_SRC_OFFSET(mapping); DVA_MAPPING_GET_SRC_OFFSET(m);
uint64_t inner_size = uint64_t inner_size =
MIN(rs->rs_asize, size - inner_offset); MIN(rs->rs_asize, size - inner_offset);
@ -1064,10 +1135,10 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
rs->rs_offset += inner_size; rs->rs_offset += inner_size;
rs->rs_asize -= inner_size; rs->rs_asize -= inner_size;
rs->rs_split_offset += inner_size; rs->rs_split_offset += inner_size;
mapping++;
} }
VERIFY0(rs->rs_asize);
rw_exit(&v->vdev_indirect_rwlock); kmem_free(mapping, num_entries * sizeof (*mapping));
kmem_free(rs, sizeof (remap_segment_t)); kmem_free(rs, sizeof (remap_segment_t));
} }
list_destroy(&stack); list_destroy(&stack);