fix abd_nr_pages_off for gang abd
`__vdev_disk_physio()` uses `abd_nr_pages_off()` to allocate a bio with a sufficient number of iovec's to process this zio (i.e. `nr_iovecs`/`bi_max_vecs`). If there are not enough iovec's in the bio, then additional bio's will be allocated. However, this is a sub-optimal code path. In particular, it requires several abd calls (to `abd_nr_pages_off()` and `abd_bio_map_off()`) which will have to walk the constituents of the ABD (the pages or the gang children) because they are looking for offsets > 0. For gang ABD's, `abd_nr_pages_off()` returns the number of iovec's needed for the first constituent, rather than the sum of all constituents (within the requested range). This always under-estimates the required number of iovec's, which causes us to always need several bio's. The end result is that `__vdev_disk_physio()` is usually O(n^2) for gang ABD's (and occasionally O(n^3), when more than 16 bio's are needed). This commit fixes `abd_nr_pages_off()`'s handling of gang ABD's, to correctly determine how many iovec's are needed, by adding up the number of iovec's for each of the gang children in the requested range. Reviewed-by: Mark Maybee <mark.maybee@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Brian Atkinson <batkinson@lanl.gov> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Closes #11536
This commit is contained in:
parent
0ae184a6ba
commit
f8c0d7e1f6
|
@ -925,17 +925,28 @@ abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
|
|||
{
|
||||
unsigned long pos;
|
||||
|
||||
while (abd_is_gang(abd))
|
||||
abd = abd_gang_get_offset(abd, &off);
|
||||
if (abd_is_gang(abd)) {
|
||||
unsigned long count = 0;
|
||||
|
||||
for (abd_t *cabd = abd_gang_get_offset(abd, &off);
|
||||
cabd != NULL && size != 0;
|
||||
cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
|
||||
ASSERT3U(off, <, cabd->abd_size);
|
||||
int mysize = MIN(size, cabd->abd_size - off);
|
||||
count += abd_nr_pages_off(cabd, mysize, off);
|
||||
size -= mysize;
|
||||
off = 0;
|
||||
}
|
||||
return (count);
|
||||
}
|
||||
|
||||
ASSERT(!abd_is_gang(abd));
|
||||
if (abd_is_linear(abd))
|
||||
pos = (unsigned long)abd_to_buf(abd) + off;
|
||||
else
|
||||
pos = ABD_SCATTER(abd).abd_offset + off;
|
||||
|
||||
return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
|
||||
(pos >> PAGE_SHIFT);
|
||||
return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
|
||||
(pos >> PAGE_SHIFT));
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
|
@ -1010,7 +1021,6 @@ unsigned int
|
|||
abd_bio_map_off(struct bio *bio, abd_t *abd,
|
||||
unsigned int io_size, size_t off)
|
||||
{
|
||||
int i;
|
||||
struct abd_iter aiter;
|
||||
|
||||
ASSERT3U(io_size, <=, abd->abd_size - off);
|
||||
|
@ -1024,7 +1034,7 @@ abd_bio_map_off(struct bio *bio, abd_t *abd,
|
|||
abd_iter_init(&aiter, abd);
|
||||
abd_iter_advance(&aiter, off);
|
||||
|
||||
for (i = 0; i < bio->bi_max_vecs; i++) {
|
||||
for (int i = 0; i < bio->bi_max_vecs; i++) {
|
||||
struct page *pg;
|
||||
size_t len, sgoff, pgoff;
|
||||
struct scatterlist *sg;
|
||||
|
|
|
@ -350,19 +350,14 @@ vdev_disk_close(vdev_t *v)
|
|||
static dio_request_t *
|
||||
vdev_disk_dio_alloc(int bio_count)
|
||||
{
|
||||
dio_request_t *dr;
|
||||
int i;
|
||||
|
||||
dr = kmem_zalloc(sizeof (dio_request_t) +
|
||||
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
|
||||
sizeof (struct bio *) * bio_count, KM_SLEEP);
|
||||
if (dr) {
|
||||
atomic_set(&dr->dr_ref, 0);
|
||||
dr->dr_bio_count = bio_count;
|
||||
dr->dr_error = 0;
|
||||
atomic_set(&dr->dr_ref, 0);
|
||||
dr->dr_bio_count = bio_count;
|
||||
dr->dr_error = 0;
|
||||
|
||||
for (i = 0; i < dr->dr_bio_count; i++)
|
||||
dr->dr_bio[i] = NULL;
|
||||
}
|
||||
for (int i = 0; i < dr->dr_bio_count; i++)
|
||||
dr->dr_bio[i] = NULL;
|
||||
|
||||
return (dr);
|
||||
}
|
||||
|
@ -536,8 +531,9 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
|||
dio_request_t *dr;
|
||||
uint64_t abd_offset;
|
||||
uint64_t bio_offset;
|
||||
int bio_size, bio_count = 16;
|
||||
int i = 0, error = 0;
|
||||
int bio_size;
|
||||
int bio_count = 16;
|
||||
int error = 0;
|
||||
struct blk_plug plug;
|
||||
|
||||
/*
|
||||
|
@ -552,8 +548,6 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
|
|||
|
||||
retry:
|
||||
dr = vdev_disk_dio_alloc(bio_count);
|
||||
if (dr == NULL)
|
||||
return (SET_ERROR(ENOMEM));
|
||||
|
||||
if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
|
||||
bio_set_flags_failfast(bdev, &flags);
|
||||
|
@ -561,26 +555,28 @@ retry:
|
|||
dr->dr_zio = zio;
|
||||
|
||||
/*
|
||||
* When the IO size exceeds the maximum bio size for the request
|
||||
* queue we are forced to break the IO in multiple bio's and wait
|
||||
* for them all to complete. Ideally, all pool users will set
|
||||
* their volume block size to match the maximum request size and
|
||||
* the common case will be one bio per vdev IO request.
|
||||
* Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which
|
||||
* is at least 512 bytes and at most PAGESIZE (typically 4K), one bio
|
||||
* can cover at least 128KB and at most 1MB. When the required number
|
||||
* of iovec's exceeds this, we are forced to break the IO in multiple
|
||||
* bio's and wait for them all to complete. This is likely if the
|
||||
* recordsize property is increased beyond 1MB. The default
|
||||
* bio_count=16 should typically accommodate the maximum-size zio of
|
||||
* 16MB.
|
||||
*/
|
||||
|
||||
abd_offset = 0;
|
||||
bio_offset = io_offset;
|
||||
bio_size = io_size;
|
||||
for (i = 0; i <= dr->dr_bio_count; i++) {
|
||||
bio_size = io_size;
|
||||
for (int i = 0; i <= dr->dr_bio_count; i++) {
|
||||
|
||||
/* Finished constructing bio's for given buffer */
|
||||
if (bio_size <= 0)
|
||||
break;
|
||||
|
||||
/*
|
||||
* By default only 'bio_count' bio's per dio are allowed.
|
||||
* However, if we find ourselves in a situation where more
|
||||
* are needed we allocate a larger dio and warn the user.
|
||||
* If additional bio's are required, we have to retry, but
|
||||
* this should be rare - see the comment above.
|
||||
*/
|
||||
if (dr->dr_bio_count == i) {
|
||||
vdev_disk_dio_free(dr);
|
||||
|
@ -622,9 +618,10 @@ retry:
|
|||
blk_start_plug(&plug);
|
||||
|
||||
/* Submit all bio's associated with this dio */
|
||||
for (i = 0; i < dr->dr_bio_count; i++)
|
||||
for (int i = 0; i < dr->dr_bio_count; i++) {
|
||||
if (dr->dr_bio[i])
|
||||
vdev_submit_bio(dr->dr_bio[i]);
|
||||
}
|
||||
|
||||
if (dr->dr_bio_count > 1)
|
||||
blk_finish_plug(&plug);
|
||||
|
|
Loading…
Reference in New Issue