fix abd_nr_pages_off for gang abd

`__vdev_disk_physio()` uses `abd_nr_pages_off()` to allocate a bio with
a sufficient number of iovec's to process this zio (i.e.
`nr_iovecs`/`bi_max_vecs`).  If there are not enough iovec's in the bio,
then additional bio's will be allocated.  However, this is a sub-optimal
code path.  In particular, it requires several abd calls (to
`abd_nr_pages_off()` and `abd_bio_map_off()`) which will have to walk
the constituents of the ABD (the pages or the gang children) because
they are looking for offsets > 0.

For gang ABD's, `abd_nr_pages_off()` returns the number of iovec's
needed for the first constituent, rather than the sum of all
constituents (within the requested range).  This always under-estimates
the required number of iovec's, which causes us to always need several
bio's.  The end result is that `__vdev_disk_physio()` is usually O(n^2)
for gang ABD's (and occasionally O(n^3), when more than 16 bio's are
needed).

This commit fixes `abd_nr_pages_off()`'s handling of gang ABD's, to
correctly determine how many iovec's are needed, by adding up the number
of iovec's for each of the gang children in the requested range.

Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #11536
This commit is contained in:
Matthew Ahrens 2021-01-28 09:28:20 -08:00 committed by GitHub
parent 0ae184a6ba
commit f8c0d7e1f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 40 additions and 33 deletions

View File

@ -925,17 +925,28 @@ abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
{
unsigned long pos;
while (abd_is_gang(abd))
abd = abd_gang_get_offset(abd, &off);
if (abd_is_gang(abd)) {
unsigned long count = 0;
for (abd_t *cabd = abd_gang_get_offset(abd, &off);
cabd != NULL && size != 0;
cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
ASSERT3U(off, <, cabd->abd_size);
int mysize = MIN(size, cabd->abd_size - off);
count += abd_nr_pages_off(cabd, mysize, off);
size -= mysize;
off = 0;
}
return (count);
}
ASSERT(!abd_is_gang(abd));
if (abd_is_linear(abd))
pos = (unsigned long)abd_to_buf(abd) + off;
else
pos = ABD_SCATTER(abd).abd_offset + off;
return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
(pos >> PAGE_SHIFT);
return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
(pos >> PAGE_SHIFT));
}
static unsigned int
@ -1010,7 +1021,6 @@ unsigned int
abd_bio_map_off(struct bio *bio, abd_t *abd,
unsigned int io_size, size_t off)
{
int i;
struct abd_iter aiter;
ASSERT3U(io_size, <=, abd->abd_size - off);
@ -1024,7 +1034,7 @@ abd_bio_map_off(struct bio *bio, abd_t *abd,
abd_iter_init(&aiter, abd);
abd_iter_advance(&aiter, off);
for (i = 0; i < bio->bi_max_vecs; i++) {
for (int i = 0; i < bio->bi_max_vecs; i++) {
struct page *pg;
size_t len, sgoff, pgoff;
struct scatterlist *sg;

View File

@ -350,19 +350,14 @@ vdev_disk_close(vdev_t *v)
static dio_request_t *
vdev_disk_dio_alloc(int bio_count)
{
dio_request_t *dr;
int i;
dr = kmem_zalloc(sizeof (dio_request_t) +
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
sizeof (struct bio *) * bio_count, KM_SLEEP);
if (dr) {
atomic_set(&dr->dr_ref, 0);
dr->dr_bio_count = bio_count;
dr->dr_error = 0;
atomic_set(&dr->dr_ref, 0);
dr->dr_bio_count = bio_count;
dr->dr_error = 0;
for (i = 0; i < dr->dr_bio_count; i++)
dr->dr_bio[i] = NULL;
}
for (int i = 0; i < dr->dr_bio_count; i++)
dr->dr_bio[i] = NULL;
return (dr);
}
@ -536,8 +531,9 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
dio_request_t *dr;
uint64_t abd_offset;
uint64_t bio_offset;
int bio_size, bio_count = 16;
int i = 0, error = 0;
int bio_size;
int bio_count = 16;
int error = 0;
struct blk_plug plug;
/*
@ -552,8 +548,6 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
retry:
dr = vdev_disk_dio_alloc(bio_count);
if (dr == NULL)
return (SET_ERROR(ENOMEM));
if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
bio_set_flags_failfast(bdev, &flags);
@ -561,26 +555,28 @@ retry:
dr->dr_zio = zio;
/*
* When the IO size exceeds the maximum bio size for the request
* queue we are forced to break the IO in multiple bio's and wait
* for them all to complete. Ideally, all pool users will set
* their volume block size to match the maximum request size and
* the common case will be one bio per vdev IO request.
* Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which
* is at least 512 bytes and at most PAGESIZE (typically 4K), one bio
* can cover at least 128KB and at most 1MB. When the required number
* of iovec's exceeds this, we are forced to break the IO in multiple
* bio's and wait for them all to complete. This is likely if the
* recordsize property is increased beyond 1MB. The default
* bio_count=16 should typically accommodate the maximum-size zio of
* 16MB.
*/
abd_offset = 0;
bio_offset = io_offset;
bio_size = io_size;
for (i = 0; i <= dr->dr_bio_count; i++) {
bio_size = io_size;
for (int i = 0; i <= dr->dr_bio_count; i++) {
/* Finished constructing bio's for given buffer */
if (bio_size <= 0)
break;
/*
* By default only 'bio_count' bio's per dio are allowed.
* However, if we find ourselves in a situation where more
* are needed we allocate a larger dio and warn the user.
* If additional bio's are required, we have to retry, but
* this should be rare - see the comment above.
*/
if (dr->dr_bio_count == i) {
vdev_disk_dio_free(dr);
@ -622,9 +618,10 @@ retry:
blk_start_plug(&plug);
/* Submit all bio's associated with this dio */
for (i = 0; i < dr->dr_bio_count; i++)
for (int i = 0; i < dr->dr_bio_count; i++) {
if (dr->dr_bio[i])
vdev_submit_bio(dr->dr_bio[i]);
}
if (dr->dr_bio_count > 1)
blk_finish_plug(&plug);