Update linux vdev_disk interfaces to issue multiple bios if needed due to the maximum request size being smaller than the request size passed down from the spa

This commit is contained in:
Brian Behlendorf 2009-01-26 16:46:50 -08:00
parent 85366f145c
commit 3657ada547
1 changed files with 162 additions and 168 deletions

View File

@ -35,18 +35,21 @@
* Virtual device vector for disks. * Virtual device vector for disks.
*/ */
typedef struct dio_request { typedef struct dio_request {
struct completion dr_comp; struct completion dr_comp; /* Completion for sync IO */
atomic_t dr_ref; spinlock_t dr_lock; /* Completion lock */
zio_t *dr_zio; zio_t *dr_zio; /* Parent ZIO */
int dr_error; int dr_ref; /* Outstanding bio count */
int dr_rw; /* Read/Write */
int dr_error; /* Bio error */
int dr_bio_count; /* Count of bio's */
struct bio *dr_bio[0]; /* Attached bio's */
} dio_request_t; } dio_request_t;
static int static int
vdev_disk_open_common(vdev_t *vd) vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
{ {
struct block_device *vd_lh;
vdev_disk_t *dvd; vdev_disk_t *dvd;
struct block_device *bdev;
int mode = 0;
/* Must have a pathname and it must be absolute. */ /* Must have a pathname and it must be absolute. */
if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
@ -68,77 +71,37 @@ vdev_disk_open_common(vdev_t *vd)
* munging of the flags to make then more agreeable to linux. * munging of the flags to make then more agreeable to linux.
* However, simply passing a 0 for now gets us W/R behavior. * However, simply passing a 0 for now gets us W/R behavior.
*/ */
bdev = open_bdev_excl(vd->vdev_path, mode, dvd); vd_lh = open_bdev_excl(vd->vdev_path, 0, dvd);
if (IS_ERR(bdev)) { if (IS_ERR(vd_lh)) {
kmem_free(dvd, sizeof(vdev_disk_t)); kmem_free(dvd, sizeof(vdev_disk_t));
return -PTR_ERR(bdev); return -PTR_ERR(vd_lh);
} }
/* XXX: Long term validate stored dvd->vd_devid with a unique /* XXX: Long term validate stored dvd->vd_devid with a unique
* identifier read from the disk. * identifier read from the disk, likely EFI support.
*/ */
dvd->vd_lh = bdev;
vd->vdev_tsd = dvd; vd->vdev_tsd = dvd;
dvd->vd_lh = vd_lh;
return 0; /* Check if this is a whole device. When vd_lh->bd_contains ==
} * vd_lh we have a whole device and not simply a partition. */
vd->vdev_wholedisk = !!(vd_lh->bd_contains == vd_lh);
static int /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) vd->vdev_nowritecache = B_FALSE;
{
vdev_disk_t *dvd;
struct block_device *bdev;
int error;
error = vdev_disk_open_common(vd);
if (error)
return error;
dvd = vd->vdev_tsd;
bdev = dvd->vd_lh;
/* Determine the actual size of the device (in bytes) /* Determine the actual size of the device (in bytes)
* *
* XXX: SECTOR_SIZE is defined to 512b which may not be true for * XXX: SECTOR_SIZE is defined to 512b which may not be true for
* your device, we must use the actual hardware sector size. * your device, we must use the actual hardware sector size.
*/ */
*psize = get_capacity(bdev->bd_disk) * SECTOR_SIZE; *psize = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE;
/* Check if this is a whole device and if it is try and
* enable the write cache, it is OK if this fails. */
if (bdev->bd_contains == bdev) {
int wce = 1;
vd->vdev_wholedisk = 1ULL;
/* XXX: Different methods are needed for an IDE vs SCSI disk.
* Since we're not sure what type of disk this is try IDE,
* if that fails try SCSI.
*/
error = ioctl_by_bdev(bdev, HDIO_SET_WCACHE, (unsigned long)&wce);
if (error)
dprintf("Unable to enable IDE WCE and SCSI WCE "
"not yet supported: %d\n", error);
/* XXX: To implement the scsi WCE enable we are going to need
* to use the SG_IO ioctl. But that means fully forming the
* SCSI command as the ioctl arg. To get this right I need
* to look at the sdparm source which does this.
*/
error = 0;
} else {
/* Must be a partition, that's fine. */
vd->vdev_wholedisk = 0;
}
/* Based on the minimum sector size set the block size */ /* Based on the minimum sector size set the block size */
*ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1; *ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1;
/* Clear the nowritecache bit, causes vdev_reopen() to try again. */ return 0;
vd->vdev_nowritecache = B_FALSE;
return error;
} }
static void static void
@ -166,7 +129,7 @@ vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc)
{ {
dio_request_t *dr = bio->bi_private; dio_request_t *dr = bio->bi_private;
zio_t *zio; zio_t *zio;
int error; int i, error;
/* Fatal error but print some useful debugging before asserting */ /* Fatal error but print some useful debugging before asserting */
if (dr == NULL) { if (dr == NULL) {
@ -189,20 +152,38 @@ vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc)
if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags)) if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags))
error = EIO; error = EIO;
zio = dr->dr_zio; spin_lock(&dr->dr_lock);
if (zio) {
zio->io_error = error;
zio_interrupt(zio);
}
dr->dr_ref--;
if (dr->dr_error == 0)
dr->dr_error = error; dr->dr_error = error;
atomic_dec(&dr->dr_ref);
if (bio_sync(bio)) { /*
* All bio's attached to this dio request have completed. This
* means it is safe to access the dio outside the spin lock, we
* are assured there will be no racing accesses.
*/
if (dr->dr_ref == 0) {
zio = dr->dr_zio;
spin_unlock(&dr->dr_lock);
/* Syncronous dio cleanup handled by waiter */
if (dr->dr_rw & (1 << BIO_RW_SYNC)) {
complete(&dr->dr_comp); complete(&dr->dr_comp);
} else { } else {
kmem_free(dr, sizeof(dio_request_t)); for (i = 0; i < dr->dr_bio_count; i++)
bio_put(bio); bio_put(dr->dr_bio[i]);
kmem_free(dr, sizeof(dio_request_t) +
sizeof(struct bio *) * dr->dr_bio_count);
}
if (zio) {
zio->io_error = dr->dr_error;
zio_interrupt(zio);
}
} else {
spin_unlock(&dr->dr_lock);
} }
rc = 0; rc = 0;
@ -215,16 +196,17 @@ out:
} }
static struct bio * static struct bio *
__bio_map_vmem(struct request_queue *q, void *data, bio_map_virt(struct request_queue *q, void *data,
unsigned int len, gfp_t gfp_mask) unsigned int len, gfp_t gfp_mask)
{ {
unsigned long kaddr = (unsigned long)data; unsigned long kaddr = (unsigned long)data;
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = kaddr >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT;
unsigned int offset, i, data_len = len;
const int nr_pages = end - start; const int nr_pages = end - start;
int offset, i;
struct page *page; struct page *page;
struct bio *bio; struct bio *bio;
int rc;
bio = bio_alloc(gfp_mask, nr_pages); bio = bio_alloc(gfp_mask, nr_pages);
if (!bio) if (!bio)
@ -240,11 +222,8 @@ __bio_map_vmem(struct request_queue *q, void *data,
if (bytes > len) if (bytes > len)
bytes = len; bytes = len;
page = vmalloc_to_page(data); VERIFY3P(page = vmalloc_to_page(data), !=, NULL);
ASSERT(page); /* Expecting virtual linear address */ VERIFY3U(bio_add_pc_page(q, bio, page, bytes, offset), ==, bytes);
if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
break;
data += bytes; data += bytes;
len -= bytes; len -= bytes;
@ -252,24 +231,7 @@ __bio_map_vmem(struct request_queue *q, void *data,
bytes = PAGE_SIZE; bytes = PAGE_SIZE;
} }
return bio; VERIFY3U(bio->bi_size, ==, data_len);
}
static struct bio *
bio_map_vmem(struct request_queue *q, void *data,
unsigned int len, gfp_t gfp_mask)
{
struct bio *bio;
bio = __bio_map_vmem(q, data, len, gfp_mask);
if (IS_ERR(bio))
return bio;
if (bio->bi_size != len) {
bio_put(bio);
return ERR_PTR(-EINVAL);
}
return bio; return bio;
} }
@ -281,7 +243,7 @@ bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask)
/* Cleanly map buffer we are passed in to a bio regardless /* Cleanly map buffer we are passed in to a bio regardless
* of if the buffer is a virtual or physical address. */ * of if the buffer is a virtual or physical address. */
if (kmem_virt(data)) if (kmem_virt(data))
bio = bio_map_vmem(q, data, len, gfp_mask); bio = bio_map_virt(q, data, len, gfp_mask);
else else
bio = bio_map_kern(q, data, len, gfp_mask); bio = bio_map_kern(q, data, len, gfp_mask);
@ -289,63 +251,92 @@ bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask)
} }
static int static int
__vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf, __vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf_ptr,
size_t size, uint64_t offset, int flags) size_t kbuf_size, uint64_t kbuf_offset, int flags)
{ {
struct bio *bio; struct request_queue *q = vd_lh->bd_disk->queue;
dio_request_t *dr; dio_request_t *dr;
int rw, error = 0; caddr_t bio_ptr;
struct request_queue *q; uint64_t bio_offset;
int i, j, error = 0, bio_count, bio_size, dio_size;
ASSERT((offset % SECTOR_SIZE) == 0); /* Sector aligned */ ASSERT3S(kbuf_offset % SECTOR_SIZE, ==, 0);
ASSERT3S(flags &
~((1 << BIO_RW) |
(1 << BIO_RW_SYNC) |
(1 << BIO_RW_FAILFAST)), ==, 0);
dr = kmem_alloc(sizeof(dio_request_t), KM_SLEEP); bio_count = (kbuf_size / (q->max_hw_sectors << 9)) + 1;
dio_size = sizeof(dio_request_t) + sizeof(struct bio *) * bio_count;
dr = kmem_zalloc(dio_size, KM_SLEEP);
if (dr == NULL) if (dr == NULL)
return ENOMEM; return ENOMEM;
atomic_set(&dr->dr_ref, 0);
dr->dr_zio = zio;
dr->dr_error = 0;
q = vd_lh->bd_disk->queue;
bio = bio_map(q, kbuf, size, GFP_NOIO);
if (IS_ERR(bio)) {
kmem_free(dr, sizeof(dio_request_t));
return -PTR_ERR(bio);
}
bio->bi_bdev = vd_lh;
bio->bi_sector = offset / SECTOR_SIZE;
bio->bi_end_io = vdev_disk_physio_completion;
bio->bi_private = dr;
init_completion(&dr->dr_comp); init_completion(&dr->dr_comp);
atomic_inc(&dr->dr_ref); spin_lock_init(&dr->dr_lock);
dr->dr_ref = 0;
dr->dr_zio = zio;
dr->dr_rw = READ;
dr->dr_error = 0;
dr->dr_bio_count = bio_count;
if (flags & (1 << BIO_RW)) if (flags & (1 << BIO_RW))
rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE; dr->dr_rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE;
else
rw = READ;
if (flags & (1 << BIO_RW_FAILFAST)) if (flags & (1 << BIO_RW_FAILFAST))
rw |= 1 << BIO_RW_FAILFAST; dr->dr_rw |= 1 << BIO_RW_FAILFAST;
ASSERT3S(flags & ~((1 << BIO_RW) | (1 << BIO_RW_SYNC) |
(1 << BIO_RW_FAILFAST)), ==, 0);
submit_bio(rw, bio);
/* /*
* On syncronous blocking requests we wait for the completion * When the IO size exceeds the maximum bio size for the request
* callback to wake us. Then we are responsible for freeing * queue we are forced to break the IO in multiple bio's and wait
* the dio_request_t as well as dropping the final bio reference. * for them all to complete. Ideally, all pool users will set
* their volume block size to match the maximum request size and
* the common case will be one bio per vdev IO request.
*/ */
if (bio_sync(bio)) { bio_ptr = kbuf_ptr;
bio_offset = kbuf_offset;
for (i = 0; i < dr->dr_bio_count; i++) {
bio_size = MIN(kbuf_size, q->max_hw_sectors << 9);
dr->dr_bio[i] = bio_map(q, bio_ptr, bio_size, GFP_NOIO);
if (IS_ERR(dr->dr_bio[i])) {
for (j = 0; j < i; j++)
bio_put(dr->dr_bio[j]);
error = -PTR_ERR(dr->dr_bio[i]);
kmem_free(dr, dio_size);
return error;
}
dr->dr_bio[i]->bi_bdev = vd_lh;
dr->dr_bio[i]->bi_sector = bio_offset >> 9;
dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
dr->dr_bio[i]->bi_private = dr;
dr->dr_ref++;
bio_ptr += bio_size;
bio_offset += bio_size;
kbuf_size -= bio_size;
}
for (i = 0; i < dr->dr_bio_count; i++)
submit_bio(dr->dr_rw, dr->dr_bio[i]);
/*
* On syncronous blocking requests we wait for all bio the completion
* callbacks to run. We will be woken when the last callback runs
* for this dio. We are responsible for freeing the dio_request_t as
* well as the final reference on all attached bios.
*/
if (dr->dr_rw & (1 << BIO_RW_SYNC)) {
wait_for_completion(&dr->dr_comp); wait_for_completion(&dr->dr_comp);
ASSERT(atomic_read(&dr->dr_ref) == 0); ASSERT(dr->dr_ref == 0);
error = dr->dr_error; error = dr->dr_error;
kmem_free(dr, sizeof(dio_request_t));
bio_put(bio); for (i = 0; i < dr->dr_bio_count; i++)
bio_put(dr->dr_bio[i]);
kmem_free(dr, dio_size);
} }
return error; return error;
@ -379,7 +370,6 @@ vdev_disk_io_start(zio_t *zio)
int flags, error; int flags, error;
if (zio->io_type == ZIO_TYPE_IOCTL) { if (zio->io_type == ZIO_TYPE_IOCTL) {
zio_vdev_io_bypass(zio);
/* XXPOLICY */ /* XXPOLICY */
if (!vdev_readable(vd)) { if (!vdev_readable(vd)) {
@ -453,8 +443,12 @@ vdev_disk_io_start(zio_t *zio)
if (zio->io_flags & ZIO_FLAG_IO_RETRY) if (zio->io_flags & ZIO_FLAG_IO_RETRY)
flags |= (1 << BIO_RW_FAILFAST); flags |= (1 << BIO_RW_FAILFAST);
__vdev_disk_physio(dvd->vd_lh, zio, zio->io_data, error = __vdev_disk_physio(dvd->vd_lh, zio, zio->io_data,
zio->io_size, zio->io_offset, flags); zio->io_size, zio->io_offset, flags);
if (error) {
zio->io_error = error;
return ZIO_PIPELINE_CONTINUE;
}
return ZIO_PIPELINE_STOP; return ZIO_PIPELINE_STOP;
} }
@ -470,7 +464,6 @@ vdev_disk_io_done(zio_t *zio)
*/ */
VERIFY3S(zio->io_error, ==, 0); VERIFY3S(zio->io_error, ==, 0);
#if 0 #if 0
vdev_t *vd = zio->io_vd;
vdev_disk_t *dvd = vd->vdev_tsd; vdev_disk_t *dvd = vd->vdev_tsd;
int state = DKIO_NONE; int state = DKIO_NONE;
@ -523,7 +516,8 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
if (IS_ERR(vd_lh)) if (IS_ERR(vd_lh))
return -PTR_ERR(vd_lh); return -PTR_ERR(vd_lh);
if ((s = i_size_read(vd_lh->bd_inode)) == 0) { s = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE;
if (s == 0) {
close_bdev_excl(vd_lh); close_bdev_excl(vd_lh);
return EIO; return EIO;
} }