Update linux vdev_disk interfaces to issue multiple bios if needed due to the maximum request size being smaller than the request size passed down from the spa

2009-01-26 16:46:50 -08:00 · 2009-01-26 16:46:50 -08:00 · 3657ada547
parent 85366f145c
commit 3657ada547
1 changed files with 162 additions and 168 deletions
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@ -35,18 +35,21 @@
 * Virtual device vector for disks.
 */
 typedef struct dio_request {
-	struct completion dr_comp;
-	atomic_t dr_ref;
-	zio_t *dr_zio;
-	int dr_error;
+	struct completion	dr_comp;	/* Completion for sync IO */
+	spinlock_t		dr_lock;	/* Completion lock */
+	zio_t			*dr_zio;	/* Parent ZIO */
+	int			dr_ref;		/* Outstanding bio count */
+	int			dr_rw;		/* Read/Write */
+	int			dr_error;	/* Bio error */
+	int			dr_bio_count;	/* Count of bio's */
+        struct bio		*dr_bio[0];	/* Attached bio's */
 } dio_request_t;

 static int
-vdev_disk_open_common(vdev_t *vd)
+vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 {
+	struct block_device *vd_lh;
 	vdev_disk_t *dvd;
-	struct block_device *bdev;
-	int mode = 0;

 	/* Must have a pathname and it must be absolute. */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
@ -68,77 +71,37 @@ vdev_disk_open_common(vdev_t *vd)
 	 * munging of the flags to make then more agreeable to linux.
 	 * However, simply passing a 0 for now gets us W/R behavior.
 	 */
-	bdev = open_bdev_excl(vd->vdev_path, mode, dvd);
-	if (IS_ERR(bdev)) {
+	vd_lh = open_bdev_excl(vd->vdev_path, 0, dvd);
+	if (IS_ERR(vd_lh)) {
 		kmem_free(dvd, sizeof(vdev_disk_t));
-		return -PTR_ERR(bdev);
+		return -PTR_ERR(vd_lh);
 	}

 	/* XXX: Long term validate stored dvd->vd_devid with a unique
-	 * identifier read from the disk.
+	 * identifier read from the disk, likely EFI support.
 	 */

-	dvd->vd_lh = bdev;
 	vd->vdev_tsd = dvd;
+	dvd->vd_lh = vd_lh;

-	return 0;
-}
+	/* Check if this is a whole device.  When vd_lh->bd_contains ==
+	 * vd_lh we have a whole device and not simply a partition. */
+	vd->vdev_wholedisk = !!(vd_lh->bd_contains == vd_lh);

-static int
-vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
-{
-	vdev_disk_t *dvd;
-	struct block_device *bdev;
-	int error;
-
-	error = vdev_disk_open_common(vd);
-	if (error)
-		return error;
-
-	dvd = vd->vdev_tsd;
-	bdev = dvd->vd_lh;
+	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
+	vd->vdev_nowritecache = B_FALSE;

 	/* Determine the actual size of the device (in bytes)
 	 *
 	 * XXX: SECTOR_SIZE is defined to 512b which may not be true for
 	 * your device, we must use the actual hardware sector size.
 	 */
-	*psize = get_capacity(bdev->bd_disk) * SECTOR_SIZE;
-
-	/* Check if this is a whole device and if it is try and
-	 * enable the write cache, it is OK if this fails. */
-	if (bdev->bd_contains == bdev) {
-		int wce = 1;
-
-		vd->vdev_wholedisk = 1ULL;
-
-		/* XXX: Different methods are needed for an IDE vs SCSI disk.
-		 * Since we're not sure what type of disk this is try IDE,
-		 * if that fails try SCSI.
-		 */
-		error = ioctl_by_bdev(bdev, HDIO_SET_WCACHE, (unsigned long)&wce);
-		if (error)
-			dprintf("Unable to enable IDE WCE and SCSI WCE "
-				"not yet supported: %d\n", error);
-
-		/* XXX: To implement the scsi WCE enable we are going to need
-		 * to use the SG_IO ioctl.  But that means fully forming the
-		 * SCSI command as the ioctl arg.  To get this right I need
-		 * to look at the sdparm source which does this.
-		 */
-		error = 0;
-	} else {
-		/* Must be a partition, that's fine. */
-		vd->vdev_wholedisk = 0;
-	}
+	*psize = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE;

 	/* Based on the minimum sector size set the block size */
 	*ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1;

-	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
-	vd->vdev_nowritecache = B_FALSE;
-
-	return error;
+	return 0;
 }

 static void
@ -164,16 +127,16 @@ static int
 vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc)
 #endif /* HAVE_2ARGS_BIO_END_IO_T */
 {
-        dio_request_t *dr = bio->bi_private;
+	dio_request_t *dr = bio->bi_private;
 	zio_t *zio;
-	int error;
+	int i, error;

 	/* Fatal error but print some useful debugging before asserting */
 	if (dr == NULL) {
 		printk("FATAL: bio->bi_private == NULL\n"
 		       "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n"
-                       "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d\n",
-                       bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt,
+		       "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d\n",
+		       bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt,
 		       bio->bi_idx, bio->bi_size, bio->bi_end_io,
 		       atomic_read(&bio->bi_cnt));
 		SBUG();
@ -189,20 +152,38 @@ vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc)
 	if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = EIO;

-	zio = dr->dr_zio;
-	if (zio) {
-		zio->io_error = error;
-		zio_interrupt(zio);
-	}
+	spin_lock(&dr->dr_lock);

-	dr->dr_error = error;
-        atomic_dec(&dr->dr_ref);
+	dr->dr_ref--;
+	if (dr->dr_error == 0)
+		dr->dr_error = error;

-        if (bio_sync(bio)) {
-		complete(&dr->dr_comp);
+	/*
+	 * All bio's attached to this dio request have completed.  This
+	 * means it is safe to access the dio outside the spin lock, we
+	 * are assured there will be no racing accesses.
+	 */
+	if (dr->dr_ref == 0) {
+		zio = dr->dr_zio;
+		spin_unlock(&dr->dr_lock);
+
+		/* Syncronous dio cleanup handled by waiter */
+		if (dr->dr_rw & (1 << BIO_RW_SYNC)) {
+			complete(&dr->dr_comp);
+		} else {
+			for (i = 0; i < dr->dr_bio_count; i++)
+				bio_put(dr->dr_bio[i]);
+
+			kmem_free(dr, sizeof(dio_request_t) +
+			          sizeof(struct bio *) * dr->dr_bio_count);
+		}
+
+		if (zio) {
+			zio->io_error = dr->dr_error;
+			zio_interrupt(zio);
+		}
 	} else {
-		kmem_free(dr, sizeof(dio_request_t));
-	        bio_put(bio);
+		spin_unlock(&dr->dr_lock);
 	}

 	rc = 0;
@ -215,62 +196,43 @@ out:
 }

 static struct bio *
-__bio_map_vmem(struct request_queue *q, void *data,
+bio_map_virt(struct request_queue *q, void *data,
               unsigned int len, gfp_t gfp_mask)
 {
-        unsigned long kaddr = (unsigned long)data;
-        unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        unsigned long start = kaddr >> PAGE_SHIFT;
-        const int nr_pages = end - start;
-        int offset, i;
+	unsigned long kaddr = (unsigned long)data;
+	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	unsigned long start = kaddr >> PAGE_SHIFT;
+	unsigned int offset, i, data_len = len;
+	const int nr_pages = end - start;
 	struct page *page;
-        struct bio *bio;
-
-        bio = bio_alloc(gfp_mask, nr_pages);
-        if (!bio)
-                return ERR_PTR(-ENOMEM);
-
-        offset = offset_in_page(kaddr);
-        for (i = 0; i < nr_pages; i++) {
-                unsigned int bytes = PAGE_SIZE - offset;
-
-                if (len <= 0)
-                        break;
-
-                if (bytes > len)
-                        bytes = len;
-
-		page = vmalloc_to_page(data);
-		ASSERT(page); /* Expecting virtual linear address */
-
-                if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
-                        break;
-
-                data += bytes;
-                len -= bytes;
-                offset = 0;
-		bytes = PAGE_SIZE;
-        }
-
-        return bio;
-}
-
-static struct bio *
-bio_map_vmem(struct request_queue *q, void *data,
-             unsigned int len, gfp_t gfp_mask)
-{
 	struct bio *bio;
+	int rc;

-	bio = __bio_map_vmem(q, data, len, gfp_mask);
-	if (IS_ERR(bio))
-		return bio;
+	bio = bio_alloc(gfp_mask, nr_pages);
+	if (!bio)
+		return ERR_PTR(-ENOMEM);

-	if (bio->bi_size != len) {
-		bio_put(bio);
-		return ERR_PTR(-EINVAL);
+	offset = offset_in_page(kaddr);
+	for (i = 0; i < nr_pages; i++) {
+		unsigned int bytes = PAGE_SIZE - offset;
+
+		if (len <= 0)
+			break;
+
+		if (bytes > len)
+			bytes = len;
+
+		VERIFY3P(page = vmalloc_to_page(data), !=, NULL);
+		VERIFY3U(bio_add_pc_page(q, bio, page, bytes, offset), ==, bytes);
+
+		data += bytes;
+		len -= bytes;
+		offset = 0;
+		bytes = PAGE_SIZE;
 	}

-	return bio;
+	VERIFY3U(bio->bi_size, ==, data_len);
+        return bio;
 }

 static struct bio *
@ -281,7 +243,7 @@ bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask)
 	/* Cleanly map buffer we are passed in to a bio regardless
 	 * of if the buffer is a virtual or physical address. */
 	if (kmem_virt(data))
-		bio = bio_map_vmem(q, data, len, gfp_mask);
+		bio = bio_map_virt(q, data, len, gfp_mask);
 	else
 		bio = bio_map_kern(q, data, len, gfp_mask);

@ -289,63 +251,92 @@ bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask)
 }

 static int
-__vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf,
-                   size_t size, uint64_t offset, int flags)
+__vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf_ptr,
+                   size_t kbuf_size, uint64_t kbuf_offset, int flags)
 {
-	struct bio *bio;
+	struct request_queue *q = vd_lh->bd_disk->queue;
        dio_request_t *dr;
-	int rw, error = 0;
-	struct request_queue *q;
+	caddr_t bio_ptr;
+	uint64_t bio_offset;
+	int i, j, error = 0, bio_count, bio_size, dio_size;

-	ASSERT((offset % SECTOR_SIZE) == 0); /* Sector aligned */
+	ASSERT3S(kbuf_offset % SECTOR_SIZE, ==, 0);
+	ASSERT3S(flags &
+		 ~((1 << BIO_RW) |
+		   (1 << BIO_RW_SYNC) |
+		   (1 << BIO_RW_FAILFAST)), ==, 0);

-	dr = kmem_alloc(sizeof(dio_request_t), KM_SLEEP);
+	bio_count = (kbuf_size / (q->max_hw_sectors << 9)) + 1;
+	dio_size  = sizeof(dio_request_t) + sizeof(struct bio *) * bio_count;
+	dr = kmem_zalloc(dio_size, KM_SLEEP);
 	if (dr == NULL)
 		return ENOMEM;

-	atomic_set(&dr->dr_ref, 0);
-	dr->dr_zio = zio;
-	dr->dr_error = 0;
-	q = vd_lh->bd_disk->queue;
-
-	bio = bio_map(q, kbuf, size, GFP_NOIO);
-	if (IS_ERR(bio)) {
-		kmem_free(dr, sizeof(dio_request_t));
-		return -PTR_ERR(bio);
-	}
-
-	bio->bi_bdev = vd_lh;
-	bio->bi_sector = offset / SECTOR_SIZE;
-	bio->bi_end_io = vdev_disk_physio_completion;
-	bio->bi_private = dr;
-
 	init_completion(&dr->dr_comp);
-	atomic_inc(&dr->dr_ref);
+	spin_lock_init(&dr->dr_lock);
+	dr->dr_ref = 0;
+	dr->dr_zio = zio;
+	dr->dr_rw = READ;
+	dr->dr_error = 0;
+	dr->dr_bio_count = bio_count;

 	if (flags & (1 << BIO_RW))
-		rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE;
-	else
-		rw = READ;
+		dr->dr_rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE;

 	if (flags & (1 << BIO_RW_FAILFAST))
-		rw |= 1 << BIO_RW_FAILFAST;
-
-	ASSERT3S(flags & ~((1 << BIO_RW) | (1 << BIO_RW_SYNC) |
-	         (1 << BIO_RW_FAILFAST)), ==, 0);
-
-	submit_bio(rw, bio);
+		dr->dr_rw |= 1 << BIO_RW_FAILFAST;

 	/*
-	 * On syncronous blocking requests we wait for the completion
-	 * callback to wake us.  Then we are responsible for freeing
-	 * the dio_request_t as well as dropping the final bio reference.
+	 * When the IO size exceeds the maximum bio size for the request
+	 * queue we are forced to break the IO in multiple bio's and wait
+	 * for them all to complete.  Ideally, all pool users will set
+	 * their volume block size to match the maximum request size and
+	 * the common case will be one bio per vdev IO request.
 	 */
-	if (bio_sync(bio)) {
+	bio_ptr = kbuf_ptr;
+	bio_offset = kbuf_offset;
+	for (i = 0; i < dr->dr_bio_count; i++) {
+		bio_size = MIN(kbuf_size, q->max_hw_sectors << 9);
+
+		dr->dr_bio[i] = bio_map(q, bio_ptr, bio_size, GFP_NOIO);
+		if (IS_ERR(dr->dr_bio[i])) {
+			for (j = 0; j < i; j++)
+				bio_put(dr->dr_bio[j]);
+
+			error = -PTR_ERR(dr->dr_bio[i]);
+			kmem_free(dr, dio_size);
+			return error;
+		}
+
+		dr->dr_bio[i]->bi_bdev = vd_lh;
+		dr->dr_bio[i]->bi_sector = bio_offset >> 9;
+		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+		dr->dr_bio[i]->bi_private = dr;
+		dr->dr_ref++;
+
+		bio_ptr    += bio_size;
+		bio_offset += bio_size;
+		kbuf_size  -= bio_size;
+	}
+
+	for (i = 0; i < dr->dr_bio_count; i++)
+		submit_bio(dr->dr_rw, dr->dr_bio[i]);
+
+	/*
+	 * On syncronous blocking requests we wait for all bio the completion
+	 * callbacks to run.  We will be woken when the last callback runs
+	 * for this dio.  We are responsible for freeing the dio_request_t as
+	 * well as the final reference on all attached bios.
+	 */
+	if (dr->dr_rw & (1 << BIO_RW_SYNC)) {
 		wait_for_completion(&dr->dr_comp);
-		ASSERT(atomic_read(&dr->dr_ref) == 0);
+		ASSERT(dr->dr_ref == 0);
 		error = dr->dr_error;
-		kmem_free(dr, sizeof(dio_request_t));
-	        bio_put(bio);
+
+		for (i = 0; i < dr->dr_bio_count; i++)
+			bio_put(dr->dr_bio[i]);
+
+		kmem_free(dr, dio_size);
 	}

 	return error;
@ -379,7 +370,6 @@ vdev_disk_io_start(zio_t *zio)
 	int flags, error;

 	if (zio->io_type == ZIO_TYPE_IOCTL) {
-		zio_vdev_io_bypass(zio);

 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
@ -453,8 +443,12 @@ vdev_disk_io_start(zio_t *zio)
 	if (zio->io_flags & ZIO_FLAG_IO_RETRY)
 		flags |= (1 << BIO_RW_FAILFAST);

-	__vdev_disk_physio(dvd->vd_lh, zio, zio->io_data,
-			   zio->io_size, zio->io_offset, flags);
+	error = __vdev_disk_physio(dvd->vd_lh, zio, zio->io_data,
+		                   zio->io_size, zio->io_offset, flags);
+	if (error) {
+		zio->io_error = error;
+		return ZIO_PIPELINE_CONTINUE;
+	}

 	return ZIO_PIPELINE_STOP;
 }
@ -470,7 +464,6 @@ vdev_disk_io_done(zio_t *zio)
 	 */
 	VERIFY3S(zio->io_error, ==, 0);
 #if 0
-		vdev_t *vd = zio->io_vd;
 		vdev_disk_t *dvd = vd->vdev_tsd;
 		int state = DKIO_NONE;

@ -523,7 +516,8 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 	if (IS_ERR(vd_lh))
 		return -PTR_ERR(vd_lh);

-	if ((s = i_size_read(vd_lh->bd_inode)) == 0) {
+	s = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE;
+	if (s == 0) {
 		close_bdev_excl(vd_lh);
 		return EIO;
 	}