Add linux kernel disk support

Native Linux vdev disk interfaces Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
2010-08-26 11:45:02 -07:00 · 2010-08-26 11:45:02 -07:00 · 60101509ee
parent 325f023544
commit 60101509ee
27 changed files with 2575 additions and 116 deletions
--- a/module/zcommon/include/sys/fs/zfs.h
+++ b/module/zcommon/include/sys/fs/zfs.h
@ -701,12 +701,12 @@ typedef struct ddt_histogram {
 #define	ZFS_DEV		"/dev/zfs"
 /* general zvol path */
-#define	ZVOL_DIR		"/dev/zvol"
+#define	ZVOL_DIR	"/dev"
-/* expansion */
+
-#define	ZVOL_PSEUDO_DEV		"/devices/pseudo/zfs@0:"
+#define	ZVOL_MAJOR		230
-/* for dump and swap */
+#define	ZVOL_MINOR_BITS		4
-#define	ZVOL_FULL_DEV_DIR	ZVOL_DIR "/dsk/"
+#define	ZVOL_MINOR_MASK		((1U << ZVOL_MINOR_BITS) - 1)
-#define	ZVOL_FULL_RDEV_DIR	ZVOL_DIR "/rdsk/"
+#define	ZVOL_MINORS		(1 << 4)
 #define	ZVOL_PROP_NAME		"name"
 #define	ZVOL_DEFAULT_BLOCKSIZE	8192
@ -740,6 +740,8 @@ typedef enum zfs_ioc {
 	ZFS_IOC_DATASET_LIST_NEXT,
 	ZFS_IOC_SNAPSHOT_LIST_NEXT,
 	ZFS_IOC_SET_PROP,
 	ZFS_IOC_CREATE_MINOR,
 	ZFS_IOC_REMOVE_MINOR,
 	ZFS_IOC_CREATE,
 	ZFS_IOC_DESTROY,
 	ZFS_IOC_ROLLBACK,
--- a/module/zcommon/zfs_namecheck.c
+++ b/module/zcommon/zfs_namecheck.c
@ -142,9 +142,22 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
 	 * which is the same as MAXNAMELEN used in the kernel.
 	 * If ZFS_MAXNAMELEN value is changed, make sure to cleanup all
 	 * places using MAXNAMELEN.
 	 *
 	 * When HAVE_KOBJ_NAME_LEN is defined the maximum safe kobject name
 	 * length is 20 bytes.  This 20 bytes is broken down as follows to
 	 * provide a maximum safe <pool>/<dataset>[@snapshot] length of only
 	 * 18 bytes.  To ensure bytes are left for <dataset>[@snapshot] the
 	 * <pool> portition is futher limited to 9 bytes.  For 2.6.27 and
 	 * newer kernels this limit is set to MAXNAMELEN.
 	 *
 	 *   <pool>/<dataset> + <partition> + <newline>
 	 *   (18)             + (1)         + (1)
 	 */
-
+#ifdef HAVE_KOBJ_NAME_LEN
 	if (strlen(path) > 18) {
 #else
 	if (strlen(path) >= MAXNAMELEN) {
 #endif /* HAVE_KOBJ_NAME_LEN */
 		if (why)
 			*why = NAME_ERR_TOOLONG;
 		return (-1);
@ -303,8 +316,22 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
 	 * which is the same as MAXNAMELEN used in the kernel.
 	 * If ZPOOL_MAXNAMELEN value is changed, make sure to cleanup all
 	 * places using MAXNAMELEN.
 	 *
 	 * When HAVE_KOBJ_NAME_LEN is defined the maximum safe kobject name
 	 * length is 20 bytes.  This 20 bytes is broken down as follows to
 	 * provide a maximum safe <pool>/<dataset>[@snapshot] length of only
 	 * 18 bytes.  To ensure bytes are left for <dataset>[@snapshot] the
 	 * <pool> portition is futher limited to 8 bytes.  For 2.6.27 and
 	 * newer kernels this limit is set to MAXNAMELEN.
 	 *
 	 *   <pool>/<dataset> + <partition> + <newline>
 	 *   (18)             + (1)         + (1)
 	 */
 #ifdef HAVE_KOBJ_NAME_LEN
 	if (strlen(pool) > 8) {
 #else
 	if (strlen(pool) >= MAXNAMELEN) {
 #endif /* HAVE_KOBJ_NAME_LEN */
 		if (why)
 			*why = NAME_ERR_TOOLONG;
 		return (-1);
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@ -53,6 +53,7 @@ ${MODULE}-objs += uberblock.o
 ${MODULE}-objs += unique.o
 ${MODULE}-objs += vdev.o
 ${MODULE}-objs += vdev_cache.o
 ${MODULE}-objs += vdev_disk.o
 ${MODULE}-objs += vdev_file.o
 ${MODULE}-objs += vdev_label.o
 ${MODULE}-objs += vdev_mirror.o
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@ -793,7 +793,7 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		else
 			dmu_buf_will_dirty(db, tx);
-		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
+		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
@ -975,85 +975,126 @@ xuio_stat_wbuf_nocopy()
 }
 #ifdef _KERNEL
-int
+
-dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+/*
 * Copy up to size bytes between arg_buf and req based on the data direction
 * described by the req.  If an entire req's data cannot be transfered the
 * req's is updated such that it's current index and bv offsets correctly
 * reference any residual data which could not be copied.  The return value
 * is the number of bytes successfully copied to arg_buf.
 */
 static int
 dmu_req_copy(void *arg_buf, int size, int *offset, struct request *req)
 {
 	struct bio_vec *bv;
 	struct req_iterator iter;
 	char *bv_buf;
 	int tocpy;
 	*offset = 0;
 	rq_for_each_segment(bv, req, iter) {
 		/* Fully consumed the passed arg_buf */
 		ASSERT3S(*offset, <=, size);
 		if (size == *offset)
 			break;
 		/* Skip fully consumed bv's */
 		if (bv->bv_len == 0)
 			continue;
 		tocpy = MIN(bv->bv_len, size - *offset);
 		ASSERT3S(tocpy, >=, 0);
 		bv_buf = page_address(bv->bv_page) + bv->bv_offset;
 		ASSERT3P(bv_buf, !=, NULL);
 		if (rq_data_dir(req) == WRITE)
 			memcpy(arg_buf + *offset, bv_buf, tocpy);
 		else
 			memcpy(bv_buf, arg_buf + *offset, tocpy);
 		*offset += tocpy;
 		bv->bv_offset += tocpy;
 		bv->bv_len -= tocpy;
 	}
 	return 0;
 }
 int
 dmu_read_req(objset_t *os, uint64_t object, struct request *req)
 {
 	uint64_t size = blk_rq_bytes(req);
 	uint64_t offset = blk_rq_pos(req) << 9;
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 	xuio_t *xuio = NULL;
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
-	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
+	err = dmu_buf_hold_array(os, object, offset, size, TRUE, FTAG,
-	    &numbufs, &dbp);
+				 &numbufs, &dbp);
 	if (err)
 		return (err);
 	if (uio->uio_extflg == UIO_XUIO)
 		xuio = (xuio_t *)uio;
 	for (i = 0; i < numbufs; i++) {
-		int tocpy;
+		int tocpy, didcpy, bufoff;
 		int bufoff;
 		dmu_buf_t *db = dbp[i];
-		ASSERT(size > 0);
+		bufoff = offset - db->db_offset;
 		ASSERT3S(bufoff, >=, 0);
 		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 		if (tocpy == 0)
 			break;
-		if (xuio) {
+		err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req);
-			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+
-			arc_buf_t *dbuf_abuf = dbi->db_buf;
+		if (didcpy < tocpy)
-			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
+			err = EIO;
 			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
 			if (!err) {
 				uio->uio_resid -= tocpy;
 				uio->uio_loffset += tocpy;
 			}
 			if (abuf == dbuf_abuf)
 				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
 			else
 				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
 		} else {
 			err = uiomove((char *)db->db_data + bufoff, tocpy,
 			    UIO_READ, uio);
 		}
 		if (err)
 			break;
 		size -= tocpy;
 		offset += didcpy;
 		err = 0;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
-static int
+int
-dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
+dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx)
 {
 	uint64_t size = blk_rq_bytes(req);
 	uint64_t offset = blk_rq_pos(req) << 9;
 	dmu_buf_t **dbp;
 	int numbufs;
 	int err = 0;
 	int i;
-	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+	if (size == 0)
-	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
+		return (0);
 	err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
 				 &numbufs, &dbp);
 	if (err)
 		return (err);
 	for (i = 0; i < numbufs; i++) {
-		int tocpy;
+		int tocpy, didcpy, bufoff;
 		int bufoff;
 		dmu_buf_t *db = dbp[i];
-		ASSERT(size > 0);
+		bufoff = offset - db->db_offset;
 		ASSERT3S(bufoff, >=, 0);
 		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 		if (tocpy == 0)
 			break;
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
@ -1062,28 +1103,28 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
 		else
 			dmu_buf_will_dirty(db, tx);
-		/*
+		err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req);
 		 * XXX uiomove could block forever (eg. nfs-backed
 		 * pages).  There needs to be a uiolockdown() function
 		 * to lock the pages in memory, so that uiomove won't
 		 * block.
 		 */
 		err = uiomove((char *)db->db_data + bufoff, tocpy,
 		    UIO_WRITE, uio);
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 		if (didcpy < tocpy)
 			err = EIO;
 		if (err)
 			break;
 		size -= tocpy;
 		offset += didcpy;
 		err = 0;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 #endif
 #ifdef HAVE_ZPL
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
    dmu_tx_t *tx)
--- a/module/zfs/include/sys/blkdev.h
+++ b/module/zfs/include/sys/blkdev.h
@ -0,0 +1,211 @@
 /*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
 /*
 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
 * LLNL-CODE-403049.
 */
 #ifndef	_SYS_BLKDEV_H
 #define	_SYS_BLKDEV_H
 #ifdef _KERNEL
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #ifndef HAVE_FMODE_T
 typedef unsigned __bitwise__ fmode_t;
 #endif /* HAVE_FMODE_T */
 #ifndef HAVE_BLK_FETCH_REQUEST
 static inline struct request *
 blk_fetch_request(struct request_queue *q)
 {
 	struct request *req;
 	req = elv_next_request(q);
 	if (req)
 		blkdev_dequeue_request(req);
 	return req;
 }
 #endif /* HAVE_BLK_FETCH_REQUEST */
 #ifndef HAVE_BLK_REQUEUE_REQUEST
 static inline void
 blk_requeue_request(request_queue_t *q, struct request *req)
 {
 	elv_requeue_request(q, req);
 }
 #endif /* HAVE_BLK_REQUEUE_REQUEST */
 #ifndef HAVE_BLK_END_REQUEST
 static inline bool
 __blk_end_request(struct request *req, int error, unsigned int nr_bytes)
 {
 	LIST_HEAD(list);
 	/*
 	 * Request has already been dequeued but 2.6.18 version of
 	 * end_request() unconditionally dequeues the request so we
 	 * add it to a local list to prevent hitting the BUG_ON.
 	 */
 	list_add(&req->queuelist, &list);
 	/*
 	 * The old API required the driver to end each segment and not
 	 * the entire request.  In our case we always need to end the
 	 * entire request partial requests are not supported.
 	 */
 	req->hard_cur_sectors = nr_bytes >> 9;
 	end_request(req, ((error == 0) ? 1 : error));
 	return 0;
 }
 static inline bool
 blk_end_request(struct request *req, int error, unsigned int nr_bytes)
 {
 	struct request_queue *q = req->q;
 	bool rc;
 	spin_lock_irq(q->queue_lock);
 	rc = __blk_end_request(req, error, nr_bytes);
 	spin_unlock_irq(q->queue_lock);
 	return rc;
 }
 #else
 # ifdef HAVE_BLK_END_REQUEST_GPL_ONLY
 /*
 * Define required to avoid conflicting 2.6.29 non-static prototype for a
 * GPL-only version of the helper.  As of 2.6.31 the helper is available
 * to non-GPL modules and is not explicitly exported GPL-only.
 */
 # define __blk_end_request __blk_end_request_x
 # define blk_end_request blk_end_request_x
 static inline bool
 __blk_end_request_x(struct request *req, int error, unsigned int nr_bytes)
 {
 	/*
 	 * The old API required the driver to end each segment and not
 	 * the entire request.  In our case we always need to end the
 	 * entire request partial requests are not supported.
 	 */
 	req->hard_cur_sectors = nr_bytes >> 9;
 	end_request(req, ((error == 0) ? 1 : error));
 	return 0;
 }
 static inline bool
 blk_end_request_x(struct request *req, int error, unsigned int nr_bytes)
 {
 	struct request_queue *q = req->q;
 	bool rc;
 	spin_lock_irq(q->queue_lock);
 	rc = __blk_end_request_x(req, error, nr_bytes);
 	spin_unlock_irq(q->queue_lock);
 	return rc;
 }
 # endif /* HAVE_BLK_END_REQUEST_GPL_ONLY */
 #endif /* HAVE_BLK_END_REQUEST */
 #ifndef HAVE_BLK_RQ_POS
 static inline sector_t
 blk_rq_pos(struct request *req)
 {
 	return req->sector;
 }
 #endif /* HAVE_BLK_RQ_POS */
 #ifndef HAVE_BLK_RQ_SECTORS
 static inline unsigned int
 blk_rq_sectors(struct request *req)
 {
 	return req->nr_sectors;
 }
 #endif /* HAVE_BLK_RQ_SECTORS */
 #if !defined(HAVE_BLK_RQ_BYTES) || defined(HAVE_BLK_RQ_BYTES_GPL_ONLY)
 /*
 * Define required to avoid conflicting 2.6.29 non-static prototype for a
 * GPL-only version of the helper.  As of 2.6.31 the helper is available
 * to non-GPL modules in the form of a static inline in the header.
 */
 #define blk_rq_bytes __blk_rq_bytes
 static inline unsigned int
 __blk_rq_bytes(struct request *req)
 {
 	return blk_rq_sectors(req) << 9;
 }
 #endif /* !HAVE_BLK_RQ_BYTES || HAVE_BLK_RQ_BYTES_GPL_ONLY */
 #ifndef HAVE_GET_DISK_RO
 static inline int
 get_disk_ro(struct gendisk *disk)
 {
 	int policy = 0;
 	if (disk->part[0])
 		policy = disk->part[0]->policy;
 	return policy;
 }
 #endif /* HAVE_GET_DISK_RO */
 #ifndef HAVE_RQ_IS_SYNC
 static inline bool
 rq_is_sync(struct request *req)
 {
 	return (req->flags & REQ_RW_SYNC);
 }
 #endif /* HAVE_RQ_IS_SYNC */
 #ifndef HAVE_RQ_FOR_EACH_SEGMENT
 struct req_iterator {
 	int i;
 	struct bio *bio;
 };
 # define for_each_bio(_bio)              \
 	for (; _bio; _bio = _bio->bi_next)
 # define __rq_for_each_bio(_bio, rq)    \
 	if ((rq->bio))                  \
 		for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
 # define rq_for_each_segment(bvl, _rq, _iter)                   \
 	__rq_for_each_bio(_iter.bio, _rq)                       \
 		bio_for_each_segment(bvl, _iter.bio, _iter.i)
 #endif /* HAVE_RQ_FOR_EACH_SEGMENT */
 #ifndef DISK_NAME_LEN
 #define DISK_NAME_LEN	32
 #endif /* DISK_NAME_LEN */
 #endif /* KERNEL */
 #endif	/* _SYS_BLKDEV_H */
--- a/module/zfs/include/sys/dmu.h
+++ b/module/zfs/include/sys/dmu.h
@ -41,13 +41,14 @@
 #include <sys/cred.h>
 #include <sys/time.h>
 #include <sys/uio.h>
 #ifdef _KERNEL
 #include <sys/blkdev.h>
 #endif
 #ifdef	__cplusplus
 extern "C" {
 #endif
 struct uio;
 struct xuio;
 struct page;
 struct vnode;
 struct spa;
@ -512,13 +513,14 @@ void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_tx_t *tx);
-int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
+#ifdef _KERNEL
-int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
+int dmu_read_req(objset_t *os, uint64_t object, struct request *req);
-    dmu_tx_t *tx);
+int dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx);
-int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
+#endif
-    dmu_tx_t *tx);
+#ifdef HAVE_ZPL
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
    uint64_t size, struct page *pp, dmu_tx_t *tx);
 #endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
--- a/module/zfs/include/sys/spa_impl.h
+++ b/module/zfs/include/sys/spa_impl.h
@ -210,7 +210,7 @@ struct spa {
 	kmutex_t	spa_proc_lock;		/* protects spa_proc* */
 	kcondvar_t	spa_proc_cv;		/* spa_proc_state transitions */
 	spa_proc_state_t spa_proc_state;	/* see definition */
-	struct proc	*spa_proc;		/* "zpool-poolname" process */
+	proc_t		*spa_proc;		/* "zpool-poolname" process */
 	uint64_t	spa_did;		/* if procp != p0, did of t1 */
 	boolean_t	spa_autoreplace;	/* autoreplace set in open */
 	int		spa_vdev_locks;		/* locks grabbed */
--- a/module/zfs/include/sys/vdev_disk.h
+++ b/module/zfs/include/sys/vdev_disk.h
@ -0,0 +1,97 @@
 /*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
 /*
 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
 * LLNL-CODE-403049.
 */
 #ifndef _SYS_VDEV_DISK_H
 #define _SYS_VDEV_DISK_H
 #ifdef	__cplusplus
 extern "C" {
 #endif
 #ifdef _KERNEL
 #include <sys/vdev.h>
 #include <sys/ddi.h>
 #include <sys/sunldi.h>
 #include <sys/sunddi.h>
 typedef struct vdev_disk {
 	ddi_devid_t		vd_devid;
 	char			*vd_minor;
 	struct block_device	*vd_bdev;
 } vdev_disk_t;
 extern int vdev_disk_physio(struct block_device *, caddr_t,
 			    size_t, uint64_t, int);
 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
 /* 2.6.24 API change */
 #ifdef HAVE_2ARGS_BIO_END_IO_T
 # define BIO_END_IO_PROTO(fn, x, y, z)	static void fn(struct bio *x, int z)
 # define BIO_END_IO_RETURN(rc)		return
 #else
 # define BIO_END_IO_PROTO(fn, x, y, z)	static int fn(struct bio *x, \
 					              unsigned int y, int z)
 # define BIO_END_IO_RETURN(rc)		return rc
 #endif /* HAVE_2ARGS_BIO_END_IO_T */
 /* 2.6.29 API change */
 #ifdef HAVE_BIO_RW_SYNCIO
 # define DIO_RW_SYNCIO			BIO_RW_SYNCIO
 #else
 # define DIO_RW_SYNCIO			BIO_RW_SYNC
 #endif /* HAVE_BIO_RW_SYNCIO */
 /* 2.6.28 API change */
 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
 # define vdev_bdev_open(path, md, hld)	open_bdev_exclusive(path, md, hld)
 # define vdev_bdev_close(bdev, md)	close_bdev_exclusive(bdev, md)
 #else
 # define vdev_bdev_open(path, md, hld)	open_bdev_excl(path, md, hld)
 # define vdev_bdev_close(bdev, md)	close_bdev_excl(bdev)
 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
 /* 2.6.22 API change */
 #ifdef HAVE_1ARG_INVALIDATE_BDEV
 # define vdev_bdev_invalidate(bdev)	invalidate_bdev(bdev)
 #else
 # define vdev_bdev_invalidate(bdev)	invalidate_bdev(bdev, 1)
 #endif /* HAVE_1ARG_INVALIDATE_BDEV */
 /* 2.6.30 API change */
 #ifdef HAVE_BDEV_LOGICAL_BLOCK_SIZE
 # define vdev_bdev_block_size(bdev)	bdev_logical_block_size(bdev)
 #else
 # define vdev_bdev_block_size(bdev)	bdev_hardsect_size(bdev)
 #endif
 #endif /* _KERNEL */
 #ifdef	__cplusplus
 }
 #endif
 #endif	/* _SYS_VDEV_DISK_H */
--- a/module/zfs/include/sys/zfs_fuid.h
+++ b/module/zfs/include/sys/zfs_fuid.h
@ -33,6 +33,7 @@
 #include <sys/zfs_vfsops.h>
 #endif
 #include <sys/avl.h>
 #include <sys/list.h>
 #ifdef	__cplusplus
 extern "C" {
@ -98,6 +99,7 @@ typedef struct zfs_fuid_info {
 } zfs_fuid_info_t;
 #ifdef _KERNEL
 #ifdef HAVE_ZPL
 struct znode;
 extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
 extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t,
@ -117,6 +119,7 @@ extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain,
    char **retdomain, boolean_t addok);
 extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx);
 extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 #endif /* HAVE_ZPL */
 #endif
 char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t);
--- a/module/zfs/include/sys/zfs_ioctl.h
+++ b/module/zfs/include/sys/zfs_ioctl.h
@ -316,7 +316,6 @@ extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr);
 extern int zfs_secpolicy_rename_perms(const char *from,
    const char *to, cred_t *cr);
 extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
 extern int zfs_busy(void);
 extern int zfs_unmount_snap(const char *, void *);
 enum zfsdev_state_type {
--- a/module/zfs/include/sys/zfs_znode.h
+++ b/module/zfs/include/sys/zfs_znode.h
@ -343,8 +343,10 @@ extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
 extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 #if defined(HAVE_UIO_RW)
 extern caddr_t zfs_map_page(page_t *, enum seg_rw);
 extern void zfs_unmap_page(page_t *, caddr_t);
 #endif /* HAVE_UIO_RW */
 extern zil_get_data_t zfs_get_data;
 extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
--- a/module/zfs/include/sys/zvol.h
+++ b/module/zfs/include/sys/zvol.h
@ -28,49 +28,26 @@
 #include <sys/zfs_context.h>
 #ifdef	__cplusplus
 extern "C" {
 #endif
 #define	ZVOL_OBJ		1ULL
 #define	ZVOL_ZAP_OBJ		2ULL
 #ifdef _KERNEL
 #include <sys/blkdev.h>
 extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
 extern int zvol_check_volblocksize(uint64_t volblocksize);
 extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
 extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
 extern int zvol_create_minor(const char *);
 extern int zvol_create_minors(const char *);
 extern int zvol_remove_minor(const char *);
 extern void zvol_remove_minors(const char *);
-extern int zvol_set_volsize(const char *, major_t, uint64_t);
+extern int zvol_set_volsize(const char *, uint64_t);
 extern int zvol_set_volblocksize(const char *, uint64_t);
-extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
+extern int zvol_init(void);
 extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
 extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
 extern int zvol_strategy(buf_t *bp);
 extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
 extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
 extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
 extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
 extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
    int *rvalp);
 extern int zvol_busy(void);
 extern void zvol_init(void);
 extern void zvol_fini(void);
-extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+#endif /* _KERNEL */
-    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+#endif /* _SYS_ZVOL_H */
    void **rl_hdl, void **bonus_hdl);
 extern uint64_t zvol_get_volume_size(void *minor_hdl);
 extern int zvol_get_volume_wce(void *minor_hdl);
 extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
    ssize_t resid, boolean_t sync);
 #endif
 #ifdef	__cplusplus
 }
 #endif
 #endif	/* _SYS_ZVOL_H */
--- a/module/zfs/rrwlock.c
+++ b/module/zfs/rrwlock.c
@ -23,6 +23,8 @@
 * Use is subject to license terms.
 */
 #ifdef HAVE_ZPL
 #include <sys/refcount.h>
 #include <sys/rrwlock.h>
@ -262,3 +264,4 @@ rrw_held(rrwlock_t *rrl, krw_t rw)
 	return (held);
 }
 #endif /* HAVE_ZPL */
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@ -1510,12 +1510,6 @@ spa_name_compare(const void *a1, const void *a2)
 	return (0);
 }
 int
 spa_busy(void)
 {
 	return (spa_active_count);
 }
 void
 spa_boot_init(void)
 {
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@ -1069,6 +1069,15 @@ vdev_open_child(void *arg)
 boolean_t
 vdev_uses_zvols(vdev_t *vd)
 {
 /*
 * Stacking zpools on top of zvols is unsupported until we implement a method
 * for determining if an arbitrary block device is a zvol without using the
 * path.  Solaris would check the 'zvol' path component but this does not
 * exist in the Linux port, so we really should do something like stat the
 * file and check the major number.  This is complicated by the fact that
 * we need to do this portably in user or kernel space.
 */
 #if 0
 	int c;
 	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
@ -1077,6 +1086,7 @@ vdev_uses_zvols(vdev_t *vd)
 	for (c = 0; c < vd->vdev_children; c++)
 		if (vdev_uses_zvols(vd->vdev_child[c]))
 			return (B_TRUE);
 #endif
 	return (B_FALSE);
 }
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@ -0,0 +1,654 @@
 /*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
 /*
 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
 * LLNL-CODE-403049.
 */
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/sunldi.h>
 /*
 * Virtual device vector for disks.
 */
 typedef struct dio_request {
 	struct completion	dr_comp;	/* Completion for sync IO */
 	atomic_t		dr_ref;		/* References */
 	zio_t			*dr_zio;	/* Parent ZIO */
 	int			dr_rw;		/* Read/Write */
 	int			dr_error;	/* Bio error */
 	int			dr_bio_count;	/* Count of bio's */
        struct bio		*dr_bio[0];	/* Attached bio's */
 } dio_request_t;
 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
 static fmode_t
 vdev_bdev_mode(int smode)
 {
 	fmode_t mode = 0;
 	ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
 	if (smode & FREAD)
 		mode |= FMODE_READ;
 	if (smode & FWRITE)
 		mode |= FMODE_WRITE;
 	return mode;
 }
 #else
 static int
 vdev_bdev_mode(int smode)
 {
 	int mode = 0;
 	ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
 	if ((smode & FREAD) && !(smode & FWRITE))
 		mode = MS_RDONLY;
 	return mode;
 }
 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
 static uint64_t
 bdev_capacity(struct block_device *bdev)
 {
 	struct hd_struct *part = bdev->bd_part;
 	/* The partition capacity referenced by the block device */
 	if (part)
 	       return part->nr_sects;
 	/* Otherwise assume the full device capacity */
 	return get_capacity(bdev->bd_disk);
 }
 static int
 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
 {
 	struct block_device *bdev;
 	vdev_disk_t *vd;
 	int mode, block_size;
 	/* Must have a pathname and it must be absolute. */
 	if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
 		v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return EINVAL;
 	}
 	vd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP);
 	if (vd == NULL)
 		return ENOMEM;
 	/*
 	 * Devices are always opened by the path provided at configuration
 	 * time.  This means that if the provided path is a udev by-id path
 	 * then drives may be recabled without an issue.  If the provided
 	 * path is a udev by-path path then the physical location information
 	 * will be preserved.  This can be critical for more complicated
 	 * configurations where drives are located in specific physical
 	 * locations to maximize the systems tolerence to component failure.
 	 * Alternately you can provide your own udev rule to flexibly map
 	 * the drives as you see fit.  It is not advised that you use the
 	 * /dev/[hd]d devices which may be reorder due to probing order.
 	 * Devices in the wrong locations will be detected by the higher
 	 * level vdev validation.
 	 */
 	mode = spa_mode(v->vdev_spa);
 	bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd);
 	if (IS_ERR(bdev)) {
 		kmem_free(vd, sizeof(vdev_disk_t));
 		return -PTR_ERR(bdev);
 	}
 	v->vdev_tsd = vd;
 	vd->vd_bdev = bdev;
 	block_size =  vdev_bdev_block_size(bdev);
 	/* Check if this is a whole device.  When bdev->bd_contains ==
 	 * bdev we have a whole device and not simply a partition. */
 	v->vdev_wholedisk = !!(bdev->bd_contains == bdev);
 	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
 	v->vdev_nowritecache = B_FALSE;
 	/* Physical volume size in bytes */
 	*psize = bdev_capacity(bdev) * block_size;
 	/* Based on the minimum sector size set the block size */
 	*ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
 	return 0;
 }
 static void
 vdev_disk_close(vdev_t *v)
 {
 	vdev_disk_t *vd = v->vdev_tsd;
 	if (vd == NULL)
 		return;
 	if (vd->vd_bdev != NULL)
 		vdev_bdev_close(vd->vd_bdev,
 		                vdev_bdev_mode(spa_mode(v->vdev_spa)));
 	kmem_free(vd, sizeof(vdev_disk_t));
 	v->vdev_tsd = NULL;
 }
 static dio_request_t *
 vdev_disk_dio_alloc(int bio_count)
 {
 	dio_request_t *dr;
 	int i;
 	dr = kmem_zalloc(sizeof(dio_request_t) +
 	                 sizeof(struct bio *) * bio_count, KM_SLEEP);
 	if (dr) {
 		init_completion(&dr->dr_comp);
 		atomic_set(&dr->dr_ref, 0);
 		dr->dr_bio_count = bio_count;
 		dr->dr_error = 0;
 		for (i = 0; i < dr->dr_bio_count; i++)
 			dr->dr_bio[i] = NULL;
 	}
 	return dr;
 }
 static void
 vdev_disk_dio_free(dio_request_t *dr)
 {
 	int i;
 	for (i = 0; i < dr->dr_bio_count; i++)
 		if (dr->dr_bio[i])
 			bio_put(dr->dr_bio[i]);
 	kmem_free(dr, sizeof(dio_request_t) +
 	          sizeof(struct bio *) * dr->dr_bio_count);
 }
 static void
 vdev_disk_dio_get(dio_request_t *dr)
 {
 	atomic_inc(&dr->dr_ref);
 }
 static int
 vdev_disk_dio_put(dio_request_t *dr)
 {
 	int rc = atomic_dec_return(&dr->dr_ref);
 	/*
 	 * Free the dio_request when the last reference is dropped and
 	 * ensure zio_interpret is called only once with the correct zio
 	 */
 	if (rc == 0) {
 		zio_t *zio = dr->dr_zio;
 		int error = dr->dr_error;
 		vdev_disk_dio_free(dr);
 		if (zio) {
 			zio->io_error = error;
 			zio_interrupt(zio);
 		}
 	}
 	return rc;
 }
 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, error)
 {
 	dio_request_t *dr = bio->bi_private;
 	int rc;
 	/* Fatal error but print some useful debugging before asserting */
 	if (dr == NULL)
 		PANIC("dr == NULL, bio->bi_private == NULL\n"
 		    "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n"
 		    "bi_idx: %d, bi_size: %d, bi_end_io: %p, bi_cnt: %d\n",
 		    bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt,
 		    bio->bi_idx, bio->bi_size, bio->bi_end_io,
 		    atomic_read(&bio->bi_cnt));
 #ifndef HAVE_2ARGS_BIO_END_IO_T
 	if (bio->bi_size)
 		return 1;
 #endif /* HAVE_2ARGS_BIO_END_IO_T */
 	if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = EIO;
 	if (dr->dr_error == 0)
 		dr->dr_error = error;
 	/* Drop reference aquired by __vdev_disk_physio */
 	rc = vdev_disk_dio_put(dr);
 	/* Wake up synchronous waiter this is the last outstanding bio */
 	if ((rc == 1) && (dr->dr_rw & (1 << DIO_RW_SYNCIO)))
 		complete(&dr->dr_comp);
 	BIO_END_IO_RETURN(0);
 }
 static inline unsigned long
 bio_nr_pages(void *bio_ptr, unsigned int bio_size)
 {
 	return ((((unsigned long)bio_ptr + bio_size + PAGE_SIZE - 1) >>
 	        PAGE_SHIFT) - ((unsigned long)bio_ptr >> PAGE_SHIFT));
 }
 static unsigned int
 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
 {
 	unsigned int offset, size, i;
 	struct page *page;
 	offset = offset_in_page(bio_ptr);
 	for (i = 0; i < bio->bi_max_vecs; i++) {
 		size = PAGE_SIZE - offset;
 		if (bio_size <= 0)
 			break;
 		if (size > bio_size)
 			size = bio_size;
 		if (kmem_virt(bio_ptr))
 			page = vmalloc_to_page(bio_ptr);
 		else
 			page = virt_to_page(bio_ptr);
 		if (bio_add_page(bio, page, size, offset) != size)
 			break;
 		bio_ptr  += size;
 		bio_size -= size;
 		offset = 0;
 	}
        return bio_size;
 }
 static int
 __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
                   size_t kbuf_size, uint64_t kbuf_offset, int flags)
 {
        dio_request_t *dr;
 	caddr_t bio_ptr;
 	uint64_t bio_offset;
 	int bio_size, bio_count = 16;
 	int i = 0, error = 0, block_size;
 retry:
 	dr = vdev_disk_dio_alloc(bio_count);
 	if (dr == NULL)
 		return ENOMEM;
 	dr->dr_zio = zio;
 	dr->dr_rw = flags;
 	block_size = vdev_bdev_block_size(bdev);
 #ifdef BIO_RW_FAILFAST
 	if (flags & (1 << BIO_RW_FAILFAST))
 		dr->dr_rw |= 1 << BIO_RW_FAILFAST;
 #endif /* BIO_RW_FAILFAST */
 	/*
 	 * When the IO size exceeds the maximum bio size for the request
 	 * queue we are forced to break the IO in multiple bio's and wait
 	 * for them all to complete.  Ideally, all pool users will set
 	 * their volume block size to match the maximum request size and
 	 * the common case will be one bio per vdev IO request.
 	 */
 	bio_ptr    = kbuf_ptr;
 	bio_offset = kbuf_offset;
 	bio_size   = kbuf_size;
 	for (i = 0; i <= dr->dr_bio_count; i++) {
 		/* Finished constructing bio's for given buffer */
 		if (bio_size <= 0)
 			break;
 		/*
 		 * By default only 'bio_count' bio's per dio are allowed.
 		 * However, if we find ourselves in a situation where more
 		 * are needed we allocate a larger dio and warn the user.
 		 */
 		if (dr->dr_bio_count == i) {
 			vdev_disk_dio_free(dr);
 			bio_count *= 2;
 			printk("WARNING: Resized bio's/dio to %d\n",bio_count);
 			goto retry;
 		}
 		dr->dr_bio[i] = bio_alloc(GFP_NOIO,
 		                          bio_nr_pages(bio_ptr, bio_size));
 		if (dr->dr_bio[i] == NULL) {
 			vdev_disk_dio_free(dr);
 			return ENOMEM;
 		}
 		/* Matching put called by vdev_disk_physio_completion */
 		vdev_disk_dio_get(dr);
 		dr->dr_bio[i]->bi_bdev = bdev;
 		dr->dr_bio[i]->bi_sector = bio_offset / block_size;
 		dr->dr_bio[i]->bi_rw = dr->dr_rw;
 		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
 		dr->dr_bio[i]->bi_private = dr;
 		/* Remaining size is returned to become the new size */
 		bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size);
 		/* Advance in buffer and construct another bio if needed */
 		bio_ptr    += dr->dr_bio[i]->bi_size;
 		bio_offset += dr->dr_bio[i]->bi_size;
 	}
 	/* Extra reference to protect dio_request during submit_bio */
 	vdev_disk_dio_get(dr);
 	/* Submit all bio's associated with this dio */
 	for (i = 0; i < dr->dr_bio_count; i++)
 		if (dr->dr_bio[i])
 			submit_bio(dr->dr_rw, dr->dr_bio[i]);
 	/*
 	 * On synchronous blocking requests we wait for all bio the completion
 	 * callbacks to run.  We will be woken when the last callback runs
 	 * for this dio.  We are responsible for putting the last dio_request
 	 * reference will in turn put back the last bio references.  The
 	 * only synchronous consumer is vdev_disk_read_rootlabel() all other
 	 * IO originating from vdev_disk_io_start() is asynchronous.
 	 */
 	if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) {
 		wait_for_completion(&dr->dr_comp);
 		error = dr->dr_error;
 		ASSERT3S(atomic_read(&dr->dr_ref), ==, 1);
 	}
 	(void)vdev_disk_dio_put(dr);
 	return error;
 }
 int
 vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
 		 size_t size, uint64_t offset, int flags)
 {
 	return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags);
 }
 /* 2.6.24 API change */
 #ifdef HAVE_BIO_EMPTY_BARRIER
 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc)
 {
 	zio_t *zio = bio->bi_private;
 	zio->io_error = -rc;
 	if (rc && (rc == -EOPNOTSUPP))
 		zio->io_vd->vdev_nowritecache = B_TRUE;
 	bio_put(bio);
 	zio_interrupt(zio);
 	BIO_END_IO_RETURN(0);
 }
 static int
 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 {
 	struct request_queue *q;
 	struct bio *bio;
 	q = bdev_get_queue(bdev);
 	if (!q)
 		return ENXIO;
 	bio = bio_alloc(GFP_KERNEL, 0);
 	if (!bio)
 		return ENOMEM;
 	bio->bi_end_io = vdev_disk_io_flush_completion;
 	bio->bi_private = zio;
 	bio->bi_bdev = bdev;
 	submit_bio(WRITE_BARRIER, bio);
 	return 0;
 }
 #else
 static int
 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 {
 	return ENOTSUP;
 }
 #endif /* HAVE_BIO_EMPTY_BARRIER */
 static int
 vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
 	int flags, error;
 	switch (zio->io_type) {
 	case ZIO_TYPE_IOCTL:
 		if (!vdev_readable(v)) {
 			zio->io_error = ENXIO;
 			return ZIO_PIPELINE_CONTINUE;
 		}
 		switch (zio->io_cmd) {
 		case DKIOCFLUSHWRITECACHE:
 			if (zfs_nocacheflush)
 				break;
 			if (v->vdev_nowritecache) {
 				zio->io_error = ENOTSUP;
 				break;
 			}
 			error = vdev_disk_io_flush(vd->vd_bdev, zio);
 			if (error == 0)
 				return ZIO_PIPELINE_STOP;
 			zio->io_error = error;
 			if (error == ENOTSUP)
 				v->vdev_nowritecache = B_TRUE;
 			break;
 		default:
 			zio->io_error = ENOTSUP;
 		}
 		return ZIO_PIPELINE_CONTINUE;
 	case ZIO_TYPE_WRITE:
 		flags = WRITE;
 		break;
 	case ZIO_TYPE_READ:
 		flags = READ;
 		break;
 	default:
 		zio->io_error = ENOTSUP;
 		return ZIO_PIPELINE_CONTINUE;
 	}
 #ifdef BIO_RW_FAILFAST
 	if (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))
 		flags |= (1 << BIO_RW_FAILFAST);
 #endif /* BIO_RW_FAILFAST */
 	error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
 		                   zio->io_size, zio->io_offset, flags);
 	if (error) {
 		zio->io_error = error;
 		return ZIO_PIPELINE_CONTINUE;
 	}
 	return ZIO_PIPELINE_STOP;
 }
 static void
 vdev_disk_io_done(zio_t *zio)
 {
 	/*
 	 * If the device returned EIO, we revalidate the media.  If it is
 	 * determined the media has changed this triggers the asynchronous
 	 * removal of the device from the configuration.
 	 */
 	if (zio->io_error == EIO) {
 	        vdev_t *v = zio->io_vd;
 		vdev_disk_t *vd = v->vdev_tsd;
 		if (check_disk_change(vd->vd_bdev)) {
 			vdev_bdev_invalidate(vd->vd_bdev);
 			v->vdev_remove_wanted = B_TRUE;
 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
 		}
 	}
 }
 static void
 vdev_disk_hold(vdev_t *vd)
 {
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
 	/* We must have a pathname, and it must be absolute. */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
 		return;
 	/*
 	 * Only prefetch path and devid info if the device has
 	 * never been opened.
 	 */
 	if (vd->vdev_tsd != NULL)
 		return;
 	/* XXX: Implement me as a vnode lookup for the device */
 	vd->vdev_name_vp = NULL;
 	vd->vdev_devid_vp = NULL;
 }
 static void
 vdev_disk_rele(vdev_t *vd)
 {
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
 	/* XXX: Implement me as a vnode rele for the device */
 }
 vdev_ops_t vdev_disk_ops = {
 	vdev_disk_open,
 	vdev_disk_close,
 	vdev_default_asize,
 	vdev_disk_io_start,
 	vdev_disk_io_done,
 	NULL,
 	vdev_disk_hold,
 	vdev_disk_rele,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
 /*
 * Given the root disk device devid or pathname, read the label from
 * the device, and construct a configuration nvlist.
 */
 int
 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 {
 	struct block_device *bdev;
 	vdev_label_t *label;
 	uint64_t s, size;
 	int i;
 	bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), NULL);
 	if (IS_ERR(bdev))
 		return -PTR_ERR(bdev);
 	s = bdev_capacity(bdev) * vdev_bdev_block_size(bdev);
 	if (s == 0) {
 		vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
 		return EIO;
 	}
 	size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t);
 	label = vmem_alloc(sizeof(vdev_label_t), KM_SLEEP);
 	for (i = 0; i < VDEV_LABELS; i++) {
 	        uint64_t offset, state, txg = 0;
 		/* read vdev label */
 		offset = vdev_label_offset(size, i, 0);
 		if (vdev_disk_physio(bdev, (caddr_t)label,
 		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, READ_SYNC) != 0)
 			continue;
 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
 			*config = NULL;
 			continue;
 		}
 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
 			nvlist_free(*config);
 			*config = NULL;
 			continue;
 		}
 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
 		    &txg) != 0 || txg == 0) {
 			nvlist_free(*config);
 			*config = NULL;
 			continue;
 		}
 		break;
 	}
 	vmem_free(label, sizeof(vdev_label_t));
 	vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
 	return 0;
 }
--- a/module/zfs/zfs_acl.c
+++ b/module/zfs/zfs_acl.c
@ -22,6 +22,8 @@
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */
 #ifdef HAVE_ZPL
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@ -2746,3 +2748,5 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
 	return (error);
 }
 #endif /* HAVE_ZPL */
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@ -63,6 +63,8 @@
 * so that it cannot be freed until all snapshots have been unmounted.
 */
 #ifdef HAVE_ZPL
 #include <fs/fs_subr.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
@ -1347,3 +1349,4 @@ zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 	return (error);
 }
 #endif /* HAVE_ZPL */
--- a/module/zfs/zfs_dir.c
+++ b/module/zfs/zfs_dir.c
@ -22,6 +22,8 @@
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */
 #ifdef HAVE_ZPL
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@ -1087,3 +1089,4 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
 	else
 		return (secpolicy_vnode_remove(cr));
 }
 #endif /* HAVE_ZPL */
--- a/module/zfs/zfs_fuid.c
+++ b/module/zfs/zfs_fuid.c
@ -192,6 +192,7 @@ zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
 }
 #ifdef _KERNEL
 #ifdef HAVE_ZPL
 /*
 * Load the fuid table(s) into memory.
 */
@ -753,4 +754,5 @@ zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 		    FUID_SIZE_ESTIMATE(zfsvfs));
 	}
 }
 #endif /* HAVE_ZPL */
 #endif
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@ -1292,6 +1292,9 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
 			error = err;
 	}
 	if (error == 0)
 		zvol_create_minors(zc->zc_name);
 	nvlist_free(config);
 	if (props)
@ -2179,8 +2182,7 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source,
 		err = dsl_dataset_set_reservation(dsname, source, intval);
 		break;
 	case ZFS_PROP_VOLSIZE:
-		err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip),
+		err = zvol_set_volsize(dsname, intval);
 		    intval);
 		break;
 	case ZFS_PROP_VERSION:
 	{
@ -2652,6 +2654,30 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
 	return (error);
 }
 /*
 * inputs:
 * zc_name              name of volume
 *
 * outputs:             none
 */
 static int
 zfs_ioc_create_minor(zfs_cmd_t *zc)
 {
 	return (zvol_create_minor(zc->zc_name));
 }
 /*
 * inputs:
 * zc_name              name of volume
 *
 * outputs:             none
 */
 static int
 zfs_ioc_remove_minor(zfs_cmd_t *zc)
 {
 	return (zvol_remove_minor(zc->zc_name));
 }
 /*
 * inputs:
 * zc_name		name of filesystem
@ -4805,6 +4831,10 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
 	    POOL_CHECK_SUSPENDED },
 	{ zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
 	{ zfs_ioc_create_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE,
 	    POOL_CHECK_NONE },
 	{ zfs_ioc_remove_minor, zfs_secpolicy_config, DATASET_NAME, B_FALSE,
 	    POOL_CHECK_NONE },
 	{ zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY },
 	{ zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@ -22,6 +22,8 @@
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */
 #ifdef HAVE_ZPL
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@ -674,3 +676,5 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
 	itx->itx_sync = (zp->z_sync_cnt != 0);
 	zil_itx_assign(zilog, itx, tx);
 }
 #endif /* HAVE_ZPL */
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@ -22,6 +22,8 @@
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */
 #ifdef HAVE_ZPL
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@ -929,3 +931,4 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
 	zfs_replay_write2,	/* TX_WRITE2 */
 };
 #endif /* HAVE_ZPL */
--- a/module/zfs/zfs_vfsops.c
+++ b/module/zfs/zfs_vfsops.c
@ -65,6 +65,7 @@
 #include <sys/sa.h>
 #include "zfs_comutil.h"
 #ifdef HAVE_ZPL
 int zfsfstype;
 vfsops_t *zfs_vfsops = NULL;
 static major_t zfs_major;
@ -2127,10 +2128,12 @@ zfs_vfsinit(int fstype, char *name)
 	return (0);
 }
 #endif /* HAVE_ZPL */
 void
 zfs_init(void)
 {
 #ifdef HAVE_ZPL
 	/*
 	 * Initialize .zfs directory structures
 	 */
@ -2142,21 +2145,19 @@ zfs_init(void)
 	zfs_znode_init();
 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
 #endif /* HAVE_ZPL */
 }
 void
 zfs_fini(void)
 {
 #ifdef HAVE_ZPL
 	zfsctl_fini();
 	zfs_znode_fini();
 #endif /* HAVE_ZPL */
 }
-int
+#ifdef HAVE_ZPL
 zfs_busy(void)
 {
 	return (zfs_active_fs_count != 0);
 }
 int
 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 {
@ -2224,6 +2225,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 	return (0);
 }
 #endif /* HAVE_ZPL */
 /*
 * Read a property stored within the master node.
@ -2267,6 +2269,7 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 	return (error);
 }
 #ifdef HAVE_ZPL
 static vfsdef_t vfw = {
 	VFSDEF_VERSION,
 	MNTTYPE_ZFS,
@ -2279,3 +2282,4 @@ static vfsdef_t vfw = {
 struct modlfs zfs_modlfs = {
 	&mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
 };
 #endif /* HAVE_ZPL */
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@ -25,6 +25,8 @@
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 #ifdef HAVE_ZPL
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@ -319,6 +321,7 @@ zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
 	return (ENOTTY);
 }
 #if defined(_KERNEL) && defined(HAVE_UIO_RW)
 /*
 * Utility functions to map and unmap a single physical page.  These
 * are used to manage the mappable copies of ZFS file data, and therefore
@ -343,6 +346,7 @@ zfs_unmap_page(page_t *pp, caddr_t addr)
 		ppmapout(addr);
 	}
 }
 #endif /* _KERNEL && HAVE_UIO_RW */
 /*
 * When a file is memory mapped, we must keep the IO data synchronized
@ -5241,3 +5245,4 @@ const fs_operation_def_t zfs_evnodeops_template[] = {
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	NULL,			NULL
 };
 #endif /* HAVE_ZPL */
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@ -88,6 +88,7 @@
 * (such as VFS logic) that will not compile easily in userland.
 */
 #ifdef _KERNEL
 #ifdef HAVE_ZPL
 /*
 * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
 * be freed before it can be safely accessed.
@ -1737,22 +1738,29 @@ log:
 	dmu_tx_commit(tx);
 	return (0);
 }
 #endif /* HAVE_ZPL */
 void
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
 	zfsvfs_t	zfsvfs;
 	uint64_t	moid, obj, sa_obj, version;
 	uint64_t	sense = ZFS_CASE_SENSITIVE;
 	uint64_t	norm = 0;
 	nvpair_t	*elem;
 	int		error;
 #ifdef HAVE_ZPL
 	zfsvfs_t	zfsvfs;
 	int		i;
 	znode_t		*rootzp = NULL;
 	vnode_t		*vp;
 	vattr_t		vattr;
 	znode_t		*zp;
 	zfs_acl_ids_t	acl_ids;
 #else
 	timestruc_t	now;
 	dmu_buf_t	*db;
 	znode_phys_t	*pzp;
 #endif /* HAVE_ZPL */
 	/*
 	 * First attempt to create master node.
@ -1814,6 +1822,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
 	ASSERT(error == 0);
 #ifdef HAVE_ZPL
 	/*
 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
 	 * to allow zfs_mknode to work.
@ -1879,17 +1888,49 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	ZTOV(rootzp)->v_count = 0;
 	sa_handle_destroy(rootzp->z_sa_hdl);
 	kmem_cache_free(znode_cache, rootzp);
 	/*
 	 * Create shares directory
 	 */
 	error = zfs_create_share_dir(&zfsvfs, tx);
 	ASSERT(error == 0);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs.z_hold_mtx[i]);
 #else
 	/*
 	 * Create root znode with code free of VFS dependencies
 	 */
 	obj = zap_create_norm(os, norm, DMU_OT_DIRECTORY_CONTENTS,
 	                      DMU_OT_ZNODE, sizeof (znode_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(os, obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	/*
 	 * Initialize the znode physical data to zero.
 	 */
 	ASSERT(db->db_size >= sizeof (znode_phys_t));
 	bzero(db->db_data, db->db_size);
 	pzp = db->db_data;
 	if (USE_FUIDS(version, os))
 		pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
 	pzp->zp_size = 2; /* "." and ".." */
 	pzp->zp_links = 2;
 	pzp->zp_parent = obj;
 	pzp->zp_gen = dmu_tx_get_txg(tx);
 	pzp->zp_mode = S_IFDIR | 0755;
 	pzp->zp_flags = ZFS_ACL_TRIVIAL;
 	gethrestime(&now);
 	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
 	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
 	ZFS_TIME_ENCODE(&now, pzp->zp_atime);
 	ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &obj, tx);
 	ASSERT(error == 0);
 	dmu_buf_rele(db, FTAG);
 #endif /* HAVE_ZPL */
 }
 #endif /* _KERNEL */
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c