From fb1b00e9f40ce2e8d207d9fe7cfb85add6bdbad8 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 20 Nov 2009 11:06:59 -0800 Subject: [PATCH 1/2] Linux ZVOL implementation; kernel-side changes At last a useful user space interface for the Linux ZFS port arrives. With the addition of the ZVOL real ZFS based block devices are available and can be compared head to head with Linux's MD and LVM block drivers. The Linux ZVOL has not yet had any performance work done but from a user perspective it should be functionally complete and behave like any other Linux block device. The ZVOL has so far been tested using zconfig.sh on the following x86_64 based platforms: FC11, CHAOS4, RHEL5, RHEL6, and SLES11. However, more testing is required to ensure everything is working as designed. What follows in a somewhat detailed list of changes includes in this commit to make ZVOL's possible. A few other issues were addressed in the context of these changes which will also be mentioned. * Added module/zfs/zvol.c which is based off the original Solaris ZVOL implementation but rewritten to intergrate with the Linux block device APIs. The basic design remains the similar in Linux with the major change being request processing. Request processing is handled by registering a request function which the elevator calls once all request merges is finished and the elevator unplugs. This function is called under a spin lock and the request structure is passed to the block driver to be queued for IO. The elevator must be notified asyncronously once the request completes or fails with an error. This allows us the block driver a chance to handle many request concurrently. For the ZVOL we maintain a taskq with a service thread per core. As requests are delivered by the elevator each request is dispatched to the taskq. The task queue handles each request with a write or read helper function which basically copies the request data in to our out of the DMU object. Writes single completion as soon as the DMU has the data unless they are marked sync. Reads are all handled syncronously however the elevator will merge many small reads in to a large read before it submitting the request. * Cachine is worth specifically mentioning. Because both the Linux VFS and the ZFS ARC both want to fully manage the cache we unfortunately end up with two caches. This means our memory foot print is larger than otherwise expected, and it means we have an extra copy between the caches, but it does not impact correctness. All syncs are barrior requests I believe are handled correctly. Longer term there is lots of room for improvement here but it will require fairly extensive changes to either the Linux VFS and VM layer, or additional DMU interfaces to handle managing buffer not directly allocated by the ARC. * Added module/zfs/include/sys/blkdev.h which contains all the Linux compatibility foo which is required to handle changes in the Linux block APIs from 2.6.18 thru 2.6.31 based kernels. * The dmu_{read,write}_uio interfaces which don't make sense on Linux have been modified to dmu_{read,write}_req functions which consume the standard Linux IO request structure. Their function fundamentally remains the same so this happily worked out pretty cleanly. * The /dev/zfs character device is no longer created through the half implemented Solaris driver DDI interfaces. It is now simply created with it's own major number as a Linux misc device which greatly simplifies everything. It is only capable of handling ioctls() but this fits nicely because that's all it ever has to do. The ZVOL devices unlike in Solaris do not leverage the same major number as /dev/zfs but instead register their own major. Because only one major is allocated and space is reserved for 16 partitions per-device there is a limit of 16384 concurrent ZVOL devices. By using multiple majors like the scsi driver this limit could be addressed if it becomes a problem. * The {spa,zfs,zvol}_busy() functions have all be removed because they are not required on a Linux system. Under Linux the registered module exit function will not be called while the are still references to the module. Once the exit function is called however it must succeed or block, it may not fail so returning an error on module unload makes to sense under Linux. * With the addition of ZVOL support all the HAVE_ZVOL defines were removed for obvious reasons. However, the HAVE_ZPL defines have been relocated in to the linux-{kernel,user}-disk topic branches and must remain until the ZPL is implemented. --- module/zcommon/include/sys/fs/zfs.h | 24 +- module/zfs/dmu.c | 112 ++- module/zfs/dsl_dataset.c | 8 +- module/zfs/include/sys/blkdev.h | 164 ++++ module/zfs/include/sys/dmu.h | 13 +- module/zfs/include/sys/spa.h | 1 - module/zfs/include/sys/zfs_fuid.h | 2 + module/zfs/include/sys/zfs_ioctl.h | 1 - module/zfs/include/sys/zfs_znode.h | 2 + module/zfs/include/sys/zvol.h | 34 +- module/zfs/rrwlock.c | 3 + module/zfs/spa_misc.c | 6 - module/zfs/zfs_acl.c | 4 + module/zfs/zfs_ctldir.c | 3 + module/zfs/zfs_dir.c | 3 + module/zfs/zfs_fuid.c | 2 + module/zfs/zfs_ioctl.c | 280 +++---- module/zfs/zfs_log.c | 4 + module/zfs/zfs_replay.c | 3 +- module/zfs/zfs_vfsops.c | 16 +- module/zfs/zfs_vnops.c | 5 + module/zfs/zfs_znode.c | 47 +- module/zfs/zvol.c | 1199 +++++++++++++++++++++++++++ 23 files changed, 1702 insertions(+), 234 deletions(-) create mode 100644 module/zfs/include/sys/blkdev.h create mode 100644 module/zfs/zvol.c diff --git a/module/zcommon/include/sys/fs/zfs.h b/module/zcommon/include/sys/fs/zfs.h index 86b36a8ae9..ef38ea336f 100644 --- a/module/zcommon/include/sys/fs/zfs.h +++ b/module/zcommon/include/sys/fs/zfs.h @@ -533,23 +533,13 @@ typedef struct vdev_stat { uint64_t vs_scrub_end; /* UTC scrub end time */ } vdev_stat_t; -#define ZVOL_DRIVER "zvol" -#define ZFS_DRIVER "zfs" -#define ZFS_DEV "/dev/zfs" - -/* - * zvol paths. Irritatingly, the devfsadm interfaces want all these - * paths without the /dev prefix, but for some things, we want the - * /dev prefix. Below are the names without /dev. - */ -#define ZVOL_DEV_DIR "zvol/dsk" -#define ZVOL_RDEV_DIR "zvol/rdsk" - -/* - * And here are the things we need with /dev, etc. in front of them. - */ -#define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:" -#define ZVOL_FULL_DEV_DIR "/dev/" ZVOL_DEV_DIR "/" +#define ZVOL_DRIVER "zvol" +#define ZFS_DRIVER "zfs" +#define ZFS_DEV "/dev/zfs" +#define ZVOL_MAJOR 230 +#define ZVOL_MINOR_BITS 4 +#define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) +#define ZVOL_MINORS (1 << 4) #define ZVOL_PROP_NAME "name" diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 8ca5c9d7d5..dfa075571b 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -660,9 +660,58 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, } #ifdef _KERNEL -int -dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) + +/* + * Copy up to size bytes between arg_buf and req based on the data direction + * described by the req. If an entire req's data cannot be transfered the + * req's is updated such that it's current index and bv offsets correctly + * reference any residual data which could not be copied. The return value + * is the number of bytes successfully copied to arg_buf. + */ +static int +dmu_req_copy(void *arg_buf, int size, int *offset, struct request *req) { + struct bio_vec *bv; + struct req_iterator iter; + char *bv_buf; + int tocpy; + + *offset = 0; + rq_for_each_segment(bv, req, iter) { + + /* Fully consumed the passed arg_buf */ + ASSERT3S(offset, <=, size); + if (size == *offset) + break; + + /* Skip fully consumed bv's */ + if (bv->bv_len == 0) + continue; + + tocpy = MIN(bv->bv_len, size - *offset); + ASSERT3S(tocpy, >=, 0); + + bv_buf = page_address(bv->bv_page) + bv->bv_offset; + ASSERT3P(bv_buf, !=, NULL); + + if (rq_data_dir(req) == WRITE) + memcpy(arg_buf + *offset, bv_buf, tocpy); + else + memcpy(bv_buf, arg_buf + *offset, tocpy); + + *offset += tocpy; + bv->bv_offset += tocpy; + bv->bv_len -= tocpy; + } + + return 0; +} + +int +dmu_read_req(objset_t *os, uint64_t object, struct request *req) +{ + uint64_t size = blk_rq_bytes(req); + uint64_t offset = blk_rq_pos(req) << 9; dmu_buf_t **dbp; int numbufs, i, err; @@ -670,27 +719,33 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, - &numbufs, &dbp); + err = dmu_buf_hold_array(os, object, offset, size, TRUE, FTAG, + &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; + int tocpy, didcpy, bufoff; dmu_buf_t *db = dbp[i]; - ASSERT(size > 0); + bufoff = offset - db->db_offset; + ASSERT3S(bufoff, >=, 0); - bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); + if (tocpy == 0) + break; + + err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req); + + if (didcpy < tocpy) + err = EIO; - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); if (err) break; size -= tocpy; + offset += didcpy; + err = 0; } dmu_buf_rele_array(dbp, numbufs, FTAG); @@ -698,30 +753,31 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) } int -dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, - dmu_tx_t *tx) +dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) { + uint64_t size = blk_rq_bytes(req); + uint64_t offset = blk_rq_pos(req) << 9; dmu_buf_t **dbp; - int numbufs, i; - int err = 0; + int numbufs, i, err; if (size == 0) return (0); - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, - FALSE, FTAG, &numbufs, &dbp); + err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, + &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; + int tocpy, didcpy, bufoff; dmu_buf_t *db = dbp[i]; - ASSERT(size > 0); + bufoff = offset - db->db_offset; + ASSERT3S(bufoff, >=, 0); - bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); + if (tocpy == 0) + break; ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); @@ -730,27 +786,27 @@ dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, else dmu_buf_will_dirty(db, tx); - /* - * XXX uiomove could block forever (eg. nfs-backed - * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that uiomove won't - * block. - */ - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); + err = dmu_req_copy(db->db_data + bufoff, tocpy, &didcpy, req); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); + if (didcpy < tocpy) + err = EIO; + if (err) break; size -= tocpy; + offset += didcpy; + err = 0; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } +#endif +#ifdef HAVE_ZPL int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 9260348369..628b265cd3 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -246,7 +246,13 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); - dprintf_ds(ds, "evicting %s\n", ""); + /* + * XXX: Commented out because dsl_dataset_name() is called + * which references ds->ds_dir which it seems may be NULL. + * This is easily trigged with 'zfs destroy /. + * * + * dprintf_ds(ds, "evicting %s\n", ""); + */ unique_remove(ds->ds_fsid_guid); diff --git a/module/zfs/include/sys/blkdev.h b/module/zfs/include/sys/blkdev.h new file mode 100644 index 0000000000..19ded8b6ba --- /dev/null +++ b/module/zfs/include/sys/blkdev.h @@ -0,0 +1,164 @@ +#ifndef _SYS_BLKDEV_H +#define _SYS_BLKDEV_H + +#ifdef _KERNEL + +#include +#include +#include "zfs_config.h" + +#ifndef HAVE_BLK_FETCH_REQUEST +static inline struct request * +blk_fetch_request(struct request_queue *q) +{ + struct request *req; + + req = elv_next_request(q); + if (req) + blkdev_dequeue_request(req); + + return req; +} +#endif /* HAVE_BLK_FETCH_REQUEST */ + +#ifndef HAVE_BLK_REQUEUE_REQUEST +static inline void +blk_requeue_request(request_queue_t *q, struct request *req) +{ + elv_requeue_request(q, req); +} +#endif /* HAVE_BLK_REQUEUE_REQUEST */ + +#ifndef HAVE_BLK_END_REQUEST +static inline bool +blk_end_request(struct request *req, int error, unsigned int nr_bytes) +{ + struct request_queue *q = req->q; + LIST_HEAD(list); + + /* + * Request has already been dequeued but 2.6.18 version of + * end_request() unconditionally dequeues the request so we + * add it to a local list to prevent hitting the BUG_ON. + */ + list_add(&req->queuelist, &list); + + /* + * The old API required the driver to end each segment and not + * the entire request. In our case we always need to end the + * entire request partial requests are not supported. + */ + req->hard_cur_sectors = nr_bytes >> 9; + + + spin_lock_irq(q->queue_lock); + end_request(req, ((error == 0) ? 1 : error)); + spin_unlock_irq(q->queue_lock); + + return 0; +} +#else +# ifdef HAVE_BLK_END_REQUEST_GPL_ONLY +/* + * Define required to avoid conflicting 2.6.29 non-static prototype for a + * GPL-only version of the helper. As of 2.6.31 the helper is available + * to non-GPL modules and is not explicitly exported GPL-only. + */ +# define blk_end_request ___blk_end_request +static inline bool +___blk_end_request(struct request *req, int error, unsigned int nr_bytes) +{ + struct request_queue *q = req->q; + + /* + * The old API required the driver to end each segment and not + * the entire request. In our case we always need to end the + * entire request partial requests are not supported. + */ + req->hard_cur_sectors = nr_bytes >> 9; + + spin_lock_irq(q->queue_lock); + end_request(req, ((error == 0) ? 1 : error)); + spin_unlock_irq(q->queue_lock); + + return 0; +} +# endif /* HAVE_BLK_END_REQUEST_GPL_ONLY */ +#endif /* HAVE_BLK_END_REQUEST */ + +#ifndef HAVE_BLK_RQ_POS +static inline sector_t +blk_rq_pos(struct request *req) +{ + return req->sector; +} +#endif /* HAVE_BLK_RQ_POS */ + +#ifndef HAVE_BLK_RQ_SECTORS +static inline unsigned int +blk_rq_sectors(struct request *req) +{ + return req->nr_sectors; +} +#endif /* HAVE_BLK_RQ_SECTORS */ + +#if !defined(HAVE_BLK_RQ_BYTES) || defined(HAVE_BLK_RQ_BYTES_GPL_ONLY) +/* + * Define required to avoid conflicting 2.6.29 non-static prototype for a + * GPL-only version of the helper. As of 2.6.31 the helper is available + * to non-GPL modules in the form of a static inline in the header. + */ +#define blk_rq_bytes __blk_rq_bytes +static inline unsigned int +__blk_rq_bytes(struct request *req) +{ + return blk_rq_sectors(req) << 9; +} +#endif /* !HAVE_BLK_RQ_BYTES || HAVE_BLK_RQ_BYTES_GPL_ONLY */ + +#ifndef HAVE_GET_DISK_RO +static inline int +get_disk_ro(struct gendisk *disk) +{ + int policy = 0; + + if (disk->part[0]) + policy = disk->part[0]->policy; + + return policy; +} +#endif /* HAVE_GET_DISK_RO */ + +#ifndef HAVE_RQ_IS_SYNC +static inline bool +rq_is_sync(struct request *req) +{ + return (req->flags & REQ_RW_SYNC); +} +#endif /* HAVE_RQ_IS_SYNC */ + +#ifndef HAVE_RQ_FOR_EACH_SEGMENT +struct req_iterator { + int i; + struct bio *bio; +}; + +# define for_each_bio(_bio) \ + for (; _bio; _bio = _bio->bi_next) + +# define __rq_for_each_bio(_bio, rq) \ + if ((rq->bio)) \ + for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) + +# define rq_for_each_segment(bvl, _rq, _iter) \ + __rq_for_each_bio(_iter.bio, _rq) \ + bio_for_each_segment(bvl, _iter.bio, _iter.i) +#endif /* HAVE_RQ_FOR_EACH_SEGMENT */ + +#ifndef DISK_NAME_LEN +#define DISK_NAME_LEN 32 +#endif /* DISK_NAME_LEN */ + +#endif /* KERNEL */ + +#endif /* _SYS_BLKDEV_H */ diff --git a/module/zfs/include/sys/dmu.h b/module/zfs/include/sys/dmu.h index b15da8391f..c7cf698c84 100644 --- a/module/zfs/include/sys/dmu.h +++ b/module/zfs/include/sys/dmu.h @@ -38,12 +38,14 @@ #include #include #include +#ifdef _KERNEL +#include +#endif #ifdef __cplusplus extern "C" { #endif -struct uio; struct page; struct vnode; struct spa; @@ -486,11 +488,14 @@ void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); -int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); -int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, - dmu_tx_t *tx); +#ifdef _KERNEL +int dmu_read_req(objset_t *os, uint64_t object, struct request *req); +int dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx); +#endif +#ifdef HAVE_ZPL int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); +#endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, diff --git a/module/zfs/include/sys/spa.h b/module/zfs/include/sys/spa.h index 30554ae0ec..aba28dbb2d 100644 --- a/module/zfs/include/sys/spa.h +++ b/module/zfs/include/sys/spa.h @@ -456,7 +456,6 @@ extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_version(spa_t *spa); extern int spa_max_replication(spa_t *spa); -extern int spa_busy(void); extern uint8_t spa_get_failmode(spa_t *spa); extern boolean_t spa_suspended(spa_t *spa); diff --git a/module/zfs/include/sys/zfs_fuid.h b/module/zfs/include/sys/zfs_fuid.h index f81ddf4a55..9910ce11a4 100644 --- a/module/zfs/include/sys/zfs_fuid.h +++ b/module/zfs/include/sys/zfs_fuid.h @@ -98,6 +98,7 @@ typedef struct zfs_fuid_info { } zfs_fuid_info_t; #ifdef _KERNEL +#ifdef HAVE_ZPL struct znode; extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); extern void zfs_fuid_destroy(zfsvfs_t *); @@ -115,6 +116,7 @@ extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain, char **retdomain, boolean_t addok); extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx); extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx); +#endif /* HAVE_ZPL */ #endif char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); diff --git a/module/zfs/include/sys/zfs_ioctl.h b/module/zfs/include/sys/zfs_ioctl.h index 3a3e6e7118..59992993df 100644 --- a/module/zfs/include/sys/zfs_ioctl.h +++ b/module/zfs/include/sys/zfs_ioctl.h @@ -191,7 +191,6 @@ extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); -extern int zfs_busy(void); extern int zfs_unmount_snap(char *, void *); #endif /* _KERNEL */ diff --git a/module/zfs/include/sys/zfs_znode.h b/module/zfs/include/sys/zfs_znode.h index f5ee2fc7b7..d3c1766a5f 100644 --- a/module/zfs/include/sys/zfs_znode.h +++ b/module/zfs/include/sys/zfs_znode.h @@ -342,8 +342,10 @@ extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx); +#if defined(HAVE_UIO_RW) extern caddr_t zfs_map_page(page_t *, enum seg_rw); extern void zfs_unmap_page(page_t *, caddr_t); +#endif /* HAVE_UIO_RW */ extern zil_get_data_t zfs_get_data; extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; diff --git a/module/zfs/include/sys/zvol.h b/module/zfs/include/sys/zvol.h index 74ebc83e08..e162e2b47c 100644 --- a/module/zfs/include/sys/zvol.h +++ b/module/zfs/include/sys/zvol.h @@ -20,51 +20,33 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZVOL_H #define _SYS_ZVOL_H - - #include -#ifdef __cplusplus -extern "C" { -#endif - #define ZVOL_OBJ 1ULL #define ZVOL_ZAP_OBJ 2ULL #ifdef _KERNEL + +#include + extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); extern int zvol_check_volblocksize(uint64_t volblocksize); extern int zvol_get_stats(objset_t *os, nvlist_t *nv); extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -extern int zvol_create_minor(const char *, major_t); +extern int zvol_create_minor(const char *); extern int zvol_remove_minor(const char *); -extern int zvol_set_volsize(const char *, major_t, uint64_t); +extern int zvol_set_volsize(const char *, uint64_t); extern int zvol_set_volblocksize(const char *, uint64_t); -extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); -extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks); -extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr); -extern int zvol_strategy(buf_t *bp); -extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr); -extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr); -extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr); -extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr); -extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, - int *rvalp); -extern int zvol_busy(void); -extern void zvol_init(void); +extern int zvol_init(void); extern void zvol_fini(void); -#endif - -#ifdef __cplusplus -} -#endif +#endif /* KERNEL */ #endif /* _SYS_ZVOL_H */ diff --git a/module/zfs/rrwlock.c b/module/zfs/rrwlock.c index 4cef53f951..8d05d93872 100644 --- a/module/zfs/rrwlock.c +++ b/module/zfs/rrwlock.c @@ -23,6 +23,8 @@ * Use is subject to license terms. */ +#ifdef HAVE_ZPL + #include #include @@ -262,3 +264,4 @@ rrw_held(rrwlock_t *rrl, krw_t rw) return (held); } +#endif /* HAVE_ZPL */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index ef74a443de..c2440dac4a 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1343,12 +1343,6 @@ spa_name_compare(const void *a1, const void *a2) return (0); } -int -spa_busy(void) -{ - return (spa_active_count); -} - void spa_boot_init(void) { diff --git a/module/zfs/zfs_acl.c b/module/zfs/zfs_acl.c index 12ffe9f30a..9cb20a1534 100644 --- a/module/zfs/zfs_acl.c +++ b/module/zfs/zfs_acl.c @@ -23,6 +23,8 @@ * Use is subject to license terms. */ +#ifdef HAVE_ZPL + #include #include #include @@ -2848,3 +2850,5 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, return (error); } + +#endif /* HAVE_ZPL */ diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index c6c719871c..f46699fdfb 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -64,6 +64,8 @@ * so that it cannot be freed until all snapshots have been unmounted. */ +#ifdef HAVE_ZPL + #include #include #include @@ -1333,3 +1335,4 @@ zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) return (error); } +#endif /* HAVE_ZPL */ diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c index b3f7683286..b76113bad0 100644 --- a/module/zfs/zfs_dir.c +++ b/module/zfs/zfs_dir.c @@ -23,6 +23,8 @@ * Use is subject to license terms. */ +#ifdef HAVE_ZPL + #include #include #include @@ -962,3 +964,4 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) else return (secpolicy_vnode_remove(cr)); } +#endif /* HAVE_ZPL */ diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 2409652186..4925bb0046 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -194,6 +194,7 @@ zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) } #ifdef _KERNEL +#ifdef HAVE_ZPL /* * Load the fuid table(s) into memory. */ @@ -743,4 +744,5 @@ zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx) FUID_SIZE_ESTIMATE(zfsvfs)); } } +#endif /* HAVE_ZPL */ #endif diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index b039414dbe..df1f4508d6 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -64,18 +64,16 @@ #include #include +#include + #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" - -extern struct modlfs zfs_modlfs; +#include "zfs_config.h" extern void zfs_init(void); extern void zfs_fini(void); -ldi_ident_t zfs_li = NULL; -dev_info_t *zfs_dip; - typedef int zfs_ioc_func_t(zfs_cmd_t *); typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); @@ -403,6 +401,7 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) ZFS_DELEG_PERM_SEND, cr)); } +#ifdef HAVE_ZPL static int zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) { @@ -426,10 +425,12 @@ zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) return (dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_SHARE, cr)); } +#endif /* HAVE_ZPL */ int zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) { +#ifdef HAVE_ZPL if (!INGLOBALZONE(curproc)) return (EPERM); @@ -438,11 +439,15 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) } else { return (zfs_secpolicy_deleg_share(zc, cr)); } +#else + return (ENOTSUP); +#endif /* HAVE_ZPL */ } int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) { +#ifdef HAVE_ZPL if (!INGLOBALZONE(curproc)) return (EPERM); @@ -451,6 +456,9 @@ zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) } else { return (zfs_secpolicy_deleg_share(zc, cr)); } +#else + return (ENOTSUP); +#endif /* HAVE_ZPL */ } static int @@ -645,6 +653,7 @@ zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) return (error); } +#ifdef HAVE_ZPL static int zfs_secpolicy_umount(zfs_cmd_t *zc, cred_t *cr) { @@ -656,6 +665,7 @@ zfs_secpolicy_umount(zfs_cmd_t *zc, cred_t *cr) } return (error); } +#endif /* HAVE_ZPL */ /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires @@ -836,6 +846,7 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) return (error); } +#ifdef HAVE_ZPL static int getzfsvfs(const char *dsname, zfsvfs_t **zvp) { @@ -898,6 +909,7 @@ zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) zfsvfs_free(zfsvfs); } } +#endif /* HAVE_ZPL */ static int zfs_ioc_pool_create(zfs_cmd_t *zc) @@ -1713,6 +1725,7 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) { +#ifdef HAVE_ZPL uint64_t *valary; unsigned int vallen; const char *domain; @@ -1741,6 +1754,10 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) continue; else goto out; +#else + error = ENOTSUP; + goto out; +#endif } else if (zfs_prop_user(propname)) { VERIFY(nvpair_value_string(elem, &strval) == 0); error = dsl_prop_set(name, propname, 1, @@ -1781,8 +1798,7 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) case ZFS_PROP_VOLSIZE: if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zvol_set_volsize(name, - ddi_driver_major(zfs_dip), intval)) != 0) + (error = zvol_set_volsize(name, intval)) != 0) goto out; break; @@ -1794,6 +1810,7 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) case ZFS_PROP_VERSION: { +#ifdef HAVE_ZPL zfsvfs_t *zfsvfs; if ((error = nvpair_value_uint64(elem, &intval)) != 0) @@ -1812,6 +1829,10 @@ zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) if (error) goto out; break; +#else + error = ENOTSUP; + goto out; +#endif /* HAVE_ZPL */ } default: @@ -2023,6 +2044,7 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) static int zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) { +#ifdef HAVE_ZPL nvlist_t *nvp; int error; uint32_t uid; @@ -2065,6 +2087,9 @@ zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred); crfree(usercred); return (error); +#else + return (ENOTSUP); +#endif /* HAVE_ZPL */ } /* @@ -2147,7 +2172,7 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc) static int zfs_ioc_create_minor(zfs_cmd_t *zc) { - return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip))); + return (zvol_create_minor(zc->zc_name)); } /* @@ -2162,6 +2187,7 @@ zfs_ioc_remove_minor(zfs_cmd_t *zc) return (zvol_remove_minor(zc->zc_name)); } +#ifdef HAVE_ZPL /* * Search the vfs list for a specified resource. Returns a pointer to it * or NULL if no suitable entry is found. The caller of this routine @@ -2186,6 +2212,7 @@ zfs_get_vfs(const char *resource) vfs_list_unlock(); return (vfs_found); } +#endif /* HAVE_ZPL */ /* ARGSUSED */ static void @@ -2535,6 +2562,7 @@ out: int zfs_unmount_snap(char *name, void *arg) { +#ifdef HAVE_ZPL vfs_t *vfsp = NULL; if (arg) { @@ -2566,6 +2594,7 @@ zfs_unmount_snap(char *name, void *arg) if ((err = dounmount(vfsp, flag, kcred)) != 0) return (err); } +#endif /* HAVE_ZPL */ return (0); } @@ -2621,6 +2650,7 @@ zfs_ioc_destroy(zfs_cmd_t *zc) static int zfs_ioc_rollback(zfs_cmd_t *zc) { +#ifdef HAVE_ZPL objset_t *os; int error; zfsvfs_t *zfsvfs = NULL; @@ -2654,6 +2684,9 @@ zfs_ioc_rollback(zfs_cmd_t *zc) /* Note, the dmu_objset_rollback() releases the objset for us. */ return (error); +#else + return (ENOTSUP); +#endif /* HAVE_ZPL */ } /* @@ -2727,7 +2760,9 @@ static int zfs_ioc_recv(zfs_cmd_t *zc) { file_t *fp; +#ifdef HAVE_ZPL objset_t *os; +#endif /* HAVE_ZPL */ dmu_recv_cookie_t drc; boolean_t force = (boolean_t)zc->zc_guid; int error, fd; @@ -2760,6 +2795,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) return (EBADF); } +#ifdef HAVE_ZPL if (props && dmu_objset_open(tofs, DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { /* @@ -2770,6 +2806,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) dmu_objset_close(os); } +#endif /* HAVE_ZPL */ if (zc->zc_string[0]) { error = dmu_objset_open(zc->zc_string, DMU_OST_ANY, @@ -2801,6 +2838,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) error = dmu_recv_stream(&drc, fp->f_vnode, &off); if (error == 0) { +#ifdef HAVE_ZPL zfsvfs_t *zfsvfs = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { @@ -2827,6 +2865,9 @@ zfs_ioc_recv(zfs_cmd_t *zc) } else { error = dmu_recv_end(&drc); } +#else + error = dmu_recv_end(&drc); +#endif /* HAVE_ZPL */ } zc->zc_cookie = off - fp->f_offset; @@ -3057,6 +3098,7 @@ zfs_ioc_promote(zfs_cmd_t *zc) static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { +#ifdef HAVE_ZPL zfsvfs_t *zfsvfs; int error; @@ -3072,6 +3114,9 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc) zfsvfs_rele(zfsvfs, FTAG); return (error); +#else + return (ENOTSUP); +#endif /* HAVE_ZPL */ } /* @@ -3088,6 +3133,7 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc) static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { +#ifdef HAVE_ZPL zfsvfs_t *zfsvfs; int error; @@ -3110,6 +3156,9 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc) zfsvfs_rele(zfsvfs, FTAG); return (error); +#else + return (ENOTSUP); +#endif /* HAVE_ZPL */ } /* @@ -3122,6 +3171,7 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc) static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { +#ifdef HAVE_ZPL objset_t *os; int error; zfsvfs_t *zfsvfs; @@ -3154,6 +3204,9 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) } return (error); +#else + return (ENOTSUP); +#endif /* HAVE_ZPL */ } /* @@ -3163,6 +3216,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) * the first file system is shared. * Neither sharefs, nfs or smbsrv are unloadable modules. */ +#ifdef HAVE_ZPL int (*znfsexport_fs)(void *arg); int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); int (*zsmbexport_fs)(void *arg, boolean_t add_share); @@ -3194,10 +3248,12 @@ zfs_init_sharefs() } return (0); } +#endif /* HAVE_ZPL */ static int zfs_ioc_share(zfs_cmd_t *zc) { +#ifdef HAVE_ZPL int error; int opcode; @@ -3287,7 +3343,9 @@ zfs_ioc_share(zfs_cmd_t *zc) zc->zc_share.z_sharemax); return (error); - +#else + return (ENOTSUP); +#endif /* HAVE_ZPL */ } ace_t full_access[] = { @@ -3297,6 +3355,7 @@ ace_t full_access[] = { /* * Remove all ACL files in shares dir */ +#ifdef HAVE_ZPL static int zfs_smb_acl_purge(znode_t *dzp) { @@ -3315,10 +3374,12 @@ zfs_smb_acl_purge(znode_t *dzp) zap_cursor_fini(&zc); return (error); } +#endif /* HAVE ZPL */ static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { +#ifdef HAVE_ZPL vnode_t *vp; znode_t *dzp; vnode_t *resourcevp = NULL; @@ -3440,6 +3501,9 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) ZFS_EXIT(zfsvfs); return (error); +#else + return (ENOTSUP); +#endif /* HAVE_ZPL */ } /* @@ -3632,28 +3696,23 @@ pool_status_check(const char *name, zfs_ioc_namecheck_t type) return (error); } -static int -zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) +static long +zfs_ioctl(struct file *filp, unsigned cmd, unsigned long arg) { zfs_cmd_t *zc; uint_t vec; - int error, rc; - - if (getminor(dev) != 0) - return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp)); + int error, rc, flag = 0; vec = cmd - ZFS_IOC; - ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); - if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) - return (EINVAL); + return (-EINVAL); zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); if (error == 0) - error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr); + error = zfs_ioc_vec[vec].zvec_secpolicy(zc, NULL); /* * Ensure that all pool/dataset names are valid before we pass down to @@ -3695,121 +3754,59 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) } kmem_free(zc, sizeof (zfs_cmd_t)); - return (error); + return (-error); } -static int -zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +#ifdef CONFIG_COMPAT +static long +zfs_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg) { - if (cmd != DDI_ATTACH) - return (DDI_FAILURE); - - if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, - DDI_PSEUDO, 0) == DDI_FAILURE) - return (DDI_FAILURE); - - zfs_dip = dip; - - ddi_report_dev(dip); - - return (DDI_SUCCESS); + return zfs_ioctl(filp, cmd, arg); } +#else +#define zfs_compat_ioctl NULL +#endif + +static const struct file_operations zfs_fops = { + .unlocked_ioctl = zfs_ioctl, + .compat_ioctl = zfs_compat_ioctl, + .owner = THIS_MODULE, +}; + +static struct miscdevice zfs_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = ZFS_DRIVER, + .fops = &zfs_fops, +}; static int -zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +zfs_attach(void) { - if (spa_busy() || zfs_busy() || zvol_busy()) - return (DDI_FAILURE); + int error; - if (cmd != DDI_DETACH) - return (DDI_FAILURE); - - zfs_dip = NULL; - - ddi_prop_remove_all(dip); - ddi_remove_minor_node(dip, NULL); - - return (DDI_SUCCESS); -} - -/*ARGSUSED*/ -static int -zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) -{ - switch (infocmd) { - case DDI_INFO_DEVT2DEVINFO: - *result = zfs_dip; - return (DDI_SUCCESS); - - case DDI_INFO_DEVT2INSTANCE: - *result = (void *)0; - return (DDI_SUCCESS); + error = misc_register(&zfs_misc); + if (error) { + printk(KERN_INFO "ZFS: misc_register() failed %d\n", error); + return (error); } - return (DDI_FAILURE); + return (0); } -/* - * OK, so this is a little weird. - * - * /dev/zfs is the control node, i.e. minor 0. - * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. - * - * /dev/zfs has basically nothing to do except serve up ioctls, - * so most of the standard driver entry points are in zvol.c. - */ -static struct cb_ops zfs_cb_ops = { - zvol_open, /* open */ - zvol_close, /* close */ - zvol_strategy, /* strategy */ - nodev, /* print */ - zvol_dump, /* dump */ - zvol_read, /* read */ - zvol_write, /* write */ - zfsdev_ioctl, /* ioctl */ - nodev, /* devmap */ - nodev, /* mmap */ - nodev, /* segmap */ - nochpoll, /* poll */ - ddi_prop_op, /* prop_op */ - NULL, /* streamtab */ - D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ - CB_REV, /* version */ - nodev, /* async read */ - nodev, /* async write */ -}; - -static struct dev_ops zfs_dev_ops = { - DEVO_REV, /* version */ - 0, /* refcnt */ - zfs_info, /* info */ - nulldev, /* identify */ - nulldev, /* probe */ - zfs_attach, /* attach */ - zfs_detach, /* detach */ - nodev, /* reset */ - &zfs_cb_ops, /* driver operations */ - NULL, /* no bus operations */ - NULL, /* power */ - ddi_quiesce_not_needed, /* quiesce */ -}; - -static struct modldrv zfs_modldrv = { - &mod_driverops, - "ZFS storage pool", - &zfs_dev_ops -}; - -static struct modlinkage modlinkage = { - MODREV_1, - (void *)&zfs_modlfs, - (void *)&zfs_modldrv, - NULL -}; +static void +zfs_detach(void) +{ + int error; + error = misc_deregister(&zfs_misc); + if (error) + printk(KERN_INFO "ZFS: misc_deregister() failed %d\n", error); +} +#ifdef HAVE_ZPL uint_t zfs_fsyncer_key; extern uint_t rrw_tsd_key; +#endif int _init(void) @@ -3818,21 +3815,28 @@ _init(void) spa_init(FREAD | FWRITE); zfs_init(); - zvol_init(); - if ((error = mod_install(&modlinkage)) != 0) { - zvol_fini(); + if ((error = zvol_init()) != 0) { zfs_fini(); spa_fini(); return (error); } + if ((error = zfs_attach()) != 0) { + (void)zvol_fini(); + zfs_fini(); + spa_fini(); + return (error); + } + +#ifdef HAVE_ZPL tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, NULL); - error = ldi_ident_from_mod(&modlinkage, &zfs_li); - ASSERT(error == 0); mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); +#endif /* HAVE_ZPL */ + + printk(KERN_INFO "ZFS: Loaded ZFS Filesystem v%s\n", ZFS_META_VERSION); return (0); } @@ -3840,17 +3844,11 @@ _init(void) int _fini(void) { - int error; - - if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) - return (EBUSY); - - if ((error = mod_remove(&modlinkage)) != 0) - return (error); - + zfs_detach(); zvol_fini(); zfs_fini(); spa_fini(); +#ifdef HAVE_ZPL if (zfs_nfsshare_inited) (void) ddi_modclose(nfs_mod); if (zfs_smbshare_inited) @@ -3858,16 +3856,18 @@ _fini(void) if (zfs_nfsshare_inited || zfs_smbshare_inited) (void) ddi_modclose(sharefs_mod); - tsd_destroy(&zfs_fsyncer_key); - ldi_ident_release(zfs_li); - zfs_li = NULL; mutex_destroy(&zfs_share_lock); + tsd_destroy(&zfs_fsyncer_key); +#endif /* HAVE_ZPL */ - return (error); + return (0); } -int -_info(struct modinfo *modinfop) -{ - return (mod_info(&modlinkage, modinfop)); -} +#ifdef HAVE_SPL +spl_module_init(_init); +spl_module_exit(_fini); + +MODULE_AUTHOR("Sun Microsystems, Inc"); +MODULE_DESCRIPTION("ZFS"); +MODULE_LICENSE("CDDL"); +#endif /* HAVE_SPL */ diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 3f0b6b0ed3..5bb9fb2b29 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -23,6 +23,8 @@ * Use is subject to license terms. */ +#ifdef HAVE_ZPL + #include #include #include @@ -704,3 +706,5 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, seq = zil_itx_assign(zilog, itx, tx); zp->z_last_itx = seq; } + +#endif /* HAVE_ZPL */ diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 819ba2886c..7aeff072a4 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ - +#ifdef HAVE_ZPL #include #include @@ -876,3 +876,4 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ }; +#endif /* HAVE_ZPL */ diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index d03f92ba00..c4fa8c12c8 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -61,6 +61,7 @@ #include #include +#ifdef HAVE_ZPL int zfsfstype; vfsops_t *zfs_vfsops = NULL; static major_t zfs_major; @@ -1957,10 +1958,12 @@ zfs_vfsinit(int fstype, char *name) return (0); } +#endif /* HAVE_ZPL */ void zfs_init(void) { +#ifdef HAVE_ZPL /* * Initialize .zfs directory structures */ @@ -1972,21 +1975,19 @@ zfs_init(void) zfs_znode_init(); dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); +#endif /* HAVE_ZPL */ } void zfs_fini(void) { +#ifdef HAVE_ZPL zfsctl_fini(); zfs_znode_fini(); +#endif /* HAVE_ZPL */ } -int -zfs_busy(void) -{ - return (zfs_active_fs_count != 0); -} - +#ifdef HAVE_ZPL int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { @@ -2029,6 +2030,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) return (0); } +#endif /* HAVE_ZPL */ /* * Read a property stored within the master node. @@ -2072,6 +2074,7 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) return (error); } +#ifdef HAVE_ZPL static vfsdef_t vfw = { VFSDEF_VERSION, MNTTYPE_ZFS, @@ -2084,3 +2087,4 @@ static vfsdef_t vfw = { struct modlfs zfs_modlfs = { &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw }; +#endif /* HAVE_ZPL */ diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 8eb4665aed..818d88cbff 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -25,6 +25,8 @@ /* Portions Copyright 2007 Jeremy Teo */ +#ifdef HAVE_ZPL + #include #include #include @@ -318,6 +320,7 @@ zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, return (ENOTTY); } +#if defined(_KERNEL) && defined(HAVE_UIO_RW) /* * Utility functions to map and unmap a single physical page. These * are used to manage the mappable copies of ZFS file data, and therefore @@ -342,6 +345,7 @@ zfs_unmap_page(page_t *pp, caddr_t addr) ppmapout(addr); } } +#endif /* _KERNEL && HAVE_UIO_RW */ /* * When a file is memory mapped, we must keep the IO data synchronized @@ -4695,3 +4699,4 @@ const fs_operation_def_t zfs_evnodeops_template[] = { VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, NULL, NULL }; +#endif /* HAVE_ZPL */ diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index ee27195a48..33bf6cba2a 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -87,6 +87,7 @@ * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL +#ifdef HAVE_ZPL /* * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to * be freed before it can be safely accessed. @@ -1473,21 +1474,28 @@ log: dmu_tx_commit(tx); return (0); } +#endif /* HAVE_ZPL */ void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { - zfsvfs_t zfsvfs; uint64_t moid, obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int error; +#ifdef HAVE_ZPL + zfsvfs_t zfsvfs; znode_t *rootzp = NULL; vnode_t *vp; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; +#else + timestruc_t now; + dmu_buf_t *db; + znode_phys_t *pzp; +#endif /* HAVE_ZPL */ /* * First attempt to create master node. @@ -1542,6 +1550,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); +#ifdef HAVE_ZPL /* * Create root znode. Create minimal znode/vnode/zfsvfs * to allow zfs_mknode to work. @@ -1596,14 +1605,46 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) dmu_buf_rele(rootzp->z_dbuf, NULL); rootzp->z_dbuf = NULL; kmem_cache_free(znode_cache, rootzp); + error = zfs_create_share_dir(&zfsvfs, tx); +#else + /* + * Create root znode with code free of VFS dependencies + */ + obj = zap_create_norm(os, norm, DMU_OT_DIRECTORY_CONTENTS, + DMU_OT_ZNODE, sizeof (znode_phys_t), tx); + + VERIFY(0 == dmu_bonus_hold(os, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); /* - * Create shares directory + * Initialize the znode physical data to zero. */ + ASSERT(db->db_size >= sizeof (znode_phys_t)); + bzero(db->db_data, db->db_size); + pzp = db->db_data; - error = zfs_create_share_dir(&zfsvfs, tx); + if (USE_FUIDS(version, os)) + pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + pzp->zp_size = 2; /* "." and ".." */ + pzp->zp_links = 2; + pzp->zp_parent = obj; + pzp->zp_gen = dmu_tx_get_txg(tx); + pzp->zp_mode = S_IFDIR | 0755; + pzp->zp_flags = ZFS_ACL_TRIVIAL; + + gethrestime(&now); + + ZFS_TIME_ENCODE(&now, pzp->zp_crtime); + ZFS_TIME_ENCODE(&now, pzp->zp_ctime); + ZFS_TIME_ENCODE(&now, pzp->zp_atime); + ZFS_TIME_ENCODE(&now, pzp->zp_mtime); + + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &obj, tx); ASSERT(error == 0); + + dmu_buf_rele(db, FTAG); +#endif /* HAVE_ZPL */ } #endif /* _KERNEL */ diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c new file mode 100644 index 0000000000..cc95bf2fc2 --- /dev/null +++ b/module/zfs/zvol.c @@ -0,0 +1,1199 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * ZFS volume emulation driver. + * + * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. + * Volumes are accessed through the symbolic links named: + * + * /dev// + * + * Volumes are persistent through reboot. No user command needs to be + * run before opening and using a device. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int zvol_major = ZVOL_MAJOR; +unsigned int zvol_threads = 0; + +static taskq_t *zvol_taskq; +static kmutex_t zvol_state_lock; +static list_t zvol_state_list; + +/* + * The in-core state of each volume. + */ +typedef struct zvol_state { + uint64_t zv_volsize; /* advertised space */ + uint64_t zv_volblocksize;/* volume block size */ + objset_t *zv_objset; /* objset handle */ + uint32_t zv_mode; /* DS_MODE_* at open time */ + uint32_t zv_open_count; /* open counts */ + uint32_t zv_changed; /* disk changed */ + zilog_t *zv_zilog; /* ZIL handle */ + znode_t zv_znode; /* for range locking */ + dev_t zv_dev; /* device id */ + struct gendisk *zv_disk; /* generic disk */ + struct request_queue *zv_queue; /* request queue */ + spinlock_t zv_lock; /* request queue lock */ + list_node_t zv_next; /* next zvol_state_t linkage */ +} zvol_state_t; + +/* + * Find the next available range of ZVOL_MINORS minor numbers. The + * zvol_state_list is kept in ascending minor order so we simply need + * to scan the list for the first gap in the sequence. This allows us + * to recycle minor number as devices are created and removed. + */ +static int +zvol_find_minor(unsigned *minor) +{ + zvol_state_t *zv; + + *minor = 0; + ASSERT(MUTEX_HELD(&zvol_state_lock)); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv), *minor += ZVOL_MINORS) { + if (MINOR(zv->zv_dev) != MINOR(*minor)) + break; + } + + /* All minors are in use */ + if (*minor >= (1 << MINORBITS)) + return ENXIO; + + return 0; +} + +/* + * Find a zvol_state_t given the full major+minor dev_t. + */ +static zvol_state_t * +zvol_find_by_dev(dev_t dev) +{ + zvol_state_t *zv; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv)) { + if (zv->zv_dev == dev) + return zv; + } + + return NULL; +} + +/* + * Find a zvol_state_t given the name provided at zvol_alloc() time. + */ +static zvol_state_t * +zvol_find_by_name(const char *name) +{ + zvol_state_t *zv; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv)) { + if (!strncmp(zv->zv_disk->disk_name, name, DISK_NAME_LEN)) + return zv; + } + + return NULL; +} + +/* + * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. + */ +void +zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + zfs_creat_t *zct = arg; + nvlist_t *nvprops = zct->zct_props; + int error; + uint64_t volblocksize, volsize; + + VERIFY(nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); + if (nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) + volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); + + /* + * These properties must be removed from the list so the generic + * property setting step won't apply to them. + */ + VERIFY(nvlist_remove_all(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); + (void) nvlist_remove_all(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); + + error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); + ASSERT(error == 0); +} + +/* + * ZFS_IOC_OBJSET_STATS entry point. + */ +int +zvol_get_stats(objset_t *os, nvlist_t *nv) +{ + int error; + dmu_object_info_t doi; + uint64_t val; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); + if (error) + return (error); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); + + error = dmu_object_info(os, ZVOL_OBJ, &doi); + + if (error == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, + doi.doi_data_block_size); + } + + return (error); +} + +/* + * Notification handler for objset readonly property changes. + */ +static void +zvol_readonly_changed_cb(void *arg, uint64_t value) +{ + set_disk_ro(((zvol_state_t *)arg)->zv_disk, !!value); +} + +/* + * Sanity check volume size. + */ +int +zvol_check_volsize(uint64_t volsize, uint64_t blocksize) +{ + if (volsize == 0) + return (EINVAL); + + if (volsize % blocksize != 0) + return (EINVAL); + +#ifdef _ILP32 + if (volsize - 1 > SPEC_MAXOFFSET_T) + return (EOVERFLOW); +#endif + return (0); +} + +/* + * Ensure the zap is flushed then inform the VFS of the capacity change. + */ +static int +zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) +{ + dmu_tx_t *tx; + int error; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, + &volsize, tx); + dmu_tx_commit(tx); + + if (error) + return (error); + + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, volsize, DMU_OBJECT_END); + if (error) + return (error); + + zv->zv_volsize = volsize; + zv->zv_changed = 1; + error = revalidate_disk(zv->zv_disk); + + return (error); +} + +/* + * Set ZFS_PROP_VOLSIZE set entry point. + */ +int +zvol_set_volsize(const char *name, uint64_t volsize) +{ + zvol_state_t *zv; + int error; + dmu_object_info_t doi; + uint64_t old_volsize = 0ULL; + zvol_state_t state = { 0 }; + + mutex_enter(&zvol_state_lock); + + zv = zvol_find_by_name(name); + if (zv == NULL) { + /* + * If we are doing a "zfs clone -o volsize=", then the + * minor node won't exist yet. + */ + error = dmu_objset_open(name, DMU_OST_ZVOL, DS_MODE_OWNER, + &state.zv_objset); + if (error != 0) + goto out; + zv = &state; + } + old_volsize = zv->zv_volsize; + + if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 || + (error = zvol_check_volsize(volsize,doi.doi_data_block_size)) != 0) + goto out; + + if (get_disk_ro(zv->zv_disk) || (zv->zv_mode & DS_MODE_READONLY)) { + error = EROFS; + goto out; + } + + error = zvol_update_volsize(zv, volsize); +out: + if (state.zv_objset) + dmu_objset_close(state.zv_objset); + + mutex_exit(&zvol_state_lock); + + return (error); +} + +/* + * Sanity check volume block size. + */ +int +zvol_check_volblocksize(uint64_t volblocksize) +{ + if (volblocksize < SPA_MINBLOCKSIZE || + volblocksize > SPA_MAXBLOCKSIZE || + !ISP2(volblocksize)) + return (EDOM); + + return (0); +} + +/* + * Set ZFS_PROP_VOLBLOCKSIZE set entry point. + */ +int +zvol_set_volblocksize(const char *name, uint64_t volblocksize) +{ + zvol_state_t *zv; + dmu_tx_t *tx; + int error; + + mutex_enter(&zvol_state_lock); + + zv = zvol_find_by_name(name); + if (zv == NULL) + return (ENXIO); + + if (get_disk_ro(zv->zv_disk) || (zv->zv_mode & DS_MODE_READONLY)) + return (EROFS); + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_bonus(tx, ZVOL_OBJ); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, + volblocksize, 0, tx); + if (error == ENOTSUP) + error = EBUSY; + dmu_tx_commit(tx); + if (error == 0) + zv->zv_volblocksize = volblocksize; + } + + return (error); +} + +/* + * Replay a TX_WRITE ZIL transaction that didn't get committed + * after a system failure + */ +static int +zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) +{ + objset_t *os = zv->zv_objset; + char *data = (char *)(lr + 1); /* data follows lr_write_t */ + uint64_t off = lr->lr_offset; + uint64_t len = lr->lr_length; + dmu_tx_t *tx; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, off, len, data, tx); + dmu_tx_commit(tx); + } + + return (error); +} + +static int +zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) +{ + return (ENOTSUP); +} + +/* + * Callback vectors for replaying records. + * Only TX_WRITE is needed for zvol. + */ +zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { + (zil_replay_func_t *)zvol_replay_err, /* no such transaction type */ + (zil_replay_func_t *)zvol_replay_err, /* TX_CREATE */ + (zil_replay_func_t *)zvol_replay_err, /* TX_MKDIR */ + (zil_replay_func_t *)zvol_replay_err, /* TX_MKXATTR */ + (zil_replay_func_t *)zvol_replay_err, /* TX_SYMLINK */ + (zil_replay_func_t *)zvol_replay_err, /* TX_REMOVE */ + (zil_replay_func_t *)zvol_replay_err, /* TX_RMDIR */ + (zil_replay_func_t *)zvol_replay_err, /* TX_LINK */ + (zil_replay_func_t *)zvol_replay_err, /* TX_RENAME */ + (zil_replay_func_t *)zvol_replay_write, /* TX_WRITE */ + (zil_replay_func_t *)zvol_replay_err, /* TX_TRUNCATE */ + (zil_replay_func_t *)zvol_replay_err, /* TX_SETATTR */ + (zil_replay_func_t *)zvol_replay_err, /* TX_ACL */ +}; + +/* + * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. + * + * We store data in the log buffers if it's small enough. + * Otherwise we will later flush the data out via dmu_sync(). + */ +ssize_t zvol_immediate_write_sz = 32768; + +static void +zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, + uint64_t offset, uint64_t size, int sync) +{ + uint32_t blocksize = zv->zv_volblocksize; + zilog_t *zilog = zv->zv_zilog; + boolean_t slogging; + + if (zil_disable) + return; + + if (zilog->zl_replay) { + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = + zilog->zl_replaying_seq; + return; + } + + slogging = spa_has_slogs(zilog->zl_spa); + + while (size) { + itx_t *itx; + lr_write_t *lr; + ssize_t len; + itx_wr_state_t write_state; + + /* + * Unlike zfs_log_write() we can be called with + * up to DMU_MAX_ACCESS/2 (5MB) writes. + */ + if (blocksize > zvol_immediate_write_sz && !slogging && + size >= blocksize && offset % blocksize == 0) { + write_state = WR_INDIRECT; /* uses dmu_sync */ + len = blocksize; + } else if (sync) { + write_state = WR_COPIED; + len = MIN(ZIL_MAX_LOG_DATA, size); + } else { + write_state = WR_NEED_COPY; + len = MIN(ZIL_MAX_LOG_DATA, size); + } + + itx = zil_itx_create(TX_WRITE, sizeof (*lr) + + (write_state == WR_COPIED ? len : 0)); + lr = (lr_write_t *)&itx->itx_lr; + if (write_state == WR_COPIED && dmu_read(zv->zv_objset, + ZVOL_OBJ, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + write_state = WR_NEED_COPY; + } + + itx->itx_wr_state = write_state; + if (write_state == WR_NEED_COPY) + itx->itx_sod += len; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = offset; + lr->lr_length = len; + lr->lr_blkoff = offset - + P2ALIGN_TYPED(offset, blocksize, uint64_t); + BP_ZERO(&lr->lr_blkptr); + + itx->itx_private = zv; + itx->itx_sync = sync; + + (void) zil_itx_assign(zilog, itx, tx); + + offset += len; + size -= len; + } +} + +/* + * Common write path running under the zvol taskq context. This function + * is responsible for copying the request structure data in to the DMU and + * signaling the request queue with the result of the copy. + */ +static void +zvol_write(void *arg) +{ + struct request *req = (struct request *)arg; + struct request_queue *q = req->q; + zvol_state_t *zv = q->queuedata; + uint64_t offset = blk_rq_pos(req) << 9; + uint64_t size = blk_rq_bytes(req); + int sync = 0, error = 0; + dmu_tx_t *tx; + rl_t *rl; + + if (rq_is_sync(req) && !zil_disable) + sync = 1; + + rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size); + + /* This will only fail for ENOSPC */ + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_range_unlock(rl); + blk_end_request(req, -error, size); + return; + } + + dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx); + zvol_log_write(zv, tx, offset, size, sync); + + dmu_tx_commit(tx); + zfs_range_unlock(rl); + + if (sync) + zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); + + blk_end_request(req, -error, size); +} + +/* + * Common read path running under the zvol taskq context. This function + * is responsible for copying the requested data out of the DMU and in to + * a linux request structure. It then must signal the request queue with + * an error code describing the result of the copy. + */ +static void +zvol_read(void *arg) +{ + struct request *req = (struct request *)arg; + struct request_queue *q = req->q; + zvol_state_t *zv = q->queuedata; + uint64_t offset = blk_rq_pos(req) << 9; + uint64_t size = blk_rq_bytes(req); + int error; + rl_t *rl; + + rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); + + error = dmu_read_req(zv->zv_objset, ZVOL_OBJ, req); + + zfs_range_unlock(rl); + + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = EIO; + + blk_end_request(req, -error, size); +} + +/* + * Request will be added back to the request queue and retried if + * it cannot be immediately dispatched to the taskq for handling + */ +static inline void +zvol_dispatch(task_func_t func, struct request *req) +{ + if (!taskq_dispatch(zvol_taskq, func, (void *)req, TQ_NOSLEEP)) + blk_requeue_request(req->q, req); +} + +/* + * Common request path. Rather than registering a custom make_request() + * function we use the generic Linux version. This is done because it allows + * us to easily merge read requests which would otherwise we performed + * synchronously by the DMU. This is less critical in write case where the + * DMU will perform the correct merging within a transaction group. Using + * the generic make_request() also let's use leverage the fact that the + * elevator with ensure correct ordering in regards to barrior IOs. On + * the downside it means that in the write case we end up doing request + * merging twice once in the elevator and once in the DMU. + * + * The request handler is called under a spin lock so all the real work + * is handed off to be done in the context of the zvol taskq. This function + * simply performs basic request sanity checking and hands off the request. + */ +static void +zvol_request(struct request_queue *q) +{ + zvol_state_t *zv = q->queuedata; + struct request *req; + unsigned int size; + + while ((req = blk_fetch_request(q)) != NULL) { + size = blk_rq_bytes(req); + + if (blk_rq_pos(req) + blk_rq_sectors(req) > + get_capacity(zv->zv_disk)) { + printk(KERN_INFO + "%s: bad access: block=%llu, count=%lu\n", + req->rq_disk->disk_name, + (long long unsigned)blk_rq_pos(req), + (long unsigned)blk_rq_sectors(req)); + blk_end_request(req, -EIO, size); + continue; + } + + if (!blk_fs_request(req)) { + printk(KERN_INFO "%s: non-fs cmd\n", + req->rq_disk->disk_name); + blk_end_request(req, -EIO, size); + continue; + } + + switch (rq_data_dir(req)) { + case READ: + zvol_dispatch(zvol_read, req); + break; + case WRITE: + if (unlikely(get_disk_ro(zv->zv_disk))) { + blk_end_request(req, -EROFS, size); + break; + } + + zvol_dispatch(zvol_write, req); + break; + default: + printk(KERN_INFO "%s: unknown cmd: %d\n", + req->rq_disk->disk_name, (int)rq_data_dir(req)); + blk_end_request(req, -EIO, size); + break; + } + } +} + +/* + * The zvol_state_t's are inserted in increasing MINOR(dev_t) order. + */ +static void +zvol_insert(zvol_state_t *zv_insert) +{ + zvol_state_t *zv = NULL; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + ASSERT3U(MINOR(zv_insert->zv_dev) & ZVOL_MINOR_MASK, ==, 0); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv)) { + if (MINOR(zv->zv_dev) > MINOR(zv_insert->zv_dev)) + break; + } + + list_insert_before(&zvol_state_list, zv, zv_insert); +} + +/* + * Simply remove the zvol from to list of zvols. + */ +static void +zvol_remove(zvol_state_t *zv_remove) +{ + ASSERT(MUTEX_HELD(&zvol_state_lock)); + list_remove(&zvol_state_list, zv_remove); +} + +#ifndef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS +typedef unsigned __bitwise__ fmode_t; +#endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */ + +static int +zvol_open(struct block_device *bdev, fmode_t flag) +{ + zvol_state_t *zv = bdev->bd_disk->private_data; + + mutex_enter(&zvol_state_lock); + ASSERT3P(zv, !=, NULL); + ASSERT3P(zv->zv_objset, !=, NULL); + + if ((flag & FMODE_WRITE) && + (get_disk_ro(zv->zv_disk) || (zv->zv_mode & DS_MODE_READONLY))) { + mutex_exit(&zvol_state_lock); + return (-EROFS); + } + + zv->zv_open_count++; + mutex_exit(&zvol_state_lock); + + check_disk_change(bdev); + + return (0); +} + +static int +zvol_release(struct gendisk *disk, fmode_t mode) +{ + zvol_state_t *zv = disk->private_data; + + mutex_enter(&zvol_state_lock); + ASSERT3P(zv, !=, NULL); + ASSERT3U(zv->zv_open_count, >, 0); + zv->zv_open_count--; + mutex_exit(&zvol_state_lock); + + return (0); +} + +static void +zvol_get_done(dmu_buf_t *db, void *vzgd) +{ + zgd_t *zgd = (zgd_t *)vzgd; + rl_t *rl = zgd->zgd_rl; + + dmu_buf_rele(db, vzgd); + zfs_range_unlock(rl); + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); + kmem_free(zgd, sizeof (zgd_t)); +} + +/* + * Get data to generate a TX_WRITE intent log record. + */ +static int +zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) +{ + zvol_state_t *zv = arg; + objset_t *os = zv->zv_objset; + dmu_buf_t *db; + rl_t *rl; + zgd_t *zgd; + uint64_t boff; /* block starting offset */ + int dlen = lr->lr_length; /* length of user data */ + int error; + + ASSERT(zio); + ASSERT(dlen != 0); + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) /* immediate write */ + return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf, + DMU_READ_NO_PREFETCH)); + + zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_zilog = zv->zv_zilog; + zgd->zgd_bp = &lr->lr_blkptr; + + /* + * Lock the range of the block to ensure that when the data is + * written out and its checksum is being calculated that no other + * thread can change the block. + */ + boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t); + rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize, + RL_READER); + zgd->zgd_rl = rl; + + VERIFY3S(dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db), ==, 0); + error = dmu_sync(zio, db, &lr->lr_blkptr, + lr->lr_common.lrc_txg, zvol_get_done, zgd); + if (error == 0) + zil_add_block(zv->zv_zilog, &lr->lr_blkptr); + /* + * If we get EINPROGRESS, then we need to wait for a + * write IO initiated by dmu_sync() to complete before + * we can release this dbuf. We will finish everything + * up in the zvol_get_done() callback. + */ + if (error == EINPROGRESS) + return (0); + + dmu_buf_rele(db, zgd); + zfs_range_unlock(rl); + kmem_free(zgd, sizeof (zgd_t)); + + return (error); +} + + +static int +zvol_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + zvol_state_t *zv = bdev->bd_disk->private_data; + int error = 0; + + if (zv == NULL) + return (-ENXIO); + + switch (cmd) { + case BLKFLSBUF: + zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); + break; + + default: + error = -ENOTTY; + break; + + } + + return (error); +} + +#ifdef CONFIG_COMPAT +static int +zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + return zvol_ioctl(bdev, mode, cmd, arg); +} +#else +#define zvol_compat_ioctl NULL +#endif + +static int zvol_media_changed(struct gendisk *disk) +{ + zvol_state_t *zv = disk->private_data; + + return zv->zv_changed; +} + +static int zvol_revalidate_disk(struct gendisk *disk) +{ + zvol_state_t *zv = disk->private_data; + + zv->zv_changed = 0; + set_capacity(zv->zv_disk, zv->zv_volsize >> 9); + + return 0; +} + +/* + * Provide a simple virtual geometry for legacy compatibility. For devices + * smaller than 1 MiB a small head and sector count is used to allow very + * tiny devices. For devices over 1 Mib a standard head and sector count + * is used to keep the cylinders count reasonable. + */ +static int +zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + zvol_state_t *zv = bdev->bd_disk->private_data; + sector_t sectors = get_capacity(zv->zv_disk); + + if (sectors > 2048) { + geo->heads = 16; + geo->sectors = 63; + } else { + geo->heads = 2; + geo->sectors = 4; + } + + geo->start = 0; + geo->cylinders = sectors / (geo->heads * geo->sectors); + + return 0; +} + +static struct kobject * +zvol_probe(dev_t dev, int *part, void *arg) +{ + zvol_state_t *zv; + struct kobject *kobj; + + mutex_enter(&zvol_state_lock); + zv = zvol_find_by_dev(dev); + kobj = zv ? get_disk(zv->zv_disk) : ERR_PTR(-ENOENT); + mutex_exit(&zvol_state_lock); + + return kobj; +} + +#ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS +static struct block_device_operations zvol_ops = { + .open = zvol_open, + .release = zvol_release, + .ioctl = zvol_ioctl, + .compat_ioctl = zvol_compat_ioctl, + .media_changed = zvol_media_changed, + .revalidate_disk = zvol_revalidate_disk, + .getgeo = zvol_getgeo, + .owner = THIS_MODULE, +}; + +#else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */ + +static int +zvol_open_by_inode(struct inode *inode, struct file *file) +{ + return zvol_open(inode->i_bdev, file->f_mode); +} + +static int +zvol_release_by_inode(struct inode *inode, struct file *file) +{ + return zvol_release(inode->i_bdev->bd_disk, file->f_mode); +} + +static int +zvol_ioctl_by_inode(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + return zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg); +} + +# ifdef CONFIG_COMPAT +static long +zvol_compat_ioctl_by_inode(struct file *file, + unsigned int cmd, unsigned long arg) +{ + return zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev, + file->f_mode, cmd, arg); +} +# else +# define zvol_compat_ioctl_by_inode NULL +# endif + +static struct block_device_operations zvol_ops = { + .open = zvol_open_by_inode, + .release = zvol_release_by_inode, + .ioctl = zvol_ioctl_by_inode, + .compat_ioctl = zvol_compat_ioctl_by_inode, + .media_changed = zvol_media_changed, + .revalidate_disk = zvol_revalidate_disk, + .getgeo = zvol_getgeo, + .owner = THIS_MODULE, +}; +#endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */ + +/* + * Allocate memory for a new zvol_state_t and setup the required + * request queue and generic disk structures for the block device. + */ +static zvol_state_t * +zvol_alloc(dev_t dev, const char *name) +{ + zvol_state_t *zv; + + zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); + if (zv == NULL) + goto out; + + zv->zv_queue = blk_init_queue(zvol_request, &zv->zv_lock); + if (zv->zv_queue == NULL) + goto out_kmem; + + zv->zv_disk = alloc_disk(ZVOL_MINORS); + if (zv->zv_disk == NULL) + goto out_queue; + + zv->zv_queue->queuedata = zv; + zv->zv_dev = dev; + zv->zv_open_count = 0; + + mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, + sizeof (rl_t), offsetof(rl_t, r_node)); + spin_lock_init(&zv->zv_lock); + list_link_init(&zv->zv_next); + + zv->zv_disk->major = zvol_major; + zv->zv_disk->first_minor = (dev & MINORMASK); + zv->zv_disk->fops = &zvol_ops; + zv->zv_disk->private_data = zv; + zv->zv_disk->queue = zv->zv_queue; + strlcpy(zv->zv_disk->disk_name, name, DISK_NAME_LEN); + + return zv; + +out_queue: + blk_cleanup_queue(zv->zv_queue); +out_kmem: + kmem_free(zv, sizeof (zvol_state_t)); +out: + return NULL; +} + +/* + * Cleanup then free a zvol_state_t which was created by zvol_alloc(). + */ +static void +zvol_free(zvol_state_t *zv) +{ + avl_destroy(&zv->zv_znode.z_range_avl); + mutex_destroy(&zv->zv_znode.z_range_lock); + + del_gendisk(zv->zv_disk); + blk_cleanup_queue(zv->zv_queue); + put_disk(zv->zv_disk); + + kmem_free(zv, sizeof (zvol_state_t)); +} + +/* + * Create a block device minor node and setup the linkage between it + * and the specified volume. Once this function returns the block + * device is live and ready for use. + */ +int +zvol_create_minor(const char *name) +{ + zvol_state_t *zv; + objset_t *os; + dmu_object_info_t doi; + uint64_t volsize; + unsigned minor = 0; + int ds_mode = DS_MODE_OWNER; + int error = 0; + + mutex_enter(&zvol_state_lock); + + zv = zvol_find_by_name(name); + if (zv) { + error = EEXIST; + goto out; + } + + /* Snapshot may only be read-only */ + if (strchr(name, '@') != 0) + ds_mode |= DS_MODE_READONLY; + + error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os); + if (error) + goto out; + + error = dmu_object_info(os, ZVOL_OBJ, &doi); + if (error) + goto out_dmu_objset_close; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) + goto out_dmu_objset_close; + + error = zvol_find_minor(&minor); + if (error) + goto out_dmu_objset_close; + + zv = zvol_alloc(MKDEV(zvol_major, minor), name); + if (zv == NULL) { + error = EAGAIN; + goto out_dmu_objset_close; + } + + set_disk_ro(zv->zv_disk, !!(ds_mode & DS_MODE_READONLY)); + set_capacity(zv->zv_disk, volsize >> 9); + + zv->zv_volsize = volsize; + zv->zv_volblocksize = doi.doi_data_block_size; + zv->zv_objset = os; + zv->zv_mode = ds_mode; + zv->zv_zilog = zil_open(os, zvol_get_data); + zil_replay(os, zv, zvol_replay_vector); + + error = dsl_prop_register(dmu_objset_ds(zv->zv_objset), "readonly", + zvol_readonly_changed_cb, zv); + if (error) + goto out_zvol_alloc; + + zvol_insert(zv); + mutex_exit(&zvol_state_lock); + add_disk(zv->zv_disk); + + return 0; + +out_zvol_alloc: + zvol_free(zv); +out_dmu_objset_close: + dmu_objset_close(os); +out: + mutex_exit(&zvol_state_lock); + + return (-error); +} + +/* + * Remove a block device minor node for the specified volume. + */ +int +zvol_remove_minor(const char *name) +{ + zvol_state_t *zv; + int error = 0; + + mutex_enter(&zvol_state_lock); + + zv = zvol_find_by_name(name); + if (zv == NULL) { + error = ENXIO; + goto out; + } + + if (zv->zv_open_count > 0) { + error = EBUSY; + goto out; + } + + error = dsl_prop_unregister(dmu_objset_ds(zv->zv_objset), + "readonly", zvol_readonly_changed_cb, zv); + if (error) + goto out; + + zil_close(zv->zv_zilog); + dmu_objset_close(zv->zv_objset); + + zvol_remove(zv); + zvol_free(zv); +out: + mutex_exit(&zvol_state_lock); + + return (-error); +} + +/* + * Remove all minors from the system. This is only called from + * zvol_fini() which means the module reference count must have + * dropped to zero and none of the zvol devices may be open. + */ +static void +zvol_remove_minors(void) +{ + zvol_state_t *zv; + + mutex_enter(&zvol_state_lock); + while ((zv = list_head(&zvol_state_list)) != NULL) { + ASSERT3U(zv->zv_open_count, ==, 0); + + (void)dsl_prop_unregister(dmu_objset_ds(zv->zv_objset), + "readonly", zvol_readonly_changed_cb, zv); + zil_close(zv->zv_zilog); + dmu_objset_close(zv->zv_objset); + + zvol_remove(zv); + zvol_free(zv); + } + mutex_exit(&zvol_state_lock); +} + +int +zvol_init(void) +{ + int error; + + if (!zvol_threads) + zvol_threads = num_online_cpus(); + + zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_threads, maxclsyspri, + zvol_threads, INT_MAX, TASKQ_PREPOPULATE); + if (zvol_taskq == NULL) { + printk(KERN_INFO "ZFS: taskq_create() failed\n"); + return (-ENOMEM); + } + + error = register_blkdev(zvol_major, ZVOL_DRIVER); + if (error) { + printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); + taskq_destroy(zvol_taskq); + return (error); + } + + blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, + THIS_MODULE, zvol_probe, NULL, NULL); + + mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zvol_state_list, sizeof (zvol_state_t), + offsetof(zvol_state_t, zv_next)); + + return (0); +} + +void +zvol_fini(void) +{ + zvol_remove_minors(); + blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); + unregister_blkdev(zvol_major, ZVOL_DRIVER); + taskq_destroy(zvol_taskq); + + mutex_destroy(&zvol_state_lock); + list_destroy(&zvol_state_list); +} + +module_param(zvol_major, uint, 0); +MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); + +module_param(zvol_threads, uint, 0); +MODULE_PARM_DESC(zvol_threads, "Number of threads for zvol device"); From aebe6818a9a7454b0da5dcf63d45f8fa83c36ae7 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 20 Nov 2009 12:00:08 -0800 Subject: [PATCH 2/2] Linux ZVOL implementation; user-side changes At last a useful user space interface for the Linux ZFS port arrives. With the addition of the ZVOL real ZFS based block devices are available and can be compared head to head with Linux's MD and LVM block drivers. The Linux ZVOL has not yet had any performance work done but from a user perspective it should be functionally complete and behave like any other Linux block device. The ZVOL has so far been tested using zconfig.sh on the following x86_64 based platforms: FC11, CHAOS4, RHEL5, RHEL6, and SLES11. However, more testing is required to ensure everything is working as designed. What follows in a somewhat detailed list of changes includes in this commit to make ZVOL's possible. A few other issues were addressed in the context of these changes which will also be mentioned. * zvol_create_link_common() simplified to simply issue to ioctl to create the device and then wait up to 10 seconds for it to appear. The device will be created within a few miliseconds by udev under /dev//. Note this naming convention is slightly different than on Solaris by I feel is more Linuxy. * Removed support for dump vdevs. This concept is specific to Solaris and done not map cleanly to Linux. Under Linux generating system cores is perferably done over the network via netdump, or alternately to a block device via O_DIRECT. --- cmd/zfs/zfs_main.c | 32 ++++++++++ cmd/zinject/zinject.c | 5 +- cmd/zpool/zpool_main.c | 10 +++ lib/libzfs/include/libzfs.h | 3 - lib/libzfs/libzfs_changelist.c | 12 ++++ lib/libzfs/libzfs_dataset.c | 64 +++++-------------- lib/libzfs/libzfs_mount.c | 52 +++++++++++++++- lib/libzfs/libzfs_pool.c | 110 --------------------------------- lib/libzfs/libzfs_sendrecv.c | 2 + lib/libzfs/libzfs_util.c | 2 + 10 files changed, 128 insertions(+), 164 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 91b85ed6b8..618ae10225 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -554,6 +554,7 @@ zfs_do_clone(int argc, char **argv) ret = zfs_clone(zhp, argv[1], props); /* create the mountpoint if necessary */ +#ifdef HAVE_ZPL if (ret == 0) { zfs_handle_t *clone; @@ -564,6 +565,7 @@ zfs_do_clone(int argc, char **argv) zfs_close(clone); } } +#endif /* HAVE_ZPL */ zfs_close(zhp); nvlist_free(props); @@ -761,6 +763,7 @@ zfs_do_create(int argc, char **argv) * in fact created, even if we failed to mount or share it. */ ret = 0; +#ifdef HAVE_ZPL if (canmount == ZFS_CANMOUNT_ON) { if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " @@ -772,6 +775,7 @@ zfs_do_create(int argc, char **argv) ret = 1; } } +#endif /* HAVE_ZPL */ error: if (zhp) @@ -2787,6 +2791,7 @@ typedef struct get_all_cbdata { #define SPINNER_TIME 3 /* seconds */ #define MOUNT_TIME 5 /* seconds */ +#ifdef HAVE_ZPL static int get_one_dataset(zfs_handle_t *zhp, void *data) { @@ -3338,6 +3343,7 @@ share_mount(int op, int argc, char **argv) return (ret); } +#endif /* HAVE_ZPL */ /* * zfs mount -a [nfs | iscsi] @@ -3348,7 +3354,11 @@ share_mount(int op, int argc, char **argv) static int zfs_do_mount(int argc, char **argv) { +#ifdef HAVE_ZPL return (share_mount(OP_MOUNT, argc, argv)); +#else + return ENOSYS; +#endif /* HAVE_ZPL */ } /* @@ -3360,9 +3370,14 @@ zfs_do_mount(int argc, char **argv) static int zfs_do_share(int argc, char **argv) { +#ifdef HAVE_ZPL return (share_mount(OP_SHARE, argc, argv)); +#else + return ENOSYS; +#endif /* HAVE_ZPL */ } +#ifdef HAVE_ZPL typedef struct unshare_unmount_node { zfs_handle_t *un_zhp; char *un_mountp; @@ -3815,6 +3830,7 @@ unshare_unmount(int op, int argc, char **argv) return (ret); } +#endif /* HAVE_ZPL */ /* * zfs unmount -a @@ -3825,7 +3841,11 @@ unshare_unmount(int op, int argc, char **argv) static int zfs_do_unmount(int argc, char **argv) { +#ifdef HAVE_ZPL return (unshare_unmount(OP_MOUNT, argc, argv)); +#else + return ENOSYS; +#endif /* HAVE_ZPL */ } /* @@ -3837,7 +3857,11 @@ zfs_do_unmount(int argc, char **argv) static int zfs_do_unshare(int argc, char **argv) { +#ifdef HAVE_ZPL return (unshare_unmount(OP_SHARE, argc, argv)); +#else + return ENOSYS; +#endif /* HAVE_ZPL */ } /* ARGSUSED */ @@ -3853,6 +3877,7 @@ zfs_do_python(int argc, char **argv) * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is * 'legacy'. Otherwise, complain that use should be using 'zfs mount'. */ +#ifdef HAVE_ZPL static int manual_mount(int argc, char **argv) { @@ -3983,6 +4008,7 @@ manual_unmount(int argc, char **argv) return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE)); } +#endif /* HAVE_ZPL */ static int volcheck(zpool_handle_t *zhp, void *data) @@ -4027,7 +4053,9 @@ main(int argc, char **argv) { int ret; int i = 0; +#ifdef HAVE_ZPL char *progname; +#endif char *cmdname; (void) setlocale(LC_ALL, ""); @@ -4052,6 +4080,7 @@ main(int argc, char **argv) return (1); } +#ifdef HAVE_ZPL /* * This command also doubles as the /etc/fs mount and unmount program. * Determine if we should take this behavior based on argv[0]. @@ -4062,6 +4091,9 @@ main(int argc, char **argv) } else if (strcmp(progname, "umount") == 0) { ret = manual_unmount(argc, argv); } else { +#else + { +#endif /* HAVE_ZPL */ /* * Make sure the user has specified some command. */ diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index 09c377ef8d..0ad8549b24 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -751,17 +751,20 @@ main(int argc, char **argv) if (dataset[0] != '\0' && domount) { if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) return (1); - +#ifdef HAVE_ZPL if (zfs_unmount(zhp, NULL, 0) != 0) return (1); +#endif /* HAVE_ZPL */ } record.zi_error = error; ret = register_handler(pool, flags, &record, quiet); +#ifdef HAVE_ZPL if (dataset[0] != '\0' && domount) ret = (zfs_mount(zhp, NULL, 0) != 0); +#endif /* HAVE_ZPL */ libzfs_fini(g_zfs); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index b6c454d24b..ca3f37b900 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -696,7 +696,9 @@ zpool_do_create(int argc, char **argv) (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { char buf[MAXPATHLEN]; +#ifdef HAVE_ZPL DIR *dirp; +#endif if (mountpoint && mountpoint[0] != '/') { (void) fprintf(stderr, gettext("invalid mountpoint " @@ -721,6 +723,7 @@ zpool_do_create(int argc, char **argv) mountpoint); } +#ifdef HAVE_ZPL if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { (void) fprintf(stderr, gettext("mountpoint '%s' : " "%s\n"), buf, strerror(errno)); @@ -743,6 +746,7 @@ zpool_do_create(int argc, char **argv) goto errout; } } +#endif /* HAVE_ZPL */ } if (dryrun) { @@ -773,8 +777,12 @@ zpool_do_create(int argc, char **argv) zfs_prop_to_name( ZFS_PROP_MOUNTPOINT), mountpoint) == 0); +#ifdef HAVE_ZPL if (zfs_mount(pool, NULL, 0) == 0) ret = zfs_shareall(pool); +#else + ret = 0; +#endif /* HAVE_ZPL */ zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { @@ -1531,11 +1539,13 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) return (1); +#if HAVE_ZPL if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && zpool_enable_datasets(zhp, mntopts, 0) != 0) { zpool_close(zhp); return (1); } +#endif /* HAVE_ZPL */ zpool_close(zhp); return (error); diff --git a/lib/libzfs/include/libzfs.h b/lib/libzfs/include/libzfs.h index 21e0fdc5c6..f08356b49f 100644 --- a/lib/libzfs/include/libzfs.h +++ b/lib/libzfs/include/libzfs.h @@ -602,9 +602,6 @@ extern int zpool_read_label(int, nvlist_t **); extern int zpool_create_zvol_links(zpool_handle_t *); extern int zpool_remove_zvol_links(zpool_handle_t *); -/* is this zvol valid for use as a dump device? */ -extern int zvol_check_dump_config(char *); - /* * Management interfaces for SMB ACL files */ diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index 7eedffa53c..fb162cb5aa 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -93,6 +93,7 @@ struct prop_changelist { int changelist_prefix(prop_changelist_t *clp) { +#ifdef HAVE_ZPL prop_changenode_t *cn; int ret = 0; @@ -168,6 +169,9 @@ changelist_prefix(prop_changelist_t *clp) (void) changelist_postfix(clp); return (ret); +#else + return 0; +#endif /* HAVE_ZPL */ } /* @@ -182,6 +186,7 @@ changelist_prefix(prop_changelist_t *clp) int changelist_postfix(prop_changelist_t *clp) { +#ifdef HAVE_ZPL prop_changenode_t *cn; char shareopts[ZFS_MAXPROPLEN]; int errors = 0; @@ -306,6 +311,9 @@ changelist_postfix(prop_changelist_t *clp) } return (errors ? -1 : 0); +#else + return 0; +#endif /* HAVE_ZPL */ } /* @@ -368,6 +376,7 @@ changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) int changelist_unshare(prop_changelist_t *clp, zfs_share_proto_t *proto) { +#ifdef HAVE_ZPL prop_changenode_t *cn; int ret = 0; @@ -382,6 +391,9 @@ changelist_unshare(prop_changelist_t *clp, zfs_share_proto_t *proto) } return (ret); +#else + return 0; +#endif } /* diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 899ffdaaed..ce1da79638 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -965,6 +965,7 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, /*FALLTHRU*/ +#ifdef HAVE_ZPL case ZFS_PROP_SHARESMB: case ZFS_PROP_SHARENFS: /* @@ -1075,6 +1076,7 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, } break; +#endif /* HAVE_ZPL */ case ZFS_PROP_UTF8ONLY: chosen_utf = (int)intval; break; @@ -2522,6 +2524,7 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) goto ancestorerr; } +#ifdef HAVE_ZPL if (zfs_mount(h, NULL, 0) != 0) { opname = dgettext(TEXT_DOMAIN, "mount"); goto ancestorerr; @@ -2531,6 +2534,7 @@ create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) opname = dgettext(TEXT_DOMAIN, "share"); goto ancestorerr; } +#endif /* HAVE_ZPL */ zfs_close(h); } @@ -3620,7 +3624,7 @@ error: /* * Given a zvol dataset, issue the ioctl to create the appropriate minor node, - * poke devfsadm to create the /dev link, and then wait for the link to appear. + * and wait briefly for udev to create the /dev link. */ int zvol_create_link(libzfs_handle_t *hdl, const char *dataset) @@ -3632,9 +3636,8 @@ static int zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists) { zfs_cmd_t zc = { "\0", "\0", "\0", 0 }; - di_devlink_handle_t dhdl; - priv_set_t *priv_effective; - int privileged; + char path[MAXPATHLEN]; + int error; (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); @@ -3671,52 +3674,13 @@ zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists) } /* - * If privileged call devfsadm and wait for the links to - * magically appear. - * Otherwise, print out an informational message. + * Wait up to 10 seconds for udev to create the device. */ - - priv_effective = priv_allocset(); - (void) getppriv(PRIV_EFFECTIVE, priv_effective); - privileged = (priv_isfullset(priv_effective) == B_TRUE); - priv_freeset(priv_effective); - - if (privileged) { - if ((dhdl = di_devlink_init(ZFS_DRIVER, - DI_MAKE_LINK)) == NULL) { - zfs_error_aux(hdl, strerror(errno)); - (void) zfs_error_fmt(hdl, errno, - dgettext(TEXT_DOMAIN, "cannot create device links " - "for '%s'"), dataset); - (void) ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc); - return (-1); - } else { - (void) di_devlink_fini(&dhdl); - } - } else { - char pathname[MAXPATHLEN]; - struct stat64 statbuf; - int i; - -#define MAX_WAIT 10 - - /* - * This is the poor mans way of waiting for the link - * to show up. If after 10 seconds we still don't - * have it, then print out a message. - */ - (void) snprintf(pathname, sizeof (pathname), "/dev/zvol/dsk/%s", - dataset); - - for (i = 0; i != MAX_WAIT; i++) { - if (stat64(pathname, &statbuf) == 0) - break; - (void) sleep(1); - } - if (i == MAX_WAIT) - (void) printf(gettext("%s may not be immediately " - "available\n"), pathname); - } + (void) snprintf(path, sizeof (path), "/dev/%s", dataset); + error = zpool_label_disk_wait(path, 10000); + if (error) + (void) printf(gettext("%s may not be immediately " + "available\n"), path); return (0); } @@ -3852,6 +3816,7 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp) return (0); } +#ifdef HAVE_ZPL int zfs_iscsi_perm_check(libzfs_handle_t *hdl, char *dataset, ucred_t *cred) { @@ -3917,6 +3882,7 @@ zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path, error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc); return (error); } +#endif /* HAVE_ZPL */ void zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index 1dd345a275..055b42a5f5 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -85,6 +85,7 @@ #include #define MAXISALEN 257 /* based on sysinfo(2) man page */ +#ifdef HAVE_ZPL static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *); zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, zfs_share_proto_t); @@ -1225,7 +1226,6 @@ out: return (ret); } - static int zvol_cb(const char *dataset, void *data) { @@ -1398,3 +1398,53 @@ out: return (ret); } + +#else /* HAVE_ZPL */ + +int +zfs_unshare_iscsi(zfs_handle_t *zhp) +{ + return 0; +} + +int +zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) +{ + return 0; +} + +void +remove_mountpoint(zfs_handle_t *zhp) { + return; +} + +boolean_t +is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where) +{ + return B_FALSE; +} + +boolean_t +zfs_is_mounted(zfs_handle_t *zhp, char **where) +{ + return is_mounted(zhp->zfs_hdl, zfs_get_name(zhp), where); +} + +boolean_t +zfs_is_shared(zfs_handle_t *zhp) +{ + return B_FALSE; +} + +int +zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) +{ + return B_FALSE; +} + +int +zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) +{ + return B_FALSE; +} +#endif /* HAVE_ZPL */ diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 38cc627fcb..e8c0e7e273 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3285,113 +3285,3 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) return 0; } - -static boolean_t -supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf) -{ - char *type; - nvlist_t **child; - uint_t children, c; - - verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || - strcmp(type, VDEV_TYPE_FILE) == 0 || - strcmp(type, VDEV_TYPE_LOG) == 0 || - strcmp(type, VDEV_TYPE_MISSING) == 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "vdev type '%s' is not supported"), type); - (void) zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf); - return (B_FALSE); - } - if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) { - if (!supported_dump_vdev_type(hdl, child[c], errbuf)) - return (B_FALSE); - } - } - return (B_TRUE); -} - -/* - * check if this zvol is allowable for use as a dump device; zero if - * it is, > 0 if it isn't, < 0 if it isn't a zvol - */ -int -zvol_check_dump_config(char *arg) -{ - zpool_handle_t *zhp = NULL; - nvlist_t *config, *nvroot; - char *p, *volname; - nvlist_t **top; - uint_t toplevels; - libzfs_handle_t *hdl; - char errbuf[1024]; - char poolname[ZPOOL_MAXNAMELEN]; - int pathlen = strlen(ZVOL_FULL_DEV_DIR); - int ret = 1; - - if (strncmp(arg, ZVOL_FULL_DEV_DIR, pathlen)) { - return (-1); - } - - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "dump is not supported on device '%s'"), arg); - - if ((hdl = libzfs_init()) == NULL) - return (1); - libzfs_print_on_error(hdl, B_TRUE); - - volname = arg + pathlen; - - /* check the configuration of the pool */ - if ((p = strchr(volname, '/')) == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "malformed dataset name")); - (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); - return (1); - } else if (p - volname >= ZFS_MAXNAMELEN) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "dataset name is too long")); - (void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf); - return (1); - } else { - (void) strncpy(poolname, volname, p - volname); - poolname[p - volname] = '\0'; - } - - if ((zhp = zpool_open(hdl, poolname)) == NULL) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "could not open pool '%s'"), poolname); - (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); - goto out; - } - config = zpool_get_config(zhp, NULL); - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "could not obtain vdev configuration for '%s'"), poolname); - (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); - goto out; - } - - verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &top, &toplevels) == 0); - if (toplevels != 1) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' has multiple top level vdevs"), poolname); - (void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf); - goto out; - } - - if (!supported_dump_vdev_type(hdl, top[0], errbuf)) { - goto out; - } - ret = 0; - -out: - if (zhp) - zpool_close(zhp); - libzfs_fini(hdl); - return (ret); -} diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index be5b3949f7..d28a4f9f8e 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -1974,6 +1974,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, * if we did a replication receive (indicated by stream_avl * being non-NULL). */ +#ifdef HAVE_ZPL cp = strchr(zc.zc_value, '@'); if (cp && (ioctl_err == 0 || !newfs)) { zfs_handle_t *h; @@ -2000,6 +2001,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } *cp = '@'; } +#endif /* HAVE_ZPL */ if (clp) { err |= changelist_postfix(clp); diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 8d0c47e301..91a48bfd10 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -605,7 +605,9 @@ libzfs_fini(libzfs_handle_t *hdl) (void) fclose(hdl->libzfs_mnttab); if (hdl->libzfs_sharetab) (void) fclose(hdl->libzfs_sharetab); +#ifdef HAVE_ZPL zfs_uninit_libshare(hdl); +#endif if (hdl->libzfs_log_str) (void) free(hdl->libzfs_log_str); zpool_free_handles(hdl);