Merge commit 'refs/remotes/origin/linux-kernel-disk' into HEAD

2009-06-26 14:35:08 -07:00 · 2009-06-26 14:35:08 -07:00 · a5041b8a4d
parent 4fbded409d 83de6db13f
commit a5041b8a4d
5 changed files with 597 additions and 18 deletions
--- a/.topdeps
+++ b/.topdeps
@ -1,3 +1 @@
-gcc-branch
+zfs-branch
 fix-branch
 feature-branch
--- a/.topmsg
+++ b/.topmsg
@ -1,19 +1,6 @@
 From: Brian Behlendorf <behlendorf1@llnl.gov>
-Subject: [PATCH] zfs branch
+Subject: [PATCH] linux kernel disk
-Merged result of all changes which are relevant to both Solaris
+Native Linux vdev disk interfaces
 and Linux builds of the ZFS code.  These are changes where there
 is a reasonable chance they will be accepted upstream.
 Additionally, since this is effectively the root of the linux
 ZFS tree the core linux build system is added here.  This
 includes autogen.sh, configure.ac, m4 macros, some scripts/*,
 and makefiles for all the core ZFS components.  Linux-only
 features which require tweaks to the build system should appear
 on the relevant topic branches.  All autotools products which
 result from autogen.sh are commited to the linux-configure-branch.
 This branch also contains the META, ChangeLog, AUTHORS, TODO,
 and README, files.
 Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@ -47,6 +47,7 @@ ${MODULE}-objs += uberblock.o
 ${MODULE}-objs += unique.o
 ${MODULE}-objs += vdev.o
 ${MODULE}-objs += vdev_cache.o
 ${MODULE}-objs += vdev_disk.o
 ${MODULE}-objs += vdev_file.o
 ${MODULE}-objs += vdev_label.o
 ${MODULE}-objs += vdev_mirror.o
--- a/module/zfs/include/sys/vdev_disk.h
+++ b/module/zfs/include/sys/vdev_disk.h
@ -0,0 +1,28 @@
 #ifndef _SYS_VDEV_DISK_H
 #define _SYS_VDEV_DISK_H
 #ifdef	__cplusplus
 extern "C" {
 #endif
 #ifdef _KERNEL
 #include <sys/vdev.h>
 #include <sys/ddi.h>
 #include <sys/sunldi.h>
 #include <sys/sunddi.h>
 typedef struct vdev_disk {
 	ddi_devid_t	vd_devid;
 	char		*vd_minor;
 	ldi_handle_t	vd_lh;
 } vdev_disk_t;
 extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
 #endif /* _KERNEL */
 #ifdef	__cplusplus
 }
 #endif
 #endif	/* _SYS_VDEV_DISK_H */
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@ -0,0 +1,565 @@
 /*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
 /*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/sunldi.h>
 #include <zfs_config.h>
 /*
 * Virtual device vector for disks.
 */
 typedef struct dio_request {
 	struct completion	dr_comp;	/* Completion for sync IO */
 	spinlock_t		dr_lock;	/* Completion lock */
 	zio_t			*dr_zio;	/* Parent ZIO */
 	int			dr_ref;		/* Outstanding bio count */
 	int			dr_rw;		/* Read/Write */
 	int			dr_error;	/* Bio error */
 	int			dr_bio_count;	/* Count of bio's */
        struct bio		*dr_bio[0];	/* Attached bio's */
 } dio_request_t;
 static int
 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 {
 	struct block_device *vd_lh;
 	vdev_disk_t *dvd;
 	/* Must have a pathname and it must be absolute. */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return EINVAL;
 	}
 	dvd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP);
 	if (dvd == NULL)
 		return ENOMEM;
 	/* XXX: Since we do not have devid support like Solaris we
 	 * currently can't be as clever about opening the right device.
 	 * For now we will simply open the device name provided and
 	 * fail when it doesn't exist.  If your devices get reordered
 	 * your going to be screwed, use udev for now to prevent this.
 	 *
 	 * XXX: mode here could be the global spa_mode with a little
 	 * munging of the flags to make then more agreeable to linux.
 	 * However, simply passing a 0 for now gets us W/R behavior.
 	 */
 	vd_lh = open_bdev_excl(vd->vdev_path, 0, dvd);
 	if (IS_ERR(vd_lh)) {
 		kmem_free(dvd, sizeof(vdev_disk_t));
 		return -PTR_ERR(vd_lh);
 	}
 	/* XXX: Long term validate stored dvd->vd_devid with a unique
 	 * identifier read from the disk, likely EFI support.
 	 */
 	vd->vdev_tsd = dvd;
 	dvd->vd_lh = vd_lh;
 	/* Check if this is a whole device.  When vd_lh->bd_contains ==
 	 * vd_lh we have a whole device and not simply a partition. */
 	vd->vdev_wholedisk = !!(vd_lh->bd_contains == vd_lh);
 	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
 	vd->vdev_nowritecache = B_FALSE;
 	/* Determine the actual size of the device (in bytes)
 	 *
 	 * XXX: SECTOR_SIZE is defined to 512b which may not be true for
 	 * your device, we must use the actual hardware sector size.
 	 */
 	*psize = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE;
 	/* Based on the minimum sector size set the block size */
 	*ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1;
 	return 0;
 }
 static void
 vdev_disk_close(vdev_t *vd)
 {
 	vdev_disk_t *dvd = vd->vdev_tsd;
 	if (dvd == NULL)
 		return;
 	if (dvd->vd_lh != NULL)
 		close_bdev_excl(dvd->vd_lh);
 	kmem_free(dvd, sizeof(vdev_disk_t));
 	vd->vdev_tsd = NULL;
 }
 #ifdef HAVE_2ARGS_BIO_END_IO_T
 static void
 vdev_disk_physio_completion(struct bio *bio, int rc)
 #else
 static int
 vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc)
 #endif /* HAVE_2ARGS_BIO_END_IO_T */
 {
 	dio_request_t *dr = bio->bi_private;
 	zio_t *zio;
 	int i, error;
 	/* Fatal error but print some useful debugging before asserting */
 	if (dr == NULL) {
 		printk("FATAL: bio->bi_private == NULL\n"
 		       "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n"
 		       "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d\n",
 		       bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt,
 		       bio->bi_idx, bio->bi_size, bio->bi_end_io,
 		       atomic_read(&bio->bi_cnt));
 		SBUG();
 	}
 	/* Incomplete */
 	if (bio->bi_size) {
 		rc = 1;
 		goto out;
 	}
 	error = rc;
 	if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = EIO;
 	spin_lock(&dr->dr_lock);
 	dr->dr_ref--;
 	if (dr->dr_error == 0)
 		dr->dr_error = error;
 	/*
 	 * All bio's attached to this dio request have completed.  This
 	 * means it is safe to access the dio outside the spin lock, we
 	 * are assured there will be no racing accesses.
 	 */
 	if (dr->dr_ref == 0) {
 		zio = dr->dr_zio;
 		spin_unlock(&dr->dr_lock);
 		/* Syncronous dio cleanup handled by waiter */
 		if (dr->dr_rw & (1 << BIO_RW_SYNC)) {
 			complete(&dr->dr_comp);
 		} else {
 			for (i = 0; i < dr->dr_bio_count; i++)
 				bio_put(dr->dr_bio[i]);
 			kmem_free(dr, sizeof(dio_request_t) +
 			          sizeof(struct bio *) * dr->dr_bio_count);
 		}
 		if (zio) {
 			zio->io_error = dr->dr_error;
 			zio_interrupt(zio);
 		}
 	} else {
 		spin_unlock(&dr->dr_lock);
 	}
 	rc = 0;
 out:
 #ifdef HAVE_2ARGS_BIO_END_IO_T
 	return;
 #else
 	return rc;
 #endif /* HAVE_2ARGS_BIO_END_IO_T */
 }
 static struct bio *
 bio_map_virt(struct request_queue *q, void *data,
               unsigned int len, gfp_t gfp_mask)
 {
 	unsigned long kaddr = (unsigned long)data;
 	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	unsigned long start = kaddr >> PAGE_SHIFT;
 	unsigned int offset, i, data_len = len;
 	const int nr_pages = end - start;
 	struct page *page;
 	struct bio *bio;
 	bio = bio_alloc(gfp_mask, nr_pages);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 	offset = offset_in_page(kaddr);
 	for (i = 0; i < nr_pages; i++) {
 		unsigned int bytes = PAGE_SIZE - offset;
 		if (len <= 0)
 			break;
 		if (bytes > len)
 			bytes = len;
 		VERIFY3P(page = vmalloc_to_page(data), !=, NULL);
 		VERIFY3U(bio_add_pc_page(q, bio, page, bytes, offset), ==, bytes);
 		data += bytes;
 		len -= bytes;
 		offset = 0;
 		bytes = PAGE_SIZE;
 	}
 	VERIFY3U(bio->bi_size, ==, data_len);
        return bio;
 }
 static struct bio *
 bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask)
 {
 	struct bio *bio;
 	/* Cleanly map buffer we are passed in to a bio regardless
 	 * of if the buffer is a virtual or physical address. */
 	if (kmem_virt(data))
 		bio = bio_map_virt(q, data, len, gfp_mask);
 	else
 		bio = bio_map_kern(q, data, len, gfp_mask);
 	return bio;
 }
 static int
 __vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf_ptr,
                   size_t kbuf_size, uint64_t kbuf_offset, int flags)
 {
 	struct request_queue *q = vd_lh->bd_disk->queue;
        dio_request_t *dr;
 	caddr_t bio_ptr;
 	uint64_t bio_offset;
 	int i, j, error = 0, bio_count, bio_size, dio_size;
 	ASSERT3S(kbuf_offset % SECTOR_SIZE, ==, 0);
 	ASSERT3S(flags &
 		 ~((1 << BIO_RW) |
 		   (1 << BIO_RW_SYNC) |
 		   (1 << BIO_RW_FAILFAST)), ==, 0);
 	bio_count = (kbuf_size / (q->max_hw_sectors << 9)) + 1;
 	dio_size  = sizeof(dio_request_t) + sizeof(struct bio *) * bio_count;
 	dr = kmem_zalloc(dio_size, KM_SLEEP);
 	if (dr == NULL)
 		return ENOMEM;
 	init_completion(&dr->dr_comp);
 	spin_lock_init(&dr->dr_lock);
 	dr->dr_ref = 0;
 	dr->dr_zio = zio;
 	dr->dr_rw = READ;
 	dr->dr_error = 0;
 	dr->dr_bio_count = bio_count;
 	if (flags & (1 << BIO_RW))
 		dr->dr_rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE;
 	if (flags & (1 << BIO_RW_FAILFAST))
 		dr->dr_rw |= 1 << BIO_RW_FAILFAST;
 	/*
 	 * When the IO size exceeds the maximum bio size for the request
 	 * queue we are forced to break the IO in multiple bio's and wait
 	 * for them all to complete.  Ideally, all pool users will set
 	 * their volume block size to match the maximum request size and
 	 * the common case will be one bio per vdev IO request.
 	 */
 	bio_ptr = kbuf_ptr;
 	bio_offset = kbuf_offset;
 	for (i = 0; i < dr->dr_bio_count; i++) {
 		bio_size = MIN(kbuf_size, q->max_hw_sectors << 9);
 		dr->dr_bio[i] = bio_map(q, bio_ptr, bio_size, GFP_NOIO);
 		if (IS_ERR(dr->dr_bio[i])) {
 			for (j = 0; j < i; j++)
 				bio_put(dr->dr_bio[j]);
 			error = -PTR_ERR(dr->dr_bio[i]);
 			kmem_free(dr, dio_size);
 			return error;
 		}
 		dr->dr_bio[i]->bi_bdev = vd_lh;
 		dr->dr_bio[i]->bi_sector = bio_offset >> 9;
 		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
 		dr->dr_bio[i]->bi_private = dr;
 		dr->dr_ref++;
 		bio_ptr    += bio_size;
 		bio_offset += bio_size;
 		kbuf_size  -= bio_size;
 	}
 	for (i = 0; i < dr->dr_bio_count; i++)
 		submit_bio(dr->dr_rw, dr->dr_bio[i]);
 	/*
 	 * On syncronous blocking requests we wait for all bio the completion
 	 * callbacks to run.  We will be woken when the last callback runs
 	 * for this dio.  We are responsible for freeing the dio_request_t as
 	 * well as the final reference on all attached bios.
 	 */
 	if (dr->dr_rw & (1 << BIO_RW_SYNC)) {
 		wait_for_completion(&dr->dr_comp);
 		ASSERT(dr->dr_ref == 0);
 		error = dr->dr_error;
 		for (i = 0; i < dr->dr_bio_count; i++)
 			bio_put(dr->dr_bio[i]);
 		kmem_free(dr, dio_size);
 	}
 	return error;
 }
 int
 vdev_disk_physio(ldi_handle_t vd_lh, caddr_t kbuf,
 		 size_t size, uint64_t offset, int flags)
 {
 	return __vdev_disk_physio(vd_lh, NULL, kbuf, size, offset, flags);
 }
 #if 0
 /* XXX: Not yet supported */
 static void
 vdev_disk_ioctl_done(void *zio_arg, int error)
 {
 	zio_t *zio = zio_arg;
 	zio->io_error = error;
 	zio_interrupt(zio);
 }
 #endif
 static int
 vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_disk_t *dvd = vd->vdev_tsd;
 	int flags, error;
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = ENXIO;
 			return ZIO_PIPELINE_CONTINUE;
 		}
 		switch (zio->io_cmd) {
 		case DKIOCFLUSHWRITECACHE:
 			if (zfs_nocacheflush)
 				break;
 			if (vd->vdev_nowritecache) {
 				zio->io_error = ENOTSUP;
 				break;
 			}
 #if 0
 			/* XXX: Not yet supported */
 			vdev_disk_t *dvd = vd->vdev_tsd;
 			zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
 			zio->io_dk_callback.dkc_flag = FLUSH_VOLATILE;
 			zio->io_dk_callback.dkc_cookie = zio;
 			error = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
 			    (uintptr_t)&zio->io_dk_callback,
 			    FKIOCTL, kcred, NULL);
 			if (error == 0) {
 				/*
 				 * The ioctl will be done asychronously,
 				 * and will call vdev_disk_ioctl_done()
 				 * upon completion.
 				 */
 				return ZIO_PIPELINE_STOP;
 			}
 #else
 			error = ENOTSUP;
 #endif
 			if (error == ENOTSUP || error == ENOTTY) {
 				/*
 				 * If we get ENOTSUP or ENOTTY, we know that
 				 * no future attempts will ever succeed.
 				 * In this case we set a persistent bit so
 				 * that we don't bother with the ioctl in the
 				 * future.
 				 */
 				vd->vdev_nowritecache = B_TRUE;
 			}
 			zio->io_error = error;
 			break;
 		default:
 			zio->io_error = ENOTSUP;
 		}
 		return ZIO_PIPELINE_CONTINUE;
 	}
 	/*
 	 * B_BUSY	XXX: Not supported
 	 * B_NOCACHE	XXX: Not supported
 	 */
 	flags = ((zio->io_type == ZIO_TYPE_READ) ? READ : WRITE);
 	if (zio->io_flags & ZIO_FLAG_IO_RETRY)
 		flags |= (1 << BIO_RW_FAILFAST);
 	error = __vdev_disk_physio(dvd->vd_lh, zio, zio->io_data,
 		                   zio->io_size, zio->io_offset, flags);
 	if (error) {
 		zio->io_error = error;
 		return ZIO_PIPELINE_CONTINUE;
 	}
 	return ZIO_PIPELINE_STOP;
 }
 static void
 vdev_disk_io_done(zio_t *zio)
 {
 	/*
 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
 	 * the device has been removed.  If this is the case, then we trigger an
 	 * asynchronous removal of the device. Otherwise, probe the device and
 	 * make sure it's still accessible.
 	 */
 	VERIFY3S(zio->io_error, ==, 0);
 #if 0
 		vdev_disk_t *dvd = vd->vdev_tsd;
 		int state = DKIO_NONE;
 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
 			vd->vdev_remove_wanted = B_TRUE;
 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
 		}
 #endif
 }
 vdev_ops_t vdev_disk_ops = {
 	vdev_disk_open,
 	vdev_disk_close,
 	vdev_default_asize,
 	vdev_disk_io_start,
 	vdev_disk_io_done,
 	NULL,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
 /*
 * Given the root disk device devid or pathname, read the label from
 * the device, and construct a configuration nvlist.
 */
 int
 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 {
 	struct block_device *vd_lh;
 	vdev_label_t *label;
 	uint64_t s, size;
 	int i;
 	/*
 	 * Read the device label and build the nvlist.
 	 * XXX: Not yet supported
 	 */
 #if 0
 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
 	    &minor_name) == 0) {
 		error = ldi_open_by_devid(tmpdevid, minor_name, spa_mode,
 					  kcred, &vd_lh, zfs_li);
 		ddi_devid_free(tmpdevid);
 		ddi_devid_str_free(minor_name);
 	}
 #endif
 	vd_lh = open_bdev_excl(devpath, MS_RDONLY, NULL);
 	if (IS_ERR(vd_lh))
 		return -PTR_ERR(vd_lh);
 	s = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE;
 	if (s == 0) {
 		close_bdev_excl(vd_lh);
 		return EIO;
 	}
 	size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t);
 	label = vmem_alloc(sizeof(vdev_label_t), KM_SLEEP);
 	for (i = 0; i < VDEV_LABELS; i++) {
 	        uint64_t offset, state, txg = 0;
 		/* read vdev label */
 		offset = vdev_label_offset(size, i, 0);
 		if (vdev_disk_physio(vd_lh, (caddr_t)label,
 		    VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE +
 		    VDEV_PHYS_SIZE, offset, READ) != 0)
 			continue;
 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
 		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
 			*config = NULL;
 			continue;
 		}
 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
 		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
 			nvlist_free(*config);
 			*config = NULL;
 			continue;
 		}
 		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
 		    &txg) != 0 || txg == 0) {
 			nvlist_free(*config);
 			*config = NULL;
 			continue;
 		}
 		break;
 	}
 	vmem_free(label, sizeof(vdev_label_t));
 	close_bdev_excl(vd_lh);
 	return 0;
 }