Refresh linux-kernel-disk

2008-12-05 11:16:18 -08:00 · 2008-12-05 11:16:18 -08:00 · a5e3d71fd4
parent 9baaa468ac
commit a5e3d71fd4
4 changed files with 690 additions and 18 deletions
--- a/.topdeps
+++ b/.topdeps
@ -1,3 +1 @@
-gcc-branch
+zfs-branch
 fix-branch
 feature-branch
--- a/.topmsg
+++ b/.topmsg
@ -1,19 +1,6 @@
 From: Brian Behlendorf <behlendorf1@llnl.gov>
-Subject: [PATCH] zfs branch
+Subject: [PATCH] linux kernel disk
-Merged result of all changes which are relevant to both Solaris
+Native Linux vdev disk interfaces
 and Linux builds of the ZFS code.  These are changes where there
 is a reasonable chance they will be accepted upstream.
 Additionally, since this is effectively the root of the linux
 ZFS tree the core linux build system is added here.  This
 includes autogen.sh, configure.ac, m4 macros, some scripts/*,
 and makefiles for all the core ZFS components.  Linux-only
 features which require tweaks to the build system should appear
 on the relevant topic branches.  All autotools products which
 result from autogen.sh are commited to the linux-configure-branch.
 This branch also contains the META, ChangeLog, AUTHORS,
 README, and GIT files.
 Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
--- a/zfs/lib/libzpool/include/sys/vdev_disk.h
+++ b/zfs/lib/libzpool/include/sys/vdev_disk.h
@ -0,0 +1,29 @@
 #ifndef _SYS_VDEV_DISK_H
 #define _SYS_VDEV_DISK_H
 #ifdef	__cplusplus
 extern "C" {
 #endif
 #ifdef _KERNEL
 #include <sys/vdev.h>
 #include <sys/ddi.h>
 #include <sys/sunldi.h>
 #include <sys/sunddi.h>
 typedef struct vdev_disk {
 	ddi_devid_t	vd_devid;
 	char		*vd_minor;
 	ldi_handle_t	vd_lh;
 } vdev_disk_t;
 extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
 extern nvlist_t *vdev_disk_read_rootlabel(char *devpath);
 #endif
 #ifdef	__cplusplus
 }
 #endif
 #endif	/* _SYS_VDEV_DISK_H */
--- a/zfs/lib/libzpool/vdev_disk.c
+++ b/zfs/lib/libzpool/vdev_disk.c
@ -0,0 +1,658 @@
 /*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
 /*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/sunldi.h>
 /*
 * Virtual device vector for disks.
 */
 #if defined(_KERNEL) && defined(HAVE_SPL)
 /* XXX: A slab entry for these would probably be good */
 typedef struct dio_request {
 	struct completion dr_comp;
 	atomic_t dr_ref;
 	vdev_t *dr_vd;
 	zio_t *dr_zio;
 	int dr_rc;
 } dio_request_t;
 static int
 vdev_disk_open_common(vdev_t *vd)
 {
 	vdev_disk_t *dvd;
 	struct block_device *bdev;
 	int mode = 0;
 	// dprintf("vd=%p\n", vd);
 	/* Must have a pathname and it must be absolute. */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return EINVAL;
 	}
 	dvd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP);
 	if (dvd == NULL)
 		return ENOMEM;
 	/* XXX: Since we do not have devid support like Solaris we
 	 * currently can't be as clever about opening the right device.
 	 * For now we will simple open the device name provided and
 	 * fail when it doesn't exist.  If your devices get reordered
 	 * your going to be screwed, use udev for now to prevent this.
 	 *
 	 * XXX: mode here could be the global spa_mode with a little
 	 * munging of the flags to make then more agreeable to linux.
 	 * However, simply passing a 0 for now gets us W/R behavior.
 	 */
 	bdev = open_bdev_excl(vd->vdev_path, mode, dvd);
 	if (IS_ERR(bdev)) {
 		kmem_free(dvd, sizeof(vdev_disk_t));
 		return -PTR_ERR(bdev);
 	}
 	/* XXX: Long term validate stored dvd->vd_devid with
 	 * a unique identifier read from the disk.
 	 */
 	dvd->vd_lh = bdev;
 	vd->vdev_tsd = dvd;
 	return 0;
 }
 static int
 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 {
 	vdev_disk_t *dvd;
 	struct block_device *bdev;
 	int rc;
 	// dprintf("vd=%p, psize=%p, ashift=%p\n", vd, psize, ashift);
 	dprintf("adding disk %s\n",
 	        vd->vdev_path ? vd->vdev_path : "<none>");
 	rc = vdev_disk_open_common(vd);
 	if (rc)
 		return rc;
 	dvd = vd->vdev_tsd;
 	bdev = dvd->vd_lh;
 	/* Determine the actual size of the device (in bytes) */
 	*psize = get_capacity(bdev->bd_disk) * SECTOR_SIZE;
 	/* Check if this is a whole device and if it is try and
 	 * enable the write cache, it is OK if this fails.
 	 *
 	 * XXX: This behavior should probably be configurable.
 	 */
 	if (bdev->bd_contains == bdev) {
 		int wce = 1;
 		vd->vdev_wholedisk = 1ULL;
 		/* Different methods are needed for an IDE vs SCSI disk.
 		 * Since we're not sure what type of disk this is try IDE,
 		 * if that fails try SCSI. */
 		rc = ioctl_by_bdev(bdev, HDIO_SET_WCACHE, (unsigned long)&wce);
 		if (rc)
 			dprintf("Unable to enable IDE WCE and SCSI WCE "
 				"not yet supported: %d\n", rc);
 		/* XXX: To implement the scsi WCE enable we are going to need
 		 * to use the SG_IO ioctl.  But that means fully forming the
 		 * SCSI command as the ioctl arg.  To get this right I need
 		 * to look at the sdparm source which does this.
 		 */
 		rc = 0;
 	} else {
 		/* Must be a partition, that's fine. */
 		vd->vdev_wholedisk = 0;
 	}
 	/* Based on the minimum sector size set the block size */
 	*ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1;
 	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
 	vd->vdev_nowritecache = B_FALSE;
 	return rc;
 }
 static void
 vdev_disk_close(vdev_t *vd)
 {
 	vdev_disk_t *dvd = vd->vdev_tsd;
 	// dprintf("vd=%p\n", vd);
 	dprintf("removing disk %s\n",
 	        vd->vdev_path ? vd->vdev_path : "<none>");
 	if (dvd == NULL)
 		return;
 	close_bdev_excl(dvd->vd_lh);
 	kmem_free(dvd, sizeof(vdev_disk_t));
 	vd->vdev_tsd = NULL;
 }
 #ifdef HAVE_2ARGS_BIO_END_IO_T
 static void
 vdev_disk_probe_io_completion(struct bio *bio, int rc)
 #else
 static int
 vdev_disk_probe_io_completion(struct bio *bio, unsigned int size, int rc)
 #endif /* HAVE_2ARGS_BIO_END_IO_T */
 {
        dio_request_t *dr = bio->bi_private;
 	zio_t *zio;
 	int error;
 	/* Fatal error but print some useful debugging before asserting */
 	if (dr == NULL) {
 		printk("FATAL: bio->bi_private == NULL\n"
 		       "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n"
                       "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d\n",
                       bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt,
 		       bio->bi_idx, bio->bi_size, bio->bi_end_io,
 		       atomic_read(&bio->bi_cnt));
 		SBUG();
 	}
 	/* Incomplete */
 	if (bio->bi_size) {
 		rc = 1;
 		goto out;
 	}
 	error = rc;
 	if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = EIO;
 	zio = dr->dr_zio;
 	if (zio) {
 		zio->io_error = error;
 		zio_interrupt(zio);
 	}
 	dr->dr_rc = error;
        atomic_dec(&dr->dr_ref);
        if (bio_sync(bio)) {
 		complete(&dr->dr_comp);
 	} else {
 		kmem_free(dr, sizeof(dio_request_t));
 	        bio_put(bio);
 	}
 	rc = 0;
 out:
 #ifdef HAVE_2ARGS_BIO_END_IO_T
 	return;
 #else
 	return rc;
 #endif /* HAVE_2ARGS_BIO_END_IO_T */
 }
 static struct bio *
 __bio_map_vmem(struct request_queue *q, void *data,
               unsigned int len, gfp_t gfp_mask)
 {
        unsigned long kaddr = (unsigned long)data;
        unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        unsigned long start = kaddr >> PAGE_SHIFT;
        const int nr_pages = end - start;
        int offset, i;
 	struct page *page;
        struct bio *bio;
        bio = bio_alloc(gfp_mask, nr_pages);
        if (!bio)
                return ERR_PTR(-ENOMEM);
        offset = offset_in_page(kaddr);
        for (i = 0; i < nr_pages; i++) {
                unsigned int bytes = PAGE_SIZE - offset;
                if (len <= 0)
                        break;
                if (bytes > len)
                        bytes = len;
 		page = vmalloc_to_page(data);
 		ASSERT(page); /* Expecting virtual linear address */
                if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
                        break;
                data += bytes;
                len -= bytes;
                offset = 0;
 		bytes = PAGE_SIZE;
        }
        return bio;
 }
 static struct bio *
 bio_map_vmem(struct request_queue *q, void *data,
             unsigned int len, gfp_t gfp_mask)
 {
 	struct bio *bio;
 	bio = __bio_map_vmem(q, data, len, gfp_mask);
 	if (IS_ERR(bio))
 		return bio;
 	if (bio->bi_size != len) {
 		bio_put(bio);
 		return ERR_PTR(-EINVAL);
 	}
 	return bio;
 }
 static struct bio *
 bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask)
 {
 	struct bio *bio;
 	/* Cleanly map buffer we are passed in to a bio regardless
 	 * of if the buffer is a virtual or physical address. */
 	if (kmem_virt(data))
 		bio = bio_map_vmem(q, data, len, gfp_mask);
 	else
 		bio = bio_map_kern(q, data, len, gfp_mask);
 	return bio;
 }
 static int
 vdev_disk_io(vdev_t *vd, zio_t *zio, caddr_t kbuf, size_t size,
             uint64_t offset, int flags)
 {
 	struct bio *bio;
        dio_request_t *dr;
 	int rw, rc = 0;
 	struct block_device *bdev;
 	struct request_queue *q;
 	// dprintf("vd=%p, zio=%p, kbuf=%p, size=%ld, offset=%lu, flag=%lx\n",
 	//        vd, zio, kbuf, size, offset, flags);
 	ASSERT((offset % SECTOR_SIZE) == 0); /* Sector aligned */
 	if (vd == NULL || vd->vdev_tsd == NULL)
 		return EINVAL;
 	dr = kmem_alloc(sizeof(dio_request_t), KM_SLEEP);
 	if (dr == NULL)
 		return ENOMEM;
 	atomic_set(&dr->dr_ref, 0);
 	dr->dr_vd = vd;
 	dr->dr_zio = zio;
 	dr->dr_rc = 0;
 	bdev = ((vdev_disk_t *)(vd->vdev_tsd))->vd_lh;
 	q = bdev->bd_disk->queue;
 	bio = bio_map(q, kbuf, size, GFP_NOIO);
 	if (IS_ERR(bio)) {
 		kmem_free(dr, sizeof(dio_request_t));
 		return -PTR_ERR(bio);
 	}
 	bio->bi_bdev = bdev;
 	bio->bi_sector = offset / SECTOR_SIZE;
 	bio->bi_end_io = vdev_disk_probe_io_completion;
 	bio->bi_private = dr;
 	init_completion(&dr->dr_comp);
 	atomic_inc(&dr->dr_ref);
 	if (flags & (1 << BIO_RW))
 		rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE;
 	else
 		rw = READ;
 	if (flags & (1 << BIO_RW_FAILFAST))
 		rw |= 1 << BIO_RW_FAILFAST;
 	ASSERT3S(flags & ~((1 << BIO_RW) | (1 << BIO_RW_SYNC) |
 	    (1 << BIO_RW_FAILFAST)), ==, 0);
 	submit_bio(rw, bio);
 	/*
 	 * On syncronous blocking requests we wait for the completion
 	 * callback to wake us.  Then we are responsible for freeing
 	 * the dio_request_t as well as dropping the final bio reference.
 	 */
 	if (bio_sync(bio)) {
 		wait_for_completion(&dr->dr_comp);
 		ASSERT(atomic_read(&dr->dr_ref) == 0);
 		rc = dr->dr_rc;
 		kmem_free(dr, sizeof(dio_request_t));
 	        bio_put(bio);
 	}
 	if (zio_injection_enabled && rc == 0)
 		rc = zio_handle_device_injection(vd, EIO);
 	return rc;
 }
 static int
 vdev_disk_probe_io(vdev_t *vd, caddr_t kbuf, size_t size,
 		   uint64_t offset, int flags)
 {
 	int rc;
 	// dprintf("vd=%p, kbuf=%p, size=%ld, offset=%lu, flag=%d\n",
 	//        vd, kbuf, size, offset, flags);
 	flags |= (1 << BIO_RW_SYNC);
 	flags |= (1 << BIO_RW_FAILFAST);
 	/* XXX: offset must be block aligned or we need to take
 	 * care of it */
 	rc = vdev_disk_io(vd, NULL, kbuf, size, offset, flags);
 	return rc;
 }
 /*
 * Determine if the underlying device is accessible by reading and writing
 * to a known location. We must be able to do this during syncing context
 * and thus we cannot set the vdev state directly.
 */
 static int
 vdev_disk_probe(vdev_t *vd)
 {
 	vdev_t *nvd;
 	int label_idx, rc = 0, retries = 0;
 	uint64_t offset;
        char *vl_pad;
 	// dprintf("vd=%p\n", vd);
 	if (vd == NULL)
 		return EINVAL;
 	/* Hijack the current vdev */
 	nvd = vd;
 	/* Pick a random label to rewrite */
 	label_idx = spa_get_random(VDEV_LABELS);
 	ASSERT(label_idx < VDEV_LABELS);
 	offset = vdev_label_offset(vd->vdev_psize, label_idx,
 	                           offsetof(vdev_label_t, vl_pad));
        vl_pad = vmem_alloc(VDEV_SKIP_SIZE, KM_SLEEP);
 	if (vl_pad == NULL)
 		return ENOMEM;
 	/*
 	 * Try to read and write to a special location on the
 	 * label. We use the existing vdev initially and only
 	 * try to create and reopen it if we encounter a failure.
 	 */
 	while ((rc = vdev_disk_probe_io(nvd, vl_pad,
 	        VDEV_SKIP_SIZE, offset, READ)) != 0 && retries == 0) {
 		nvd = kmem_zalloc(sizeof(vdev_t), KM_SLEEP);
 		if (vd->vdev_path)
 			nvd->vdev_path = spa_strdup(vd->vdev_path);
 		if (vd->vdev_physpath)
 			nvd->vdev_physpath = spa_strdup(vd->vdev_physpath);
 		if (vd->vdev_devid)
 			nvd->vdev_devid = spa_strdup(vd->vdev_devid);
 		nvd->vdev_wholedisk = vd->vdev_wholedisk;
 		nvd->vdev_guid = vd->vdev_guid;
 		retries++;
 		rc = vdev_disk_open_common(nvd);
 		if (rc)
 			break;
 	}
 	if (!rc)
 		rc = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE,
 		                        offset, WRITE);
 	/* Clean up if we allocated a new vdev */
 	if (retries) {
 		vdev_disk_close(nvd);
 		if (nvd->vdev_path)
 			spa_strfree(nvd->vdev_path);
 		if (nvd->vdev_physpath)
 			spa_strfree(nvd->vdev_physpath);
 		if (nvd->vdev_devid)
 			spa_strfree(nvd->vdev_devid);
 		kmem_free(nvd, sizeof(vdev_t));
 	}
 	vmem_free(vl_pad, VDEV_SKIP_SIZE);
 	/* Reset the failing flag */
 	if (!rc)
 		vd->vdev_is_failing = B_FALSE;
 	return rc;
 }
 #if 0
 static void
 vdev_disk_ioctl_done(void *zio_arg, int rc)
 {
 	zio_t *zio = zio_arg;
 	zio->io_error = rc;
 	zio_interrupt(zio);
 }
 #endif
 static int
 vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 //	vdev_disk_t *dvd = vd->vdev_tsd;
 	int flags, rc;
 	// dprintf("zio=%p\n", zio);
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
 		zio_vdev_io_bypass(zio);
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = ENXIO;
 			return ZIO_PIPELINE_CONTINUE;
 		}
 		switch (zio->io_cmd) {
 		case DKIOCFLUSHWRITECACHE:
 			if (zfs_nocacheflush)
 				break;
 			if (vd->vdev_nowritecache) {
 				zio->io_error = ENOTSUP;
 				break;
 			}
 #if 0
 			zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done;
 			zio->io_dk_callback.dkc_flag = FLUSH_VOLATILE;
 			zio->io_dk_callback.dkc_cookie = zio;
 			rc = ldi_ioctl(dvd->vd_lh, zio->io_cmd,
 			    (uintptr_t)&zio->io_dk_callback,
 			    FKIOCTL, kcred, NULL);
 			if (rc == 0) {
 				/*
 				 * The ioctl will be done asychronously,
 				 * and will call vdev_disk_ioctl_done()
 				 * upon completion.
 				 */
 				return ZIO_PIPELINE_STOP;
 			}
 #else
 			rc = ENOTSUP;
 #endif
 			if (rc == ENOTSUP || rc == ENOTTY) {
 				/*
 				 * If we get ENOTSUP or ENOTTY, we know that
 				 * no future attempts will ever succeed.
 				 * In this case we set a persistent bit so
 				 * that we don't bother with the ioctl in the
 				 * future.
 				 */
 				vd->vdev_nowritecache = B_TRUE;
 			}
 			zio->io_error = rc;
 			break;
 		default:
 			zio->io_error = ENOTSUP;
 		}
 		return ZIO_PIPELINE_CONTINUE;
 	}
 	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
 		return ZIO_PIPELINE_STOP;
 	if ((zio = vdev_queue_io(zio)) == NULL)
 		return ZIO_PIPELINE_STOP;
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		rc = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
 	else
 		rc = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
 	rc = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : rc;
 	if (rc) {
 		zio->io_error = rc;
 		zio_interrupt(zio);
 		return ZIO_PIPELINE_STOP;
 	}
 	flags = ((zio->io_type == ZIO_TYPE_READ) ? READ : WRITE);
 	/* flags |= B_BUSY | B_NOCACHE; XXX : Not supported */
 	if (zio->io_flags & ZIO_FLAG_FAILFAST)
 		flags |= (1 << BIO_RW_FAILFAST);
 	vdev_disk_io(vd, zio, zio->io_data, zio->io_size,
 	             zio->io_offset, flags);
 	return ZIO_PIPELINE_STOP;
 }
 static int
 vdev_disk_io_done(zio_t *zio)
 {
 	// dprintf("zio=%p\n", zio);
 	vdev_queue_io_done(zio);
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		vdev_cache_write(zio);
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
 	/*
 	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
 	 * the device has been removed.  If this is the case, then we trigger an
 	 * asynchronous removal of the device. Otherwise, probe the device and
 	 * make sure it's still accessible.
 	 */
 	if (zio->io_error == EIO) {
 		ASSERT(0); /* XXX: Not yet supported */
 #if 0
 		vdev_t *vd = zio->io_vd;
 		vdev_disk_t *dvd = vd->vdev_tsd;
 		int state;
 		state = DKIO_NONE;
 		if (dvd && ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
 		    FKIOCTL, kcred, NULL) == 0 &&
 		    state != DKIO_INSERTED) {
 			vd->vdev_remove_wanted = B_TRUE;
 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
 		} else if (vdev_probe(vd) != 0) {
 			ASSERT(vd->vdev_ops->vdev_op_leaf);
 			vd->vdev_is_failing = B_TRUE;
 		}
 #endif
 	}
 	return ZIO_PIPELINE_CONTINUE;
 }
 nvlist_t *
 vdev_disk_read_rootlabel(char *devpath)
 {
 	return NULL;
 }
 vdev_ops_t vdev_disk_ops = {
 	vdev_disk_open,
 	vdev_disk_close,
 	vdev_disk_probe,
 	vdev_default_asize,
 	vdev_disk_io_start,
 	vdev_disk_io_done,
 	NULL,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
 #endif /* defined(_KERNEL) && defined(HAVE_SPL) */