From a5e3d71fd49941b69ac341a5320cd81ff79d77f6 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 5 Dec 2008 11:16:18 -0800 Subject: [PATCH 01/11] Refresh linux-kernel-disk --- .topdeps | 4 +- .topmsg | 17 +- zfs/lib/libzpool/include/sys/vdev_disk.h | 29 + zfs/lib/libzpool/vdev_disk.c | 658 +++++++++++++++++++++++ 4 files changed, 690 insertions(+), 18 deletions(-) create mode 100644 zfs/lib/libzpool/include/sys/vdev_disk.h create mode 100644 zfs/lib/libzpool/vdev_disk.c diff --git a/.topdeps b/.topdeps index 607c231780..7f16cbcdd5 100644 --- a/.topdeps +++ b/.topdeps @@ -1,3 +1 @@ -gcc-branch -fix-branch -feature-branch +zfs-branch diff --git a/.topmsg b/.topmsg index e9722e1075..7e907446b4 100644 --- a/.topmsg +++ b/.topmsg @@ -1,19 +1,6 @@ From: Brian Behlendorf -Subject: [PATCH] zfs branch +Subject: [PATCH] linux kernel disk -Merged result of all changes which are relevant to both Solaris -and Linux builds of the ZFS code. These are changes where there -is a reasonable chance they will be accepted upstream. - -Additionally, since this is effectively the root of the linux -ZFS tree the core linux build system is added here. This -includes autogen.sh, configure.ac, m4 macros, some scripts/*, -and makefiles for all the core ZFS components. Linux-only -features which require tweaks to the build system should appear -on the relevant topic branches. All autotools products which -result from autogen.sh are commited to the linux-configure-branch. - -This branch also contains the META, ChangeLog, AUTHORS, -README, and GIT files. +Native Linux vdev disk interfaces Signed-off-by: Brian Behlendorf diff --git a/zfs/lib/libzpool/include/sys/vdev_disk.h b/zfs/lib/libzpool/include/sys/vdev_disk.h new file mode 100644 index 0000000000..38abf2e419 --- /dev/null +++ b/zfs/lib/libzpool/include/sys/vdev_disk.h @@ -0,0 +1,29 @@ +#ifndef _SYS_VDEV_DISK_H +#define _SYS_VDEV_DISK_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +#include +#include +#include +#include + +typedef struct vdev_disk { + ddi_devid_t vd_devid; + char *vd_minor; + ldi_handle_t vd_lh; +} vdev_disk_t; + +extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); +extern nvlist_t *vdev_disk_read_rootlabel(char *devpath); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_DISK_H */ diff --git a/zfs/lib/libzpool/vdev_disk.c b/zfs/lib/libzpool/vdev_disk.c new file mode 100644 index 0000000000..bc9a22934f --- /dev/null +++ b/zfs/lib/libzpool/vdev_disk.c @@ -0,0 +1,658 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for disks. + */ +#if defined(_KERNEL) && defined(HAVE_SPL) + +/* XXX: A slab entry for these would probably be good */ +typedef struct dio_request { + struct completion dr_comp; + atomic_t dr_ref; + vdev_t *dr_vd; + zio_t *dr_zio; + int dr_rc; +} dio_request_t; + +static int +vdev_disk_open_common(vdev_t *vd) +{ + vdev_disk_t *dvd; + struct block_device *bdev; + int mode = 0; + + // dprintf("vd=%p\n", vd); + + /* Must have a pathname and it must be absolute. */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return EINVAL; + } + + dvd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP); + if (dvd == NULL) + return ENOMEM; + + /* XXX: Since we do not have devid support like Solaris we + * currently can't be as clever about opening the right device. + * For now we will simple open the device name provided and + * fail when it doesn't exist. If your devices get reordered + * your going to be screwed, use udev for now to prevent this. + * + * XXX: mode here could be the global spa_mode with a little + * munging of the flags to make then more agreeable to linux. + * However, simply passing a 0 for now gets us W/R behavior. + */ + bdev = open_bdev_excl(vd->vdev_path, mode, dvd); + if (IS_ERR(bdev)) { + kmem_free(dvd, sizeof(vdev_disk_t)); + return -PTR_ERR(bdev); + } + + /* XXX: Long term validate stored dvd->vd_devid with + * a unique identifier read from the disk. + */ + + dvd->vd_lh = bdev; + vd->vdev_tsd = dvd; + + return 0; +} + +static int +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + vdev_disk_t *dvd; + struct block_device *bdev; + int rc; + + // dprintf("vd=%p, psize=%p, ashift=%p\n", vd, psize, ashift); + dprintf("adding disk %s\n", + vd->vdev_path ? vd->vdev_path : ""); + + rc = vdev_disk_open_common(vd); + if (rc) + return rc; + + dvd = vd->vdev_tsd; + bdev = dvd->vd_lh; + + /* Determine the actual size of the device (in bytes) */ + *psize = get_capacity(bdev->bd_disk) * SECTOR_SIZE; + + /* Check if this is a whole device and if it is try and + * enable the write cache, it is OK if this fails. + * + * XXX: This behavior should probably be configurable. + */ + if (bdev->bd_contains == bdev) { + int wce = 1; + + vd->vdev_wholedisk = 1ULL; + + /* Different methods are needed for an IDE vs SCSI disk. + * Since we're not sure what type of disk this is try IDE, + * if that fails try SCSI. */ + rc = ioctl_by_bdev(bdev, HDIO_SET_WCACHE, (unsigned long)&wce); + if (rc) + dprintf("Unable to enable IDE WCE and SCSI WCE " + "not yet supported: %d\n", rc); + + /* XXX: To implement the scsi WCE enable we are going to need + * to use the SG_IO ioctl. But that means fully forming the + * SCSI command as the ioctl arg. To get this right I need + * to look at the sdparm source which does this. + */ + rc = 0; + } else { + /* Must be a partition, that's fine. */ + vd->vdev_wholedisk = 0; + } + + /* Based on the minimum sector size set the block size */ + *ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1; + + /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ + vd->vdev_nowritecache = B_FALSE; + + return rc; +} + +static void +vdev_disk_close(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + // dprintf("vd=%p\n", vd); + dprintf("removing disk %s\n", + vd->vdev_path ? vd->vdev_path : ""); + + if (dvd == NULL) + return; + + close_bdev_excl(dvd->vd_lh); + + kmem_free(dvd, sizeof(vdev_disk_t)); + vd->vdev_tsd = NULL; +} + +#ifdef HAVE_2ARGS_BIO_END_IO_T +static void +vdev_disk_probe_io_completion(struct bio *bio, int rc) +#else +static int +vdev_disk_probe_io_completion(struct bio *bio, unsigned int size, int rc) +#endif /* HAVE_2ARGS_BIO_END_IO_T */ +{ + dio_request_t *dr = bio->bi_private; + zio_t *zio; + int error; + + + /* Fatal error but print some useful debugging before asserting */ + if (dr == NULL) { + printk("FATAL: bio->bi_private == NULL\n" + "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n" + "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d\n", + bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt, + bio->bi_idx, bio->bi_size, bio->bi_end_io, + atomic_read(&bio->bi_cnt)); + SBUG(); + } + + /* Incomplete */ + if (bio->bi_size) { + rc = 1; + goto out; + } + + error = rc; + if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = EIO; + + zio = dr->dr_zio; + if (zio) { + zio->io_error = error; + zio_interrupt(zio); + } + + dr->dr_rc = error; + atomic_dec(&dr->dr_ref); + + if (bio_sync(bio)) { + complete(&dr->dr_comp); + } else { + kmem_free(dr, sizeof(dio_request_t)); + bio_put(bio); + } + + rc = 0; +out: +#ifdef HAVE_2ARGS_BIO_END_IO_T + return; +#else + return rc; +#endif /* HAVE_2ARGS_BIO_END_IO_T */ +} + +static struct bio * +__bio_map_vmem(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + int offset, i; + struct page *page; + struct bio *bio; + + bio = bio_alloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + page = vmalloc_to_page(data); + ASSERT(page); /* Expecting virtual linear address */ + + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) + break; + + data += bytes; + len -= bytes; + offset = 0; + bytes = PAGE_SIZE; + } + + return bio; +} + +static struct bio * +bio_map_vmem(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + struct bio *bio; + + bio = __bio_map_vmem(q, data, len, gfp_mask); + if (IS_ERR(bio)) + return bio; + + if (bio->bi_size != len) { + bio_put(bio); + return ERR_PTR(-EINVAL); + } + + return bio; +} + +static struct bio * +bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) +{ + struct bio *bio; + + /* Cleanly map buffer we are passed in to a bio regardless + * of if the buffer is a virtual or physical address. */ + if (kmem_virt(data)) + bio = bio_map_vmem(q, data, len, gfp_mask); + else + bio = bio_map_kern(q, data, len, gfp_mask); + + return bio; +} + +static int +vdev_disk_io(vdev_t *vd, zio_t *zio, caddr_t kbuf, size_t size, + uint64_t offset, int flags) +{ + struct bio *bio; + dio_request_t *dr; + int rw, rc = 0; + struct block_device *bdev; + struct request_queue *q; + + // dprintf("vd=%p, zio=%p, kbuf=%p, size=%ld, offset=%lu, flag=%lx\n", + // vd, zio, kbuf, size, offset, flags); + + ASSERT((offset % SECTOR_SIZE) == 0); /* Sector aligned */ + + if (vd == NULL || vd->vdev_tsd == NULL) + return EINVAL; + + dr = kmem_alloc(sizeof(dio_request_t), KM_SLEEP); + if (dr == NULL) + return ENOMEM; + + atomic_set(&dr->dr_ref, 0); + dr->dr_vd = vd; + dr->dr_zio = zio; + dr->dr_rc = 0; + + bdev = ((vdev_disk_t *)(vd->vdev_tsd))->vd_lh; + q = bdev->bd_disk->queue; + + bio = bio_map(q, kbuf, size, GFP_NOIO); + if (IS_ERR(bio)) { + kmem_free(dr, sizeof(dio_request_t)); + return -PTR_ERR(bio); + } + + bio->bi_bdev = bdev; + bio->bi_sector = offset / SECTOR_SIZE; + bio->bi_end_io = vdev_disk_probe_io_completion; + bio->bi_private = dr; + + init_completion(&dr->dr_comp); + atomic_inc(&dr->dr_ref); + + if (flags & (1 << BIO_RW)) + rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE; + else + rw = READ; + + if (flags & (1 << BIO_RW_FAILFAST)) + rw |= 1 << BIO_RW_FAILFAST; + + ASSERT3S(flags & ~((1 << BIO_RW) | (1 << BIO_RW_SYNC) | + (1 << BIO_RW_FAILFAST)), ==, 0); + + submit_bio(rw, bio); + + /* + * On syncronous blocking requests we wait for the completion + * callback to wake us. Then we are responsible for freeing + * the dio_request_t as well as dropping the final bio reference. + */ + if (bio_sync(bio)) { + wait_for_completion(&dr->dr_comp); + ASSERT(atomic_read(&dr->dr_ref) == 0); + rc = dr->dr_rc; + kmem_free(dr, sizeof(dio_request_t)); + bio_put(bio); + } + + if (zio_injection_enabled && rc == 0) + rc = zio_handle_device_injection(vd, EIO); + + return rc; +} + +static int +vdev_disk_probe_io(vdev_t *vd, caddr_t kbuf, size_t size, + uint64_t offset, int flags) +{ + int rc; + + // dprintf("vd=%p, kbuf=%p, size=%ld, offset=%lu, flag=%d\n", + // vd, kbuf, size, offset, flags); + + flags |= (1 << BIO_RW_SYNC); + flags |= (1 << BIO_RW_FAILFAST); + + /* XXX: offset must be block aligned or we need to take + * care of it */ + + rc = vdev_disk_io(vd, NULL, kbuf, size, offset, flags); + + return rc; +} + +/* + * Determine if the underlying device is accessible by reading and writing + * to a known location. We must be able to do this during syncing context + * and thus we cannot set the vdev state directly. + */ +static int +vdev_disk_probe(vdev_t *vd) +{ + vdev_t *nvd; + int label_idx, rc = 0, retries = 0; + uint64_t offset; + char *vl_pad; + + // dprintf("vd=%p\n", vd); + + if (vd == NULL) + return EINVAL; + + /* Hijack the current vdev */ + nvd = vd; + + /* Pick a random label to rewrite */ + label_idx = spa_get_random(VDEV_LABELS); + ASSERT(label_idx < VDEV_LABELS); + + offset = vdev_label_offset(vd->vdev_psize, label_idx, + offsetof(vdev_label_t, vl_pad)); + + vl_pad = vmem_alloc(VDEV_SKIP_SIZE, KM_SLEEP); + if (vl_pad == NULL) + return ENOMEM; + + /* + * Try to read and write to a special location on the + * label. We use the existing vdev initially and only + * try to create and reopen it if we encounter a failure. + */ + while ((rc = vdev_disk_probe_io(nvd, vl_pad, + VDEV_SKIP_SIZE, offset, READ)) != 0 && retries == 0) { + + nvd = kmem_zalloc(sizeof(vdev_t), KM_SLEEP); + + if (vd->vdev_path) + nvd->vdev_path = spa_strdup(vd->vdev_path); + if (vd->vdev_physpath) + nvd->vdev_physpath = spa_strdup(vd->vdev_physpath); + if (vd->vdev_devid) + nvd->vdev_devid = spa_strdup(vd->vdev_devid); + + nvd->vdev_wholedisk = vd->vdev_wholedisk; + nvd->vdev_guid = vd->vdev_guid; + retries++; + + rc = vdev_disk_open_common(nvd); + if (rc) + break; + } + + if (!rc) + rc = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE, + offset, WRITE); + + /* Clean up if we allocated a new vdev */ + if (retries) { + vdev_disk_close(nvd); + if (nvd->vdev_path) + spa_strfree(nvd->vdev_path); + if (nvd->vdev_physpath) + spa_strfree(nvd->vdev_physpath); + if (nvd->vdev_devid) + spa_strfree(nvd->vdev_devid); + kmem_free(nvd, sizeof(vdev_t)); + } + + vmem_free(vl_pad, VDEV_SKIP_SIZE); + + /* Reset the failing flag */ + if (!rc) + vd->vdev_is_failing = B_FALSE; + + return rc; +} + +#if 0 +static void +vdev_disk_ioctl_done(void *zio_arg, int rc) +{ + zio_t *zio = zio_arg; + + zio->io_error = rc; + + zio_interrupt(zio); +} +#endif + +static int +vdev_disk_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; +// vdev_disk_t *dvd = vd->vdev_tsd; + int flags, rc; + + // dprintf("zio=%p\n", zio); + + if (zio->io_type == ZIO_TYPE_IOCTL) { + zio_vdev_io_bypass(zio); + + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = ENXIO; + return ZIO_PIPELINE_CONTINUE; + } + + switch (zio->io_cmd) { + + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + if (vd->vdev_nowritecache) { + zio->io_error = ENOTSUP; + break; + } + +#if 0 + zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done; + zio->io_dk_callback.dkc_flag = FLUSH_VOLATILE; + zio->io_dk_callback.dkc_cookie = zio; + + rc = ldi_ioctl(dvd->vd_lh, zio->io_cmd, + (uintptr_t)&zio->io_dk_callback, + FKIOCTL, kcred, NULL); + + if (rc == 0) { + /* + * The ioctl will be done asychronously, + * and will call vdev_disk_ioctl_done() + * upon completion. + */ + return ZIO_PIPELINE_STOP; + } +#else + rc = ENOTSUP; +#endif + + if (rc == ENOTSUP || rc == ENOTTY) { + /* + * If we get ENOTSUP or ENOTTY, we know that + * no future attempts will ever succeed. + * In this case we set a persistent bit so + * that we don't bother with the ioctl in the + * future. + */ + vd->vdev_nowritecache = B_TRUE; + } + zio->io_error = rc; + + break; + + default: + zio->io_error = ENOTSUP; + } + + return ZIO_PIPELINE_CONTINUE; + } + + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) + return ZIO_PIPELINE_STOP; + + if ((zio = vdev_queue_io(zio)) == NULL) + return ZIO_PIPELINE_STOP; + + if (zio->io_type == ZIO_TYPE_WRITE) + rc = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + else + rc = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + + rc = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : rc; + + if (rc) { + zio->io_error = rc; + zio_interrupt(zio); + return ZIO_PIPELINE_STOP; + } + + flags = ((zio->io_type == ZIO_TYPE_READ) ? READ : WRITE); + /* flags |= B_BUSY | B_NOCACHE; XXX : Not supported */ + + if (zio->io_flags & ZIO_FLAG_FAILFAST) + flags |= (1 << BIO_RW_FAILFAST); + + + vdev_disk_io(vd, zio, zio->io_data, zio->io_size, + zio->io_offset, flags); + + return ZIO_PIPELINE_STOP; +} + +static int +vdev_disk_io_done(zio_t *zio) +{ + // dprintf("zio=%p\n", zio); + + vdev_queue_io_done(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. Otherwise, probe the device and + * make sure it's still accessible. + */ + if (zio->io_error == EIO) { + ASSERT(0); /* XXX: Not yet supported */ +#if 0 + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + int state; + + state = DKIO_NONE; + if (dvd && ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && + state != DKIO_INSERTED) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } else if (vdev_probe(vd) != 0) { + ASSERT(vd->vdev_ops->vdev_op_leaf); + vd->vdev_is_failing = B_TRUE; + } +#endif + } + + return ZIO_PIPELINE_CONTINUE; +} + +nvlist_t * +vdev_disk_read_rootlabel(char *devpath) +{ + return NULL; +} + +vdev_ops_t vdev_disk_ops = { + vdev_disk_open, + vdev_disk_close, + vdev_disk_probe, + vdev_default_asize, + vdev_disk_io_start, + vdev_disk_io_done, + NULL, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif /* defined(_KERNEL) && defined(HAVE_SPL) */ From a2d1d32c17917321346dad21a5f4aed9c4e6b245 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 11 Dec 2008 15:26:36 -0800 Subject: [PATCH 02/11] move vdev_disk to it's new home --- {zfs/lib/libzpool => module/zfs}/include/sys/vdev_disk.h | 0 {zfs/lib/libzpool => module/zfs}/vdev_disk.c | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {zfs/lib/libzpool => module/zfs}/include/sys/vdev_disk.h (100%) rename {zfs/lib/libzpool => module/zfs}/vdev_disk.c (100%) diff --git a/zfs/lib/libzpool/include/sys/vdev_disk.h b/module/zfs/include/sys/vdev_disk.h similarity index 100% rename from zfs/lib/libzpool/include/sys/vdev_disk.h rename to module/zfs/include/sys/vdev_disk.h diff --git a/zfs/lib/libzpool/vdev_disk.c b/module/zfs/vdev_disk.c similarity index 100% rename from zfs/lib/libzpool/vdev_disk.c rename to module/zfs/vdev_disk.c From add6c31eed922a5e90776849a606eadd6d0c88f5 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 19 Dec 2008 13:34:38 -0800 Subject: [PATCH 03/11] Update vdev_disk for in-kernel use --- module/zfs/vdev_disk.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index bc9a22934f..4c241ec212 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -23,6 +23,8 @@ * Use is subject to license terms. */ +#if defined(_KERNEL) + #include #include #include @@ -34,9 +36,8 @@ /* * Virtual device vector for disks. */ -#if defined(_KERNEL) && defined(HAVE_SPL) -/* XXX: A slab entry for these would probably be good */ +/* FIXME: A slab entry for these would probably be good */ typedef struct dio_request { struct completion dr_comp; atomic_t dr_ref; @@ -64,13 +65,13 @@ vdev_disk_open_common(vdev_t *vd) if (dvd == NULL) return ENOMEM; - /* XXX: Since we do not have devid support like Solaris we + /* FIXME: Since we do not have devid support like Solaris we * currently can't be as clever about opening the right device. * For now we will simple open the device name provided and * fail when it doesn't exist. If your devices get reordered * your going to be screwed, use udev for now to prevent this. * - * XXX: mode here could be the global spa_mode with a little + * FIXME: mode here could be the global spa_mode with a little * munging of the flags to make then more agreeable to linux. * However, simply passing a 0 for now gets us W/R behavior. */ @@ -80,7 +81,7 @@ vdev_disk_open_common(vdev_t *vd) return -PTR_ERR(bdev); } - /* XXX: Long term validate stored dvd->vd_devid with + /* FIXME: Long term validate stored dvd->vd_devid with * a unique identifier read from the disk. */ @@ -114,7 +115,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) /* Check if this is a whole device and if it is try and * enable the write cache, it is OK if this fails. * - * XXX: This behavior should probably be configurable. + * FIXME: This behavior should probably be configurable. */ if (bdev->bd_contains == bdev) { int wce = 1; @@ -129,7 +130,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) dprintf("Unable to enable IDE WCE and SCSI WCE " "not yet supported: %d\n", rc); - /* XXX: To implement the scsi WCE enable we are going to need + /* FIXME: To implement the scsi WCE enable we are going to need * to use the SG_IO ioctl. But that means fully forming the * SCSI command as the ioctl arg. To get this right I need * to look at the sdparm source which does this. @@ -388,7 +389,7 @@ vdev_disk_probe_io(vdev_t *vd, caddr_t kbuf, size_t size, flags |= (1 << BIO_RW_SYNC); flags |= (1 << BIO_RW_FAILFAST); - /* XXX: offset must be block aligned or we need to take + /* FIXME: offset must be block aligned or we need to take * care of it */ rc = vdev_disk_io(vd, NULL, kbuf, size, offset, flags); @@ -583,7 +584,7 @@ vdev_disk_io_start(zio_t *zio) } flags = ((zio->io_type == ZIO_TYPE_READ) ? READ : WRITE); - /* flags |= B_BUSY | B_NOCACHE; XXX : Not supported */ + /* flags |= B_BUSY | B_NOCACHE; FIXME : Not supported */ if (zio->io_flags & ZIO_FLAG_FAILFAST) flags |= (1 << BIO_RW_FAILFAST); @@ -615,7 +616,7 @@ vdev_disk_io_done(zio_t *zio) * make sure it's still accessible. */ if (zio->io_error == EIO) { - ASSERT(0); /* XXX: Not yet supported */ + ASSERT(0); /* FIXME: Not yet supported */ #if 0 vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; @@ -655,4 +656,6 @@ vdev_ops_t vdev_disk_ops = { B_TRUE /* leaf vdev */ }; -#endif /* defined(_KERNEL) && defined(HAVE_SPL) */ +#else +#error "vdev_disk.c is only required for an in-kernel builds" +#endif /* _KERNEL */ From b17c1f4123646c2e628b1aabb1c8c4f748e44174 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 22 Dec 2008 13:32:19 -0800 Subject: [PATCH 04/11] Don't make this fatal for userspace --- module/zfs/vdev_disk.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 4c241ec212..19cef7339f 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -656,6 +656,4 @@ vdev_ops_t vdev_disk_ops = { B_TRUE /* leaf vdev */ }; -#else -#error "vdev_disk.c is only required for an in-kernel builds" #endif /* _KERNEL */ From 3a1f0dcde1357685c0fd5208969de23027310265 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 5 Jan 2009 16:53:23 -0800 Subject: [PATCH 05/11] Refresh prototype due to upstream changes --- module/zfs/include/sys/vdev_disk.h | 1 - module/zfs/vdev_disk.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/module/zfs/include/sys/vdev_disk.h b/module/zfs/include/sys/vdev_disk.h index 38abf2e419..3d5ec0eb89 100644 --- a/module/zfs/include/sys/vdev_disk.h +++ b/module/zfs/include/sys/vdev_disk.h @@ -18,7 +18,6 @@ typedef struct vdev_disk { } vdev_disk_t; extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); -extern nvlist_t *vdev_disk_read_rootlabel(char *devpath); #endif diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 19cef7339f..3e451525fa 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -639,7 +639,7 @@ vdev_disk_io_done(zio_t *zio) } nvlist_t * -vdev_disk_read_rootlabel(char *devpath) +vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) { return NULL; } From 4522c570e8eda9e827aea63b224589d49cda4472 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 6 Jan 2009 08:39:45 -0800 Subject: [PATCH 06/11] Add missing prototype --- module/zfs/include/sys/vdev_disk.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/include/sys/vdev_disk.h b/module/zfs/include/sys/vdev_disk.h index 3d5ec0eb89..520db0103d 100644 --- a/module/zfs/include/sys/vdev_disk.h +++ b/module/zfs/include/sys/vdev_disk.h @@ -18,8 +18,8 @@ typedef struct vdev_disk { } vdev_disk_t; extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); - -#endif +extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); +#endif /* _KERNEL */ #ifdef __cplusplus } From b38c50ac45d12c3e5535ae5ed9273a1f8ec7441f Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 8 Jan 2009 10:25:23 -0800 Subject: [PATCH 07/11] Update vdev_disk.c implementation to be compatible with b103 API --- module/zfs/Makefile.in | 1 + module/zfs/vdev_disk.c | 376 ++++++++++++++++------------------------- 2 files changed, 146 insertions(+), 231 deletions(-) diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 3f090a2a68..b5e7a0b6ab 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -49,6 +49,7 @@ ${MODULE}-objs += uberblock.o ${MODULE}-objs += unique.o ${MODULE}-objs += vdev.o ${MODULE}-objs += vdev_cache.o +${MODULE}-objs += vdev_disk.o ${MODULE}-objs += vdev_file.o ${MODULE}-objs += vdev_label.o ${MODULE}-objs += vdev_mirror.o diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 3e451525fa..6ffa4b511b 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#if defined(_KERNEL) - #include #include #include @@ -36,14 +34,11 @@ /* * Virtual device vector for disks. */ - -/* FIXME: A slab entry for these would probably be good */ typedef struct dio_request { struct completion dr_comp; atomic_t dr_ref; - vdev_t *dr_vd; zio_t *dr_zio; - int dr_rc; + int dr_error; } dio_request_t; static int @@ -53,8 +48,6 @@ vdev_disk_open_common(vdev_t *vd) struct block_device *bdev; int mode = 0; - // dprintf("vd=%p\n", vd); - /* Must have a pathname and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; @@ -65,13 +58,13 @@ vdev_disk_open_common(vdev_t *vd) if (dvd == NULL) return ENOMEM; - /* FIXME: Since we do not have devid support like Solaris we + /* XXX: Since we do not have devid support like Solaris we * currently can't be as clever about opening the right device. - * For now we will simple open the device name provided and + * For now we will simply open the device name provided and * fail when it doesn't exist. If your devices get reordered * your going to be screwed, use udev for now to prevent this. * - * FIXME: mode here could be the global spa_mode with a little + * XXX: mode here could be the global spa_mode with a little * munging of the flags to make then more agreeable to linux. * However, simply passing a 0 for now gets us W/R behavior. */ @@ -81,8 +74,8 @@ vdev_disk_open_common(vdev_t *vd) return -PTR_ERR(bdev); } - /* FIXME: Long term validate stored dvd->vd_devid with - * a unique identifier read from the disk. + /* XXX: Long term validate stored dvd->vd_devid with a unique + * identifier read from the disk. */ dvd->vd_lh = bdev; @@ -96,46 +89,44 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) { vdev_disk_t *dvd; struct block_device *bdev; - int rc; + int error; - // dprintf("vd=%p, psize=%p, ashift=%p\n", vd, psize, ashift); - dprintf("adding disk %s\n", - vd->vdev_path ? vd->vdev_path : ""); - - rc = vdev_disk_open_common(vd); - if (rc) - return rc; + error = vdev_disk_open_common(vd); + if (error) + return error; dvd = vd->vdev_tsd; bdev = dvd->vd_lh; - /* Determine the actual size of the device (in bytes) */ + /* Determine the actual size of the device (in bytes) + * + * XXX: SECTOR_SIZE is defined to 512b which may not be true for + * your device, we must use the actual hardware sector size. + */ *psize = get_capacity(bdev->bd_disk) * SECTOR_SIZE; /* Check if this is a whole device and if it is try and - * enable the write cache, it is OK if this fails. - * - * FIXME: This behavior should probably be configurable. - */ + * enable the write cache, it is OK if this fails. */ if (bdev->bd_contains == bdev) { int wce = 1; vd->vdev_wholedisk = 1ULL; - /* Different methods are needed for an IDE vs SCSI disk. + /* XXX: Different methods are needed for an IDE vs SCSI disk. * Since we're not sure what type of disk this is try IDE, - * if that fails try SCSI. */ - rc = ioctl_by_bdev(bdev, HDIO_SET_WCACHE, (unsigned long)&wce); - if (rc) + * if that fails try SCSI. + */ + error = ioctl_by_bdev(bdev, HDIO_SET_WCACHE, (unsigned long)&wce); + if (error) dprintf("Unable to enable IDE WCE and SCSI WCE " - "not yet supported: %d\n", rc); + "not yet supported: %d\n", error); - /* FIXME: To implement the scsi WCE enable we are going to need + /* XXX: To implement the scsi WCE enable we are going to need * to use the SG_IO ioctl. But that means fully forming the * SCSI command as the ioctl arg. To get this right I need * to look at the sdparm source which does this. */ - rc = 0; + error = 0; } else { /* Must be a partition, that's fine. */ vd->vdev_wholedisk = 0; @@ -147,7 +138,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ vd->vdev_nowritecache = B_FALSE; - return rc; + return error; } static void @@ -155,14 +146,11 @@ vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; - // dprintf("vd=%p\n", vd); - dprintf("removing disk %s\n", - vd->vdev_path ? vd->vdev_path : ""); - if (dvd == NULL) return; - close_bdev_excl(dvd->vd_lh); + if (dvd->vd_lh != NULL) + close_bdev_excl(dvd->vd_lh); kmem_free(dvd, sizeof(vdev_disk_t)); vd->vdev_tsd = NULL; @@ -170,17 +158,16 @@ vdev_disk_close(vdev_t *vd) #ifdef HAVE_2ARGS_BIO_END_IO_T static void -vdev_disk_probe_io_completion(struct bio *bio, int rc) +vdev_disk_physio_completion(struct bio *bio, int rc) #else static int -vdev_disk_probe_io_completion(struct bio *bio, unsigned int size, int rc) +vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc) #endif /* HAVE_2ARGS_BIO_END_IO_T */ { dio_request_t *dr = bio->bi_private; zio_t *zio; int error; - /* Fatal error but print some useful debugging before asserting */ if (dr == NULL) { printk("FATAL: bio->bi_private == NULL\n" @@ -208,7 +195,7 @@ vdev_disk_probe_io_completion(struct bio *bio, unsigned int size, int rc) zio_interrupt(zio); } - dr->dr_rc = error; + dr->dr_error = error; atomic_dec(&dr->dr_ref); if (bio_sync(bio)) { @@ -302,34 +289,24 @@ bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) } static int -vdev_disk_io(vdev_t *vd, zio_t *zio, caddr_t kbuf, size_t size, - uint64_t offset, int flags) +__vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf, + size_t size, uint64_t offset, int flags) { struct bio *bio; dio_request_t *dr; - int rw, rc = 0; - struct block_device *bdev; + int rw, error = 0; struct request_queue *q; - // dprintf("vd=%p, zio=%p, kbuf=%p, size=%ld, offset=%lu, flag=%lx\n", - // vd, zio, kbuf, size, offset, flags); - ASSERT((offset % SECTOR_SIZE) == 0); /* Sector aligned */ - if (vd == NULL || vd->vdev_tsd == NULL) - return EINVAL; - dr = kmem_alloc(sizeof(dio_request_t), KM_SLEEP); if (dr == NULL) return ENOMEM; atomic_set(&dr->dr_ref, 0); - dr->dr_vd = vd; dr->dr_zio = zio; - dr->dr_rc = 0; - - bdev = ((vdev_disk_t *)(vd->vdev_tsd))->vd_lh; - q = bdev->bd_disk->queue; + dr->dr_error = 0; + q = vd_lh->bd_disk->queue; bio = bio_map(q, kbuf, size, GFP_NOIO); if (IS_ERR(bio)) { @@ -337,9 +314,9 @@ vdev_disk_io(vdev_t *vd, zio_t *zio, caddr_t kbuf, size_t size, return -PTR_ERR(bio); } - bio->bi_bdev = bdev; + bio->bi_bdev = vd_lh; bio->bi_sector = offset / SECTOR_SIZE; - bio->bi_end_io = vdev_disk_probe_io_completion; + bio->bi_end_io = vdev_disk_physio_completion; bio->bi_private = dr; init_completion(&dr->dr_comp); @@ -354,7 +331,7 @@ vdev_disk_io(vdev_t *vd, zio_t *zio, caddr_t kbuf, size_t size, rw |= 1 << BIO_RW_FAILFAST; ASSERT3S(flags & ~((1 << BIO_RW) | (1 << BIO_RW_SYNC) | - (1 << BIO_RW_FAILFAST)), ==, 0); + (1 << BIO_RW_FAILFAST)), ==, 0); submit_bio(rw, bio); @@ -366,127 +343,29 @@ vdev_disk_io(vdev_t *vd, zio_t *zio, caddr_t kbuf, size_t size, if (bio_sync(bio)) { wait_for_completion(&dr->dr_comp); ASSERT(atomic_read(&dr->dr_ref) == 0); - rc = dr->dr_rc; + error = dr->dr_error; kmem_free(dr, sizeof(dio_request_t)); bio_put(bio); } - if (zio_injection_enabled && rc == 0) - rc = zio_handle_device_injection(vd, EIO); - - return rc; + return error; } -static int -vdev_disk_probe_io(vdev_t *vd, caddr_t kbuf, size_t size, - uint64_t offset, int flags) +int +vdev_disk_physio(ldi_handle_t vd_lh, caddr_t kbuf, + size_t size, uint64_t offset, int flags) { - int rc; - - // dprintf("vd=%p, kbuf=%p, size=%ld, offset=%lu, flag=%d\n", - // vd, kbuf, size, offset, flags); - - flags |= (1 << BIO_RW_SYNC); - flags |= (1 << BIO_RW_FAILFAST); - - /* FIXME: offset must be block aligned or we need to take - * care of it */ - - rc = vdev_disk_io(vd, NULL, kbuf, size, offset, flags); - - return rc; -} - -/* - * Determine if the underlying device is accessible by reading and writing - * to a known location. We must be able to do this during syncing context - * and thus we cannot set the vdev state directly. - */ -static int -vdev_disk_probe(vdev_t *vd) -{ - vdev_t *nvd; - int label_idx, rc = 0, retries = 0; - uint64_t offset; - char *vl_pad; - - // dprintf("vd=%p\n", vd); - - if (vd == NULL) - return EINVAL; - - /* Hijack the current vdev */ - nvd = vd; - - /* Pick a random label to rewrite */ - label_idx = spa_get_random(VDEV_LABELS); - ASSERT(label_idx < VDEV_LABELS); - - offset = vdev_label_offset(vd->vdev_psize, label_idx, - offsetof(vdev_label_t, vl_pad)); - - vl_pad = vmem_alloc(VDEV_SKIP_SIZE, KM_SLEEP); - if (vl_pad == NULL) - return ENOMEM; - - /* - * Try to read and write to a special location on the - * label. We use the existing vdev initially and only - * try to create and reopen it if we encounter a failure. - */ - while ((rc = vdev_disk_probe_io(nvd, vl_pad, - VDEV_SKIP_SIZE, offset, READ)) != 0 && retries == 0) { - - nvd = kmem_zalloc(sizeof(vdev_t), KM_SLEEP); - - if (vd->vdev_path) - nvd->vdev_path = spa_strdup(vd->vdev_path); - if (vd->vdev_physpath) - nvd->vdev_physpath = spa_strdup(vd->vdev_physpath); - if (vd->vdev_devid) - nvd->vdev_devid = spa_strdup(vd->vdev_devid); - - nvd->vdev_wholedisk = vd->vdev_wholedisk; - nvd->vdev_guid = vd->vdev_guid; - retries++; - - rc = vdev_disk_open_common(nvd); - if (rc) - break; - } - - if (!rc) - rc = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE, - offset, WRITE); - - /* Clean up if we allocated a new vdev */ - if (retries) { - vdev_disk_close(nvd); - if (nvd->vdev_path) - spa_strfree(nvd->vdev_path); - if (nvd->vdev_physpath) - spa_strfree(nvd->vdev_physpath); - if (nvd->vdev_devid) - spa_strfree(nvd->vdev_devid); - kmem_free(nvd, sizeof(vdev_t)); - } - - vmem_free(vl_pad, VDEV_SKIP_SIZE); - - /* Reset the failing flag */ - if (!rc) - vd->vdev_is_failing = B_FALSE; - - return rc; + return __vdev_disk_physio(vd_lh, NULL, kbuf, size, offset, flags); } #if 0 +/* XXX: Not yet supported */ static void -vdev_disk_ioctl_done(void *zio_arg, int rc) +vdev_disk_ioctl_done(void *zio_arg, int error) { zio_t *zio = zio_arg; - zio->io_error = rc; + zio->io_error = error; zio_interrupt(zio); } @@ -496,10 +375,8 @@ static int vdev_disk_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; -// vdev_disk_t *dvd = vd->vdev_tsd; - int flags, rc; - - // dprintf("zio=%p\n", zio); + vdev_disk_t *dvd = vd->vdev_tsd; + int flags, error; if (zio->io_type == ZIO_TYPE_IOCTL) { zio_vdev_io_bypass(zio); @@ -523,15 +400,18 @@ vdev_disk_io_start(zio_t *zio) } #if 0 + /* XXX: Not yet supported */ + vdev_disk_t *dvd = vd->vdev_tsd; + zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done; zio->io_dk_callback.dkc_flag = FLUSH_VOLATILE; zio->io_dk_callback.dkc_cookie = zio; - rc = ldi_ioctl(dvd->vd_lh, zio->io_cmd, + error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, (uintptr_t)&zio->io_dk_callback, FKIOCTL, kcred, NULL); - if (rc == 0) { + if (error == 0) { /* * The ioctl will be done asychronously, * and will call vdev_disk_ioctl_done() @@ -540,10 +420,10 @@ vdev_disk_io_start(zio_t *zio) return ZIO_PIPELINE_STOP; } #else - rc = ENOTSUP; + error = ENOTSUP; #endif - if (rc == ENOTSUP || rc == ENOTTY) { + if (error == ENOTSUP || error == ENOTTY) { /* * If we get ENOTSUP or ENOTTY, we know that * no future attempts will ever succeed. @@ -553,7 +433,7 @@ vdev_disk_io_start(zio_t *zio) */ vd->vdev_nowritecache = B_TRUE; } - zio->io_error = rc; + zio->io_error = error; break; @@ -564,51 +444,24 @@ vdev_disk_io_start(zio_t *zio) return ZIO_PIPELINE_CONTINUE; } - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return ZIO_PIPELINE_STOP; - - if ((zio = vdev_queue_io(zio)) == NULL) - return ZIO_PIPELINE_STOP; - - if (zio->io_type == ZIO_TYPE_WRITE) - rc = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; - else - rc = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; - - rc = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : rc; - - if (rc) { - zio->io_error = rc; - zio_interrupt(zio); - return ZIO_PIPELINE_STOP; - } - + /* + * B_BUSY XXX: Not supported + * B_NOCACHE XXX: Not supported + */ flags = ((zio->io_type == ZIO_TYPE_READ) ? READ : WRITE); - /* flags |= B_BUSY | B_NOCACHE; FIXME : Not supported */ - if (zio->io_flags & ZIO_FLAG_FAILFAST) + if (zio->io_flags & ZIO_FLAG_IO_RETRY) flags |= (1 << BIO_RW_FAILFAST); - - vdev_disk_io(vd, zio, zio->io_data, zio->io_size, - zio->io_offset, flags); + __vdev_disk_physio(dvd->vd_lh, zio, zio->io_data, + zio->io_size, zio->io_offset, flags); return ZIO_PIPELINE_STOP; } -static int +static void vdev_disk_io_done(zio_t *zio) { - // dprintf("zio=%p\n", zio); - - vdev_queue_io_done(zio); - - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); - - if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); - /* * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if * the device has been removed. If this is the case, then we trigger an @@ -616,38 +469,24 @@ vdev_disk_io_done(zio_t *zio) * make sure it's still accessible. */ if (zio->io_error == EIO) { - ASSERT(0); /* FIXME: Not yet supported */ + ASSERT(0); /* XXX: Not yet supported */ #if 0 vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; - int state; + int state = DKIO_NONE; - state = DKIO_NONE; - if (dvd && ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, - FKIOCTL, kcred, NULL) == 0 && - state != DKIO_INSERTED) { + if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); - } else if (vdev_probe(vd) != 0) { - ASSERT(vd->vdev_ops->vdev_op_leaf); - vd->vdev_is_failing = B_TRUE; } #endif } - - return ZIO_PIPELINE_CONTINUE; -} - -nvlist_t * -vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) -{ - return NULL; } vdev_ops_t vdev_disk_ops = { vdev_disk_open, vdev_disk_close, - vdev_disk_probe, vdev_default_asize, vdev_disk_io_start, vdev_disk_io_done, @@ -656,4 +495,79 @@ vdev_ops_t vdev_disk_ops = { B_TRUE /* leaf vdev */ }; -#endif /* _KERNEL */ +/* + * Given the root disk device devid or pathname, read the label from + * the device, and construct a configuration nvlist. + */ +int +vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) +{ + struct block_device *vd_lh; + vdev_label_t *label; + uint64_t s, size; + int i; + + /* + * Read the device label and build the nvlist. + * XXX: Not yet supported + */ +#if 0 + if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, + &minor_name) == 0) { + error = ldi_open_by_devid(tmpdevid, minor_name, spa_mode, + kcred, &vd_lh, zfs_li); + ddi_devid_free(tmpdevid); + ddi_devid_str_free(minor_name); + } +#endif + + vd_lh = open_bdev_excl(devpath, MS_RDONLY, NULL); + if (IS_ERR(vd_lh)) + return -PTR_ERR(vd_lh); + + if ((s = i_size_read(vd_lh->bd_inode)) == 0) { + close_bdev_excl(vd_lh); + return EIO; + } + + size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t); + label = vmem_alloc(sizeof(vdev_label_t), KM_SLEEP); + + for (i = 0; i < VDEV_LABELS; i++) { + uint64_t offset, state, txg = 0; + + /* read vdev label */ + offset = vdev_label_offset(size, i, 0); + if (vdev_disk_physio(vd_lh, (caddr_t)label, + VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE + + VDEV_PHYS_SIZE, offset, READ) != 0) + continue; + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state >= POOL_STATE_DESTROYED) { + nvlist_free(*config); + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + nvlist_free(*config); + *config = NULL; + continue; + } + + break; + } + + vmem_free(label, sizeof(vdev_label_t)); + close_bdev_excl(vd_lh); + + return 0; +} From ab8f4ca43f70a1c575cb38518b0b616086edff89 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 21 Jan 2009 10:59:06 -0800 Subject: [PATCH 08/11] Convert ASSERT() to VERIFY() for better coverage --- module/zfs/vdev_disk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 6ffa4b511b..94686d2cfd 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -468,8 +468,7 @@ vdev_disk_io_done(zio_t *zio) * asynchronous removal of the device. Otherwise, probe the device and * make sure it's still accessible. */ - if (zio->io_error == EIO) { - ASSERT(0); /* XXX: Not yet supported */ + VERIFY3S(zio->io_error, ==, 0); #if 0 vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; @@ -481,7 +480,6 @@ vdev_disk_io_done(zio_t *zio) spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } #endif - } } vdev_ops_t vdev_disk_ops = { From 3657ada547e51772282ea79f2496ca91f5937d2d Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 26 Jan 2009 16:46:50 -0800 Subject: [PATCH 09/11] Update linux vdev_disk interfaces to issue multiple bios if needed due to the maximum request size being smaller than the request size passed down from the spa --- module/zfs/vdev_disk.c | 330 ++++++++++++++++++++--------------------- 1 file changed, 162 insertions(+), 168 deletions(-) diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 94686d2cfd..1c4c37a62a 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -35,18 +35,21 @@ * Virtual device vector for disks. */ typedef struct dio_request { - struct completion dr_comp; - atomic_t dr_ref; - zio_t *dr_zio; - int dr_error; + struct completion dr_comp; /* Completion for sync IO */ + spinlock_t dr_lock; /* Completion lock */ + zio_t *dr_zio; /* Parent ZIO */ + int dr_ref; /* Outstanding bio count */ + int dr_rw; /* Read/Write */ + int dr_error; /* Bio error */ + int dr_bio_count; /* Count of bio's */ + struct bio *dr_bio[0]; /* Attached bio's */ } dio_request_t; static int -vdev_disk_open_common(vdev_t *vd) +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) { + struct block_device *vd_lh; vdev_disk_t *dvd; - struct block_device *bdev; - int mode = 0; /* Must have a pathname and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { @@ -68,77 +71,37 @@ vdev_disk_open_common(vdev_t *vd) * munging of the flags to make then more agreeable to linux. * However, simply passing a 0 for now gets us W/R behavior. */ - bdev = open_bdev_excl(vd->vdev_path, mode, dvd); - if (IS_ERR(bdev)) { + vd_lh = open_bdev_excl(vd->vdev_path, 0, dvd); + if (IS_ERR(vd_lh)) { kmem_free(dvd, sizeof(vdev_disk_t)); - return -PTR_ERR(bdev); + return -PTR_ERR(vd_lh); } /* XXX: Long term validate stored dvd->vd_devid with a unique - * identifier read from the disk. + * identifier read from the disk, likely EFI support. */ - dvd->vd_lh = bdev; vd->vdev_tsd = dvd; + dvd->vd_lh = vd_lh; - return 0; -} + /* Check if this is a whole device. When vd_lh->bd_contains == + * vd_lh we have a whole device and not simply a partition. */ + vd->vdev_wholedisk = !!(vd_lh->bd_contains == vd_lh); -static int -vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) -{ - vdev_disk_t *dvd; - struct block_device *bdev; - int error; - - error = vdev_disk_open_common(vd); - if (error) - return error; - - dvd = vd->vdev_tsd; - bdev = dvd->vd_lh; + /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ + vd->vdev_nowritecache = B_FALSE; /* Determine the actual size of the device (in bytes) * * XXX: SECTOR_SIZE is defined to 512b which may not be true for * your device, we must use the actual hardware sector size. */ - *psize = get_capacity(bdev->bd_disk) * SECTOR_SIZE; - - /* Check if this is a whole device and if it is try and - * enable the write cache, it is OK if this fails. */ - if (bdev->bd_contains == bdev) { - int wce = 1; - - vd->vdev_wholedisk = 1ULL; - - /* XXX: Different methods are needed for an IDE vs SCSI disk. - * Since we're not sure what type of disk this is try IDE, - * if that fails try SCSI. - */ - error = ioctl_by_bdev(bdev, HDIO_SET_WCACHE, (unsigned long)&wce); - if (error) - dprintf("Unable to enable IDE WCE and SCSI WCE " - "not yet supported: %d\n", error); - - /* XXX: To implement the scsi WCE enable we are going to need - * to use the SG_IO ioctl. But that means fully forming the - * SCSI command as the ioctl arg. To get this right I need - * to look at the sdparm source which does this. - */ - error = 0; - } else { - /* Must be a partition, that's fine. */ - vd->vdev_wholedisk = 0; - } + *psize = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE; /* Based on the minimum sector size set the block size */ *ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1; - /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ - vd->vdev_nowritecache = B_FALSE; - - return error; + return 0; } static void @@ -164,16 +127,16 @@ static int vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc) #endif /* HAVE_2ARGS_BIO_END_IO_T */ { - dio_request_t *dr = bio->bi_private; + dio_request_t *dr = bio->bi_private; zio_t *zio; - int error; + int i, error; /* Fatal error but print some useful debugging before asserting */ if (dr == NULL) { printk("FATAL: bio->bi_private == NULL\n" "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n" - "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d\n", - bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt, + "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d\n", + bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt, bio->bi_idx, bio->bi_size, bio->bi_end_io, atomic_read(&bio->bi_cnt)); SBUG(); @@ -189,20 +152,38 @@ vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc) if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags)) error = EIO; - zio = dr->dr_zio; - if (zio) { - zio->io_error = error; - zio_interrupt(zio); - } + spin_lock(&dr->dr_lock); - dr->dr_error = error; - atomic_dec(&dr->dr_ref); + dr->dr_ref--; + if (dr->dr_error == 0) + dr->dr_error = error; - if (bio_sync(bio)) { - complete(&dr->dr_comp); + /* + * All bio's attached to this dio request have completed. This + * means it is safe to access the dio outside the spin lock, we + * are assured there will be no racing accesses. + */ + if (dr->dr_ref == 0) { + zio = dr->dr_zio; + spin_unlock(&dr->dr_lock); + + /* Syncronous dio cleanup handled by waiter */ + if (dr->dr_rw & (1 << BIO_RW_SYNC)) { + complete(&dr->dr_comp); + } else { + for (i = 0; i < dr->dr_bio_count; i++) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, sizeof(dio_request_t) + + sizeof(struct bio *) * dr->dr_bio_count); + } + + if (zio) { + zio->io_error = dr->dr_error; + zio_interrupt(zio); + } } else { - kmem_free(dr, sizeof(dio_request_t)); - bio_put(bio); + spin_unlock(&dr->dr_lock); } rc = 0; @@ -215,62 +196,43 @@ out: } static struct bio * -__bio_map_vmem(struct request_queue *q, void *data, +bio_map_virt(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) { - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - int offset, i; + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + unsigned int offset, i, data_len = len; + const int nr_pages = end - start; struct page *page; - struct bio *bio; - - bio = bio_alloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - page = vmalloc_to_page(data); - ASSERT(page); /* Expecting virtual linear address */ - - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) - break; - - data += bytes; - len -= bytes; - offset = 0; - bytes = PAGE_SIZE; - } - - return bio; -} - -static struct bio * -bio_map_vmem(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask) -{ struct bio *bio; + int rc; - bio = __bio_map_vmem(q, data, len, gfp_mask); - if (IS_ERR(bio)) - return bio; + bio = bio_alloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); - if (bio->bi_size != len) { - bio_put(bio); - return ERR_PTR(-EINVAL); + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + VERIFY3P(page = vmalloc_to_page(data), !=, NULL); + VERIFY3U(bio_add_pc_page(q, bio, page, bytes, offset), ==, bytes); + + data += bytes; + len -= bytes; + offset = 0; + bytes = PAGE_SIZE; } - return bio; + VERIFY3U(bio->bi_size, ==, data_len); + return bio; } static struct bio * @@ -281,7 +243,7 @@ bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) /* Cleanly map buffer we are passed in to a bio regardless * of if the buffer is a virtual or physical address. */ if (kmem_virt(data)) - bio = bio_map_vmem(q, data, len, gfp_mask); + bio = bio_map_virt(q, data, len, gfp_mask); else bio = bio_map_kern(q, data, len, gfp_mask); @@ -289,63 +251,92 @@ bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) } static int -__vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf, - size_t size, uint64_t offset, int flags) +__vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf_ptr, + size_t kbuf_size, uint64_t kbuf_offset, int flags) { - struct bio *bio; + struct request_queue *q = vd_lh->bd_disk->queue; dio_request_t *dr; - int rw, error = 0; - struct request_queue *q; + caddr_t bio_ptr; + uint64_t bio_offset; + int i, j, error = 0, bio_count, bio_size, dio_size; - ASSERT((offset % SECTOR_SIZE) == 0); /* Sector aligned */ + ASSERT3S(kbuf_offset % SECTOR_SIZE, ==, 0); + ASSERT3S(flags & + ~((1 << BIO_RW) | + (1 << BIO_RW_SYNC) | + (1 << BIO_RW_FAILFAST)), ==, 0); - dr = kmem_alloc(sizeof(dio_request_t), KM_SLEEP); + bio_count = (kbuf_size / (q->max_hw_sectors << 9)) + 1; + dio_size = sizeof(dio_request_t) + sizeof(struct bio *) * bio_count; + dr = kmem_zalloc(dio_size, KM_SLEEP); if (dr == NULL) return ENOMEM; - atomic_set(&dr->dr_ref, 0); - dr->dr_zio = zio; - dr->dr_error = 0; - q = vd_lh->bd_disk->queue; - - bio = bio_map(q, kbuf, size, GFP_NOIO); - if (IS_ERR(bio)) { - kmem_free(dr, sizeof(dio_request_t)); - return -PTR_ERR(bio); - } - - bio->bi_bdev = vd_lh; - bio->bi_sector = offset / SECTOR_SIZE; - bio->bi_end_io = vdev_disk_physio_completion; - bio->bi_private = dr; - init_completion(&dr->dr_comp); - atomic_inc(&dr->dr_ref); + spin_lock_init(&dr->dr_lock); + dr->dr_ref = 0; + dr->dr_zio = zio; + dr->dr_rw = READ; + dr->dr_error = 0; + dr->dr_bio_count = bio_count; if (flags & (1 << BIO_RW)) - rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE; - else - rw = READ; + dr->dr_rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE; if (flags & (1 << BIO_RW_FAILFAST)) - rw |= 1 << BIO_RW_FAILFAST; - - ASSERT3S(flags & ~((1 << BIO_RW) | (1 << BIO_RW_SYNC) | - (1 << BIO_RW_FAILFAST)), ==, 0); - - submit_bio(rw, bio); + dr->dr_rw |= 1 << BIO_RW_FAILFAST; /* - * On syncronous blocking requests we wait for the completion - * callback to wake us. Then we are responsible for freeing - * the dio_request_t as well as dropping the final bio reference. + * When the IO size exceeds the maximum bio size for the request + * queue we are forced to break the IO in multiple bio's and wait + * for them all to complete. Ideally, all pool users will set + * their volume block size to match the maximum request size and + * the common case will be one bio per vdev IO request. */ - if (bio_sync(bio)) { + bio_ptr = kbuf_ptr; + bio_offset = kbuf_offset; + for (i = 0; i < dr->dr_bio_count; i++) { + bio_size = MIN(kbuf_size, q->max_hw_sectors << 9); + + dr->dr_bio[i] = bio_map(q, bio_ptr, bio_size, GFP_NOIO); + if (IS_ERR(dr->dr_bio[i])) { + for (j = 0; j < i; j++) + bio_put(dr->dr_bio[j]); + + error = -PTR_ERR(dr->dr_bio[i]); + kmem_free(dr, dio_size); + return error; + } + + dr->dr_bio[i]->bi_bdev = vd_lh; + dr->dr_bio[i]->bi_sector = bio_offset >> 9; + dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; + dr->dr_bio[i]->bi_private = dr; + dr->dr_ref++; + + bio_ptr += bio_size; + bio_offset += bio_size; + kbuf_size -= bio_size; + } + + for (i = 0; i < dr->dr_bio_count; i++) + submit_bio(dr->dr_rw, dr->dr_bio[i]); + + /* + * On syncronous blocking requests we wait for all bio the completion + * callbacks to run. We will be woken when the last callback runs + * for this dio. We are responsible for freeing the dio_request_t as + * well as the final reference on all attached bios. + */ + if (dr->dr_rw & (1 << BIO_RW_SYNC)) { wait_for_completion(&dr->dr_comp); - ASSERT(atomic_read(&dr->dr_ref) == 0); + ASSERT(dr->dr_ref == 0); error = dr->dr_error; - kmem_free(dr, sizeof(dio_request_t)); - bio_put(bio); + + for (i = 0; i < dr->dr_bio_count; i++) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, dio_size); } return error; @@ -379,7 +370,6 @@ vdev_disk_io_start(zio_t *zio) int flags, error; if (zio->io_type == ZIO_TYPE_IOCTL) { - zio_vdev_io_bypass(zio); /* XXPOLICY */ if (!vdev_readable(vd)) { @@ -453,8 +443,12 @@ vdev_disk_io_start(zio_t *zio) if (zio->io_flags & ZIO_FLAG_IO_RETRY) flags |= (1 << BIO_RW_FAILFAST); - __vdev_disk_physio(dvd->vd_lh, zio, zio->io_data, - zio->io_size, zio->io_offset, flags); + error = __vdev_disk_physio(dvd->vd_lh, zio, zio->io_data, + zio->io_size, zio->io_offset, flags); + if (error) { + zio->io_error = error; + return ZIO_PIPELINE_CONTINUE; + } return ZIO_PIPELINE_STOP; } @@ -470,7 +464,6 @@ vdev_disk_io_done(zio_t *zio) */ VERIFY3S(zio->io_error, ==, 0); #if 0 - vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; int state = DKIO_NONE; @@ -523,7 +516,8 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) if (IS_ERR(vd_lh)) return -PTR_ERR(vd_lh); - if ((s = i_size_read(vd_lh->bd_inode)) == 0) { + s = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE; + if (s == 0) { close_bdev_excl(vd_lh); return EIO; } From 25c88fda18b1dd72783aba72a770bbf2fe6fbc5d Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 11 Mar 2009 22:15:36 -0700 Subject: [PATCH 10/11] Remove unused variable --- module/zfs/vdev_disk.c | 1 - 1 file changed, 1 deletion(-) diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 1c4c37a62a..7b4cf8e43a 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -206,7 +206,6 @@ bio_map_virt(struct request_queue *q, void *data, const int nr_pages = end - start; struct page *page; struct bio *bio; - int rc; bio = bio_alloc(gfp_mask, nr_pages); if (!bio) From 23c544c88431d167556fe8a72670d99d6bcf4d13 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 17 Mar 2009 15:14:02 -0700 Subject: [PATCH 11/11] Add zfs_config.h include for HAVE_2ARGS_BIO_END_IO_T define --- module/zfs/vdev_disk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 7b4cf8e43a..bae4cdcdc6 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -30,6 +30,7 @@ #include #include #include +#include /* * Virtual device vector for disks.