From a5e3d71fd49941b69ac341a5320cd81ff79d77f6 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 5 Dec 2008 11:16:18 -0800 Subject: [PATCH] Refresh linux-kernel-disk --- .topdeps | 4 +- .topmsg | 17 +- zfs/lib/libzpool/include/sys/vdev_disk.h | 29 + zfs/lib/libzpool/vdev_disk.c | 658 +++++++++++++++++++++++ 4 files changed, 690 insertions(+), 18 deletions(-) create mode 100644 zfs/lib/libzpool/include/sys/vdev_disk.h create mode 100644 zfs/lib/libzpool/vdev_disk.c diff --git a/.topdeps b/.topdeps index 607c231780..7f16cbcdd5 100644 --- a/.topdeps +++ b/.topdeps @@ -1,3 +1 @@ -gcc-branch -fix-branch -feature-branch +zfs-branch diff --git a/.topmsg b/.topmsg index e9722e1075..7e907446b4 100644 --- a/.topmsg +++ b/.topmsg @@ -1,19 +1,6 @@ From: Brian Behlendorf -Subject: [PATCH] zfs branch +Subject: [PATCH] linux kernel disk -Merged result of all changes which are relevant to both Solaris -and Linux builds of the ZFS code. These are changes where there -is a reasonable chance they will be accepted upstream. - -Additionally, since this is effectively the root of the linux -ZFS tree the core linux build system is added here. This -includes autogen.sh, configure.ac, m4 macros, some scripts/*, -and makefiles for all the core ZFS components. Linux-only -features which require tweaks to the build system should appear -on the relevant topic branches. All autotools products which -result from autogen.sh are commited to the linux-configure-branch. - -This branch also contains the META, ChangeLog, AUTHORS, -README, and GIT files. +Native Linux vdev disk interfaces Signed-off-by: Brian Behlendorf diff --git a/zfs/lib/libzpool/include/sys/vdev_disk.h b/zfs/lib/libzpool/include/sys/vdev_disk.h new file mode 100644 index 0000000000..38abf2e419 --- /dev/null +++ b/zfs/lib/libzpool/include/sys/vdev_disk.h @@ -0,0 +1,29 @@ +#ifndef _SYS_VDEV_DISK_H +#define _SYS_VDEV_DISK_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +#include +#include +#include +#include + +typedef struct vdev_disk { + ddi_devid_t vd_devid; + char *vd_minor; + ldi_handle_t vd_lh; +} vdev_disk_t; + +extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); +extern nvlist_t *vdev_disk_read_rootlabel(char *devpath); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_DISK_H */ diff --git a/zfs/lib/libzpool/vdev_disk.c b/zfs/lib/libzpool/vdev_disk.c new file mode 100644 index 0000000000..bc9a22934f --- /dev/null +++ b/zfs/lib/libzpool/vdev_disk.c @@ -0,0 +1,658 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for disks. + */ +#if defined(_KERNEL) && defined(HAVE_SPL) + +/* XXX: A slab entry for these would probably be good */ +typedef struct dio_request { + struct completion dr_comp; + atomic_t dr_ref; + vdev_t *dr_vd; + zio_t *dr_zio; + int dr_rc; +} dio_request_t; + +static int +vdev_disk_open_common(vdev_t *vd) +{ + vdev_disk_t *dvd; + struct block_device *bdev; + int mode = 0; + + // dprintf("vd=%p\n", vd); + + /* Must have a pathname and it must be absolute. */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return EINVAL; + } + + dvd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP); + if (dvd == NULL) + return ENOMEM; + + /* XXX: Since we do not have devid support like Solaris we + * currently can't be as clever about opening the right device. + * For now we will simple open the device name provided and + * fail when it doesn't exist. If your devices get reordered + * your going to be screwed, use udev for now to prevent this. + * + * XXX: mode here could be the global spa_mode with a little + * munging of the flags to make then more agreeable to linux. + * However, simply passing a 0 for now gets us W/R behavior. + */ + bdev = open_bdev_excl(vd->vdev_path, mode, dvd); + if (IS_ERR(bdev)) { + kmem_free(dvd, sizeof(vdev_disk_t)); + return -PTR_ERR(bdev); + } + + /* XXX: Long term validate stored dvd->vd_devid with + * a unique identifier read from the disk. + */ + + dvd->vd_lh = bdev; + vd->vdev_tsd = dvd; + + return 0; +} + +static int +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + vdev_disk_t *dvd; + struct block_device *bdev; + int rc; + + // dprintf("vd=%p, psize=%p, ashift=%p\n", vd, psize, ashift); + dprintf("adding disk %s\n", + vd->vdev_path ? vd->vdev_path : ""); + + rc = vdev_disk_open_common(vd); + if (rc) + return rc; + + dvd = vd->vdev_tsd; + bdev = dvd->vd_lh; + + /* Determine the actual size of the device (in bytes) */ + *psize = get_capacity(bdev->bd_disk) * SECTOR_SIZE; + + /* Check if this is a whole device and if it is try and + * enable the write cache, it is OK if this fails. + * + * XXX: This behavior should probably be configurable. + */ + if (bdev->bd_contains == bdev) { + int wce = 1; + + vd->vdev_wholedisk = 1ULL; + + /* Different methods are needed for an IDE vs SCSI disk. + * Since we're not sure what type of disk this is try IDE, + * if that fails try SCSI. */ + rc = ioctl_by_bdev(bdev, HDIO_SET_WCACHE, (unsigned long)&wce); + if (rc) + dprintf("Unable to enable IDE WCE and SCSI WCE " + "not yet supported: %d\n", rc); + + /* XXX: To implement the scsi WCE enable we are going to need + * to use the SG_IO ioctl. But that means fully forming the + * SCSI command as the ioctl arg. To get this right I need + * to look at the sdparm source which does this. + */ + rc = 0; + } else { + /* Must be a partition, that's fine. */ + vd->vdev_wholedisk = 0; + } + + /* Based on the minimum sector size set the block size */ + *ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1; + + /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ + vd->vdev_nowritecache = B_FALSE; + + return rc; +} + +static void +vdev_disk_close(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + // dprintf("vd=%p\n", vd); + dprintf("removing disk %s\n", + vd->vdev_path ? vd->vdev_path : ""); + + if (dvd == NULL) + return; + + close_bdev_excl(dvd->vd_lh); + + kmem_free(dvd, sizeof(vdev_disk_t)); + vd->vdev_tsd = NULL; +} + +#ifdef HAVE_2ARGS_BIO_END_IO_T +static void +vdev_disk_probe_io_completion(struct bio *bio, int rc) +#else +static int +vdev_disk_probe_io_completion(struct bio *bio, unsigned int size, int rc) +#endif /* HAVE_2ARGS_BIO_END_IO_T */ +{ + dio_request_t *dr = bio->bi_private; + zio_t *zio; + int error; + + + /* Fatal error but print some useful debugging before asserting */ + if (dr == NULL) { + printk("FATAL: bio->bi_private == NULL\n" + "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n" + "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d\n", + bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt, + bio->bi_idx, bio->bi_size, bio->bi_end_io, + atomic_read(&bio->bi_cnt)); + SBUG(); + } + + /* Incomplete */ + if (bio->bi_size) { + rc = 1; + goto out; + } + + error = rc; + if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = EIO; + + zio = dr->dr_zio; + if (zio) { + zio->io_error = error; + zio_interrupt(zio); + } + + dr->dr_rc = error; + atomic_dec(&dr->dr_ref); + + if (bio_sync(bio)) { + complete(&dr->dr_comp); + } else { + kmem_free(dr, sizeof(dio_request_t)); + bio_put(bio); + } + + rc = 0; +out: +#ifdef HAVE_2ARGS_BIO_END_IO_T + return; +#else + return rc; +#endif /* HAVE_2ARGS_BIO_END_IO_T */ +} + +static struct bio * +__bio_map_vmem(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + int offset, i; + struct page *page; + struct bio *bio; + + bio = bio_alloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + page = vmalloc_to_page(data); + ASSERT(page); /* Expecting virtual linear address */ + + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) + break; + + data += bytes; + len -= bytes; + offset = 0; + bytes = PAGE_SIZE; + } + + return bio; +} + +static struct bio * +bio_map_vmem(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + struct bio *bio; + + bio = __bio_map_vmem(q, data, len, gfp_mask); + if (IS_ERR(bio)) + return bio; + + if (bio->bi_size != len) { + bio_put(bio); + return ERR_PTR(-EINVAL); + } + + return bio; +} + +static struct bio * +bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) +{ + struct bio *bio; + + /* Cleanly map buffer we are passed in to a bio regardless + * of if the buffer is a virtual or physical address. */ + if (kmem_virt(data)) + bio = bio_map_vmem(q, data, len, gfp_mask); + else + bio = bio_map_kern(q, data, len, gfp_mask); + + return bio; +} + +static int +vdev_disk_io(vdev_t *vd, zio_t *zio, caddr_t kbuf, size_t size, + uint64_t offset, int flags) +{ + struct bio *bio; + dio_request_t *dr; + int rw, rc = 0; + struct block_device *bdev; + struct request_queue *q; + + // dprintf("vd=%p, zio=%p, kbuf=%p, size=%ld, offset=%lu, flag=%lx\n", + // vd, zio, kbuf, size, offset, flags); + + ASSERT((offset % SECTOR_SIZE) == 0); /* Sector aligned */ + + if (vd == NULL || vd->vdev_tsd == NULL) + return EINVAL; + + dr = kmem_alloc(sizeof(dio_request_t), KM_SLEEP); + if (dr == NULL) + return ENOMEM; + + atomic_set(&dr->dr_ref, 0); + dr->dr_vd = vd; + dr->dr_zio = zio; + dr->dr_rc = 0; + + bdev = ((vdev_disk_t *)(vd->vdev_tsd))->vd_lh; + q = bdev->bd_disk->queue; + + bio = bio_map(q, kbuf, size, GFP_NOIO); + if (IS_ERR(bio)) { + kmem_free(dr, sizeof(dio_request_t)); + return -PTR_ERR(bio); + } + + bio->bi_bdev = bdev; + bio->bi_sector = offset / SECTOR_SIZE; + bio->bi_end_io = vdev_disk_probe_io_completion; + bio->bi_private = dr; + + init_completion(&dr->dr_comp); + atomic_inc(&dr->dr_ref); + + if (flags & (1 << BIO_RW)) + rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE; + else + rw = READ; + + if (flags & (1 << BIO_RW_FAILFAST)) + rw |= 1 << BIO_RW_FAILFAST; + + ASSERT3S(flags & ~((1 << BIO_RW) | (1 << BIO_RW_SYNC) | + (1 << BIO_RW_FAILFAST)), ==, 0); + + submit_bio(rw, bio); + + /* + * On syncronous blocking requests we wait for the completion + * callback to wake us. Then we are responsible for freeing + * the dio_request_t as well as dropping the final bio reference. + */ + if (bio_sync(bio)) { + wait_for_completion(&dr->dr_comp); + ASSERT(atomic_read(&dr->dr_ref) == 0); + rc = dr->dr_rc; + kmem_free(dr, sizeof(dio_request_t)); + bio_put(bio); + } + + if (zio_injection_enabled && rc == 0) + rc = zio_handle_device_injection(vd, EIO); + + return rc; +} + +static int +vdev_disk_probe_io(vdev_t *vd, caddr_t kbuf, size_t size, + uint64_t offset, int flags) +{ + int rc; + + // dprintf("vd=%p, kbuf=%p, size=%ld, offset=%lu, flag=%d\n", + // vd, kbuf, size, offset, flags); + + flags |= (1 << BIO_RW_SYNC); + flags |= (1 << BIO_RW_FAILFAST); + + /* XXX: offset must be block aligned or we need to take + * care of it */ + + rc = vdev_disk_io(vd, NULL, kbuf, size, offset, flags); + + return rc; +} + +/* + * Determine if the underlying device is accessible by reading and writing + * to a known location. We must be able to do this during syncing context + * and thus we cannot set the vdev state directly. + */ +static int +vdev_disk_probe(vdev_t *vd) +{ + vdev_t *nvd; + int label_idx, rc = 0, retries = 0; + uint64_t offset; + char *vl_pad; + + // dprintf("vd=%p\n", vd); + + if (vd == NULL) + return EINVAL; + + /* Hijack the current vdev */ + nvd = vd; + + /* Pick a random label to rewrite */ + label_idx = spa_get_random(VDEV_LABELS); + ASSERT(label_idx < VDEV_LABELS); + + offset = vdev_label_offset(vd->vdev_psize, label_idx, + offsetof(vdev_label_t, vl_pad)); + + vl_pad = vmem_alloc(VDEV_SKIP_SIZE, KM_SLEEP); + if (vl_pad == NULL) + return ENOMEM; + + /* + * Try to read and write to a special location on the + * label. We use the existing vdev initially and only + * try to create and reopen it if we encounter a failure. + */ + while ((rc = vdev_disk_probe_io(nvd, vl_pad, + VDEV_SKIP_SIZE, offset, READ)) != 0 && retries == 0) { + + nvd = kmem_zalloc(sizeof(vdev_t), KM_SLEEP); + + if (vd->vdev_path) + nvd->vdev_path = spa_strdup(vd->vdev_path); + if (vd->vdev_physpath) + nvd->vdev_physpath = spa_strdup(vd->vdev_physpath); + if (vd->vdev_devid) + nvd->vdev_devid = spa_strdup(vd->vdev_devid); + + nvd->vdev_wholedisk = vd->vdev_wholedisk; + nvd->vdev_guid = vd->vdev_guid; + retries++; + + rc = vdev_disk_open_common(nvd); + if (rc) + break; + } + + if (!rc) + rc = vdev_disk_probe_io(nvd, vl_pad, VDEV_SKIP_SIZE, + offset, WRITE); + + /* Clean up if we allocated a new vdev */ + if (retries) { + vdev_disk_close(nvd); + if (nvd->vdev_path) + spa_strfree(nvd->vdev_path); + if (nvd->vdev_physpath) + spa_strfree(nvd->vdev_physpath); + if (nvd->vdev_devid) + spa_strfree(nvd->vdev_devid); + kmem_free(nvd, sizeof(vdev_t)); + } + + vmem_free(vl_pad, VDEV_SKIP_SIZE); + + /* Reset the failing flag */ + if (!rc) + vd->vdev_is_failing = B_FALSE; + + return rc; +} + +#if 0 +static void +vdev_disk_ioctl_done(void *zio_arg, int rc) +{ + zio_t *zio = zio_arg; + + zio->io_error = rc; + + zio_interrupt(zio); +} +#endif + +static int +vdev_disk_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; +// vdev_disk_t *dvd = vd->vdev_tsd; + int flags, rc; + + // dprintf("zio=%p\n", zio); + + if (zio->io_type == ZIO_TYPE_IOCTL) { + zio_vdev_io_bypass(zio); + + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = ENXIO; + return ZIO_PIPELINE_CONTINUE; + } + + switch (zio->io_cmd) { + + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + if (vd->vdev_nowritecache) { + zio->io_error = ENOTSUP; + break; + } + +#if 0 + zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done; + zio->io_dk_callback.dkc_flag = FLUSH_VOLATILE; + zio->io_dk_callback.dkc_cookie = zio; + + rc = ldi_ioctl(dvd->vd_lh, zio->io_cmd, + (uintptr_t)&zio->io_dk_callback, + FKIOCTL, kcred, NULL); + + if (rc == 0) { + /* + * The ioctl will be done asychronously, + * and will call vdev_disk_ioctl_done() + * upon completion. + */ + return ZIO_PIPELINE_STOP; + } +#else + rc = ENOTSUP; +#endif + + if (rc == ENOTSUP || rc == ENOTTY) { + /* + * If we get ENOTSUP or ENOTTY, we know that + * no future attempts will ever succeed. + * In this case we set a persistent bit so + * that we don't bother with the ioctl in the + * future. + */ + vd->vdev_nowritecache = B_TRUE; + } + zio->io_error = rc; + + break; + + default: + zio->io_error = ENOTSUP; + } + + return ZIO_PIPELINE_CONTINUE; + } + + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) + return ZIO_PIPELINE_STOP; + + if ((zio = vdev_queue_io(zio)) == NULL) + return ZIO_PIPELINE_STOP; + + if (zio->io_type == ZIO_TYPE_WRITE) + rc = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + else + rc = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO; + + rc = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : rc; + + if (rc) { + zio->io_error = rc; + zio_interrupt(zio); + return ZIO_PIPELINE_STOP; + } + + flags = ((zio->io_type == ZIO_TYPE_READ) ? READ : WRITE); + /* flags |= B_BUSY | B_NOCACHE; XXX : Not supported */ + + if (zio->io_flags & ZIO_FLAG_FAILFAST) + flags |= (1 << BIO_RW_FAILFAST); + + + vdev_disk_io(vd, zio, zio->io_data, zio->io_size, + zio->io_offset, flags); + + return ZIO_PIPELINE_STOP; +} + +static int +vdev_disk_io_done(zio_t *zio) +{ + // dprintf("zio=%p\n", zio); + + vdev_queue_io_done(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. Otherwise, probe the device and + * make sure it's still accessible. + */ + if (zio->io_error == EIO) { + ASSERT(0); /* XXX: Not yet supported */ +#if 0 + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + int state; + + state = DKIO_NONE; + if (dvd && ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && + state != DKIO_INSERTED) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } else if (vdev_probe(vd) != 0) { + ASSERT(vd->vdev_ops->vdev_op_leaf); + vd->vdev_is_failing = B_TRUE; + } +#endif + } + + return ZIO_PIPELINE_CONTINUE; +} + +nvlist_t * +vdev_disk_read_rootlabel(char *devpath) +{ + return NULL; +} + +vdev_ops_t vdev_disk_ops = { + vdev_disk_open, + vdev_disk_close, + vdev_disk_probe, + vdev_default_asize, + vdev_disk_io_start, + vdev_disk_io_done, + NULL, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif /* defined(_KERNEL) && defined(HAVE_SPL) */