diff --git a/.topdeps b/.topdeps index 607c231780..7f16cbcdd5 100644 --- a/.topdeps +++ b/.topdeps @@ -1,3 +1 @@ -gcc-branch -fix-branch -feature-branch +zfs-branch diff --git a/.topmsg b/.topmsg index 03967cdff5..7e907446b4 100644 --- a/.topmsg +++ b/.topmsg @@ -1,19 +1,6 @@ From: Brian Behlendorf -Subject: [PATCH] zfs branch +Subject: [PATCH] linux kernel disk -Merged result of all changes which are relevant to both Solaris -and Linux builds of the ZFS code. These are changes where there -is a reasonable chance they will be accepted upstream. - -Additionally, since this is effectively the root of the linux -ZFS tree the core linux build system is added here. This -includes autogen.sh, configure.ac, m4 macros, some scripts/*, -and makefiles for all the core ZFS components. Linux-only -features which require tweaks to the build system should appear -on the relevant topic branches. All autotools products which -result from autogen.sh are commited to the linux-configure-branch. - -This branch also contains the META, ChangeLog, AUTHORS, TODO, -and README, files. +Native Linux vdev disk interfaces Signed-off-by: Brian Behlendorf diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 2bde8a89a0..f392800979 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -47,6 +47,7 @@ ${MODULE}-objs += uberblock.o ${MODULE}-objs += unique.o ${MODULE}-objs += vdev.o ${MODULE}-objs += vdev_cache.o +${MODULE}-objs += vdev_disk.o ${MODULE}-objs += vdev_file.o ${MODULE}-objs += vdev_label.o ${MODULE}-objs += vdev_mirror.o diff --git a/module/zfs/include/sys/vdev_disk.h b/module/zfs/include/sys/vdev_disk.h new file mode 100644 index 0000000000..520db0103d --- /dev/null +++ b/module/zfs/include/sys/vdev_disk.h @@ -0,0 +1,28 @@ +#ifndef _SYS_VDEV_DISK_H +#define _SYS_VDEV_DISK_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +#include +#include +#include +#include + +typedef struct vdev_disk { + ddi_devid_t vd_devid; + char *vd_minor; + ldi_handle_t vd_lh; +} vdev_disk_t; + +extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); +extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_DISK_H */ diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c new file mode 100644 index 0000000000..bae4cdcdc6 --- /dev/null +++ b/module/zfs/vdev_disk.c @@ -0,0 +1,565 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for disks. + */ +typedef struct dio_request { + struct completion dr_comp; /* Completion for sync IO */ + spinlock_t dr_lock; /* Completion lock */ + zio_t *dr_zio; /* Parent ZIO */ + int dr_ref; /* Outstanding bio count */ + int dr_rw; /* Read/Write */ + int dr_error; /* Bio error */ + int dr_bio_count; /* Count of bio's */ + struct bio *dr_bio[0]; /* Attached bio's */ +} dio_request_t; + +static int +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +{ + struct block_device *vd_lh; + vdev_disk_t *dvd; + + /* Must have a pathname and it must be absolute. */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return EINVAL; + } + + dvd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP); + if (dvd == NULL) + return ENOMEM; + + /* XXX: Since we do not have devid support like Solaris we + * currently can't be as clever about opening the right device. + * For now we will simply open the device name provided and + * fail when it doesn't exist. If your devices get reordered + * your going to be screwed, use udev for now to prevent this. + * + * XXX: mode here could be the global spa_mode with a little + * munging of the flags to make then more agreeable to linux. + * However, simply passing a 0 for now gets us W/R behavior. + */ + vd_lh = open_bdev_excl(vd->vdev_path, 0, dvd); + if (IS_ERR(vd_lh)) { + kmem_free(dvd, sizeof(vdev_disk_t)); + return -PTR_ERR(vd_lh); + } + + /* XXX: Long term validate stored dvd->vd_devid with a unique + * identifier read from the disk, likely EFI support. + */ + + vd->vdev_tsd = dvd; + dvd->vd_lh = vd_lh; + + /* Check if this is a whole device. When vd_lh->bd_contains == + * vd_lh we have a whole device and not simply a partition. */ + vd->vdev_wholedisk = !!(vd_lh->bd_contains == vd_lh); + + /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ + vd->vdev_nowritecache = B_FALSE; + + /* Determine the actual size of the device (in bytes) + * + * XXX: SECTOR_SIZE is defined to 512b which may not be true for + * your device, we must use the actual hardware sector size. + */ + *psize = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE; + + /* Based on the minimum sector size set the block size */ + *ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1; + + return 0; +} + +static void +vdev_disk_close(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + if (dvd == NULL) + return; + + if (dvd->vd_lh != NULL) + close_bdev_excl(dvd->vd_lh); + + kmem_free(dvd, sizeof(vdev_disk_t)); + vd->vdev_tsd = NULL; +} + +#ifdef HAVE_2ARGS_BIO_END_IO_T +static void +vdev_disk_physio_completion(struct bio *bio, int rc) +#else +static int +vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc) +#endif /* HAVE_2ARGS_BIO_END_IO_T */ +{ + dio_request_t *dr = bio->bi_private; + zio_t *zio; + int i, error; + + /* Fatal error but print some useful debugging before asserting */ + if (dr == NULL) { + printk("FATAL: bio->bi_private == NULL\n" + "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n" + "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d\n", + bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt, + bio->bi_idx, bio->bi_size, bio->bi_end_io, + atomic_read(&bio->bi_cnt)); + SBUG(); + } + + /* Incomplete */ + if (bio->bi_size) { + rc = 1; + goto out; + } + + error = rc; + if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = EIO; + + spin_lock(&dr->dr_lock); + + dr->dr_ref--; + if (dr->dr_error == 0) + dr->dr_error = error; + + /* + * All bio's attached to this dio request have completed. This + * means it is safe to access the dio outside the spin lock, we + * are assured there will be no racing accesses. + */ + if (dr->dr_ref == 0) { + zio = dr->dr_zio; + spin_unlock(&dr->dr_lock); + + /* Syncronous dio cleanup handled by waiter */ + if (dr->dr_rw & (1 << BIO_RW_SYNC)) { + complete(&dr->dr_comp); + } else { + for (i = 0; i < dr->dr_bio_count; i++) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, sizeof(dio_request_t) + + sizeof(struct bio *) * dr->dr_bio_count); + } + + if (zio) { + zio->io_error = dr->dr_error; + zio_interrupt(zio); + } + } else { + spin_unlock(&dr->dr_lock); + } + + rc = 0; +out: +#ifdef HAVE_2ARGS_BIO_END_IO_T + return; +#else + return rc; +#endif /* HAVE_2ARGS_BIO_END_IO_T */ +} + +static struct bio * +bio_map_virt(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + unsigned int offset, i, data_len = len; + const int nr_pages = end - start; + struct page *page; + struct bio *bio; + + bio = bio_alloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + VERIFY3P(page = vmalloc_to_page(data), !=, NULL); + VERIFY3U(bio_add_pc_page(q, bio, page, bytes, offset), ==, bytes); + + data += bytes; + len -= bytes; + offset = 0; + bytes = PAGE_SIZE; + } + + VERIFY3U(bio->bi_size, ==, data_len); + return bio; +} + +static struct bio * +bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) +{ + struct bio *bio; + + /* Cleanly map buffer we are passed in to a bio regardless + * of if the buffer is a virtual or physical address. */ + if (kmem_virt(data)) + bio = bio_map_virt(q, data, len, gfp_mask); + else + bio = bio_map_kern(q, data, len, gfp_mask); + + return bio; +} + +static int +__vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf_ptr, + size_t kbuf_size, uint64_t kbuf_offset, int flags) +{ + struct request_queue *q = vd_lh->bd_disk->queue; + dio_request_t *dr; + caddr_t bio_ptr; + uint64_t bio_offset; + int i, j, error = 0, bio_count, bio_size, dio_size; + + ASSERT3S(kbuf_offset % SECTOR_SIZE, ==, 0); + ASSERT3S(flags & + ~((1 << BIO_RW) | + (1 << BIO_RW_SYNC) | + (1 << BIO_RW_FAILFAST)), ==, 0); + + bio_count = (kbuf_size / (q->max_hw_sectors << 9)) + 1; + dio_size = sizeof(dio_request_t) + sizeof(struct bio *) * bio_count; + dr = kmem_zalloc(dio_size, KM_SLEEP); + if (dr == NULL) + return ENOMEM; + + init_completion(&dr->dr_comp); + spin_lock_init(&dr->dr_lock); + dr->dr_ref = 0; + dr->dr_zio = zio; + dr->dr_rw = READ; + dr->dr_error = 0; + dr->dr_bio_count = bio_count; + + if (flags & (1 << BIO_RW)) + dr->dr_rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE; + + if (flags & (1 << BIO_RW_FAILFAST)) + dr->dr_rw |= 1 << BIO_RW_FAILFAST; + + /* + * When the IO size exceeds the maximum bio size for the request + * queue we are forced to break the IO in multiple bio's and wait + * for them all to complete. Ideally, all pool users will set + * their volume block size to match the maximum request size and + * the common case will be one bio per vdev IO request. + */ + bio_ptr = kbuf_ptr; + bio_offset = kbuf_offset; + for (i = 0; i < dr->dr_bio_count; i++) { + bio_size = MIN(kbuf_size, q->max_hw_sectors << 9); + + dr->dr_bio[i] = bio_map(q, bio_ptr, bio_size, GFP_NOIO); + if (IS_ERR(dr->dr_bio[i])) { + for (j = 0; j < i; j++) + bio_put(dr->dr_bio[j]); + + error = -PTR_ERR(dr->dr_bio[i]); + kmem_free(dr, dio_size); + return error; + } + + dr->dr_bio[i]->bi_bdev = vd_lh; + dr->dr_bio[i]->bi_sector = bio_offset >> 9; + dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; + dr->dr_bio[i]->bi_private = dr; + dr->dr_ref++; + + bio_ptr += bio_size; + bio_offset += bio_size; + kbuf_size -= bio_size; + } + + for (i = 0; i < dr->dr_bio_count; i++) + submit_bio(dr->dr_rw, dr->dr_bio[i]); + + /* + * On syncronous blocking requests we wait for all bio the completion + * callbacks to run. We will be woken when the last callback runs + * for this dio. We are responsible for freeing the dio_request_t as + * well as the final reference on all attached bios. + */ + if (dr->dr_rw & (1 << BIO_RW_SYNC)) { + wait_for_completion(&dr->dr_comp); + ASSERT(dr->dr_ref == 0); + error = dr->dr_error; + + for (i = 0; i < dr->dr_bio_count; i++) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, dio_size); + } + + return error; +} + +int +vdev_disk_physio(ldi_handle_t vd_lh, caddr_t kbuf, + size_t size, uint64_t offset, int flags) +{ + return __vdev_disk_physio(vd_lh, NULL, kbuf, size, offset, flags); +} + +#if 0 +/* XXX: Not yet supported */ +static void +vdev_disk_ioctl_done(void *zio_arg, int error) +{ + zio_t *zio = zio_arg; + + zio->io_error = error; + + zio_interrupt(zio); +} +#endif + +static int +vdev_disk_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + int flags, error; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = ENXIO; + return ZIO_PIPELINE_CONTINUE; + } + + switch (zio->io_cmd) { + + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + if (vd->vdev_nowritecache) { + zio->io_error = ENOTSUP; + break; + } + +#if 0 + /* XXX: Not yet supported */ + vdev_disk_t *dvd = vd->vdev_tsd; + + zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done; + zio->io_dk_callback.dkc_flag = FLUSH_VOLATILE; + zio->io_dk_callback.dkc_cookie = zio; + + error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, + (uintptr_t)&zio->io_dk_callback, + FKIOCTL, kcred, NULL); + + if (error == 0) { + /* + * The ioctl will be done asychronously, + * and will call vdev_disk_ioctl_done() + * upon completion. + */ + return ZIO_PIPELINE_STOP; + } +#else + error = ENOTSUP; +#endif + + if (error == ENOTSUP || error == ENOTTY) { + /* + * If we get ENOTSUP or ENOTTY, we know that + * no future attempts will ever succeed. + * In this case we set a persistent bit so + * that we don't bother with the ioctl in the + * future. + */ + vd->vdev_nowritecache = B_TRUE; + } + zio->io_error = error; + + break; + + default: + zio->io_error = ENOTSUP; + } + + return ZIO_PIPELINE_CONTINUE; + } + + /* + * B_BUSY XXX: Not supported + * B_NOCACHE XXX: Not supported + */ + flags = ((zio->io_type == ZIO_TYPE_READ) ? READ : WRITE); + + if (zio->io_flags & ZIO_FLAG_IO_RETRY) + flags |= (1 << BIO_RW_FAILFAST); + + error = __vdev_disk_physio(dvd->vd_lh, zio, zio->io_data, + zio->io_size, zio->io_offset, flags); + if (error) { + zio->io_error = error; + return ZIO_PIPELINE_CONTINUE; + } + + return ZIO_PIPELINE_STOP; +} + +static void +vdev_disk_io_done(zio_t *zio) +{ + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. Otherwise, probe the device and + * make sure it's still accessible. + */ + VERIFY3S(zio->io_error, ==, 0); +#if 0 + vdev_disk_t *dvd = vd->vdev_tsd; + int state = DKIO_NONE; + + if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } +#endif +} + +vdev_ops_t vdev_disk_ops = { + vdev_disk_open, + vdev_disk_close, + vdev_default_asize, + vdev_disk_io_start, + vdev_disk_io_done, + NULL, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * Given the root disk device devid or pathname, read the label from + * the device, and construct a configuration nvlist. + */ +int +vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) +{ + struct block_device *vd_lh; + vdev_label_t *label; + uint64_t s, size; + int i; + + /* + * Read the device label and build the nvlist. + * XXX: Not yet supported + */ +#if 0 + if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, + &minor_name) == 0) { + error = ldi_open_by_devid(tmpdevid, minor_name, spa_mode, + kcred, &vd_lh, zfs_li); + ddi_devid_free(tmpdevid); + ddi_devid_str_free(minor_name); + } +#endif + + vd_lh = open_bdev_excl(devpath, MS_RDONLY, NULL); + if (IS_ERR(vd_lh)) + return -PTR_ERR(vd_lh); + + s = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE; + if (s == 0) { + close_bdev_excl(vd_lh); + return EIO; + } + + size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t); + label = vmem_alloc(sizeof(vdev_label_t), KM_SLEEP); + + for (i = 0; i < VDEV_LABELS; i++) { + uint64_t offset, state, txg = 0; + + /* read vdev label */ + offset = vdev_label_offset(size, i, 0); + if (vdev_disk_physio(vd_lh, (caddr_t)label, + VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE + + VDEV_PHYS_SIZE, offset, READ) != 0) + continue; + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state >= POOL_STATE_DESTROYED) { + nvlist_free(*config); + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + nvlist_free(*config); + *config = NULL; + continue; + } + + break; + } + + vmem_free(label, sizeof(vdev_label_t)); + close_bdev_excl(vd_lh); + + return 0; +}