From 4014bd401fd19111d41b68508c52f22e37cd00b4 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 29 Jul 2009 17:24:08 -0700 Subject: [PATCH] Major vdev_disk feature and API (thru 2.6.30) update. Tested under CHAOS4.2, RHEL5, SLES11, and FC11 (all x86_64) Features: Honor spa_mode() when opening the block device. Previously this was ignored and devices were always opened read/write. Integrated DKIOCFLUSHWRITECACHE zio operation with linux WRITE_BARRIER for kernels post 2.6.24 where empty bio requests are supported. For earlier kernels ENOTSUP is returned and no barriers are performed. If RHEL5 based kernels are intended to be supported long term we may need make use of the old akward API. With the addition of WRITE_BARRIER support all writes which were WRITE_SYNC can now be safely made WRITE bios. They will now take advantage of aggregation in the elevator and improved write performance is likely. Notice the ZIO_FLAG_SPECULATIVE flag and pass along the hint to the elevator by using READA instead of READ. This provides the elevator the ability to prioritize the real READs ahead of the speculative IO if needed. Implement an initial version of vdev_disk_io_done() which in the case of an EIO error triggers a media change check. If it determines a media change has occured we fail the device and remove it from the config. This logic I'm sure can be improved further but for now it is an improvement over the VERIFY() that no error will ever happen. APIs: 2.6.22 API change Unused destroy_dirty_buffers arg removed from prototype. 2.6.24 API change Empty write barriers are now supported and we should use them. 2.6.24 API change Size argument dropped from bio_endio and bi_end_io, because the bi_end_io is only called once now when the request is complete. There is no longer any need for a size argument. This also means that partial IO's are no longer possibe and the end_io callback should not check bi->bi_size. Finally, the return type was updated to void. 2.6.28 API change open/close_bdev_excl() renamed to open/close_bdev_exclusive(). 2.6.29 API change BIO_RW_SYNC renamed to BIO_RW_SYNCIO. --- module/zfs/include/sys/vdev_disk.h | 44 +++- module/zfs/vdev_disk.c | 321 +++++++++++++++-------------- 2 files changed, 210 insertions(+), 155 deletions(-) diff --git a/module/zfs/include/sys/vdev_disk.h b/module/zfs/include/sys/vdev_disk.h index 520db0103d..06063d2c3d 100644 --- a/module/zfs/include/sys/vdev_disk.h +++ b/module/zfs/include/sys/vdev_disk.h @@ -10,15 +10,51 @@ extern "C" { #include #include #include +#include typedef struct vdev_disk { - ddi_devid_t vd_devid; - char *vd_minor; - ldi_handle_t vd_lh; + ddi_devid_t vd_devid; + char *vd_minor; + struct block_device *vd_bdev; } vdev_disk_t; -extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int); +extern int vdev_disk_physio(struct block_device *, caddr_t, + size_t, uint64_t, int); extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); + +/* 2.6.24 API change */ +#ifdef HAVE_2ARGS_BIO_END_IO_T +# define BIO_END_IO_PROTO(fn, x, y, z) static void fn(struct bio *x, int z) +# define BIO_END_IO_RETURN(rc) return +#else +# define BIO_END_IO_PROTO(fn, x, y, z) static int fn(struct bio *x, \ + unsigned int y, int z) +# define BIO_END_IO_RETURN(rc) return rc +#endif /* HAVE_2ARGS_BIO_END_IO_T */ + +/* 2.6.29 API change */ +#ifdef HAVE_BIO_RW_SYNCIO +# define DIO_RW_SYNCIO BIO_RW_SYNCIO +#else +# define DIO_RW_SYNCIO BIO_RW_SYNC +#endif /* HAVE_BIO_RW_SYNCIO */ + +/* 2.6.28 API change */ +#ifdef HAVE_OPEN_BDEV_EXCLUSIVE +# define vdev_bdev_open(path, md, hld) open_bdev_exclusive(path, md, hld) +# define vdev_bdev_close(bdev, md) close_bdev_exclusive(bdev, md) +#else +# define vdev_bdev_open(path, md, hld) open_bdev_excl(path, md, hld) +# define vdev_bdev_close(bdev, md) close_bdev_excl(bdev) +#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ + +/* 2.6.22 API change */ +#ifdef HAVE_1ARG_INVALIDATE_BDEV +# define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev) +#else +# define vdev_bdev_invalidate(bdev) invalidate_bdev(bdev, 1) +#endif /* HAVE_1ARG_INVALIDATE_BDEV */ + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 56357addb4..442ab79dc8 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,7 +30,6 @@ #include #include #include -#include /* * Virtual device vector for disks. @@ -46,58 +45,90 @@ typedef struct dio_request { struct bio *dr_bio[0]; /* Attached bio's */ } dio_request_t; -static int -vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) + +#ifdef HAVE_OPEN_BDEV_EXCLUSIVE +static fmode_t +vdev_bdev_mode(int smode) { - struct block_device *vd_lh; - vdev_disk_t *dvd; + fmode_t mode = 0; + + ASSERT3S(smode & (FREAD | FWRITE), !=, 0); + + if (smode & FREAD) + mode |= FMODE_READ; + + if (smode & FWRITE) + mode |= FMODE_WRITE; + + return mode; +} +#else +static int +vdev_bdev_mode(int smode) +{ + int mode = 0; + + ASSERT3S(smode & (FREAD | FWRITE), !=, 0); + + if ((smode & FREAD) && !(smode & FWRITE)) + mode = MS_RDONLY; + + return mode; +} +#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ + +static int +vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) +{ + struct block_device *bdev; + vdev_disk_t *vd; + int mode; /* Must have a pathname and it must be absolute. */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + if (v->vdev_path == NULL || v->vdev_path[0] != '/') { + v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return EINVAL; } - dvd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP); - if (dvd == NULL) + vd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP); + if (vd == NULL) return ENOMEM; - /* XXX: Since we do not have devid support like Solaris we + /* + * XXX: Since we do not have devid support like Solaris we * currently can't be as clever about opening the right device. * For now we will simply open the device name provided and * fail when it doesn't exist. If your devices get reordered * your going to be screwed, use udev for now to prevent this. - * - * XXX: mode here could be the global spa_mode with a little - * munging of the flags to make then more agreeable to linux. - * However, simply passing a 0 for now gets us W/R behavior. */ - vd_lh = open_bdev_excl(vd->vdev_path, 0, dvd); - if (IS_ERR(vd_lh)) { - kmem_free(dvd, sizeof(vdev_disk_t)); - return -PTR_ERR(vd_lh); + mode = spa_mode(v->vdev_spa); + bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd); + if (IS_ERR(bdev)) { + kmem_free(vd, sizeof(vdev_disk_t)); + return -PTR_ERR(bdev); } - /* XXX: Long term validate stored dvd->vd_devid with a unique + /* + * XXX: Long term validate stored vd->vd_devid with a unique * identifier read from the disk, likely EFI support. */ - vd->vdev_tsd = dvd; - dvd->vd_lh = vd_lh; + v->vdev_tsd = vd; + vd->vd_bdev = bdev; - /* Check if this is a whole device. When vd_lh->bd_contains == - * vd_lh we have a whole device and not simply a partition. */ - vd->vdev_wholedisk = !!(vd_lh->bd_contains == vd_lh); + /* Check if this is a whole device. When bdev->bd_contains == + * bdev we have a whole device and not simply a partition. */ + v->vdev_wholedisk = !!(bdev->bd_contains == bdev); /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ - vd->vdev_nowritecache = B_FALSE; + v->vdev_nowritecache = B_FALSE; /* Determine the actual size of the device (in bytes) * * XXX: SECTOR_SIZE is defined to 512b which may not be true for * your device, we must use the actual hardware sector size. */ - *psize = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE; + *psize = get_capacity(bdev->bd_disk) * SECTOR_SIZE; /* Based on the minimum sector size set the block size */ *ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1; @@ -106,27 +137,22 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) } static void -vdev_disk_close(vdev_t *vd) +vdev_disk_close(vdev_t *v) { - vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_t *vd = v->vdev_tsd; - if (dvd == NULL) + if (vd == NULL) return; - if (dvd->vd_lh != NULL) - close_bdev_excl(dvd->vd_lh); + if (vd->vd_bdev != NULL) + vdev_bdev_close(vd->vd_bdev, + vdev_bdev_mode(spa_mode(v->vdev_spa))); - kmem_free(dvd, sizeof(vdev_disk_t)); - vd->vdev_tsd = NULL; + kmem_free(vd, sizeof(vdev_disk_t)); + v->vdev_tsd = NULL; } -#ifdef HAVE_2ARGS_BIO_END_IO_T -static void -vdev_disk_physio_completion(struct bio *bio, int rc) -#else -static int -vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc) -#endif /* HAVE_2ARGS_BIO_END_IO_T */ +BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, rc) { dio_request_t *dr = bio->bi_private; zio_t *zio; @@ -168,7 +194,7 @@ vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc) spin_unlock(&dr->dr_lock); /* Synchronous dio cleanup handled by waiter */ - if (dr->dr_rw & (1 << BIO_RW_SYNC)) { + if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) { complete(&dr->dr_comp); } else { for (i = 0; i < dr->dr_bio_count; i++) @@ -186,13 +212,7 @@ vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc) spin_unlock(&dr->dr_lock); } - rc = 0; - -#ifdef HAVE_2ARGS_BIO_END_IO_T - return; -#else - return rc; -#endif /* HAVE_2ARGS_BIO_END_IO_T */ + BIO_END_IO_RETURN(0); } static struct bio * @@ -222,7 +242,7 @@ bio_map_virt(struct request_queue *q, void *data, bytes = len; VERIFY3P(page = vmalloc_to_page(data), !=, NULL); - VERIFY3U(bio_add_pc_page(q, bio, page, bytes, offset), ==, bytes); + VERIFY3U(bio_add_pc_page(q, bio, page, bytes, offset),==,bytes); data += bytes; len -= bytes; @@ -250,16 +270,19 @@ bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) } static int -__vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf_ptr, +__vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, size_t kbuf_size, uint64_t kbuf_offset, int flags) { - struct request_queue *q = vd_lh->bd_disk->queue; + struct request_queue *q; dio_request_t *dr; caddr_t bio_ptr; uint64_t bio_offset; int i, j, error = 0, bio_count, bio_size, dio_size; ASSERT3S(kbuf_offset % SECTOR_SIZE, ==, 0); + q = bdev_get_queue(bdev); + if (!q) + return ENXIO; bio_count = (kbuf_size / (q->max_hw_sectors << 9)) + 1; dio_size = sizeof(dio_request_t) + sizeof(struct bio *) * bio_count; @@ -271,13 +294,10 @@ __vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf_ptr, spin_lock_init(&dr->dr_lock); dr->dr_ref = 0; dr->dr_zio = zio; - dr->dr_rw = READ; + dr->dr_rw = flags; dr->dr_error = 0; dr->dr_bio_count = bio_count; - if (flags & (1 << BIO_RW)) - dr->dr_rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE; - #ifdef BIO_RW_FAILFAST if (flags & (1 << BIO_RW_FAILFAST)) dr->dr_rw |= 1 << BIO_RW_FAILFAST; @@ -305,7 +325,7 @@ __vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf_ptr, return error; } - dr->dr_bio[i]->bi_bdev = vd_lh; + dr->dr_bio[i]->bi_bdev = bdev; dr->dr_bio[i]->bi_sector = bio_offset >> 9; dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; dr->dr_bio[i]->bi_private = dr; @@ -320,12 +340,14 @@ __vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf_ptr, submit_bio(dr->dr_rw, dr->dr_bio[i]); /* - * On syncronous blocking requests we wait for all bio the completion + * On synchronous blocking requests we wait for all bio the completion * callbacks to run. We will be woken when the last callback runs * for this dio. We are responsible for freeing the dio_request_t as - * well as the final reference on all attached bios. + * well as the final reference on all attached bios. Currently, the + * only synchronous consumer is vdev_disk_read_rootlabel() all other + * IO originating from vdev_disk_io_start() is asynchronous. */ - if (dr->dr_rw & (1 << BIO_RW_SYNC)) { + if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) { wait_for_completion(&dr->dr_comp); ASSERT(dr->dr_ref == 0); error = dr->dr_error; @@ -340,87 +362,90 @@ __vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf_ptr, } int -vdev_disk_physio(ldi_handle_t vd_lh, caddr_t kbuf, +vdev_disk_physio(struct block_device *bdev, caddr_t kbuf, size_t size, uint64_t offset, int flags) { - return __vdev_disk_physio(vd_lh, NULL, kbuf, size, offset, flags); + return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags); } -#if 0 -/* XXX: Not yet supported */ -static void -vdev_disk_ioctl_done(void *zio_arg, int error) +/* 2.6.24 API change */ +#ifdef HAVE_BIO_EMPTY_BARRIER +BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc) { - zio_t *zio = zio_arg; + zio_t *zio = bio->bi_private; - zio->io_error = error; + zio->io_error = -rc; + if (rc && (rc == -EOPNOTSUPP)) + zio->io_vd->vdev_nowritecache = B_TRUE; + bio_put(bio); zio_interrupt(zio); + + BIO_END_IO_RETURN(0); } -#endif + +static int +vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) +{ + struct request_queue *q; + struct bio *bio; + + q = bdev_get_queue(bdev); + if (!q) + return ENXIO; + + bio = bio_alloc(GFP_KERNEL, 0); + if (!bio) + return ENOMEM; + + bio->bi_end_io = vdev_disk_io_flush_completion; + bio->bi_private = zio; + bio->bi_bdev = bdev; + submit_bio(WRITE_BARRIER, bio); + + return 0; +} +#else +static int +vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) +{ + return ENOTSUP; +} +#endif /* HAVE_BIO_EMPTY_BARRIER */ static int vdev_disk_io_start(zio_t *zio) { - vdev_t *vd = zio->io_vd; - vdev_disk_t *dvd = vd->vdev_tsd; + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; int flags, error; - if (zio->io_type == ZIO_TYPE_IOCTL) { + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: - /* XXPOLICY */ - if (!vdev_readable(vd)) { + if (!vdev_readable(v)) { zio->io_error = ENXIO; return ZIO_PIPELINE_CONTINUE; } switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush) break; - if (vd->vdev_nowritecache) { + if (v->vdev_nowritecache) { zio->io_error = ENOTSUP; break; } -#if 0 - /* XXX: Not yet supported */ - vdev_disk_t *dvd = vd->vdev_tsd; - - zio->io_dk_callback.dkc_callback = vdev_disk_ioctl_done; - zio->io_dk_callback.dkc_flag = FLUSH_VOLATILE; - zio->io_dk_callback.dkc_cookie = zio; - - error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, - (uintptr_t)&zio->io_dk_callback, - FKIOCTL, kcred, NULL); - - if (error == 0) { - /* - * The ioctl will be done asychronously, - * and will call vdev_disk_ioctl_done() - * upon completion. - */ + error = vdev_disk_io_flush(vd->vd_bdev, zio); + if (error == 0) return ZIO_PIPELINE_STOP; - } -#else - error = ENOTSUP; -#endif - if (error == ENOTSUP || error == ENOTTY) { - /* - * If we get ENOTSUP or ENOTTY, we know that - * no future attempts will ever succeed. - * In this case we set a persistent bit so - * that we don't bother with the ioctl in the - * future. - */ - vd->vdev_nowritecache = B_TRUE; - } zio->io_error = error; + if (error == ENOTSUP) + v->vdev_nowritecache = B_TRUE; break; @@ -429,20 +454,30 @@ vdev_disk_io_start(zio_t *zio) } return ZIO_PIPELINE_CONTINUE; + + case ZIO_TYPE_WRITE: + flags = WRITE; + break; + + case ZIO_TYPE_READ: + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + flags = READA; + else + flags = READ; + + break; + + default: + zio->io_error = ENOTSUP; + return ZIO_PIPELINE_CONTINUE; } - /* - * B_BUSY XXX: Not supported - * B_NOCACHE XXX: Not supported - */ - flags = ((zio->io_type == ZIO_TYPE_READ) ? READ : WRITE); - #ifdef BIO_RW_FAILFAST - if (zio->io_flags & ZIO_FLAG_IO_RETRY) + if (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) flags |= (1 << BIO_RW_FAILFAST); #endif /* BIO_RW_FAILFAST */ - error = __vdev_disk_physio(dvd->vd_lh, zio, zio->io_data, + error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, zio->io_size, zio->io_offset, flags); if (error) { zio->io_error = error; @@ -456,22 +491,20 @@ static void vdev_disk_io_done(zio_t *zio) { /* - * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if - * the device has been removed. If this is the case, then we trigger an - * asynchronous removal of the device. Otherwise, probe the device and - * make sure it's still accessible. + * If the device returned EIO, we revalidate the media. If it is + * determined the media has changed this triggers the asynchronous + * removal of the device from the configuration. */ - VERIFY3S(zio->io_error, ==, 0); -#if 0 - vdev_disk_t *dvd = vd->vdev_tsd; - int state = DKIO_NONE; + if (zio->io_error == EIO) { + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; - if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, - FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { - vd->vdev_remove_wanted = B_TRUE; + if (check_disk_change(vd->vd_bdev)) { + vdev_bdev_invalidate(vd->vd_bdev); + v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } -#endif + } } vdev_ops_t vdev_disk_ops = { @@ -492,32 +525,18 @@ vdev_ops_t vdev_disk_ops = { int vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) { - struct block_device *vd_lh; + struct block_device *bdev; vdev_label_t *label; uint64_t s, size; int i; - /* - * Read the device label and build the nvlist. - * XXX: Not yet supported - */ -#if 0 - if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, - &minor_name) == 0) { - error = ldi_open_by_devid(tmpdevid, minor_name, spa_mode, - kcred, &vd_lh, zfs_li); - ddi_devid_free(tmpdevid); - ddi_devid_str_free(minor_name); - } -#endif + bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), NULL); + if (IS_ERR(bdev)) + return -PTR_ERR(bdev); - vd_lh = open_bdev_excl(devpath, MS_RDONLY, NULL); - if (IS_ERR(vd_lh)) - return -PTR_ERR(vd_lh); - - s = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE; + s = get_capacity(bdev->bd_disk) * SECTOR_SIZE; if (s == 0) { - close_bdev_excl(vd_lh); + vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); return EIO; } @@ -529,8 +548,8 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) /* read vdev label */ offset = vdev_label_offset(size, i, 0); - if (vdev_disk_physio(vd_lh, (caddr_t)label, - VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, READ) != 0) + if (vdev_disk_physio(bdev, (caddr_t)label, + VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, READ_SYNC) != 0) continue; if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, @@ -557,7 +576,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) } vmem_free(label, sizeof(vdev_label_t)); - close_bdev_excl(vd_lh); + vdev_bdev_close(bdev, vdev_bdev_mode(FREAD)); return 0; }