zfs/module/zfs/vdev_disk.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
#include <sys/sunldi.h>

/*
 * Virtual device vector for disks.
 */
typedef struct dio_request {
	struct completion	dr_comp;	/* Completion for sync IO */
	spinlock_t		dr_lock;	/* Completion lock */
	zio_t			*dr_zio;	/* Parent ZIO */
	int			dr_ref;		/* Outstanding bio count */
	int			dr_rw;		/* Read/Write */
	int			dr_error;	/* Bio error */
	int			dr_bio_count;	/* Count of bio's */
        struct bio		*dr_bio[0];	/* Attached bio's */
} dio_request_t;


#ifdef HAVE_OPEN_BDEV_EXCLUSIVE
static fmode_t
vdev_bdev_mode(int smode)
{
	fmode_t mode = 0;

	ASSERT3S(smode & (FREAD | FWRITE), !=, 0);

	if (smode & FREAD)
		mode |= FMODE_READ;

	if (smode & FWRITE)
		mode |= FMODE_WRITE;

	return mode;
}
#else
static int
vdev_bdev_mode(int smode)
{
	int mode = 0;

	ASSERT3S(smode & (FREAD | FWRITE), !=, 0);

	if ((smode & FREAD) && !(smode & FWRITE))
		mode = MS_RDONLY;

	return mode;
}
#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */

static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
{
	struct block_device *bdev;
	vdev_disk_t *vd;
	int mode;

	/* Must have a pathname and it must be absolute. */
	if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
		v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
		return EINVAL;
	}

	vd = kmem_zalloc(sizeof(vdev_disk_t), KM_SLEEP);
	if (vd == NULL)
		return ENOMEM;

	/*
	 * XXX: Since we do not have devid support like Solaris we
	 * currently can't be as clever about opening the right device.
	 * For now we will simply open the device name provided and
	 * fail when it doesn't exist.  If your devices get reordered
	 * your going to be screwed, use udev for now to prevent this.
	 */
	mode = spa_mode(v->vdev_spa);
	bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd);
	if (IS_ERR(bdev)) {
		kmem_free(vd, sizeof(vdev_disk_t));
		return -PTR_ERR(bdev);
	}

	/*
	 * XXX: Long term validate stored vd->vd_devid with a unique
	 * identifier read from the disk, likely EFI support.
	 */

	v->vdev_tsd = vd;
	vd->vd_bdev = bdev;

	/* Check if this is a whole device.  When bdev->bd_contains ==
	 * bdev we have a whole device and not simply a partition. */
	v->vdev_wholedisk = !!(bdev->bd_contains == bdev);

	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
	v->vdev_nowritecache = B_FALSE;

	/* Determine the actual size of the device (in bytes)
	 *
	 * XXX: SECTOR_SIZE is defined to 512b which may not be true for
	 * your device, we must use the actual hardware sector size.
	 */
	*psize = get_capacity(bdev->bd_disk) * SECTOR_SIZE;

	/* Based on the minimum sector size set the block size */
	*ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1;

	return 0;
}

static void
vdev_disk_close(vdev_t *v)
{
	vdev_disk_t *vd = v->vdev_tsd;

	if (vd == NULL)
		return;

	if (vd->vd_bdev != NULL)
		vdev_bdev_close(vd->vd_bdev,
		                vdev_bdev_mode(spa_mode(v->vdev_spa)));

	kmem_free(vd, sizeof(vdev_disk_t));
	v->vdev_tsd = NULL;
}

BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, size, rc)
{
	dio_request_t *dr = bio->bi_private;
	zio_t *zio;
	int i, error;

	/* Fatal error but print some useful debugging before asserting */
	if (dr == NULL) {
		printk("FATAL: bio->bi_private == NULL\n"
		       "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n"
		       "bi_idx: %d, bi_size: %d, bi_end_io: %p, bi_cnt: %d\n",
		       bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt,
		       bio->bi_idx, bio->bi_size, bio->bi_end_io,
		       atomic_read(&bio->bi_cnt));
		SBUG();
	}

#ifndef HAVE_2ARGS_BIO_END_IO_T
	if (bio->bi_size)
		return 1;
#endif /* HAVE_2ARGS_BIO_END_IO_T */

	error = rc;
	if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags))
		error = EIO;

	spin_lock(&dr->dr_lock);

	dr->dr_ref--;
	if (dr->dr_error == 0)
		dr->dr_error = error;

	/*
	 * All bio's attached to this dio request have completed.  This
	 * means it is safe to access the dio outside the spin lock, we
	 * are assured there will be no racing accesses.
	 */
	if (dr->dr_ref == 0) {
		zio = dr->dr_zio;
		spin_unlock(&dr->dr_lock);

		/* Synchronous dio cleanup handled by waiter */
		if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) {
			complete(&dr->dr_comp);
		} else {
			for (i = 0; i < dr->dr_bio_count; i++)
				bio_put(dr->dr_bio[i]);

			kmem_free(dr, sizeof(dio_request_t) +
			          sizeof(struct bio *) * dr->dr_bio_count);
		}

		if (zio) {
			zio->io_error = dr->dr_error;
			zio_interrupt(zio);
		}
	} else {
		spin_unlock(&dr->dr_lock);
	}

	BIO_END_IO_RETURN(0);
}

static struct bio *
bio_map_virt(struct request_queue *q, void *data,
               unsigned int len, gfp_t gfp_mask)
{
	unsigned long kaddr = (unsigned long)data;
	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	unsigned long start = kaddr >> PAGE_SHIFT;
	unsigned int offset, i, data_len = len;
	const int nr_pages = end - start;
	struct page *page;
	struct bio *bio;

	bio = bio_alloc(gfp_mask, nr_pages);
	if (!bio)
		return ERR_PTR(-ENOMEM);

	offset = offset_in_page(kaddr);
	for (i = 0; i < nr_pages; i++) {
		unsigned int bytes = PAGE_SIZE - offset;

		if (len <= 0)
			break;

		if (bytes > len)
			bytes = len;

		VERIFY3P(page = vmalloc_to_page(data), !=, NULL);
		VERIFY3U(bio_add_pc_page(q, bio, page, bytes, offset),==,bytes);

		data += bytes;
		len -= bytes;
		offset = 0;
		bytes = PAGE_SIZE;
	}

	VERIFY3U(bio->bi_size, ==, data_len);
        return bio;
}

static struct bio *
bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask)
{
	struct bio *bio;

	/* Cleanly map buffer we are passed in to a bio regardless
	 * of if the buffer is a virtual or physical address. */
	if (kmem_virt(data))
		bio = bio_map_virt(q, data, len, gfp_mask);
	else
		bio = bio_map_kern(q, data, len, gfp_mask);

	return bio;
}

static int
__vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
                   size_t kbuf_size, uint64_t kbuf_offset, int flags)
{
	struct request_queue *q;
        dio_request_t *dr;
	caddr_t bio_ptr;
	uint64_t bio_offset;
	int i, j, error = 0, bio_count, bio_size, dio_size;

	ASSERT3S(kbuf_offset % SECTOR_SIZE, ==, 0);
	q = bdev_get_queue(bdev);
	if (!q)
		return ENXIO;

	bio_count = (kbuf_size / (q->max_hw_sectors << 9)) + 1;
	dio_size  = sizeof(dio_request_t) + sizeof(struct bio *) * bio_count;
	dr = kmem_zalloc(dio_size, KM_SLEEP);
	if (dr == NULL)
		return ENOMEM;

	init_completion(&dr->dr_comp);
	spin_lock_init(&dr->dr_lock);
	dr->dr_ref = 0;
	dr->dr_zio = zio;
	dr->dr_rw = flags;
	dr->dr_error = 0;
	dr->dr_bio_count = bio_count;

#ifdef BIO_RW_FAILFAST
	if (flags & (1 << BIO_RW_FAILFAST))
		dr->dr_rw |= 1 << BIO_RW_FAILFAST;
#endif /* BIO_RW_FAILFAST */

	/*
	 * When the IO size exceeds the maximum bio size for the request
	 * queue we are forced to break the IO in multiple bio's and wait
	 * for them all to complete.  Ideally, all pool users will set
	 * their volume block size to match the maximum request size and
	 * the common case will be one bio per vdev IO request.
	 */
	bio_ptr = kbuf_ptr;
	bio_offset = kbuf_offset;
	for (i = 0; i < dr->dr_bio_count; i++) {
		bio_size = MIN(kbuf_size, q->max_hw_sectors << 9);

		dr->dr_bio[i] = bio_map(q, bio_ptr, bio_size, GFP_NOIO);
		if (IS_ERR(dr->dr_bio[i])) {
			for (j = 0; j < i; j++)
				bio_put(dr->dr_bio[j]);

			error = -PTR_ERR(dr->dr_bio[i]);
			kmem_free(dr, dio_size);
			return error;
		}

		dr->dr_bio[i]->bi_bdev = bdev;
		dr->dr_bio[i]->bi_sector = bio_offset >> 9;
		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
		dr->dr_bio[i]->bi_private = dr;
		dr->dr_ref++;

		bio_ptr    += bio_size;
		bio_offset += bio_size;
		kbuf_size  -= bio_size;
	}

	for (i = 0; i < dr->dr_bio_count; i++)
		submit_bio(dr->dr_rw, dr->dr_bio[i]);

	/*
	 * On synchronous blocking requests we wait for all bio the completion
	 * callbacks to run.  We will be woken when the last callback runs
	 * for this dio.  We are responsible for freeing the dio_request_t as
	 * well as the final reference on all attached bios.  Currently, the
	 * only synchronous consumer is vdev_disk_read_rootlabel() all other
	 * IO originating from vdev_disk_io_start() is asynchronous.
	 */
	if (dr->dr_rw & (1 << DIO_RW_SYNCIO)) {
		wait_for_completion(&dr->dr_comp);
		ASSERT(dr->dr_ref == 0);
		error = dr->dr_error;

		for (i = 0; i < dr->dr_bio_count; i++)
			bio_put(dr->dr_bio[i]);

		kmem_free(dr, dio_size);
	}

	return error;
}

int
vdev_disk_physio(struct block_device *bdev, caddr_t kbuf,
		 size_t size, uint64_t offset, int flags)
{
	return __vdev_disk_physio(bdev, NULL, kbuf, size, offset, flags);
}

/* 2.6.24 API change */
#ifdef HAVE_BIO_EMPTY_BARRIER
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, size, rc)
{
	zio_t *zio = bio->bi_private;

	zio->io_error = -rc;
	if (rc && (rc == -EOPNOTSUPP))
		zio->io_vd->vdev_nowritecache = B_TRUE;

	bio_put(bio);
	zio_interrupt(zio);

	BIO_END_IO_RETURN(0);
}

static int
vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
{
	struct request_queue *q;
	struct bio *bio;

	q = bdev_get_queue(bdev);
	if (!q)
		return ENXIO;

	bio = bio_alloc(GFP_KERNEL, 0);
	if (!bio)
		return ENOMEM;

	bio->bi_end_io = vdev_disk_io_flush_completion;
	bio->bi_private = zio;
	bio->bi_bdev = bdev;
	submit_bio(WRITE_BARRIER, bio);

	return 0;
}
#else
static int
vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
{
	return ENOTSUP;
}
#endif /* HAVE_BIO_EMPTY_BARRIER */

static int
vdev_disk_io_start(zio_t *zio)
{
	vdev_t *v = zio->io_vd;
	vdev_disk_t *vd = v->vdev_tsd;
	int flags, error;

	switch (zio->io_type) {
	case ZIO_TYPE_IOCTL:

		if (!vdev_readable(v)) {
			zio->io_error = ENXIO;
			return ZIO_PIPELINE_CONTINUE;
		}

		switch (zio->io_cmd) {
		case DKIOCFLUSHWRITECACHE:

			if (zfs_nocacheflush)
				break;

			if (v->vdev_nowritecache) {
				zio->io_error = ENOTSUP;
				break;
			}

			error = vdev_disk_io_flush(vd->vd_bdev, zio);
			if (error == 0)
				return ZIO_PIPELINE_STOP;

			zio->io_error = error;
			if (error == ENOTSUP)
				v->vdev_nowritecache = B_TRUE;

			break;

		default:
			zio->io_error = ENOTSUP;
		}

		return ZIO_PIPELINE_CONTINUE;

	case ZIO_TYPE_WRITE:
		flags = WRITE;
		break;

	case ZIO_TYPE_READ:
		if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
			flags = READA;
		else
			flags = READ;

		break;

	default:
		zio->io_error = ENOTSUP;
		return ZIO_PIPELINE_CONTINUE;
	}

#ifdef BIO_RW_FAILFAST
	if (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))
		flags |= (1 << BIO_RW_FAILFAST);
#endif /* BIO_RW_FAILFAST */

	error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data,
		                   zio->io_size, zio->io_offset, flags);
	if (error) {
		zio->io_error = error;
		return ZIO_PIPELINE_CONTINUE;
	}

	return ZIO_PIPELINE_STOP;
}

static void
vdev_disk_io_done(zio_t *zio)
{
	/*
	 * If the device returned EIO, we revalidate the media.  If it is
	 * determined the media has changed this triggers the asynchronous
	 * removal of the device from the configuration.
	 */
	if (zio->io_error == EIO) {
	        vdev_t *v = zio->io_vd;
		vdev_disk_t *vd = v->vdev_tsd;

		if (check_disk_change(vd->vd_bdev)) {
			vdev_bdev_invalidate(vd->vd_bdev);
			v->vdev_remove_wanted = B_TRUE;
			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
		}
	}
}

vdev_ops_t vdev_disk_ops = {
	vdev_disk_open,
	vdev_disk_close,
	vdev_default_asize,
	vdev_disk_io_start,
	vdev_disk_io_done,
	NULL,
	VDEV_TYPE_DISK,		/* name of this vdev type */
	B_TRUE			/* leaf vdev */
};

/*
 * Given the root disk device devid or pathname, read the label from
 * the device, and construct a configuration nvlist.
 */
int
vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
{
	struct block_device *bdev;
	vdev_label_t *label;
	uint64_t s, size;
	int i;

	bdev = vdev_bdev_open(devpath, vdev_bdev_mode(FREAD), NULL);
	if (IS_ERR(bdev))
		return -PTR_ERR(bdev);

	s = get_capacity(bdev->bd_disk) * SECTOR_SIZE;
	if (s == 0) {
		vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));
		return EIO;
	}

	size = P2ALIGN_TYPED(s, sizeof(vdev_label_t), uint64_t);
	label = vmem_alloc(sizeof(vdev_label_t), KM_SLEEP);

	for (i = 0; i < VDEV_LABELS; i++) {
	        uint64_t offset, state, txg = 0;

		/* read vdev label */
		offset = vdev_label_offset(size, i, 0);
		if (vdev_disk_physio(bdev, (caddr_t)label,
		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, READ_SYNC) != 0)
			continue;

		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
			*config = NULL;
			continue;
		}

		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
			nvlist_free(*config);
			*config = NULL;
			continue;
		}

		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
		    &txg) != 0 || txg == 0) {
			nvlist_free(*config);
			*config = NULL;
			continue;
		}

		break;
	}

	vmem_free(label, sizeof(vdev_label_t));
	vdev_bdev_close(bdev, vdev_bdev_mode(FREAD));

	return 0;
}