diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 94686d2cfd..1c4c37a62a 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -35,18 +35,21 @@ * Virtual device vector for disks. */ typedef struct dio_request { - struct completion dr_comp; - atomic_t dr_ref; - zio_t *dr_zio; - int dr_error; + struct completion dr_comp; /* Completion for sync IO */ + spinlock_t dr_lock; /* Completion lock */ + zio_t *dr_zio; /* Parent ZIO */ + int dr_ref; /* Outstanding bio count */ + int dr_rw; /* Read/Write */ + int dr_error; /* Bio error */ + int dr_bio_count; /* Count of bio's */ + struct bio *dr_bio[0]; /* Attached bio's */ } dio_request_t; static int -vdev_disk_open_common(vdev_t *vd) +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) { + struct block_device *vd_lh; vdev_disk_t *dvd; - struct block_device *bdev; - int mode = 0; /* Must have a pathname and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { @@ -68,77 +71,37 @@ vdev_disk_open_common(vdev_t *vd) * munging of the flags to make then more agreeable to linux. * However, simply passing a 0 for now gets us W/R behavior. */ - bdev = open_bdev_excl(vd->vdev_path, mode, dvd); - if (IS_ERR(bdev)) { + vd_lh = open_bdev_excl(vd->vdev_path, 0, dvd); + if (IS_ERR(vd_lh)) { kmem_free(dvd, sizeof(vdev_disk_t)); - return -PTR_ERR(bdev); + return -PTR_ERR(vd_lh); } /* XXX: Long term validate stored dvd->vd_devid with a unique - * identifier read from the disk. + * identifier read from the disk, likely EFI support. */ - dvd->vd_lh = bdev; vd->vdev_tsd = dvd; + dvd->vd_lh = vd_lh; - return 0; -} + /* Check if this is a whole device. When vd_lh->bd_contains == + * vd_lh we have a whole device and not simply a partition. */ + vd->vdev_wholedisk = !!(vd_lh->bd_contains == vd_lh); -static int -vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) -{ - vdev_disk_t *dvd; - struct block_device *bdev; - int error; - - error = vdev_disk_open_common(vd); - if (error) - return error; - - dvd = vd->vdev_tsd; - bdev = dvd->vd_lh; + /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ + vd->vdev_nowritecache = B_FALSE; /* Determine the actual size of the device (in bytes) * * XXX: SECTOR_SIZE is defined to 512b which may not be true for * your device, we must use the actual hardware sector size. */ - *psize = get_capacity(bdev->bd_disk) * SECTOR_SIZE; - - /* Check if this is a whole device and if it is try and - * enable the write cache, it is OK if this fails. */ - if (bdev->bd_contains == bdev) { - int wce = 1; - - vd->vdev_wholedisk = 1ULL; - - /* XXX: Different methods are needed for an IDE vs SCSI disk. - * Since we're not sure what type of disk this is try IDE, - * if that fails try SCSI. - */ - error = ioctl_by_bdev(bdev, HDIO_SET_WCACHE, (unsigned long)&wce); - if (error) - dprintf("Unable to enable IDE WCE and SCSI WCE " - "not yet supported: %d\n", error); - - /* XXX: To implement the scsi WCE enable we are going to need - * to use the SG_IO ioctl. But that means fully forming the - * SCSI command as the ioctl arg. To get this right I need - * to look at the sdparm source which does this. - */ - error = 0; - } else { - /* Must be a partition, that's fine. */ - vd->vdev_wholedisk = 0; - } + *psize = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE; /* Based on the minimum sector size set the block size */ *ashift = highbit(MAX(SECTOR_SIZE, SPA_MINBLOCKSIZE)) - 1; - /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ - vd->vdev_nowritecache = B_FALSE; - - return error; + return 0; } static void @@ -164,16 +127,16 @@ static int vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc) #endif /* HAVE_2ARGS_BIO_END_IO_T */ { - dio_request_t *dr = bio->bi_private; + dio_request_t *dr = bio->bi_private; zio_t *zio; - int error; + int i, error; /* Fatal error but print some useful debugging before asserting */ if (dr == NULL) { printk("FATAL: bio->bi_private == NULL\n" "bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d\n" - "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d\n", - bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt, + "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d\n", + bio->bi_next, bio->bi_flags, bio->bi_rw, bio->bi_vcnt, bio->bi_idx, bio->bi_size, bio->bi_end_io, atomic_read(&bio->bi_cnt)); SBUG(); @@ -189,20 +152,38 @@ vdev_disk_physio_completion(struct bio *bio, unsigned int size, int rc) if (error == 0 && !test_bit(BIO_UPTODATE, &bio->bi_flags)) error = EIO; - zio = dr->dr_zio; - if (zio) { - zio->io_error = error; - zio_interrupt(zio); - } + spin_lock(&dr->dr_lock); - dr->dr_error = error; - atomic_dec(&dr->dr_ref); + dr->dr_ref--; + if (dr->dr_error == 0) + dr->dr_error = error; - if (bio_sync(bio)) { - complete(&dr->dr_comp); + /* + * All bio's attached to this dio request have completed. This + * means it is safe to access the dio outside the spin lock, we + * are assured there will be no racing accesses. + */ + if (dr->dr_ref == 0) { + zio = dr->dr_zio; + spin_unlock(&dr->dr_lock); + + /* Syncronous dio cleanup handled by waiter */ + if (dr->dr_rw & (1 << BIO_RW_SYNC)) { + complete(&dr->dr_comp); + } else { + for (i = 0; i < dr->dr_bio_count; i++) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, sizeof(dio_request_t) + + sizeof(struct bio *) * dr->dr_bio_count); + } + + if (zio) { + zio->io_error = dr->dr_error; + zio_interrupt(zio); + } } else { - kmem_free(dr, sizeof(dio_request_t)); - bio_put(bio); + spin_unlock(&dr->dr_lock); } rc = 0; @@ -215,62 +196,43 @@ out: } static struct bio * -__bio_map_vmem(struct request_queue *q, void *data, +bio_map_virt(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) { - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; - int offset, i; + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + unsigned int offset, i, data_len = len; + const int nr_pages = end - start; struct page *page; - struct bio *bio; - - bio = bio_alloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - offset = offset_in_page(kaddr); - for (i = 0; i < nr_pages; i++) { - unsigned int bytes = PAGE_SIZE - offset; - - if (len <= 0) - break; - - if (bytes > len) - bytes = len; - - page = vmalloc_to_page(data); - ASSERT(page); /* Expecting virtual linear address */ - - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) - break; - - data += bytes; - len -= bytes; - offset = 0; - bytes = PAGE_SIZE; - } - - return bio; -} - -static struct bio * -bio_map_vmem(struct request_queue *q, void *data, - unsigned int len, gfp_t gfp_mask) -{ struct bio *bio; + int rc; - bio = __bio_map_vmem(q, data, len, gfp_mask); - if (IS_ERR(bio)) - return bio; + bio = bio_alloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); - if (bio->bi_size != len) { - bio_put(bio); - return ERR_PTR(-EINVAL); + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + VERIFY3P(page = vmalloc_to_page(data), !=, NULL); + VERIFY3U(bio_add_pc_page(q, bio, page, bytes, offset), ==, bytes); + + data += bytes; + len -= bytes; + offset = 0; + bytes = PAGE_SIZE; } - return bio; + VERIFY3U(bio->bi_size, ==, data_len); + return bio; } static struct bio * @@ -281,7 +243,7 @@ bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) /* Cleanly map buffer we are passed in to a bio regardless * of if the buffer is a virtual or physical address. */ if (kmem_virt(data)) - bio = bio_map_vmem(q, data, len, gfp_mask); + bio = bio_map_virt(q, data, len, gfp_mask); else bio = bio_map_kern(q, data, len, gfp_mask); @@ -289,63 +251,92 @@ bio_map(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) } static int -__vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf, - size_t size, uint64_t offset, int flags) +__vdev_disk_physio(struct block_device *vd_lh, zio_t *zio, caddr_t kbuf_ptr, + size_t kbuf_size, uint64_t kbuf_offset, int flags) { - struct bio *bio; + struct request_queue *q = vd_lh->bd_disk->queue; dio_request_t *dr; - int rw, error = 0; - struct request_queue *q; + caddr_t bio_ptr; + uint64_t bio_offset; + int i, j, error = 0, bio_count, bio_size, dio_size; - ASSERT((offset % SECTOR_SIZE) == 0); /* Sector aligned */ + ASSERT3S(kbuf_offset % SECTOR_SIZE, ==, 0); + ASSERT3S(flags & + ~((1 << BIO_RW) | + (1 << BIO_RW_SYNC) | + (1 << BIO_RW_FAILFAST)), ==, 0); - dr = kmem_alloc(sizeof(dio_request_t), KM_SLEEP); + bio_count = (kbuf_size / (q->max_hw_sectors << 9)) + 1; + dio_size = sizeof(dio_request_t) + sizeof(struct bio *) * bio_count; + dr = kmem_zalloc(dio_size, KM_SLEEP); if (dr == NULL) return ENOMEM; - atomic_set(&dr->dr_ref, 0); - dr->dr_zio = zio; - dr->dr_error = 0; - q = vd_lh->bd_disk->queue; - - bio = bio_map(q, kbuf, size, GFP_NOIO); - if (IS_ERR(bio)) { - kmem_free(dr, sizeof(dio_request_t)); - return -PTR_ERR(bio); - } - - bio->bi_bdev = vd_lh; - bio->bi_sector = offset / SECTOR_SIZE; - bio->bi_end_io = vdev_disk_physio_completion; - bio->bi_private = dr; - init_completion(&dr->dr_comp); - atomic_inc(&dr->dr_ref); + spin_lock_init(&dr->dr_lock); + dr->dr_ref = 0; + dr->dr_zio = zio; + dr->dr_rw = READ; + dr->dr_error = 0; + dr->dr_bio_count = bio_count; if (flags & (1 << BIO_RW)) - rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE; - else - rw = READ; + dr->dr_rw = (flags & (1 << BIO_RW_SYNC)) ? WRITE_SYNC : WRITE; if (flags & (1 << BIO_RW_FAILFAST)) - rw |= 1 << BIO_RW_FAILFAST; - - ASSERT3S(flags & ~((1 << BIO_RW) | (1 << BIO_RW_SYNC) | - (1 << BIO_RW_FAILFAST)), ==, 0); - - submit_bio(rw, bio); + dr->dr_rw |= 1 << BIO_RW_FAILFAST; /* - * On syncronous blocking requests we wait for the completion - * callback to wake us. Then we are responsible for freeing - * the dio_request_t as well as dropping the final bio reference. + * When the IO size exceeds the maximum bio size for the request + * queue we are forced to break the IO in multiple bio's and wait + * for them all to complete. Ideally, all pool users will set + * their volume block size to match the maximum request size and + * the common case will be one bio per vdev IO request. */ - if (bio_sync(bio)) { + bio_ptr = kbuf_ptr; + bio_offset = kbuf_offset; + for (i = 0; i < dr->dr_bio_count; i++) { + bio_size = MIN(kbuf_size, q->max_hw_sectors << 9); + + dr->dr_bio[i] = bio_map(q, bio_ptr, bio_size, GFP_NOIO); + if (IS_ERR(dr->dr_bio[i])) { + for (j = 0; j < i; j++) + bio_put(dr->dr_bio[j]); + + error = -PTR_ERR(dr->dr_bio[i]); + kmem_free(dr, dio_size); + return error; + } + + dr->dr_bio[i]->bi_bdev = vd_lh; + dr->dr_bio[i]->bi_sector = bio_offset >> 9; + dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; + dr->dr_bio[i]->bi_private = dr; + dr->dr_ref++; + + bio_ptr += bio_size; + bio_offset += bio_size; + kbuf_size -= bio_size; + } + + for (i = 0; i < dr->dr_bio_count; i++) + submit_bio(dr->dr_rw, dr->dr_bio[i]); + + /* + * On syncronous blocking requests we wait for all bio the completion + * callbacks to run. We will be woken when the last callback runs + * for this dio. We are responsible for freeing the dio_request_t as + * well as the final reference on all attached bios. + */ + if (dr->dr_rw & (1 << BIO_RW_SYNC)) { wait_for_completion(&dr->dr_comp); - ASSERT(atomic_read(&dr->dr_ref) == 0); + ASSERT(dr->dr_ref == 0); error = dr->dr_error; - kmem_free(dr, sizeof(dio_request_t)); - bio_put(bio); + + for (i = 0; i < dr->dr_bio_count; i++) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, dio_size); } return error; @@ -379,7 +370,6 @@ vdev_disk_io_start(zio_t *zio) int flags, error; if (zio->io_type == ZIO_TYPE_IOCTL) { - zio_vdev_io_bypass(zio); /* XXPOLICY */ if (!vdev_readable(vd)) { @@ -453,8 +443,12 @@ vdev_disk_io_start(zio_t *zio) if (zio->io_flags & ZIO_FLAG_IO_RETRY) flags |= (1 << BIO_RW_FAILFAST); - __vdev_disk_physio(dvd->vd_lh, zio, zio->io_data, - zio->io_size, zio->io_offset, flags); + error = __vdev_disk_physio(dvd->vd_lh, zio, zio->io_data, + zio->io_size, zio->io_offset, flags); + if (error) { + zio->io_error = error; + return ZIO_PIPELINE_CONTINUE; + } return ZIO_PIPELINE_STOP; } @@ -470,7 +464,6 @@ vdev_disk_io_done(zio_t *zio) */ VERIFY3S(zio->io_error, ==, 0); #if 0 - vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; int state = DKIO_NONE; @@ -523,7 +516,8 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) if (IS_ERR(vd_lh)) return -PTR_ERR(vd_lh); - if ((s = i_size_read(vd_lh->bd_inode)) == 0) { + s = get_capacity(vd_lh->bd_disk) * SECTOR_SIZE; + if (s == 0) { close_bdev_excl(vd_lh); return EIO; }