From 5475aada9474464f973788c1b2fc6216486fb303 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Thu, 30 Jul 2015 22:24:36 +0800 Subject: [PATCH] Linux 4.1 compat: loop device on ZFS Starting from Linux 4.1 allows iov_iter with bio_vec to be passed into iter_read/iter_write. Notably, the loop device will pass bio_vec to backend filesystem. However, current ZFS code assumes iovec without any check, so it will always crash when using loop device. With the restructured uio_t, we can safely pass bio_vec in uio_t with UIO_BVEC set. The uio* functions are modified to handle bio_vec case separately. The const uio_iov causes some warning in xuio related stuff, so explicit convert them to non const. Signed-off-by: Chunwei Chen Signed-off-by: Richard Yao Signed-off-by: Brian Behlendorf Closes #3511 Closes #3640 --- module/zcommon/zfs_uio.c | 205 +++++++++++++++++++++------------------ module/zfs/dmu.c | 5 +- module/zfs/zfs_vnops.c | 6 +- module/zfs/zpl_file.c | 81 +++++++++------- module/zfs/zpl_inode.c | 2 + 5 files changed, 162 insertions(+), 137 deletions(-) diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c index 90376f2acf..a5634fca0c 100644 --- a/module/zcommon/zfs_uio.c +++ b/module/zcommon/zfs_uio.c @@ -35,6 +35,9 @@ * software developed by the University of California, Berkeley, and its * contributors. */ +/* + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + */ /* * The uio support from OpenSolaris has been added as a short term @@ -46,6 +49,7 @@ #include #include +#include /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -53,20 +57,17 @@ * update to reflect the data which was moved. Returns 0 on success or * a non-zero errno on failure. */ -int -uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) +static int +uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) { - struct iovec *iov; + const struct iovec *iov = uio->uio_iov; + size_t skip = uio->uio_skip; ulong_t cnt; + ASSERT3U(skip, <, iov->iov_len); + while (n && uio->uio_resid) { - iov = uio->uio_iov; - cnt = MIN(iov->iov_len, n); - if (cnt == 0l) { - uio->uio_iov++; - uio->uio_iovcnt--; - continue; - } + cnt = MIN(iov->iov_len - skip, n); switch (uio->uio_segflg) { case UIO_USERSPACE: case UIO_USERISPACE: @@ -75,22 +76,29 @@ uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) * iov->iov_base = user data pointer */ if (rw == UIO_READ) { - if (copy_to_user(iov->iov_base, p, cnt)) + if (copy_to_user(iov->iov_base+skip, p, cnt)) return (EFAULT); } else { - if (copy_from_user(p, iov->iov_base, cnt)) + if (copy_from_user(p, iov->iov_base+skip, cnt)) return (EFAULT); } break; case UIO_SYSSPACE: if (rw == UIO_READ) - bcopy(p, iov->iov_base, cnt); + bcopy(p, iov->iov_base + skip, cnt); else - bcopy(iov->iov_base, p, cnt); + bcopy(iov->iov_base + skip, p, cnt); break; + default: + ASSERT(0); } - iov->iov_base += cnt; - iov->iov_len -= cnt; + skip += cnt; + if (skip == iov->iov_len) { + skip = 0; + uio->uio_iov = (++iov); + uio->uio_iovcnt--; + } + uio->uio_skip = skip; uio->uio_resid -= cnt; uio->uio_loffset += cnt; p = (caddr_t)p + cnt; @@ -98,6 +106,50 @@ uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) } return (0); } + +static int +uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio) +{ + const struct bio_vec *bv = uio->uio_bvec; + size_t skip = uio->uio_skip; + ulong_t cnt; + + ASSERT3U(skip, <, bv->bv_len); + + while (n && uio->uio_resid) { + void *paddr; + cnt = MIN(bv->bv_len - skip, n); + + paddr = zfs_kmap_atomic(bv->bv_page, KM_USER1); + if (rw == UIO_READ) + bcopy(p, paddr + bv->bv_offset + skip, cnt); + else + bcopy(paddr + bv->bv_offset + skip, p, cnt); + zfs_kunmap_atomic(paddr, KM_USER1); + + skip += cnt; + if (skip == bv->bv_len) { + skip = 0; + uio->uio_bvec = (++bv); + uio->uio_iovcnt--; + } + uio->uio_skip = skip; + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + p = (caddr_t)p + cnt; + n -= cnt; + } + return (0); +} + +int +uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) +{ + if (uio->uio_segflg != UIO_BVEC) + return (uiomove_iov(p, n, rw, uio)); + else + return (uiomove_bvec(p, n, rw, uio)); +} EXPORT_SYMBOL(uiomove); #define fuword8(uptr, vptr) get_user((*vptr), (uptr)) @@ -111,39 +163,39 @@ EXPORT_SYMBOL(uiomove); void uio_prefaultpages(ssize_t n, struct uio *uio) { - struct iovec *iov; + const struct iovec *iov; ulong_t cnt, incr; caddr_t p; uint8_t tmp; int iovcnt; + size_t skip = uio->uio_skip; + + /* no need to fault in kernel pages */ + switch (uio->uio_segflg) { + case UIO_SYSSPACE: + case UIO_BVEC: + return; + case UIO_USERSPACE: + case UIO_USERISPACE: + break; + default: + ASSERT(0); + } iov = uio->uio_iov; iovcnt = uio->uio_iovcnt; + ASSERT3U(skip, <, iov->iov_len); while ((n > 0) && (iovcnt > 0)) { - cnt = MIN(iov->iov_len, n); - if (cnt == 0) { - /* empty iov entry */ - iov++; - iovcnt--; - continue; - } + cnt = MIN(iov->iov_len - skip, n); n -= cnt; /* * touch each page in this segment. */ - p = iov->iov_base; + p = iov->iov_base + skip; while (cnt) { - switch (uio->uio_segflg) { - case UIO_USERSPACE: - case UIO_USERISPACE: - if (fuword8((uint8_t *) p, &tmp)) - return; - break; - case UIO_SYSSPACE: - bcopy(p, &tmp, 1); - break; - } + if (fuword8((uint8_t *) p, &tmp)) + return; incr = MIN(cnt, PAGESIZE); p += incr; cnt -= incr; @@ -152,18 +204,11 @@ uio_prefaultpages(ssize_t n, struct uio *uio) * touch the last byte in case it straddles a page. */ p--; - switch (uio->uio_segflg) { - case UIO_USERSPACE: - case UIO_USERISPACE: - if (fuword8((uint8_t *) p, &tmp)) - return; - break; - case UIO_SYSSPACE: - bcopy(p, &tmp, 1); - break; - } + if (fuword8((uint8_t *) p, &tmp)) + return; iov++; iovcnt--; + skip = 0; } } EXPORT_SYMBOL(uio_prefaultpages); @@ -175,49 +220,13 @@ EXPORT_SYMBOL(uio_prefaultpages); int uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) { - struct iovec *iov; - ulong_t cnt; - int iovcnt; + struct uio uio_copy; + int ret; - iovcnt = uio->uio_iovcnt; - *cbytes = 0; - - for (iov = uio->uio_iov; n && iovcnt; iov++, iovcnt--) { - cnt = MIN(iov->iov_len, n); - if (cnt == 0) - continue; - - switch (uio->uio_segflg) { - - case UIO_USERSPACE: - case UIO_USERISPACE: - /* - * p = kernel data pointer - * iov->iov_base = user data pointer - */ - if (rw == UIO_READ) { - /* UIO_READ = copy data from kernel to user */ - if (copy_to_user(iov->iov_base, p, cnt)) - return (EFAULT); - } else { - /* UIO_WRITE = copy data from user to kernel */ - if (copy_from_user(p, iov->iov_base, cnt)) - return (EFAULT); - } - break; - - case UIO_SYSSPACE: - if (rw == UIO_READ) - bcopy(p, iov->iov_base, cnt); - else - bcopy(iov->iov_base, p, cnt); - break; - } - p = (caddr_t)p + cnt; - n -= cnt; - *cbytes += cnt; - } - return (0); + bcopy(uio, &uio_copy, sizeof (struct uio)); + ret = uiomove(p, n, rw, &uio_copy); + *cbytes = uio->uio_resid - uio_copy.uio_resid; + return (ret); } EXPORT_SYMBOL(uiocopy); @@ -229,21 +238,23 @@ uioskip(uio_t *uiop, size_t n) { if (n > uiop->uio_resid) return; - while (n != 0) { - iovec_t *iovp = uiop->uio_iov; - size_t niovb = MIN(iovp->iov_len, n); - if (niovb == 0) { + uiop->uio_skip += n; + if (uiop->uio_segflg != UIO_BVEC) { + while (uiop->uio_skip >= uiop->uio_iov->iov_len) { + uiop->uio_skip -= uiop->uio_iov->iov_len; uiop->uio_iov++; uiop->uio_iovcnt--; - continue; } - iovp->iov_base += niovb; - uiop->uio_loffset += niovb; - iovp->iov_len -= niovb; - uiop->uio_resid -= niovb; - n -= niovb; + } else { + while (uiop->uio_skip >= uiop->uio_bvec->bv_len) { + uiop->uio_skip -= uiop->uio_bvec->bv_len; + uiop->uio_bvec++; + uiop->uio_iovcnt--; + } } + uiop->uio_loffset += n; + uiop->uio_resid -= n; } EXPORT_SYMBOL(uioskip); #endif /* _KERNEL */ diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index eb3bc0ed28..ac7499d017 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #include @@ -933,7 +934,7 @@ dmu_xuio_init(xuio_t *xuio, int nblk) priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); priv->cnt = nblk; priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); - priv->iovp = uio->uio_iov; + priv->iovp = (iovec_t *)uio->uio_iov; XUIO_XUZC_PRIV(xuio) = priv; if (XUIO_XUZC_RW(xuio) == UIO_READ) @@ -974,7 +975,7 @@ dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) ASSERT(i < priv->cnt); ASSERT(off + n <= arc_buf_size(abuf)); - iov = uio->uio_iov + i; + iov = (iovec_t *)uio->uio_iov + i; iov->iov_base = (char *)abuf->b_data + off; iov->iov_len = n; priv->bufs[i] = abuf; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 1d23d6db3b..5e5f3c8db7 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -591,10 +592,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) int max_blksz = zsb->z_max_blksz; int error = 0; arc_buf_t *abuf; - iovec_t *aiov = NULL; + const iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; - iovec_t *iovp = uio->uio_iov; + const iovec_t *iovp = uio->uio_iov; int write_eof; int count = 0; sa_bulk_attr_t bulk[4]; @@ -714,6 +715,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); + ASSERT3U(uio->uio_segflg, !=, UIO_BVEC); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c index 5471140122..a23bc7d8dd 100644 --- a/module/zfs/zpl_file.c +++ b/module/zfs/zpl_file.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ @@ -202,17 +203,18 @@ zpl_aio_fsync(struct kiocb *kiocb, int datasync) #error "Unsupported fops->fsync() implementation" #endif -static inline ssize_t +static ssize_t zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, - unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, - int flags, cred_t *cr) + unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, + cred_t *cr, size_t skip) { ssize_t read; uio_t uio; int error; fstrans_cookie_t cookie; - uio.uio_iov = (struct iovec *)iovp; + uio.uio_iov = iovp; + uio.uio_skip = skip; uio.uio_resid = count; uio.uio_iovcnt = nr_segs; uio.uio_loffset = *ppos; @@ -242,7 +244,7 @@ zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, iov.iov_len = len; return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment, - flags, cr)); + flags, cr, 0)); } static ssize_t @@ -261,24 +263,17 @@ zpl_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) static ssize_t zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, size_t count) + unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) { cred_t *cr = CRED(); struct file *filp = kiocb->ki_filp; ssize_t read; - size_t alloc_size = sizeof (struct iovec) * nr_segs; - struct iovec *iov_tmp = kmem_alloc(alloc_size, KM_SLEEP); - bcopy(iovp, iov_tmp, alloc_size); - - ASSERT(iovp); crhold(cr); - read = zpl_read_common_iovec(filp->f_mapping->host, iov_tmp, count, - nr_segs, &kiocb->ki_pos, UIO_USERSPACE, filp->f_flags, cr); + read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count, + nr_segs, &kiocb->ki_pos, seg, filp->f_flags, cr, skip); crfree(cr); - kmem_free(iov_tmp, alloc_size); - return (read); } @@ -286,22 +281,32 @@ zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp, static ssize_t zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { - return (zpl_iter_read_common(kiocb, to->iov, to->nr_segs, - iov_iter_count(to))); + ssize_t ret; + uio_seg_t seg = UIO_USERSPACE; + if (to->type & ITER_KVEC) + seg = UIO_SYSSPACE; + if (to->type & ITER_BVEC) + seg = UIO_BVEC; + ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs, + iov_iter_count(to), seg, to->iov_offset); + if (ret > 0) + iov_iter_advance(to, ret); + return (ret); } #else static ssize_t zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { - return (zpl_iter_read_common(kiocb, iovp, nr_segs, kiocb->ki_nbytes)); + return (zpl_iter_read_common(kiocb, iovp, nr_segs, kiocb->ki_nbytes, + UIO_USERSPACE, 0)); } #endif /* HAVE_VFS_RW_ITERATE */ -static inline ssize_t +static ssize_t zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, - unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, - int flags, cred_t *cr) + unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, + cred_t *cr, size_t skip) { ssize_t wrote; uio_t uio; @@ -311,7 +316,8 @@ zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, if (flags & O_APPEND) *ppos = i_size_read(ip); - uio.uio_iov = (struct iovec *)iovp; + uio.uio_iov = iovp; + uio.uio_skip = skip; uio.uio_resid = count; uio.uio_iovcnt = nr_segs; uio.uio_loffset = *ppos; @@ -340,7 +346,7 @@ zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, iov.iov_len = len; return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment, - flags, cr)); + flags, cr, 0)); } static ssize_t @@ -359,24 +365,17 @@ zpl_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) static ssize_t zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, size_t count) + unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) { cred_t *cr = CRED(); struct file *filp = kiocb->ki_filp; ssize_t wrote; - size_t alloc_size = sizeof (struct iovec) * nr_segs; - struct iovec *iov_tmp = kmem_alloc(alloc_size, KM_SLEEP); - bcopy(iovp, iov_tmp, alloc_size); - - ASSERT(iovp); crhold(cr); - wrote = zpl_write_common_iovec(filp->f_mapping->host, iov_tmp, count, - nr_segs, &kiocb->ki_pos, UIO_USERSPACE, filp->f_flags, cr); + wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count, + nr_segs, &kiocb->ki_pos, seg, filp->f_flags, cr, skip); crfree(cr); - kmem_free(iov_tmp, alloc_size); - return (wrote); } @@ -384,15 +383,25 @@ zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp, static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { - return (zpl_iter_write_common(kiocb, from->iov, from->nr_segs, - iov_iter_count(from))); + ssize_t ret; + uio_seg_t seg = UIO_USERSPACE; + if (from->type & ITER_KVEC) + seg = UIO_SYSSPACE; + if (from->type & ITER_BVEC) + seg = UIO_BVEC; + ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, + iov_iter_count(from), seg, from->iov_offset); + if (ret > 0) + iov_iter_advance(from, ret); + return (ret); } #else static ssize_t zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { - return (zpl_iter_write_common(kiocb, iovp, nr_segs, kiocb->ki_nbytes)); + return (zpl_iter_write_common(kiocb, iovp, nr_segs, kiocb->ki_nbytes, + UIO_USERSPACE, 0)); } #endif /* HAVE_VFS_RW_ITERATE */ diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c index 70b5e12399..e81a3cd047 100644 --- a/module/zfs/zpl_inode.c +++ b/module/zfs/zpl_inode.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ @@ -371,6 +372,7 @@ zpl_follow_link(struct dentry *dentry, void **symlink_cookie) uio.uio_iov = &iov; uio.uio_iovcnt = 1; + uio.uio_skip = 0; uio.uio_resid = (MAXPATHLEN - 1); uio.uio_segflg = UIO_SYSSPACE;