zfs_vnops: make zfs_get_data OS-independent
Move zfs_get_data() in to platform-independent code. The only platform-specific aspect of it is the way we release an inode (Linux) / vnode_t (FreeBSD). I am not aware of a platform that could be supported by ZFS that couldn't implement zfs_rele_async itself. It's sibling zvol_get_data already is platform-independent. Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Christian Schwarz <me@cschwarz.com> Closes #10979
This commit is contained in:
parent
a909190683
commit
ab2110e3e2
|
@ -173,7 +173,6 @@ extern void zfs_tstamp_update_setup_ext(struct znode *,
|
||||||
uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx);
|
uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx);
|
||||||
extern void zfs_znode_free(struct znode *);
|
extern void zfs_znode_free(struct znode *);
|
||||||
|
|
||||||
extern zil_get_data_t zfs_get_data;
|
|
||||||
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
|
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
|
||||||
extern int zfsfstype;
|
extern int zfsfstype;
|
||||||
|
|
||||||
|
|
|
@ -173,7 +173,6 @@ extern caddr_t zfs_map_page(page_t *, enum seg_rw);
|
||||||
extern void zfs_unmap_page(page_t *, caddr_t);
|
extern void zfs_unmap_page(page_t *, caddr_t);
|
||||||
#endif /* HAVE_UIO_RW */
|
#endif /* HAVE_UIO_RW */
|
||||||
|
|
||||||
extern zil_get_data_t zfs_get_data;
|
|
||||||
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
|
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
|
||||||
extern int zfsfstype;
|
extern int zfsfstype;
|
||||||
|
|
||||||
|
|
|
@ -39,4 +39,17 @@ extern int mappedread(znode_t *, int, uio_t *);
|
||||||
extern int mappedread_sf(znode_t *, int, uio_t *);
|
extern int mappedread_sf(znode_t *, int, uio_t *);
|
||||||
extern void update_pages(znode_t *, int64_t, int, objset_t *);
|
extern void update_pages(znode_t *, int64_t, int, objset_t *);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Platform code that asynchronously drops zp's inode / vnode_t.
|
||||||
|
*
|
||||||
|
* Asynchronous dropping ensures that the caller will never drop the
|
||||||
|
* last reference on an inode / vnode_t in the current context.
|
||||||
|
* Doing so while holding open a tx could result in a deadlock if
|
||||||
|
* the platform calls into filesystem again in the implementation
|
||||||
|
* of inode / vnode_t dropping (e.g. call from iput_final()).
|
||||||
|
*/
|
||||||
|
extern void zfs_zrele_async(znode_t *zp);
|
||||||
|
|
||||||
|
extern zil_get_data_t zfs_get_data;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -42,6 +42,7 @@
|
||||||
#include <sys/mount.h>
|
#include <sys/mount.h>
|
||||||
#include <sys/cmn_err.h>
|
#include <sys/cmn_err.h>
|
||||||
#include <sys/zfs_znode.h>
|
#include <sys/zfs_znode.h>
|
||||||
|
#include <sys/zfs_vnops.h>
|
||||||
#include <sys/zfs_dir.h>
|
#include <sys/zfs_dir.h>
|
||||||
#include <sys/zil.h>
|
#include <sys/zil.h>
|
||||||
#include <sys/fs/zfs.h>
|
#include <sys/fs/zfs.h>
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
/* Portions Copyright 2007 Jeremy Teo */
|
/* Portions Copyright 2007 Jeremy Teo */
|
||||||
/* Portions Copyright 2010 Robert Milkowski */
|
/* Portions Copyright 2010 Robert Milkowski */
|
||||||
|
|
||||||
|
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <sys/param.h>
|
#include <sys/param.h>
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
@ -669,163 +670,13 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
void
|
||||||
zfs_get_done(zgd_t *zgd, int error)
|
zfs_zrele_async(znode_t *zp)
|
||||||
{
|
{
|
||||||
znode_t *zp = zgd->zgd_private;
|
vnode_t *vp = ZTOV(zp);
|
||||||
objset_t *os = zp->z_zfsvfs->z_os;
|
objset_t *os = ITOZSB(vp)->z_os;
|
||||||
|
|
||||||
if (zgd->zgd_db)
|
VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
|
||||||
dmu_buf_rele(zgd->zgd_db, zgd);
|
|
||||||
|
|
||||||
zfs_rangelock_exit(zgd->zgd_lr);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Release the vnode asynchronously as we currently have the
|
|
||||||
* txg stopped from syncing.
|
|
||||||
*/
|
|
||||||
VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os)));
|
|
||||||
|
|
||||||
kmem_free(zgd, sizeof (zgd_t));
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef ZFS_DEBUG
|
|
||||||
static int zil_fault_io = 0;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Get data to generate a TX_WRITE intent log record.
|
|
||||||
*/
|
|
||||||
int
|
|
||||||
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
|
|
||||||
{
|
|
||||||
zfsvfs_t *zfsvfs = arg;
|
|
||||||
objset_t *os = zfsvfs->z_os;
|
|
||||||
znode_t *zp;
|
|
||||||
uint64_t object = lr->lr_foid;
|
|
||||||
uint64_t offset = lr->lr_offset;
|
|
||||||
uint64_t size = lr->lr_length;
|
|
||||||
dmu_buf_t *db;
|
|
||||||
zgd_t *zgd;
|
|
||||||
int error = 0;
|
|
||||||
|
|
||||||
ASSERT3P(lwb, !=, NULL);
|
|
||||||
ASSERT3P(zio, !=, NULL);
|
|
||||||
ASSERT3U(size, !=, 0);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Nothing to do if the file has been removed
|
|
||||||
*/
|
|
||||||
if (zfs_zget(zfsvfs, object, &zp) != 0)
|
|
||||||
return (SET_ERROR(ENOENT));
|
|
||||||
if (zp->z_unlinked) {
|
|
||||||
/*
|
|
||||||
* Release the vnode asynchronously as we currently have the
|
|
||||||
* txg stopped from syncing.
|
|
||||||
*/
|
|
||||||
VN_RELE_ASYNC(ZTOV(zp),
|
|
||||||
dsl_pool_zrele_taskq(dmu_objset_pool(os)));
|
|
||||||
return (SET_ERROR(ENOENT));
|
|
||||||
}
|
|
||||||
|
|
||||||
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
|
|
||||||
zgd->zgd_lwb = lwb;
|
|
||||||
zgd->zgd_private = zp;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Write records come in two flavors: immediate and indirect.
|
|
||||||
* For small writes it's cheaper to store the data with the
|
|
||||||
* log record (immediate); for large writes it's cheaper to
|
|
||||||
* sync the data and get a pointer to it (indirect) so that
|
|
||||||
* we don't have to write the data twice.
|
|
||||||
*/
|
|
||||||
if (buf != NULL) { /* immediate write */
|
|
||||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
|
|
||||||
size, RL_READER);
|
|
||||||
/* test for truncation needs to be done while range locked */
|
|
||||||
if (offset >= zp->z_size) {
|
|
||||||
error = SET_ERROR(ENOENT);
|
|
||||||
} else {
|
|
||||||
error = dmu_read(os, object, offset, size, buf,
|
|
||||||
DMU_READ_NO_PREFETCH);
|
|
||||||
}
|
|
||||||
ASSERT(error == 0 || error == ENOENT);
|
|
||||||
} else { /* indirect write */
|
|
||||||
/*
|
|
||||||
* Have to lock the whole block to ensure when it's
|
|
||||||
* written out and its checksum is being calculated
|
|
||||||
* that no one can change the data. We need to re-check
|
|
||||||
* blocksize after we get the lock in case it's changed!
|
|
||||||
*/
|
|
||||||
for (;;) {
|
|
||||||
uint64_t blkoff;
|
|
||||||
size = zp->z_blksz;
|
|
||||||
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
|
|
||||||
offset -= blkoff;
|
|
||||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
|
||||||
offset, size, RL_READER);
|
|
||||||
if (zp->z_blksz == size)
|
|
||||||
break;
|
|
||||||
offset += blkoff;
|
|
||||||
zfs_rangelock_exit(zgd->zgd_lr);
|
|
||||||
}
|
|
||||||
/* test for truncation needs to be done while range locked */
|
|
||||||
if (lr->lr_offset >= zp->z_size)
|
|
||||||
error = SET_ERROR(ENOENT);
|
|
||||||
#ifdef ZFS_DEBUG
|
|
||||||
if (zil_fault_io) {
|
|
||||||
error = SET_ERROR(EIO);
|
|
||||||
zil_fault_io = 0;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if (error == 0)
|
|
||||||
error = dmu_buf_hold(os, object, offset, zgd, &db,
|
|
||||||
DMU_READ_NO_PREFETCH);
|
|
||||||
|
|
||||||
if (error == 0) {
|
|
||||||
blkptr_t *bp = &lr->lr_blkptr;
|
|
||||||
|
|
||||||
zgd->zgd_db = db;
|
|
||||||
zgd->zgd_bp = bp;
|
|
||||||
|
|
||||||
ASSERT(db->db_offset == offset);
|
|
||||||
ASSERT(db->db_size == size);
|
|
||||||
|
|
||||||
error = dmu_sync(zio, lr->lr_common.lrc_txg,
|
|
||||||
zfs_get_done, zgd);
|
|
||||||
ASSERT(error || lr->lr_length <= size);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* On success, we need to wait for the write I/O
|
|
||||||
* initiated by dmu_sync() to complete before we can
|
|
||||||
* release this dbuf. We will finish everything up
|
|
||||||
* in the zfs_get_done() callback.
|
|
||||||
*/
|
|
||||||
if (error == 0)
|
|
||||||
return (0);
|
|
||||||
|
|
||||||
if (error == EALREADY) {
|
|
||||||
lr->lr_common.lrc_txtype = TX_WRITE2;
|
|
||||||
/*
|
|
||||||
* TX_WRITE2 relies on the data previously
|
|
||||||
* written by the TX_WRITE that caused
|
|
||||||
* EALREADY. We zero out the BP because
|
|
||||||
* it is the old, currently-on-disk BP,
|
|
||||||
* so there's no need to zio_flush() its
|
|
||||||
* vdevs (flushing would needlesly hurt
|
|
||||||
* performance, and doesn't work on
|
|
||||||
* indirect vdevs).
|
|
||||||
*/
|
|
||||||
zgd->zgd_bp = NULL;
|
|
||||||
BP_ZERO(bp);
|
|
||||||
error = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
zfs_get_done(zgd, error);
|
|
||||||
|
|
||||||
return (error);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
|
|
@ -392,12 +392,6 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Drop a reference on the passed inode asynchronously. This ensures
|
|
||||||
* that the caller will never drop the last reference on an inode in
|
|
||||||
* the current context. Doing so while holding open a tx could result
|
|
||||||
* in a deadlock if iput_final() re-enters the filesystem code.
|
|
||||||
*/
|
|
||||||
void
|
void
|
||||||
zfs_zrele_async(znode_t *zp)
|
zfs_zrele_async(znode_t *zp)
|
||||||
{
|
{
|
||||||
|
@ -421,172 +415,6 @@ zfs_zrele_async(znode_t *zp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ARGSUSED */
|
|
||||||
static void
|
|
||||||
zfs_get_done(zgd_t *zgd, int error)
|
|
||||||
{
|
|
||||||
znode_t *zp = zgd->zgd_private;
|
|
||||||
|
|
||||||
if (zgd->zgd_db)
|
|
||||||
dmu_buf_rele(zgd->zgd_db, zgd);
|
|
||||||
|
|
||||||
zfs_rangelock_exit(zgd->zgd_lr);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Release the vnode asynchronously as we currently have the
|
|
||||||
* txg stopped from syncing.
|
|
||||||
*/
|
|
||||||
zfs_zrele_async(zp);
|
|
||||||
|
|
||||||
kmem_free(zgd, sizeof (zgd_t));
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef ZFS_DEBUG
|
|
||||||
static int zil_fault_io = 0;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Get data to generate a TX_WRITE intent log record.
|
|
||||||
*/
|
|
||||||
int
|
|
||||||
zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
|
|
||||||
struct lwb *lwb, zio_t *zio)
|
|
||||||
{
|
|
||||||
zfsvfs_t *zfsvfs = arg;
|
|
||||||
objset_t *os = zfsvfs->z_os;
|
|
||||||
znode_t *zp;
|
|
||||||
uint64_t object = lr->lr_foid;
|
|
||||||
uint64_t offset = lr->lr_offset;
|
|
||||||
uint64_t size = lr->lr_length;
|
|
||||||
dmu_buf_t *db;
|
|
||||||
zgd_t *zgd;
|
|
||||||
int error = 0;
|
|
||||||
uint64_t zp_gen;
|
|
||||||
|
|
||||||
ASSERT3P(lwb, !=, NULL);
|
|
||||||
ASSERT3P(zio, !=, NULL);
|
|
||||||
ASSERT3U(size, !=, 0);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Nothing to do if the file has been removed
|
|
||||||
*/
|
|
||||||
if (zfs_zget(zfsvfs, object, &zp) != 0)
|
|
||||||
return (SET_ERROR(ENOENT));
|
|
||||||
if (zp->z_unlinked) {
|
|
||||||
/*
|
|
||||||
* Release the vnode asynchronously as we currently have the
|
|
||||||
* txg stopped from syncing.
|
|
||||||
*/
|
|
||||||
zfs_zrele_async(zp);
|
|
||||||
return (SET_ERROR(ENOENT));
|
|
||||||
}
|
|
||||||
/* check if generation number matches */
|
|
||||||
if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
|
|
||||||
sizeof (zp_gen)) != 0) {
|
|
||||||
zfs_zrele_async(zp);
|
|
||||||
return (SET_ERROR(EIO));
|
|
||||||
}
|
|
||||||
if (zp_gen != gen) {
|
|
||||||
zfs_zrele_async(zp);
|
|
||||||
return (SET_ERROR(ENOENT));
|
|
||||||
}
|
|
||||||
|
|
||||||
zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
|
|
||||||
zgd->zgd_lwb = lwb;
|
|
||||||
zgd->zgd_private = zp;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Write records come in two flavors: immediate and indirect.
|
|
||||||
* For small writes it's cheaper to store the data with the
|
|
||||||
* log record (immediate); for large writes it's cheaper to
|
|
||||||
* sync the data and get a pointer to it (indirect) so that
|
|
||||||
* we don't have to write the data twice.
|
|
||||||
*/
|
|
||||||
if (buf != NULL) { /* immediate write */
|
|
||||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
|
||||||
offset, size, RL_READER);
|
|
||||||
/* test for truncation needs to be done while range locked */
|
|
||||||
if (offset >= zp->z_size) {
|
|
||||||
error = SET_ERROR(ENOENT);
|
|
||||||
} else {
|
|
||||||
error = dmu_read(os, object, offset, size, buf,
|
|
||||||
DMU_READ_NO_PREFETCH);
|
|
||||||
}
|
|
||||||
ASSERT(error == 0 || error == ENOENT);
|
|
||||||
} else { /* indirect write */
|
|
||||||
/*
|
|
||||||
* Have to lock the whole block to ensure when it's
|
|
||||||
* written out and its checksum is being calculated
|
|
||||||
* that no one can change the data. We need to re-check
|
|
||||||
* blocksize after we get the lock in case it's changed!
|
|
||||||
*/
|
|
||||||
for (;;) {
|
|
||||||
uint64_t blkoff;
|
|
||||||
size = zp->z_blksz;
|
|
||||||
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
|
|
||||||
offset -= blkoff;
|
|
||||||
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
|
||||||
offset, size, RL_READER);
|
|
||||||
if (zp->z_blksz == size)
|
|
||||||
break;
|
|
||||||
offset += blkoff;
|
|
||||||
zfs_rangelock_exit(zgd->zgd_lr);
|
|
||||||
}
|
|
||||||
/* test for truncation needs to be done while range locked */
|
|
||||||
if (lr->lr_offset >= zp->z_size)
|
|
||||||
error = SET_ERROR(ENOENT);
|
|
||||||
#ifdef ZFS_DEBUG
|
|
||||||
if (zil_fault_io) {
|
|
||||||
error = SET_ERROR(EIO);
|
|
||||||
zil_fault_io = 0;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if (error == 0)
|
|
||||||
error = dmu_buf_hold(os, object, offset, zgd, &db,
|
|
||||||
DMU_READ_NO_PREFETCH);
|
|
||||||
|
|
||||||
if (error == 0) {
|
|
||||||
blkptr_t *bp = &lr->lr_blkptr;
|
|
||||||
|
|
||||||
zgd->zgd_db = db;
|
|
||||||
zgd->zgd_bp = bp;
|
|
||||||
|
|
||||||
ASSERT(db->db_offset == offset);
|
|
||||||
ASSERT(db->db_size == size);
|
|
||||||
|
|
||||||
error = dmu_sync(zio, lr->lr_common.lrc_txg,
|
|
||||||
zfs_get_done, zgd);
|
|
||||||
ASSERT(error || lr->lr_length <= size);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* On success, we need to wait for the write I/O
|
|
||||||
* initiated by dmu_sync() to complete before we can
|
|
||||||
* release this dbuf. We will finish everything up
|
|
||||||
* in the zfs_get_done() callback.
|
|
||||||
*/
|
|
||||||
if (error == 0)
|
|
||||||
return (0);
|
|
||||||
|
|
||||||
if (error == EALREADY) {
|
|
||||||
lr->lr_common.lrc_txtype = TX_WRITE2;
|
|
||||||
/*
|
|
||||||
* TX_WRITE2 relies on the data previously
|
|
||||||
* written by the TX_WRITE that caused
|
|
||||||
* EALREADY. We zero out the BP because
|
|
||||||
* it is the old, currently-on-disk BP.
|
|
||||||
*/
|
|
||||||
zgd->zgd_bp = NULL;
|
|
||||||
BP_ZERO(bp);
|
|
||||||
error = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
zfs_get_done(zgd, error);
|
|
||||||
|
|
||||||
return (error);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Lookup an entry in a directory, or an extended attribute directory.
|
* Lookup an entry in a directory, or an extended attribute directory.
|
||||||
* If it exists, return a held inode reference for it.
|
* If it exists, return a held inode reference for it.
|
||||||
|
|
|
@ -52,6 +52,8 @@
|
||||||
#include <sys/policy.h>
|
#include <sys/policy.h>
|
||||||
#include <sys/zfs_vnops.h>
|
#include <sys/zfs_vnops.h>
|
||||||
#include <sys/zfs_quota.h>
|
#include <sys/zfs_quota.h>
|
||||||
|
#include <sys/zfs_vfsops.h>
|
||||||
|
#include <sys/zfs_znode.h>
|
||||||
|
|
||||||
|
|
||||||
static ulong_t zfs_fsync_sync_cnt = 4;
|
static ulong_t zfs_fsync_sync_cnt = 4;
|
||||||
|
@ -727,6 +729,163 @@ zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
|
||||||
return (error);
|
return (error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef ZFS_DEBUG
|
||||||
|
static int zil_fault_io = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static void zfs_get_done(zgd_t *zgd, int error);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get data to generate a TX_WRITE intent log record.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
|
||||||
|
{
|
||||||
|
zfsvfs_t *zfsvfs = arg;
|
||||||
|
objset_t *os = zfsvfs->z_os;
|
||||||
|
znode_t *zp;
|
||||||
|
uint64_t object = lr->lr_foid;
|
||||||
|
uint64_t offset = lr->lr_offset;
|
||||||
|
uint64_t size = lr->lr_length;
|
||||||
|
dmu_buf_t *db;
|
||||||
|
zgd_t *zgd;
|
||||||
|
int error = 0;
|
||||||
|
|
||||||
|
ASSERT3P(lwb, !=, NULL);
|
||||||
|
ASSERT3P(zio, !=, NULL);
|
||||||
|
ASSERT3U(size, !=, 0);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Nothing to do if the file has been removed
|
||||||
|
*/
|
||||||
|
if (zfs_zget(zfsvfs, object, &zp) != 0)
|
||||||
|
return (SET_ERROR(ENOENT));
|
||||||
|
if (zp->z_unlinked) {
|
||||||
|
/*
|
||||||
|
* Release the vnode asynchronously as we currently have the
|
||||||
|
* txg stopped from syncing.
|
||||||
|
*/
|
||||||
|
zfs_zrele_async(zp);
|
||||||
|
return (SET_ERROR(ENOENT));
|
||||||
|
}
|
||||||
|
|
||||||
|
zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
|
||||||
|
zgd->zgd_lwb = lwb;
|
||||||
|
zgd->zgd_private = zp;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Write records come in two flavors: immediate and indirect.
|
||||||
|
* For small writes it's cheaper to store the data with the
|
||||||
|
* log record (immediate); for large writes it's cheaper to
|
||||||
|
* sync the data and get a pointer to it (indirect) so that
|
||||||
|
* we don't have to write the data twice.
|
||||||
|
*/
|
||||||
|
if (buf != NULL) { /* immediate write */
|
||||||
|
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
||||||
|
offset, size, RL_READER);
|
||||||
|
/* test for truncation needs to be done while range locked */
|
||||||
|
if (offset >= zp->z_size) {
|
||||||
|
error = SET_ERROR(ENOENT);
|
||||||
|
} else {
|
||||||
|
error = dmu_read(os, object, offset, size, buf,
|
||||||
|
DMU_READ_NO_PREFETCH);
|
||||||
|
}
|
||||||
|
ASSERT(error == 0 || error == ENOENT);
|
||||||
|
} else { /* indirect write */
|
||||||
|
/*
|
||||||
|
* Have to lock the whole block to ensure when it's
|
||||||
|
* written out and its checksum is being calculated
|
||||||
|
* that no one can change the data. We need to re-check
|
||||||
|
* blocksize after we get the lock in case it's changed!
|
||||||
|
*/
|
||||||
|
for (;;) {
|
||||||
|
uint64_t blkoff;
|
||||||
|
size = zp->z_blksz;
|
||||||
|
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
|
||||||
|
offset -= blkoff;
|
||||||
|
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
||||||
|
offset, size, RL_READER);
|
||||||
|
if (zp->z_blksz == size)
|
||||||
|
break;
|
||||||
|
offset += blkoff;
|
||||||
|
zfs_rangelock_exit(zgd->zgd_lr);
|
||||||
|
}
|
||||||
|
/* test for truncation needs to be done while range locked */
|
||||||
|
if (lr->lr_offset >= zp->z_size)
|
||||||
|
error = SET_ERROR(ENOENT);
|
||||||
|
#ifdef ZFS_DEBUG
|
||||||
|
if (zil_fault_io) {
|
||||||
|
error = SET_ERROR(EIO);
|
||||||
|
zil_fault_io = 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (error == 0)
|
||||||
|
error = dmu_buf_hold(os, object, offset, zgd, &db,
|
||||||
|
DMU_READ_NO_PREFETCH);
|
||||||
|
|
||||||
|
if (error == 0) {
|
||||||
|
blkptr_t *bp = &lr->lr_blkptr;
|
||||||
|
|
||||||
|
zgd->zgd_db = db;
|
||||||
|
zgd->zgd_bp = bp;
|
||||||
|
|
||||||
|
ASSERT(db->db_offset == offset);
|
||||||
|
ASSERT(db->db_size == size);
|
||||||
|
|
||||||
|
error = dmu_sync(zio, lr->lr_common.lrc_txg,
|
||||||
|
zfs_get_done, zgd);
|
||||||
|
ASSERT(error || lr->lr_length <= size);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* On success, we need to wait for the write I/O
|
||||||
|
* initiated by dmu_sync() to complete before we can
|
||||||
|
* release this dbuf. We will finish everything up
|
||||||
|
* in the zfs_get_done() callback.
|
||||||
|
*/
|
||||||
|
if (error == 0)
|
||||||
|
return (0);
|
||||||
|
|
||||||
|
if (error == EALREADY) {
|
||||||
|
lr->lr_common.lrc_txtype = TX_WRITE2;
|
||||||
|
/*
|
||||||
|
* TX_WRITE2 relies on the data previously
|
||||||
|
* written by the TX_WRITE that caused
|
||||||
|
* EALREADY. We zero out the BP because
|
||||||
|
* it is the old, currently-on-disk BP.
|
||||||
|
*/
|
||||||
|
zgd->zgd_bp = NULL;
|
||||||
|
BP_ZERO(bp);
|
||||||
|
error = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
zfs_get_done(zgd, error);
|
||||||
|
|
||||||
|
return (error);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* ARGSUSED */
|
||||||
|
static void
|
||||||
|
zfs_get_done(zgd_t *zgd, int error)
|
||||||
|
{
|
||||||
|
znode_t *zp = zgd->zgd_private;
|
||||||
|
|
||||||
|
if (zgd->zgd_db)
|
||||||
|
dmu_buf_rele(zgd->zgd_db, zgd);
|
||||||
|
|
||||||
|
zfs_rangelock_exit(zgd->zgd_lr);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Release the vnode asynchronously as we currently have the
|
||||||
|
* txg stopped from syncing.
|
||||||
|
*/
|
||||||
|
zfs_zrele_async(zp);
|
||||||
|
|
||||||
|
kmem_free(zgd, sizeof (zgd_t));
|
||||||
|
}
|
||||||
|
|
||||||
EXPORT_SYMBOL(zfs_access);
|
EXPORT_SYMBOL(zfs_access);
|
||||||
EXPORT_SYMBOL(zfs_fsync);
|
EXPORT_SYMBOL(zfs_fsync);
|
||||||
EXPORT_SYMBOL(zfs_holey);
|
EXPORT_SYMBOL(zfs_holey);
|
||||||
|
|
Loading…
Reference in New Issue