Illumos #3834
3834 incremental replication of 'holey' file systems is slow Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net> References: https://www.illumos.org/issues/3834 illumos/illumos-gate@ca48f36f20 Ported-by: Richard Yao <ryao@gentoo.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1775
This commit is contained in:
parent
2883cad5b7
commit
ea97f8ce35
|
@ -21,7 +21,10 @@
|
||||||
/*
|
/*
|
||||||
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
|
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
|
||||||
* Use is subject to license terms.
|
* Use is subject to license terms.
|
||||||
|
*/
|
||||||
|
/*
|
||||||
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
||||||
|
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _SYS_DMU_IMPL_H
|
#ifndef _SYS_DMU_IMPL_H
|
||||||
|
@ -265,6 +268,9 @@ typedef struct dmu_sendarg {
|
||||||
uint64_t dsa_toguid;
|
uint64_t dsa_toguid;
|
||||||
int dsa_err;
|
int dsa_err;
|
||||||
dmu_pendop_t dsa_pending_op;
|
dmu_pendop_t dsa_pending_op;
|
||||||
|
boolean_t dsa_incremental;
|
||||||
|
uint64_t dsa_last_data_object;
|
||||||
|
uint64_t dsa_last_data_offset;
|
||||||
} dmu_sendarg_t;
|
} dmu_sendarg_t;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||||
* Copyright (c) 2012 by Delphix. All rights reserved.
|
* Copyright (c) 2013 by Delphix. All rights reserved.
|
||||||
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
|
||||||
*/
|
*/
|
||||||
|
@ -63,5 +63,6 @@ int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
|
||||||
int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
|
int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
|
||||||
int cleanup_fd, uint64_t *action_handlep);
|
int cleanup_fd, uint64_t *action_handlep);
|
||||||
int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
|
int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
|
||||||
|
boolean_t dmu_objset_is_receiving(objset_t *os);
|
||||||
|
|
||||||
#endif /* _DMU_SEND_H */
|
#endif /* _DMU_SEND_H */
|
||||||
|
|
|
@ -28,6 +28,7 @@
|
||||||
#include <sys/zfs_context.h>
|
#include <sys/zfs_context.h>
|
||||||
#include <sys/arc.h>
|
#include <sys/arc.h>
|
||||||
#include <sys/dmu.h>
|
#include <sys/dmu.h>
|
||||||
|
#include <sys/dmu_send.h>
|
||||||
#include <sys/dmu_impl.h>
|
#include <sys/dmu_impl.h>
|
||||||
#include <sys/dbuf.h>
|
#include <sys/dbuf.h>
|
||||||
#include <sys/dmu_objset.h>
|
#include <sys/dmu_objset.h>
|
||||||
|
@ -846,9 +847,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
|
||||||
/*
|
/*
|
||||||
* Evict (if its unreferenced) or clear (if its referenced) any level-0
|
* Evict (if its unreferenced) or clear (if its referenced) any level-0
|
||||||
* data blocks in the free range, so that any future readers will find
|
* data blocks in the free range, so that any future readers will find
|
||||||
* empty blocks. Also, if we happen accross any level-1 dbufs in the
|
* empty blocks. Also, if we happen across any level-1 dbufs in the
|
||||||
* range that have not already been marked dirty, mark them dirty so
|
* range that have not already been marked dirty, mark them dirty so
|
||||||
* they stay in memory.
|
* they stay in memory.
|
||||||
|
*
|
||||||
|
* This is a no-op if the dataset is in the middle of an incremental
|
||||||
|
* receive; see comment below for details.
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
|
dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
|
||||||
|
@ -864,6 +868,20 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
|
||||||
last_l1 = end >> epbs;
|
last_l1 = end >> epbs;
|
||||||
}
|
}
|
||||||
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
|
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
|
||||||
|
|
||||||
|
if (dmu_objset_is_receiving(dn->dn_objset)) {
|
||||||
|
/*
|
||||||
|
* When processing a free record from a zfs receive,
|
||||||
|
* there should have been no previous modifications to the
|
||||||
|
* data in this range. Therefore there should be no dbufs
|
||||||
|
* in the range. Searching dn_dbufs for these non-existent
|
||||||
|
* dbufs can be very expensive, so simply ignore this.
|
||||||
|
*/
|
||||||
|
VERIFY3P(dbuf_find(dn, 0, start), ==, NULL);
|
||||||
|
VERIFY3P(dbuf_find(dn, 0, end), ==, NULL);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
mutex_enter(&dn->dn_dbufs_mtx);
|
mutex_enter(&dn->dn_dbufs_mtx);
|
||||||
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
|
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
|
||||||
db_next = list_next(&dn->dn_dbufs, db);
|
db_next = list_next(&dn->dn_dbufs, db);
|
||||||
|
|
|
@ -109,6 +109,32 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
|
||||||
{
|
{
|
||||||
struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
|
struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When we receive a free record, dbuf_free_range() assumes
|
||||||
|
* that the receiving system doesn't have any dbufs in the range
|
||||||
|
* being freed. This is always true because there is a one-record
|
||||||
|
* constraint: we only send one WRITE record for any given
|
||||||
|
* object+offset. We know that the one-record constraint is
|
||||||
|
* true because we always send data in increasing order by
|
||||||
|
* object,offset.
|
||||||
|
*
|
||||||
|
* If the increasing-order constraint ever changes, we should find
|
||||||
|
* another way to assert that the one-record constraint is still
|
||||||
|
* satisfied.
|
||||||
|
*/
|
||||||
|
ASSERT(object > dsp->dsa_last_data_object ||
|
||||||
|
(object == dsp->dsa_last_data_object &&
|
||||||
|
offset > dsp->dsa_last_data_offset));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we are doing a non-incremental send, then there can't
|
||||||
|
* be any data in the dataset we're receiving into. Therefore
|
||||||
|
* a free record would simply be a no-op. Save space by not
|
||||||
|
* sending it to begin with.
|
||||||
|
*/
|
||||||
|
if (!dsp->dsa_incremental)
|
||||||
|
return (0);
|
||||||
|
|
||||||
if (length != -1ULL && offset + length < offset)
|
if (length != -1ULL && offset + length < offset)
|
||||||
length = -1ULL;
|
length = -1ULL;
|
||||||
|
|
||||||
|
@ -175,6 +201,15 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
|
||||||
{
|
{
|
||||||
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
|
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We send data in increasing object, offset order.
|
||||||
|
* See comment in dump_free() for details.
|
||||||
|
*/
|
||||||
|
ASSERT(object > dsp->dsa_last_data_object ||
|
||||||
|
(object == dsp->dsa_last_data_object &&
|
||||||
|
offset > dsp->dsa_last_data_offset));
|
||||||
|
dsp->dsa_last_data_object = object;
|
||||||
|
dsp->dsa_last_data_offset = offset + blksz - 1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If there is any kind of pending aggregation (currently either
|
* If there is any kind of pending aggregation (currently either
|
||||||
|
@ -242,6 +277,10 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
|
||||||
{
|
{
|
||||||
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
|
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
|
||||||
|
|
||||||
|
/* See comment in dump_free(). */
|
||||||
|
if (!dsp->dsa_incremental)
|
||||||
|
return (0);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
|
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
|
||||||
* push it out, since free block aggregation can only be done for
|
* push it out, since free block aggregation can only be done for
|
||||||
|
@ -318,9 +357,9 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
|
||||||
if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
|
if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
|
||||||
return (SET_ERROR(EINTR));
|
return (SET_ERROR(EINTR));
|
||||||
|
|
||||||
/* free anything past the end of the file */
|
/* Free anything past the end of the file. */
|
||||||
if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
|
if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
|
||||||
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
|
(dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
|
||||||
return (SET_ERROR(EINTR));
|
return (SET_ERROR(EINTR));
|
||||||
if (dsp->dsa_err != 0)
|
if (dsp->dsa_err != 0)
|
||||||
return (SET_ERROR(EINTR));
|
return (SET_ERROR(EINTR));
|
||||||
|
@ -503,6 +542,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
|
||||||
dsp->dsa_toguid = ds->ds_phys->ds_guid;
|
dsp->dsa_toguid = ds->ds_phys->ds_guid;
|
||||||
ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
|
ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
|
||||||
dsp->dsa_pending_op = PENDING_NONE;
|
dsp->dsa_pending_op = PENDING_NONE;
|
||||||
|
dsp->dsa_incremental = (fromtxg != 0);
|
||||||
|
|
||||||
mutex_enter(&ds->ds_sendstream_lock);
|
mutex_enter(&ds->ds_sendstream_lock);
|
||||||
list_insert_head(&ds->ds_sendstreams, dsp);
|
list_insert_head(&ds->ds_sendstreams, dsp);
|
||||||
|
@ -1799,3 +1839,13 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
|
||||||
else
|
else
|
||||||
return (dmu_recv_existing_end(drc));
|
return (dmu_recv_existing_end(drc));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return TRUE if this objset is currently being received into.
|
||||||
|
*/
|
||||||
|
boolean_t
|
||||||
|
dmu_objset_is_receiving(objset_t *os)
|
||||||
|
{
|
||||||
|
return (os->os_dsl_dataset != NULL &&
|
||||||
|
os->os_dsl_dataset->ds_owner == dmu_recv_tag);
|
||||||
|
}
|
||||||
|
|
|
@ -604,8 +604,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
|
||||||
{
|
{
|
||||||
dmu_tx_hold_t *txh;
|
dmu_tx_hold_t *txh;
|
||||||
dnode_t *dn;
|
dnode_t *dn;
|
||||||
uint64_t start, end, i;
|
int err;
|
||||||
int err, shift;
|
|
||||||
zio_t *zio;
|
zio_t *zio;
|
||||||
|
|
||||||
ASSERT(tx->tx_txg == 0);
|
ASSERT(tx->tx_txg == 0);
|
||||||
|
@ -616,30 +615,45 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
|
||||||
return;
|
return;
|
||||||
dn = txh->txh_dnode;
|
dn = txh->txh_dnode;
|
||||||
|
|
||||||
/* first block */
|
|
||||||
if (off != 0)
|
|
||||||
dmu_tx_count_write(txh, off, 1);
|
|
||||||
/* last block */
|
|
||||||
if (len != DMU_OBJECT_END)
|
|
||||||
dmu_tx_count_write(txh, off+len, 1);
|
|
||||||
|
|
||||||
dmu_tx_count_dnode(txh);
|
|
||||||
|
|
||||||
if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
|
if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
|
||||||
return;
|
return;
|
||||||
if (len == DMU_OBJECT_END)
|
if (len == DMU_OBJECT_END)
|
||||||
len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
|
len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
|
||||||
|
|
||||||
|
dmu_tx_count_dnode(txh);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* For i/o error checking, read the first and last level-0
|
* For i/o error checking, we read the first and last level-0
|
||||||
* blocks, and all the level-1 blocks. The above count_write's
|
* blocks if they are not aligned, and all the level-1 blocks.
|
||||||
* have already taken care of the level-0 blocks.
|
*
|
||||||
|
* Note: dbuf_free_range() assumes that we have not instantiated
|
||||||
|
* any level-0 dbufs that will be completely freed. Therefore we must
|
||||||
|
* exercise care to not read or count the first and last blocks
|
||||||
|
* if they are blocksize-aligned.
|
||||||
|
*/
|
||||||
|
if (dn->dn_datablkshift == 0) {
|
||||||
|
dmu_tx_count_write(txh, off, len);
|
||||||
|
} else {
|
||||||
|
/* first block will be modified if it is not aligned */
|
||||||
|
if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
|
||||||
|
dmu_tx_count_write(txh, off, 1);
|
||||||
|
/* last block will be modified if it is not aligned */
|
||||||
|
if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
|
||||||
|
dmu_tx_count_write(txh, off+len, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check level-1 blocks.
|
||||||
*/
|
*/
|
||||||
if (dn->dn_nlevels > 1) {
|
if (dn->dn_nlevels > 1) {
|
||||||
shift = dn->dn_datablkshift + dn->dn_indblkshift -
|
int shift = dn->dn_datablkshift + dn->dn_indblkshift -
|
||||||
SPA_BLKPTRSHIFT;
|
SPA_BLKPTRSHIFT;
|
||||||
start = off >> shift;
|
uint64_t start = off >> shift;
|
||||||
end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
|
uint64_t end = (off + len) >> shift;
|
||||||
|
uint64_t i;
|
||||||
|
|
||||||
|
ASSERT(dn->dn_datablkshift != 0);
|
||||||
|
ASSERT(dn->dn_indblkshift != 0);
|
||||||
|
|
||||||
zio = zio_root(tx->tx_pool->dp_spa,
|
zio = zio_root(tx->tx_pool->dp_spa,
|
||||||
NULL, NULL, ZIO_FLAG_CANFAIL);
|
NULL, NULL, ZIO_FLAG_CANFAIL);
|
||||||
|
|
Loading…
Reference in New Issue