OpenZFS 6393 - zfs receive a full send as a clone

Authored by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Dan McDonald <danmcd@omniti.com>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>

OpenZFS-issue: https://www.illumos.org/issues/6394
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/68ecb2e
This commit is contained in:
Paul Dagnelie 2016-06-09 11:18:16 -07:00 committed by Brian Behlendorf
parent fd41e93563
commit e6d3a843d6
5 changed files with 131 additions and 64 deletions

View File

@ -24,7 +24,7 @@
*/ */
/* /*
* Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2013, 2014 by Delphix. All rights reserved. * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_DMU_IMPL_H #ifndef _SYS_DMU_IMPL_H
@ -268,7 +268,6 @@ typedef struct dmu_sendarg {
uint64_t dsa_toguid; uint64_t dsa_toguid;
int dsa_err; int dsa_err;
dmu_pendop_t dsa_pending_op; dmu_pendop_t dsa_pending_op;
boolean_t dsa_incremental;
uint64_t dsa_featureflags; uint64_t dsa_featureflags;
uint64_t dsa_last_data_object; uint64_t dsa_last_data_object;
uint64_t dsa_last_data_offset; uint64_t dsa_last_data_offset;

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_ZFS_IOCTL_H #ifndef _SYS_ZFS_IOCTL_H
@ -138,6 +138,16 @@ typedef enum dmu_send_resume_token_version {
#define DRR_FLAG_CLONE (1<<0) #define DRR_FLAG_CLONE (1<<0)
#define DRR_FLAG_CI_DATA (1<<1) #define DRR_FLAG_CI_DATA (1<<1)
/*
* This send stream, if it is a full send, includes the FREE and FREEOBJECT
* records that are created by the sending process. This means that the send
* stream can be received as a clone, even though it is not an incremental.
* This is not implemented as a feature flag, because the receiving side does
* not need to have implemented it to receive this stream; it is fully backwards
* compatible. We need a flag, though, because full send streams without it
* cannot necessarily be received as a clone correctly.
*/
#define DRR_FLAG_FREERECORDS (1<<2)
/* /*
* flags in the drr_checksumflags field in the DRR_WRITE and * flags in the drr_checksumflags field in the DRR_WRITE and

View File

@ -22,7 +22,7 @@
.\" .\"
.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
.\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org> .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
.\" Copyright (c) 2011, 2014 by Delphix. All rights reserved. .\" Copyright (c) 2011, 2015 by Delphix. All rights reserved.
.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
.\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved. .\" Copyright 2012 Nexenta Systems, Inc. All Rights Reserved.
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
@ -2991,7 +2991,12 @@ Discard all but the last element of the sent snapshot's file system name, using
.ad .ad
.sp .6 .sp .6
.RS 4n .RS 4n
Forces the stream to be received as a clone of the given snapshot. This is only valid if the stream is an incremental stream whose source is the same as the provided origin. Forces the stream to be received as a clone of the given snapshot.
If the stream is a full send stream, this will create the filesystem
described by the stream as a clone of the specified snapshot. Which
snapshot was specified will not affect the success or failure of the
receive, as long as the snapshot does exist. If the stream is an
incremental send stream, all the normal verification will be performed.
.RE .RE
.RE .RE

View File

@ -20,11 +20,10 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright 2014 HybridCluster. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved.
*/ */
@ -173,6 +172,14 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
return (0); return (0);
} }
/*
* Fill in the drr_free struct, or perform aggregation if the previous record is
* also a free record, and the two are adjacent.
*
* Note that we send free records even for a full send, because we want to be
* able to receive a full send as a clone, which requires a list of all the free
* and freeobject records that were generated on the source.
*/
static int static int
dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
uint64_t length) uint64_t length)
@ -196,15 +203,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
(object == dsp->dsa_last_data_object && (object == dsp->dsa_last_data_object &&
offset > dsp->dsa_last_data_offset)); offset > dsp->dsa_last_data_offset));
/*
* If we are doing a non-incremental send, then there can't
* be any data in the dataset we're receiving into. Therefore
* a free record would simply be a no-op. Save space by not
* sending it to begin with.
*/
if (!dsp->dsa_incremental)
return (0);
if (length != -1ULL && offset + length < offset) if (length != -1ULL && offset + length < offset)
length = -1ULL; length = -1ULL;
@ -382,10 +380,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
{ {
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
/* See comment in dump_free(). */
if (!dsp->dsa_incremental)
return (0);
/* /*
* If there is a pending op, but it's not PENDING_FREEOBJECTS, * If there is a pending op, but it's not PENDING_FREEOBJECTS,
* push it out, since free block aggregation can only be done for * push it out, since free block aggregation can only be done for
@ -796,6 +790,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
if (ancestor_zb != NULL) { if (ancestor_zb != NULL) {
drr->drr_u.drr_begin.drr_fromguid = drr->drr_u.drr_begin.drr_fromguid =
@ -818,7 +813,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
dsp->dsa_off = off; dsp->dsa_off = off;
dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_pending_op = PENDING_NONE;
dsp->dsa_incremental = (ancestor_zb != NULL);
dsp->dsa_featureflags = featureflags; dsp->dsa_featureflags = featureflags;
dsp->dsa_resume_object = resumeobj; dsp->dsa_resume_object = resumeobj;
dsp->dsa_resume_offset = resumeoff; dsp->dsa_resume_offset = resumeoff;
@ -1336,7 +1330,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/* target fs already exists; recv into temp clone */ /* target fs already exists; recv into temp clone */
/* Can't recv a clone into an existing fs */ /* Can't recv a clone into an existing fs */
if (flags & DRR_FLAG_CLONE) { if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
} }
@ -1355,6 +1349,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
drba->drba_origin)) drba->drba_origin))
return (SET_ERROR(ENOENT)); return (SET_ERROR(ENOENT));
/*
* If we're receiving a full send as a clone, and it doesn't
* contain all the necessary free records and freeobject
* records, reject it.
*/
if (fromguid == 0 && drba->drba_origin &&
!(flags & DRR_FLAG_FREERECORDS))
return (SET_ERROR(EINVAL));
/* Open the parent of tofs */ /* Open the parent of tofs */
ASSERT3U(strlen(tofs), <, MAXNAMELEN); ASSERT3U(strlen(tofs), <, MAXNAMELEN);
(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
@ -1394,7 +1397,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
} }
if (dsl_dataset_phys(origin)->ds_guid != fromguid) { if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
fromguid != 0) {
dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(origin, FTAG);
dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENODEV)); return (SET_ERROR(ENODEV));
@ -1724,6 +1728,20 @@ struct receive_writer_arg {
uint64_t bytes_read; /* bytes read when current record created */ uint64_t bytes_read; /* bytes read when current record created */
}; };
struct objlist {
list_t list; /* List of struct receive_objnode. */
/*
* Last object looked up. Used to assert that objects are being looked
* up in ascending order.
*/
uint64_t last_lookup;
};
struct receive_objnode {
list_node_t node;
uint64_t object;
};
struct receive_arg { struct receive_arg {
objset_t *os; objset_t *os;
vnode_t *vp; /* The vnode to read the stream from */ vnode_t *vp; /* The vnode to read the stream from */
@ -1741,12 +1759,7 @@ struct receive_arg {
int err; int err;
boolean_t byteswap; boolean_t byteswap;
/* Sorted list of objects not to issue prefetches for. */ /* Sorted list of objects not to issue prefetches for. */
list_t ignore_obj_list; struct objlist ignore_objlist;
};
struct receive_ign_obj_node {
list_node_t node;
uint64_t object;
}; };
typedef struct guid_map_entry { typedef struct guid_map_entry {
@ -2063,13 +2076,14 @@ receive_freeobjects(struct receive_writer_arg *rwa,
struct drr_freeobjects *drrfo) struct drr_freeobjects *drrfo)
{ {
uint64_t obj; uint64_t obj;
int next_err = 0;
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs; obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
(void) dmu_object_next(rwa->os, &obj, FALSE, 0)) { next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
dmu_object_info_t doi; dmu_object_info_t doi;
int err; int err;
@ -2085,7 +2099,8 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (err != 0) if (err != 0)
return (err); return (err);
} }
if (next_err != ESRCH)
return (next_err);
return (0); return (0);
} }
@ -2415,6 +2430,70 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
return (0); return (0);
} }
static void
objlist_create(struct objlist *list)
{
list_create(&list->list, sizeof (struct receive_objnode),
offsetof(struct receive_objnode, node));
list->last_lookup = 0;
}
static void
objlist_destroy(struct objlist *list)
{
struct receive_objnode *n;
for (n = list_remove_head(&list->list);
n != NULL; n = list_remove_head(&list->list)) {
kmem_free(n, sizeof (*n));
}
list_destroy(&list->list);
}
/*
* This function looks through the objlist to see if the specified object number
* is contained in the objlist. In the process, it will remove all object
* numbers in the list that are smaller than the specified object number. Thus,
* any lookup of an object number smaller than a previously looked up object
* number will always return false; therefore, all lookups should be done in
* ascending order.
*/
static boolean_t
objlist_exists(struct objlist *list, uint64_t object)
{
struct receive_objnode *node = list_head(&list->list);
ASSERT3U(object, >=, list->last_lookup);
list->last_lookup = object;
while (node != NULL && node->object < object) {
VERIFY3P(node, ==, list_remove_head(&list->list));
kmem_free(node, sizeof (*node));
node = list_head(&list->list);
}
return (node != NULL && node->object == object);
}
/*
* The objlist is a list of object numbers stored in ascending order. However,
* the insertion of new object numbers does not seek out the correct location to
* store a new object number; instead, it appends it to the list for simplicity.
* Thus, any users must take care to only insert new object numbers in ascending
* order.
*/
static void
objlist_insert(struct objlist *list, uint64_t object)
{
struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
node->object = object;
#ifdef ZFS_DEBUG
{
struct receive_objnode *last_object = list_tail(&list->list);
uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
ASSERT3U(node->object, >, last_objnum);
}
#endif
list_insert_tail(&list->list, node);
}
/* /*
* Issue the prefetch reads for any necessary indirect blocks. * Issue the prefetch reads for any necessary indirect blocks.
* *
@ -2437,13 +2516,7 @@ static void
receive_read_prefetch(struct receive_arg *ra, receive_read_prefetch(struct receive_arg *ra,
uint64_t object, uint64_t offset, uint64_t length) uint64_t object, uint64_t offset, uint64_t length)
{ {
struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list); if (!objlist_exists(&ra->ignore_objlist, object)) {
while (node != NULL && node->object < object) {
VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
kmem_free(node, sizeof (*node));
node = list_head(&ra->ignore_obj_list);
}
if (node == NULL || node->object > object) {
dmu_prefetch(ra->os, object, 1, offset, length, dmu_prefetch(ra->os, object, 1, offset, length,
ZIO_PRIORITY_SYNC_READ); ZIO_PRIORITY_SYNC_READ);
} }
@ -2476,20 +2549,7 @@ receive_read_record(struct receive_arg *ra)
*/ */
if (err == ENOENT || if (err == ENOENT ||
(err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
struct receive_ign_obj_node *node = objlist_insert(&ra->ignore_objlist, drro->drr_object);
kmem_zalloc(sizeof (*node),
KM_SLEEP);
node->object = drro->drr_object;
#ifdef ZFS_DEBUG
{
struct receive_ign_obj_node *last_object =
list_tail(&ra->ignore_obj_list);
uint64_t last_objnum = (last_object != NULL ?
last_object->object : 0);
ASSERT3U(node->object, >, last_objnum);
}
#endif
list_insert_tail(&ra->ignore_obj_list, node);
err = 0; err = 0;
} }
return (err); return (err);
@ -2706,7 +2766,6 @@ resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
return (0); return (0);
} }
/* /*
* Read in the stream's records, one by one, and apply them to the pool. There * Read in the stream's records, one by one, and apply them to the pool. There
* are two threads involved; the thread that calls this function will spin up a * are two threads involved; the thread that calls this function will spin up a
@ -2727,7 +2786,6 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
struct receive_arg *ra; struct receive_arg *ra;
struct receive_writer_arg *rwa; struct receive_writer_arg *rwa;
int featureflags; int featureflags;
struct receive_ign_obj_node *n;
uint32_t payloadlen; uint32_t payloadlen;
void *payload; void *payload;
nvlist_t *begin_nvl = NULL; nvlist_t *begin_nvl = NULL;
@ -2746,8 +2804,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
sizeof (ra->bytes_read), 1, &ra->bytes_read); sizeof (ra->bytes_read), 1, &ra->bytes_read);
} }
list_create(&ra->ignore_obj_list, sizeof (struct receive_ign_obj_node), objlist_create(&ra->ignore_objlist);
offsetof(struct receive_ign_obj_node, node));
/* these were verified in dmu_recv_begin */ /* these were verified in dmu_recv_begin */
ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
@ -2901,12 +2958,7 @@ out:
} }
*voffp = ra->voff; *voffp = ra->voff;
objlist_destroy(&ra->ignore_objlist);
for (n = list_remove_head(&ra->ignore_obj_list); n != NULL;
n = list_remove_head(&ra->ignore_obj_list)) {
kmem_free(n, sizeof (*n));
}
list_destroy(&ra->ignore_obj_list);
kmem_free(ra, sizeof (*ra)); kmem_free(ra, sizeof (*ra));
kmem_free(rwa, sizeof (*rwa)); kmem_free(rwa, sizeof (*rwa));
return (err); return (err);

View File

@ -152,7 +152,8 @@ tests = []
[tests/functional/cli_root/zfs_receive] [tests/functional/cli_root/zfs_receive]
tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
'zfs_receive_005_neg', 'zfs_receive_006_pos', 'zfs_receive_005_neg', 'zfs_receive_006_pos',
'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg'] 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg',
'zfs_receive_010_pos']
# DISABLED: # DISABLED:
# zfs_rename_002_pos - needs investigation # zfs_rename_002_pos - needs investigation