Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* This file and its contents are supplied under the terms of the
|
|
|
|
* Common Development and Distribution License ("CDDL"), version 1.0.
|
|
|
|
* You may only use this file in accordance with the terms of version
|
|
|
|
* 1.0 of the CDDL.
|
|
|
|
*
|
|
|
|
* A full copy of the text of the CDDL should have accompanied this
|
|
|
|
* source. A copy of the CDDL is also available via the Internet at
|
|
|
|
* http://www.illumos.org/license/CDDL.
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copyright (c) 2020 by Delphix. All rights reserved.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <assert.h>
|
|
|
|
#include <cityhash.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <errno.h>
|
|
|
|
#include <fcntl.h>
|
|
|
|
#include <libzfs.h>
|
|
|
|
#include <libzutil.h>
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
2022-01-22 00:56:46 +00:00
|
|
|
#include <string.h>
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
#include <umem.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <sys/debug.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <sys/zfs_ioctl.h>
|
|
|
|
#include <sys/zio_checksum.h>
|
|
|
|
#include "zfs_fletcher.h"
|
|
|
|
#include "zstream.h"
|
|
|
|
|
|
|
|
|
|
|
|
#define MAX_RDT_PHYSMEM_PERCENT 20
|
|
|
|
#define SMALLEST_POSSIBLE_MAX_RDT_MB 128
|
|
|
|
|
|
|
|
typedef struct redup_entry {
|
|
|
|
struct redup_entry *rde_next;
|
|
|
|
uint64_t rde_guid;
|
|
|
|
uint64_t rde_object;
|
|
|
|
uint64_t rde_offset;
|
|
|
|
uint64_t rde_stream_offset;
|
|
|
|
} redup_entry_t;
|
|
|
|
|
|
|
|
typedef struct redup_table {
|
|
|
|
redup_entry_t **redup_hash_array;
|
|
|
|
umem_cache_t *ddecache;
|
|
|
|
uint64_t ddt_count;
|
|
|
|
int numhashbits;
|
|
|
|
} redup_table_t;
|
|
|
|
|
|
|
|
int
|
|
|
|
highbit64(uint64_t i)
|
|
|
|
{
|
|
|
|
if (i == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
|
|
|
|
}
|
|
|
|
|
2022-06-24 20:28:42 +00:00
|
|
|
void *
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
safe_calloc(size_t n)
|
|
|
|
{
|
|
|
|
void *rv = calloc(1, n);
|
|
|
|
if (rv == NULL) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Error: could not allocate %u bytes of memory\n",
|
|
|
|
(int)n);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
return (rv);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Safe version of fread(), exits on error.
|
|
|
|
*/
|
2022-06-24 20:28:42 +00:00
|
|
|
int
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
sfread(void *buf, size_t size, FILE *fp)
|
|
|
|
{
|
|
|
|
int rv = fread(buf, size, 1, fp);
|
|
|
|
if (rv == 0 && ferror(fp)) {
|
|
|
|
(void) fprintf(stderr, "Error while reading file: %s\n",
|
|
|
|
strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
return (rv);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Safe version of pread(), exits on error.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
spread(int fd, void *buf, size_t count, off_t offset)
|
|
|
|
{
|
|
|
|
ssize_t err = pread(fd, buf, count, offset);
|
|
|
|
if (err == -1) {
|
|
|
|
(void) fprintf(stderr,
|
|
|
|
"Error while reading file: %s\n",
|
|
|
|
strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
} else if (err != count) {
|
|
|
|
(void) fprintf(stderr,
|
|
|
|
"Error while reading file: short read\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
|
|
|
|
zio_cksum_t *zc, int outfd)
|
|
|
|
{
|
|
|
|
assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
|
|
|
|
== sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
|
|
|
|
fletcher_4_incremental_native(drr,
|
|
|
|
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
|
|
|
|
if (drr->drr_type != DRR_BEGIN) {
|
|
|
|
assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
|
|
|
|
drr_checksum.drr_checksum));
|
|
|
|
drr->drr_u.drr_checksum.drr_checksum = *zc;
|
|
|
|
}
|
|
|
|
fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
|
|
|
|
sizeof (zio_cksum_t), zc);
|
|
|
|
if (write(outfd, drr, sizeof (*drr)) == -1)
|
|
|
|
return (errno);
|
|
|
|
if (payload_len != 0) {
|
|
|
|
fletcher_4_incremental_native(payload, payload_len, zc);
|
|
|
|
if (write(outfd, payload, payload_len) == -1)
|
|
|
|
return (errno);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
rdt_insert(redup_table_t *rdt,
|
|
|
|
uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
|
|
|
|
{
|
|
|
|
uint64_t ch = cityhash4(guid, object, offset, 0);
|
|
|
|
uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
|
|
|
|
redup_entry_t **rdepp;
|
|
|
|
|
|
|
|
rdepp = &(rdt->redup_hash_array[hashcode]);
|
|
|
|
redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
|
|
|
|
rde->rde_next = *rdepp;
|
|
|
|
rde->rde_guid = guid;
|
|
|
|
rde->rde_object = object;
|
|
|
|
rde->rde_offset = offset;
|
|
|
|
rde->rde_stream_offset = stream_offset;
|
|
|
|
*rdepp = rde;
|
|
|
|
rdt->ddt_count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
rdt_lookup(redup_table_t *rdt,
|
|
|
|
uint64_t guid, uint64_t object, uint64_t offset,
|
|
|
|
uint64_t *stream_offsetp)
|
|
|
|
{
|
|
|
|
uint64_t ch = cityhash4(guid, object, offset, 0);
|
|
|
|
uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
|
|
|
|
|
|
|
|
for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
|
|
|
|
rde != NULL; rde = rde->rde_next) {
|
|
|
|
if (rde->rde_guid == guid &&
|
|
|
|
rde->rde_object == object &&
|
|
|
|
rde->rde_offset == offset) {
|
|
|
|
*stream_offsetp = rde->rde_stream_offset;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(!"could not find expected redup table entry");
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Convert a dedup stream (generated by "zfs send -D") to a
|
|
|
|
* non-deduplicated stream. The entire infd will be converted, including
|
|
|
|
* any substreams in a stream package (generated by "zfs send -RD"). The
|
|
|
|
* infd must be seekable.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zfs_redup_stream(int infd, int outfd, boolean_t verbose)
|
|
|
|
{
|
|
|
|
int bufsz = SPA_MAXBLOCKSIZE;
|
Use memset to zero stack allocations containing unions
C99 6.7.8.17 says that when an undesignated initialiser is used, only
the first element of a union is initialised. If the first element is not
the largest within the union, how the remaining space is initialised is
up to the compiler.
GCC extends the initialiser to the entire union, while Clang treats the
remainder as padding, and so initialises according to whatever
automatic/implicit initialisation rules are currently active.
When Linux is compiled with CONFIG_INIT_STACK_ALL_PATTERN,
-ftrivial-auto-var-init=pattern is added to the kernel CFLAGS. This flag
sets the policy for automatic/implicit initialisation of variables on
the stack.
Taken together, this means that when compiling under
CONFIG_INIT_STACK_ALL_PATTERN on Clang, the "zero" initialiser will only
zero the first element in a union, and the rest will be filled with a
pattern. This is significant for aes_ctx_t, which in
aes_encrypt_atomic() and aes_decrypt_atomic() is initialised to zero,
but then used as a gcm_ctx_t, which is the fifth element in the union,
and thus gets pattern initialisation. Later, it's assumed to be zero,
resulting in a hang.
As confusing and undiscoverable as it is, by the spec, we are at fault
when we initialise a structure containing a union with the zero
initializer. As such, this commit replaces these uses with an explicit
memset(0).
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16135
Closes #16206
2024-05-25 02:00:29 +00:00
|
|
|
dmu_replay_record_t thedrr;
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
dmu_replay_record_t *drr = &thedrr;
|
|
|
|
redup_table_t rdt;
|
|
|
|
zio_cksum_t stream_cksum;
|
|
|
|
uint64_t numbuckets;
|
|
|
|
uint64_t num_records = 0;
|
|
|
|
uint64_t num_write_byref_records = 0;
|
|
|
|
|
Use memset to zero stack allocations containing unions
C99 6.7.8.17 says that when an undesignated initialiser is used, only
the first element of a union is initialised. If the first element is not
the largest within the union, how the remaining space is initialised is
up to the compiler.
GCC extends the initialiser to the entire union, while Clang treats the
remainder as padding, and so initialises according to whatever
automatic/implicit initialisation rules are currently active.
When Linux is compiled with CONFIG_INIT_STACK_ALL_PATTERN,
-ftrivial-auto-var-init=pattern is added to the kernel CFLAGS. This flag
sets the policy for automatic/implicit initialisation of variables on
the stack.
Taken together, this means that when compiling under
CONFIG_INIT_STACK_ALL_PATTERN on Clang, the "zero" initialiser will only
zero the first element in a union, and the rest will be filled with a
pattern. This is significant for aes_ctx_t, which in
aes_encrypt_atomic() and aes_decrypt_atomic() is initialised to zero,
but then used as a gcm_ctx_t, which is the fifth element in the union,
and thus gets pattern initialisation. Later, it's assumed to be zero,
resulting in a hang.
As confusing and undiscoverable as it is, by the spec, we are at fault
when we initialise a structure containing a union with the zero
initializer. As such, this commit replaces these uses with an explicit
memset(0).
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16135
Closes #16206
2024-05-25 02:00:29 +00:00
|
|
|
memset(&thedrr, 0, sizeof (dmu_replay_record_t));
|
|
|
|
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
#ifdef _ILP32
|
|
|
|
uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
|
|
|
|
#else
|
|
|
|
uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
|
|
|
|
uint64_t max_rde_size =
|
|
|
|
MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
|
|
|
|
SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
numbuckets = max_rde_size / (sizeof (redup_entry_t));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* numbuckets must be a power of 2. Increase number to
|
|
|
|
* a power of 2 if necessary.
|
|
|
|
*/
|
|
|
|
if (!ISP2(numbuckets))
|
|
|
|
numbuckets = 1ULL << highbit64(numbuckets);
|
|
|
|
|
|
|
|
rdt.redup_hash_array =
|
|
|
|
safe_calloc(numbuckets * sizeof (redup_entry_t *));
|
|
|
|
rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
|
|
|
|
NULL, NULL, NULL, NULL, NULL, 0);
|
|
|
|
rdt.numhashbits = highbit64(numbuckets) - 1;
|
2020-04-11 04:10:09 +00:00
|
|
|
rdt.ddt_count = 0;
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
|
|
|
|
char *buf = safe_calloc(bufsz);
|
|
|
|
FILE *ofp = fdopen(infd, "r");
|
|
|
|
long offset = ftell(ofp);
|
2022-12-12 18:40:05 +00:00
|
|
|
int begin = 0;
|
|
|
|
boolean_t seen = B_FALSE;
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
while (sfread(drr, sizeof (*drr), ofp) != 0) {
|
|
|
|
num_records++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to regenerate the checksum.
|
|
|
|
*/
|
|
|
|
if (drr->drr_type != DRR_BEGIN) {
|
2022-02-25 13:26:54 +00:00
|
|
|
memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
sizeof (drr->drr_u.drr_checksum.drr_checksum));
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t payload_size = 0;
|
|
|
|
switch (drr->drr_type) {
|
|
|
|
case DRR_BEGIN:
|
|
|
|
{
|
|
|
|
struct drr_begin *drrb = &drr->drr_u.drr_begin;
|
|
|
|
int fflags;
|
|
|
|
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
|
2022-12-12 18:40:05 +00:00
|
|
|
VERIFY0(begin++);
|
|
|
|
seen = B_TRUE;
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
|
|
|
|
assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
|
|
|
|
|
|
|
|
/* clear the DEDUP feature flag for this stream */
|
|
|
|
fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
|
|
|
|
fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
|
|
|
|
DMU_BACKUP_FEATURE_DEDUPPROPS);
|
2021-03-06 01:56:35 +00:00
|
|
|
/* cppcheck-suppress syntaxError */
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
|
|
|
|
|
Reject streams that set ->drr_payloadlen to unreasonably large values
In the zstream code, Coverity reported:
"The argument could be controlled by an attacker, who could invoke the
function with arbitrary values (for example, a very high or negative
buffer size)."
It did not report this in the kernel. This is likely because the
userspace code stored this in an int before passing it into the
allocator, while the kernel code stored it in a uint32_t.
However, this did reveal a potentially real problem. On 32-bit systems
and systems with only 4GB of physical memory or less in general, it is
possible to pass a large enough value that the system will hang. Even
worse, on Linux systems, the kernel memory allocator is not able to
support allocations up to the maximum 4GB allocation size that this
allows.
This had already been limited in userspace to 64MB by
`ZFS_SENDRECV_MAX_NVLIST`, but we need a hard limit in the kernel to
protect systems. After some discussion, we settle on 256MB as a hard
upper limit. Attempting to receive a stream that requires more memory
than that will result in E2BIG being returned to user space.
Reported-by: Coverity (CID-1529836)
Reported-by: Coverity (CID-1529837)
Reported-by: Coverity (CID-1529838)
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #14285
2023-01-23 21:16:22 +00:00
|
|
|
uint32_t sz = drr->drr_payloadlen;
|
|
|
|
|
|
|
|
VERIFY3U(sz, <=, 1U << 28);
|
|
|
|
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
if (sz != 0) {
|
|
|
|
if (sz > bufsz) {
|
|
|
|
free(buf);
|
|
|
|
buf = safe_calloc(sz);
|
|
|
|
bufsz = sz;
|
|
|
|
}
|
|
|
|
(void) sfread(buf, sz, ofp);
|
|
|
|
}
|
|
|
|
payload_size = sz;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case DRR_END:
|
|
|
|
{
|
|
|
|
struct drr_end *drre = &drr->drr_u.drr_end;
|
2022-12-12 18:40:05 +00:00
|
|
|
/*
|
|
|
|
* We would prefer to just check --begin == 0, but
|
|
|
|
* replication streams have an end of stream END
|
|
|
|
* record, so we must avoid tripping it.
|
|
|
|
*/
|
|
|
|
VERIFY3B(seen, ==, B_TRUE);
|
|
|
|
begin--;
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
/*
|
|
|
|
* Use the recalculated checksum, unless this is
|
|
|
|
* the END record of a stream package, which has
|
|
|
|
* no checksum.
|
|
|
|
*/
|
|
|
|
if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
|
|
|
|
drre->drr_checksum = stream_cksum;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case DRR_OBJECT:
|
|
|
|
{
|
|
|
|
struct drr_object *drro = &drr->drr_u.drr_object;
|
2022-12-12 18:40:05 +00:00
|
|
|
VERIFY3S(begin, ==, 1);
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
|
|
|
|
if (drro->drr_bonuslen > 0) {
|
|
|
|
payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
|
|
|
|
(void) sfread(buf, payload_size, ofp);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case DRR_SPILL:
|
|
|
|
{
|
|
|
|
struct drr_spill *drrs = &drr->drr_u.drr_spill;
|
2022-12-12 18:40:05 +00:00
|
|
|
VERIFY3S(begin, ==, 1);
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
|
|
|
|
(void) sfread(buf, payload_size, ofp);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case DRR_WRITE_BYREF:
|
|
|
|
{
|
|
|
|
struct drr_write_byref drrwb =
|
|
|
|
drr->drr_u.drr_write_byref;
|
2022-12-12 18:40:05 +00:00
|
|
|
VERIFY3S(begin, ==, 1);
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
|
|
|
|
num_write_byref_records++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Look up in hash table by drrwb->drr_refguid,
|
|
|
|
* drr_refobject, drr_refoffset. Replace this
|
|
|
|
* record with the found WRITE record, but with
|
|
|
|
* drr_object,drr_offset,drr_toguid replaced with ours.
|
|
|
|
*/
|
2020-04-23 22:54:38 +00:00
|
|
|
uint64_t stream_offset = 0;
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
rdt_lookup(&rdt, drrwb.drr_refguid,
|
|
|
|
drrwb.drr_refobject, drrwb.drr_refoffset,
|
|
|
|
&stream_offset);
|
|
|
|
|
|
|
|
spread(infd, drr, sizeof (*drr), stream_offset);
|
|
|
|
|
|
|
|
assert(drr->drr_type == DRR_WRITE);
|
|
|
|
struct drr_write *drrw = &drr->drr_u.drr_write;
|
|
|
|
assert(drrw->drr_toguid == drrwb.drr_refguid);
|
|
|
|
assert(drrw->drr_object == drrwb.drr_refobject);
|
|
|
|
assert(drrw->drr_offset == drrwb.drr_refoffset);
|
|
|
|
|
|
|
|
payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
|
|
|
|
spread(infd, buf, payload_size,
|
|
|
|
stream_offset + sizeof (*drr));
|
|
|
|
|
|
|
|
drrw->drr_toguid = drrwb.drr_toguid;
|
|
|
|
drrw->drr_object = drrwb.drr_object;
|
|
|
|
drrw->drr_offset = drrwb.drr_offset;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case DRR_WRITE:
|
|
|
|
{
|
|
|
|
struct drr_write *drrw = &drr->drr_u.drr_write;
|
2022-12-12 18:40:05 +00:00
|
|
|
VERIFY3S(begin, ==, 1);
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
|
|
|
|
(void) sfread(buf, payload_size, ofp);
|
|
|
|
|
|
|
|
rdt_insert(&rdt, drrw->drr_toguid,
|
|
|
|
drrw->drr_object, drrw->drr_offset, offset);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case DRR_WRITE_EMBEDDED:
|
|
|
|
{
|
|
|
|
struct drr_write_embedded *drrwe =
|
|
|
|
&drr->drr_u.drr_write_embedded;
|
2022-12-12 18:40:05 +00:00
|
|
|
VERIFY3S(begin, ==, 1);
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
payload_size =
|
|
|
|
P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
|
|
|
|
(void) sfread(buf, payload_size, ofp);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case DRR_FREEOBJECTS:
|
|
|
|
case DRR_FREE:
|
|
|
|
case DRR_OBJECT_RANGE:
|
2022-12-12 18:40:05 +00:00
|
|
|
VERIFY3S(begin, ==, 1);
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
(void) fprintf(stderr, "INVALID record type 0x%x\n",
|
|
|
|
drr->drr_type);
|
|
|
|
/* should never happen, so assert */
|
|
|
|
assert(B_FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (feof(ofp)) {
|
|
|
|
fprintf(stderr, "Error: unexpected end-of-file\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (ferror(ofp)) {
|
|
|
|
fprintf(stderr, "Error while reading file: %s\n",
|
|
|
|
strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to recalculate the checksum, and it needs to be
|
|
|
|
* initially zero to do that. BEGIN records don't have
|
|
|
|
* a checksum.
|
|
|
|
*/
|
|
|
|
if (drr->drr_type != DRR_BEGIN) {
|
2022-02-25 13:26:54 +00:00
|
|
|
memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
sizeof (drr->drr_u.drr_checksum.drr_checksum));
|
|
|
|
}
|
|
|
|
if (dump_record(drr, buf, payload_size,
|
|
|
|
&stream_cksum, outfd) != 0)
|
|
|
|
break;
|
|
|
|
if (drr->drr_type == DRR_END) {
|
|
|
|
/*
|
|
|
|
* Typically the END record is either the last
|
|
|
|
* thing in the stream, or it is followed
|
|
|
|
* by a BEGIN record (which also zeros the checksum).
|
|
|
|
* However, a stream package ends with two END
|
|
|
|
* records. The last END record's checksum starts
|
|
|
|
* from zero.
|
|
|
|
*/
|
|
|
|
ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
|
|
|
|
}
|
|
|
|
offset = ftell(ofp);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (verbose) {
|
|
|
|
char mem_str[16];
|
|
|
|
zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
|
|
|
|
mem_str, sizeof (mem_str));
|
|
|
|
fprintf(stderr, "converted stream with %llu total records, "
|
|
|
|
"including %llu dedup records, using %sB memory.\n",
|
|
|
|
(long long)num_records,
|
|
|
|
(long long)num_write_byref_records,
|
|
|
|
mem_str);
|
|
|
|
}
|
|
|
|
|
|
|
|
umem_cache_destroy(rdt.ddecache);
|
|
|
|
free(rdt.redup_hash_array);
|
|
|
|
free(buf);
|
|
|
|
(void) fclose(ofp);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
zstream_do_redup(int argc, char *argv[])
|
|
|
|
{
|
|
|
|
boolean_t verbose = B_FALSE;
|
2020-12-17 18:19:30 +00:00
|
|
|
int c;
|
Add `zstream redup` command to convert deduplicated send streams
Deduplicated send and receive is deprecated. To ease migration to the
new dedup-send-less world, the commit adds a `zstream redup` utility to
convert deduplicated send streams to normal streams, so that they can
continue to be received indefinitely.
The new `zstream` command also replaces the functionality of
`zstreamdump`, by way of the `zstream dump` subcommand. The
`zstreamdump` command is replaced by a shell script which invokes
`zstream dump`.
The way that `zstream redup` works under the hood is that as we read the
send stream, we build up a hash table which maps from `<GUID, object,
offset> -> <file_offset>`.
Whenever we see a WRITE record, we add a new entry to the hash table,
which indicates where in the stream file to find the WRITE record for
this block. (The key is `drr_toguid, drr_object, drr_offset`.)
For entries other than WRITE_BYREF, we pass them through unchanged
(except for the running checksum, which is recalculated).
For WRITE_BYREF records, we change them to WRITE records. We find the
referenced WRITE record by looking in the hash table (for the record
with key `drr_refguid, drr_refobject, drr_refoffset`), and then reading
the record header and payload from the specified offset in the stream
file. This is why the stream can not be a pipe. The found WRITE record
replaces the WRITE_BYREF record, with its `drr_toguid`, `drr_object`,
and `drr_offset` fields changed to be the same as the WRITE_BYREF's
(i.e. we are writing the same logical block, but with the data supplied
by the previous WRITE record).
This algorithm requires memory proportional to the number of WRITE
records (same as `zfs send -D`), but the size per WRITE record is
relatively low (40 bytes, vs. 72 for `zfs send -D`). A 1TB send stream
with 8KB blocks (`recordsize=8k`) would use around 5GB of RAM to
"redup".
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10124
Closes #10156
2020-04-10 17:39:55 +00:00
|
|
|
|
|
|
|
while ((c = getopt(argc, argv, "v")) != -1) {
|
|
|
|
switch (c) {
|
|
|
|
case 'v':
|
|
|
|
verbose = B_TRUE;
|
|
|
|
break;
|
|
|
|
case '?':
|
|
|
|
(void) fprintf(stderr, "invalid option '%c'\n",
|
|
|
|
optopt);
|
|
|
|
zstream_usage();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
argc -= optind;
|
|
|
|
argv += optind;
|
|
|
|
|
|
|
|
if (argc != 1)
|
|
|
|
zstream_usage();
|
|
|
|
|
|
|
|
const char *filename = argv[0];
|
|
|
|
|
|
|
|
if (isatty(STDOUT_FILENO)) {
|
|
|
|
(void) fprintf(stderr,
|
|
|
|
"Error: Stream can not be written to a terminal.\n"
|
|
|
|
"You must redirect standard output.\n");
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
int fd = open(filename, O_RDONLY);
|
|
|
|
if (fd == -1) {
|
|
|
|
(void) fprintf(stderr,
|
|
|
|
"Error while opening file '%s': %s\n",
|
|
|
|
filename, strerror(errno));
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
fletcher_4_init();
|
|
|
|
zfs_redup_stream(fd, STDOUT_FILENO, verbose);
|
|
|
|
fletcher_4_fini();
|
|
|
|
|
|
|
|
close(fd);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|