DLPX-44812 integrate EP-220 large memory scalability

This commit is contained in:
David Quigley 2016-07-22 11:52:49 -04:00 committed by Brian Behlendorf
parent 616fa7c02b
commit a6255b7fce
49 changed files with 2625 additions and 798 deletions

View File

@ -23,6 +23,8 @@
* Copyright (C) 2016 Gvozden Nešković. All rights reserved. * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/ */
#ifdef _ABD_READY_
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/time.h> #include <sys/time.h>
#include <sys/wait.h> #include <sys/wait.h>
@ -225,3 +227,4 @@ run_raidz_benchmark(void)
bench_fini_raidz_maps(); bench_fini_raidz_maps();
} }
#endif

View File

@ -32,6 +32,16 @@
#include <sys/vdev_raidz_impl.h> #include <sys/vdev_raidz_impl.h>
#include <assert.h> #include <assert.h>
#include <stdio.h> #include <stdio.h>
#ifndef _ABD_READY_
int
main(int argc, char **argv)
{
exit(0);
}
#else
#include "raidz_test.h" #include "raidz_test.h"
static int *rand_data; static int *rand_data;
@ -782,3 +792,4 @@ main(int argc, char **argv)
return (err); return (err);
} }
#endif

View File

@ -59,6 +59,7 @@
#include <sys/arc.h> #include <sys/arc.h>
#include <sys/ddt.h> #include <sys/ddt.h>
#include <sys/zfeature.h> #include <sys/zfeature.h>
#include <sys/abd.h>
#include <zfs_comutil.h> #include <zfs_comutil.h>
#include <libzfs.h> #include <libzfs.h>
@ -2464,7 +2465,7 @@ zdb_blkptr_done(zio_t *zio)
zdb_cb_t *zcb = zio->io_private; zdb_cb_t *zcb = zio->io_private;
zbookmark_phys_t *zb = &zio->io_bookmark; zbookmark_phys_t *zb = &zio->io_bookmark;
zio_data_buf_free(zio->io_data, zio->io_size); abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock); mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--; spa->spa_scrub_inflight--;
@ -2530,7 +2531,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (!BP_IS_EMBEDDED(bp) && if (!BP_IS_EMBEDDED(bp) &&
(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
size_t size = BP_GET_PSIZE(bp); size_t size = BP_GET_PSIZE(bp);
void *data = zio_data_buf_alloc(size); abd_t *abd = abd_alloc(size, B_FALSE);
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
/* If it's an intent log block, failure is expected. */ /* If it's an intent log block, failure is expected. */
@ -2543,7 +2544,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
spa->spa_scrub_inflight++; spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock); mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(NULL, spa, bp, data, size, zio_nowait(zio_read(NULL, spa, bp, abd, size,
zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
} }
@ -3321,6 +3322,13 @@ name:
return (NULL); return (NULL);
} }
/* ARGSUSED */
static int
random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
{
return (random_get_pseudo_bytes(buf, len));
}
/* /*
* Read a block from a pool and print it out. The syntax of the * Read a block from a pool and print it out. The syntax of the
* block descriptor is: * block descriptor is:
@ -3352,7 +3360,8 @@ zdb_read_block(char *thing, spa_t *spa)
uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
zio_t *zio; zio_t *zio;
vdev_t *vd; vdev_t *vd;
void *pbuf, *lbuf, *buf; abd_t *pabd;
void *lbuf, *buf;
char *s, *p, *dup, *vdev, *flagstr; char *s, *p, *dup, *vdev, *flagstr;
int i, error; int i, error;
@ -3425,8 +3434,7 @@ zdb_read_block(char *thing, spa_t *spa)
psize = size; psize = size;
lsize = size; lsize = size;
/* Some 4K native devices require 4K buffer alignment */ pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, PAGESIZE, UMEM_NOFAIL);
lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
BP_ZERO(bp); BP_ZERO(bp);
@ -3454,15 +3462,15 @@ zdb_read_block(char *thing, spa_t *spa)
/* /*
* Treat this as a normal block read. * Treat this as a normal block read.
*/ */
zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
ZIO_PRIORITY_SYNC_READ, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
} else { } else {
/* /*
* Treat this as a vdev child I/O. * Treat this as a vdev child I/O.
*/ */
zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
@ -3485,13 +3493,13 @@ zdb_read_block(char *thing, spa_t *spa)
void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
bcopy(pbuf, pbuf2, psize); abd_copy_to_buf(pbuf2, pabd, psize);
VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize, VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
SPA_MAXBLOCKSIZE - psize) == 0); random_get_pseudo_bytes_cb, NULL));
VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
SPA_MAXBLOCKSIZE - psize) == 0); SPA_MAXBLOCKSIZE - psize));
/* /*
* XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB, * XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB,
@ -3506,10 +3514,10 @@ zdb_read_block(char *thing, spa_t *spa)
"Trying %05llx -> %05llx (%s)\n", "Trying %05llx -> %05llx (%s)\n",
(u_longlong_t)psize, (u_longlong_t)lsize, (u_longlong_t)psize, (u_longlong_t)lsize,
zio_compress_table[c].ci_name); zio_compress_table[c].ci_name);
if (zio_decompress_data(c, pbuf, lbuf, if (zio_decompress_data(c, pabd,
psize, lsize) == 0 && lbuf, psize, lsize) == 0 &&
zio_decompress_data(c, pbuf2, lbuf2, zio_decompress_data_buf(c, pbuf2,
psize, lsize) == 0 && lbuf2, psize, lsize) == 0 &&
bcmp(lbuf, lbuf2, lsize) == 0) bcmp(lbuf, lbuf2, lsize) == 0)
break; break;
} }
@ -3527,7 +3535,7 @@ zdb_read_block(char *thing, spa_t *spa)
buf = lbuf; buf = lbuf;
size = lsize; size = lsize;
} else { } else {
buf = pbuf; buf = abd_to_buf(pabd);
size = psize; size = psize;
} }
@ -3545,7 +3553,7 @@ zdb_read_block(char *thing, spa_t *spa)
zdb_dump_block(thing, buf, size, flags); zdb_dump_block(thing, buf, size, flags);
out: out:
umem_free(pbuf, SPA_MAXBLOCKSIZE); abd_free(pabd);
umem_free(lbuf, SPA_MAXBLOCKSIZE); umem_free(lbuf, SPA_MAXBLOCKSIZE);
free(dup); free(dup);
} }

View File

@ -25,7 +25,7 @@
*/ */
/* /*
* Copyright (c) 2013, 2014 by Delphix. All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/ */
/* /*
@ -42,6 +42,7 @@
#include <sys/resource.h> #include <sys/resource.h>
#include <sys/zil.h> #include <sys/zil.h>
#include <sys/zil_impl.h> #include <sys/zil_impl.h>
#include <sys/abd.h>
extern uint8_t dump_opt[256]; extern uint8_t dump_opt[256];
@ -119,14 +120,30 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
(void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm); (void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm);
} }
/* ARGSUSED */
static int
zil_prt_rec_write_cb(void *data, size_t len, void *unused)
{
char *cdata = data;
int i;
for (i = 0; i < len; i++) {
if (isprint(*cdata))
(void) printf("%c ", *cdata);
else
(void) printf("%2X", *cdata);
cdata++;
}
return (0);
}
/* ARGSUSED */ /* ARGSUSED */
static void static void
zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
{ {
char *data, *dlimit; abd_t *data;
blkptr_t *bp = &lr->lr_blkptr; blkptr_t *bp = &lr->lr_blkptr;
zbookmark_phys_t zb; zbookmark_phys_t zb;
char *buf;
int verbose = MAX(dump_opt['d'], dump_opt['i']); int verbose = MAX(dump_opt['d'], dump_opt['i']);
int error; int error;
@ -137,9 +154,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
if (txtype == TX_WRITE2 || verbose < 5) if (txtype == TX_WRITE2 || verbose < 5)
return; return;
if ((buf = malloc(SPA_MAXBLOCKSIZE)) == NULL)
return;
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
(void) printf("%shas blkptr, %s\n", prefix, (void) printf("%shas blkptr, %s\n", prefix,
!BP_IS_HOLE(bp) && !BP_IS_HOLE(bp) &&
@ -150,43 +164,38 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
if (BP_IS_HOLE(bp)) { if (BP_IS_HOLE(bp)) {
(void) printf("\t\t\tLSIZE 0x%llx\n", (void) printf("\t\t\tLSIZE 0x%llx\n",
(u_longlong_t)BP_GET_LSIZE(bp)); (u_longlong_t)BP_GET_LSIZE(bp));
bzero(buf, SPA_MAXBLOCKSIZE);
(void) printf("%s<hole>\n", prefix); (void) printf("%s<hole>\n", prefix);
goto exit; return;
} }
if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
(void) printf("%s<block already committed>\n", prefix); (void) printf("%s<block already committed>\n", prefix);
goto exit; return;
} }
SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_foid, ZB_ZIL_LEVEL,
lr->lr_offset / BP_GET_LSIZE(bp)); lr->lr_offset / BP_GET_LSIZE(bp));
data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
error = zio_wait(zio_read(NULL, zilog->zl_spa, error = zio_wait(zio_read(NULL, zilog->zl_spa,
bp, buf, BP_GET_LSIZE(bp), NULL, NULL, bp, data, BP_GET_LSIZE(bp), NULL, NULL,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
if (error) if (error)
goto exit; goto out;
data = buf;
} else { } else {
data = (char *)(lr + 1); /* data is stored after the end of the lr_write record */
data = abd_alloc(lr->lr_length, B_FALSE);
abd_copy_from_buf(data, lr + 1, lr->lr_length);
} }
dlimit = data + MIN(lr->lr_length,
(verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
(void) printf("%s", prefix); (void) printf("%s", prefix);
while (data < dlimit) { (void) abd_iterate_func(data,
if (isprint(*data)) 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
(void) printf("%c ", *data); zil_prt_rec_write_cb, NULL);
else
(void) printf("%2hhX", *data);
data++;
}
(void) printf("\n"); (void) printf("\n");
exit:
free(buf); out:
abd_free(data);
} }
/* ARGSUSED */ /* ARGSUSED */

View File

@ -114,6 +114,7 @@
#include <sys/refcount.h> #include <sys/refcount.h>
#include <sys/zfeature.h> #include <sys/zfeature.h>
#include <sys/dsl_userhold.h> #include <sys/dsl_userhold.h>
#include <sys/abd.h>
#include <stdio.h> #include <stdio.h>
#include <stdio_ext.h> #include <stdio_ext.h>
#include <stdlib.h> #include <stdlib.h>
@ -193,6 +194,7 @@ extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold; extern uint64_t metaslab_df_alloc_threshold;
extern int metaslab_preload_limit; extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled; extern boolean_t zfs_compressed_arc_enabled;
extern int zfs_abd_scatter_enabled;
static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts; static ztest_shared_opts_t ztest_opts;
@ -5444,7 +5446,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
enum zio_checksum checksum = spa_dedup_checksum(spa); enum zio_checksum checksum = spa_dedup_checksum(spa);
dmu_buf_t *db; dmu_buf_t *db;
dmu_tx_t *tx; dmu_tx_t *tx;
void *buf; abd_t *abd;
blkptr_t blk; blkptr_t blk;
int copies = 2 * ZIO_DEDUPDITTO_MIN; int copies = 2 * ZIO_DEDUPDITTO_MIN;
int i; int i;
@ -5525,14 +5527,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
* Damage the block. Dedup-ditto will save us when we read it later. * Damage the block. Dedup-ditto will save us when we read it later.
*/ */
psize = BP_GET_PSIZE(&blk); psize = BP_GET_PSIZE(&blk);
buf = zio_buf_alloc(psize); abd = abd_alloc_linear(psize, B_TRUE);
ztest_pattern_set(buf, psize, ~pattern); ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
zio_buf_free(buf, psize); abd_free(abd);
(void) rw_unlock(&ztest_name_lock); (void) rw_unlock(&ztest_name_lock);
umem_free(od, sizeof (ztest_od_t)); umem_free(od, sizeof (ztest_od_t));
@ -5965,6 +5967,12 @@ ztest_resume_thread(void *arg)
*/ */
if (ztest_random(10) == 0) if (ztest_random(10) == 0)
zfs_compressed_arc_enabled = ztest_random(2); zfs_compressed_arc_enabled = ztest_random(2);
/*
* Periodically change the zfs_abd_scatter_enabled setting.
*/
if (ztest_random(10) == 0)
zfs_abd_scatter_enabled = ztest_random(2);
} }
thread_exit(); thread_exit();

View File

@ -1,6 +1,7 @@
SUBDIRS = fm fs crypto sysevent SUBDIRS = fm fs crypto sysevent
COMMON_H = \ COMMON_H = \
$(top_srcdir)/include/sys/abd.h \
$(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc.h \
$(top_srcdir)/include/sys/arc_impl.h \ $(top_srcdir)/include/sys/arc_impl.h \
$(top_srcdir)/include/sys/avl.h \ $(top_srcdir)/include/sys/avl.h \

160
include/sys/abd.h Normal file
View File

@ -0,0 +1,160 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
#ifndef _ABD_H
#define _ABD_H
#include <sys/isa_defs.h>
#include <sys/int_types.h>
#include <sys/debug.h>
#include <sys/refcount.h>
#ifdef _KERNEL
#include <linux/mm.h>
#include <sys/uio.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef enum abd_flags {
ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
ABD_FLAG_META = 1 << 2 /* does this represent FS metadata? */
} abd_flags_t;
typedef struct abd {
abd_flags_t abd_flags;
uint_t abd_size; /* excludes scattered abd_offset */
struct abd *abd_parent;
refcount_t abd_children;
union {
struct abd_scatter {
uint_t abd_offset;
uint_t abd_chunk_size;
struct page *abd_chunks[];
} abd_scatter;
struct abd_linear {
void *abd_buf;
} abd_linear;
} abd_u;
} abd_t;
typedef int abd_iter_func_t(void *buf, size_t len, void *private);
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *private);
extern int zfs_abd_scatter_enabled;
static inline boolean_t
abd_is_linear(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0);
}
/*
* Allocations and deallocations
*/
abd_t *abd_alloc(size_t, boolean_t);
abd_t *abd_alloc_linear(size_t, boolean_t);
abd_t *abd_alloc_for_io(size_t, boolean_t);
abd_t *abd_alloc_sametype(abd_t *, size_t);
void abd_free(abd_t *);
abd_t *abd_get_offset(abd_t *, size_t);
abd_t *abd_get_from_buf(void *, size_t);
void abd_put(abd_t *);
/*
* Conversion to and from a normal buffer
*/
void *abd_to_buf(abd_t *);
void *abd_borrow_buf(abd_t *, size_t);
void *abd_borrow_buf_copy(abd_t *, size_t);
void abd_return_buf(abd_t *, void *, size_t);
void abd_return_buf_copy(abd_t *, void *, size_t);
void abd_take_ownership_of_buf(abd_t *, boolean_t);
void abd_release_ownership_of_buf(abd_t *);
/*
* ABD operations
*/
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
abd_iter_func2_t *, void *);
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
int abd_cmp(abd_t *, abd_t *);
int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
void abd_zero_off(abd_t *, size_t, size_t);
/*
* Wrappers for calls with offsets of 0
*/
static inline void
abd_copy(abd_t *dabd, abd_t *sabd, size_t size)
{
abd_copy_off(dabd, sabd, 0, 0, size);
}
static inline void
abd_copy_from_buf(abd_t *abd, void *buf, size_t size)
{
abd_copy_from_buf_off(abd, buf, 0, size);
}
static inline void
abd_copy_to_buf(void* buf, abd_t *abd, size_t size)
{
abd_copy_to_buf_off(buf, abd, 0, size);
}
static inline int
abd_cmp_buf(abd_t *abd, void *buf, size_t size)
{
return (abd_cmp_buf_off(abd, buf, 0, size));
}
static inline void
abd_zero(abd_t *abd, size_t size)
{
abd_zero_off(abd, 0, size);
}
/*
* Module lifecycle
*/
void abd_init(void);
void abd_fini(void);
#ifdef __cplusplus
}
#endif
#endif /* _ABD_H */

View File

@ -166,7 +166,7 @@ typedef struct l1arc_buf_hdr {
refcount_t b_refcnt; refcount_t b_refcnt;
arc_callback_t *b_acb; arc_callback_t *b_acb;
void *b_pdata; abd_t *b_pabd;
} l1arc_buf_hdr_t; } l1arc_buf_hdr_t;
typedef struct l2arc_dev { typedef struct l2arc_dev {

View File

@ -20,6 +20,7 @@
*/ */
/* /*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_DDT_H #ifndef _SYS_DDT_H
@ -35,6 +36,8 @@
extern "C" { extern "C" {
#endif #endif
struct abd;
/* /*
* On-disk DDT formats, in the desired search order (newest version first). * On-disk DDT formats, in the desired search order (newest version first).
*/ */
@ -108,7 +111,7 @@ struct ddt_entry {
ddt_key_t dde_key; ddt_key_t dde_key;
ddt_phys_t dde_phys[DDT_PHYS_TYPES]; ddt_phys_t dde_phys[DDT_PHYS_TYPES];
zio_t *dde_lead_zio[DDT_PHYS_TYPES]; zio_t *dde_lead_zio[DDT_PHYS_TYPES];
void *dde_repair_data; struct abd *dde_repair_abd;
enum ddt_type dde_type; enum ddt_type dde_type;
enum ddt_class dde_class; enum ddt_class dde_class;
uint8_t dde_loading; uint8_t dde_loading;

View File

@ -416,6 +416,9 @@ _NOTE(CONSTCOND) } while (0)
#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill) #define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
#define BP_IS_METADATA(bp) \
(BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
#define BP_GET_ASIZE(bp) \ #define BP_GET_ASIZE(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \ (BP_IS_EMBEDDED(bp) ? 0 : \
DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
@ -423,8 +426,7 @@ _NOTE(CONSTCOND) } while (0)
DVA_GET_ASIZE(&(bp)->blk_dva[2])) DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_GET_UCSIZE(bp) \ #define BP_GET_UCSIZE(bp) \
((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \ #define BP_GET_NDVAS(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \ (BP_IS_EMBEDDED(bp) ? 0 : \
@ -569,8 +571,7 @@ _NOTE(CONSTCOND) } while (0)
} }
#define BP_GET_BUFC_TYPE(bp) \ #define BP_GET_BUFC_TYPE(bp) \
(((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \ (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
ARC_BUFC_METADATA : ARC_BUFC_DATA)
typedef enum spa_import_type { typedef enum spa_import_type {
SPA_IMPORT_EXISTING, SPA_IMPORT_EXISTING,

View File

@ -53,6 +53,7 @@ extern "C" {
typedef struct vdev_queue vdev_queue_t; typedef struct vdev_queue vdev_queue_t;
typedef struct vdev_cache vdev_cache_t; typedef struct vdev_cache vdev_cache_t;
typedef struct vdev_cache_entry vdev_cache_entry_t; typedef struct vdev_cache_entry vdev_cache_entry_t;
struct abd;
extern int zfs_vdev_queue_depth_pct; extern int zfs_vdev_queue_depth_pct;
extern uint32_t zfs_vdev_async_write_max_active; extern uint32_t zfs_vdev_async_write_max_active;
@ -87,7 +88,7 @@ typedef const struct vdev_ops {
* Virtual device properties * Virtual device properties
*/ */
struct vdev_cache_entry { struct vdev_cache_entry {
char *ve_data; struct abd *ve_abd;
uint64_t ve_offset; uint64_t ve_offset;
clock_t ve_lastused; clock_t ve_lastused;
avl_node_t ve_offset_node; avl_node_t ve_offset_node;

View File

@ -28,6 +28,7 @@
#include <sys/types.h> #include <sys/types.h>
#include <sys/debug.h> #include <sys/debug.h>
#include <sys/kstat.h> #include <sys/kstat.h>
#include <sys/abd.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -104,7 +105,7 @@ typedef struct raidz_col {
size_t rc_devidx; /* child device index for I/O */ size_t rc_devidx; /* child device index for I/O */
size_t rc_offset; /* device offset */ size_t rc_offset; /* device offset */
size_t rc_size; /* I/O size */ size_t rc_size; /* I/O size */
void *rc_data; /* I/O data */ abd_t *rc_abd; /* I/O data */
void *rc_gdata; /* used to store the "good" version */ void *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */ int rc_error; /* I/O error for this device */
unsigned int rc_tried; /* Did we attempt this I/O column? */ unsigned int rc_tried; /* Did we attempt this I/O column? */
@ -121,7 +122,7 @@ typedef struct raidz_map {
size_t rm_firstdatacol; /* First data column/parity count */ size_t rm_firstdatacol; /* First data column/parity count */
size_t rm_nskip; /* Skipped sectors for padding */ size_t rm_nskip; /* Skipped sectors for padding */
size_t rm_skipstart; /* Column index of padding start */ size_t rm_skipstart; /* Column index of padding start */
void *rm_datacopy; /* rm_asize-buffer of copied data */ abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
size_t rm_reports; /* # of referencing checksum reports */ size_t rm_reports; /* # of referencing checksum reports */
unsigned int rm_freed; /* map no longer has referencing ZIO */ unsigned int rm_freed; /* map no longer has referencing ZIO */
unsigned int rm_ecksuminjected; /* checksum error was injected */ unsigned int rm_ecksuminjected; /* checksum error was injected */

View File

@ -301,6 +301,7 @@ typedef void zio_cksum_free_f(void *cbdata, size_t size);
struct zio_bad_cksum; /* defined in zio_checksum.h */ struct zio_bad_cksum; /* defined in zio_checksum.h */
struct dnode_phys; struct dnode_phys;
struct abd;
struct zio_cksum_report { struct zio_cksum_report {
struct zio_cksum_report *zcr_next; struct zio_cksum_report *zcr_next;
@ -333,12 +334,12 @@ typedef struct zio_gang_node {
} zio_gang_node_t; } zio_gang_node_t;
typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
zio_gang_node_t *gn, void *data); zio_gang_node_t *gn, struct abd *data, uint64_t offset);
typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
typedef struct zio_transform { typedef struct zio_transform {
void *zt_orig_data; struct abd *zt_orig_abd;
uint64_t zt_orig_size; uint64_t zt_orig_size;
uint64_t zt_bufsize; uint64_t zt_bufsize;
zio_transform_func_t *zt_transform; zio_transform_func_t *zt_transform;
@ -396,8 +397,8 @@ struct zio {
uint64_t io_lsize; uint64_t io_lsize;
/* Data represented by this I/O */ /* Data represented by this I/O */
void *io_data; struct abd *io_abd;
void *io_orig_data; struct abd *io_orig_abd;
uint64_t io_size; uint64_t io_size;
uint64_t io_orig_size; uint64_t io_orig_size;
@ -455,19 +456,19 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
extern zio_t *zio_root(spa_t *spa, extern zio_t *zio_root(spa_t *spa,
zio_done_func_t *done, void *private, enum zio_flag flags); zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
uint64_t lsize, zio_done_func_t *done, void *private, struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done, zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags, void *private, zio_priority_t priority, enum zio_flag flags,
const zbookmark_phys_t *zb); const zbookmark_phys_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private, struct abd *data, uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
@ -483,12 +484,12 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_done_func_t *done, void *private, enum zio_flag flags); zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum, uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *private, zio_priority_t priority, zio_done_func_t *done, void *private, zio_priority_t priority,
enum zio_flag flags, boolean_t labels); enum zio_flag flags, boolean_t labels);
extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum, uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *private, zio_priority_t priority, zio_done_func_t *done, void *private, zio_priority_t priority,
enum zio_flag flags, boolean_t labels); enum zio_flag flags, boolean_t labels);
@ -517,21 +518,20 @@ extern void *zio_buf_alloc(size_t size);
extern void zio_buf_free(void *buf, size_t size); extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size); extern void *zio_data_buf_alloc(size_t size);
extern void zio_data_buf_free(void *buf, size_t size); extern void zio_data_buf_free(void *buf, size_t size);
extern void *zio_buf_alloc_flags(size_t size, int flags);
extern void zio_push_transform(zio_t *zio, void *data, uint64_t size, extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
uint64_t bufsize, zio_transform_func_t *transform); uint64_t bufsize, zio_transform_func_t *transform);
extern void zio_pop_transforms(zio_t *zio); extern void zio_pop_transforms(zio_t *zio);
extern void zio_resubmit_stage_async(void *); extern void zio_resubmit_stage_async(void *);
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
uint64_t offset, void *data, uint64_t size, int type, uint64_t offset, struct abd *data, uint64_t size, int type,
zio_priority_t priority, enum zio_flag flags, zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private); zio_done_func_t *done, void *private);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
void *data, uint64_t size, int type, zio_priority_t priority, struct abd *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private); enum zio_flag flags, zio_done_func_t *done, void *private);
extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_bypass(zio_t *zio);

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2015 by Delphix. All rights reserved. * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
* Copyright Saso Kiselkov 2013, All rights reserved. * Copyright Saso Kiselkov 2013, All rights reserved.
*/ */
@ -34,12 +34,12 @@
extern "C" { extern "C" {
#endif #endif
struct abd;
/* /*
* Signature for checksum functions. * Signature for checksum functions.
*/ */
typedef void zio_checksum_func_t(const void *, uint64_t, const void *, typedef void zio_checksum_t(struct abd *abd, uint64_t size,
zio_cksum_t *);
typedef void zio_checksum_t(const void *data, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp); const void *ctx_template, zio_cksum_t *zcp);
typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
typedef void zio_checksum_tmpl_free_t(void *ctx_template); typedef void zio_checksum_tmpl_free_t(void *ctx_template);
@ -83,28 +83,28 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
/* /*
* Checksum routines. * Checksum routines.
*/ */
extern zio_checksum_t zio_checksum_SHA256; extern zio_checksum_t abd_checksum_SHA256;
extern zio_checksum_t zio_checksum_SHA512_native; extern zio_checksum_t abd_checksum_SHA512_native;
extern zio_checksum_t zio_checksum_SHA512_byteswap; extern zio_checksum_t abd_checksum_SHA512_byteswap;
/* Skein */ /* Skein */
extern zio_checksum_t zio_checksum_skein_native; extern zio_checksum_t abd_checksum_skein_native;
extern zio_checksum_t zio_checksum_skein_byteswap; extern zio_checksum_t abd_checksum_skein_byteswap;
extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init; extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init;
extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free; extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free;
/* Edon-R */ /* Edon-R */
extern zio_checksum_t zio_checksum_edonr_native; extern zio_checksum_t abd_checksum_edonr_native;
extern zio_checksum_t zio_checksum_edonr_byteswap; extern zio_checksum_t abd_checksum_edonr_byteswap;
extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init; extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init;
extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free; extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free;
extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
void *, uint64_t, uint64_t, zio_bad_cksum_t *); void *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, extern void zio_checksum_compute(zio_t *, enum zio_checksum,
void *data, uint64_t size); struct abd *, uint64_t);
extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
void *, uint64_t, uint64_t, zio_bad_cksum_t *); struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
extern enum zio_checksum spa_dedup_checksum(spa_t *spa); extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
extern void zio_checksum_templates_free(spa_t *spa); extern void zio_checksum_templates_free(spa_t *spa);

View File

@ -22,12 +22,14 @@
/* /*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
* Copyright (c) 2015 by Delphix. All rights reserved. * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
*/ */
#ifndef _SYS_ZIO_COMPRESS_H #ifndef _SYS_ZIO_COMPRESS_H
#define _SYS_ZIO_COMPRESS_H #define _SYS_ZIO_COMPRESS_H
#include <sys/abd.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -59,14 +61,21 @@ typedef size_t zio_compress_func_t(void *src, void *dst,
typedef int zio_decompress_func_t(void *src, void *dst, typedef int zio_decompress_func_t(void *src, void *dst,
size_t s_len, size_t d_len, int); size_t s_len, size_t d_len, int);
/*
* Common signature for all zio decompress functions using an ABD as input.
* This is helpful if you have both compressed ARC and scatter ABDs enabled,
* but is not a requirement for all compression algorithms.
*/
typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
size_t s_len, size_t d_len, int);
/* /*
* Information about each compression function. * Information about each compression function.
*/ */
typedef const struct zio_compress_info { typedef const struct zio_compress_info {
zio_compress_func_t *ci_compress; /* compression function */ char *ci_name;
zio_decompress_func_t *ci_decompress; /* decompression function */ int ci_level;
int ci_level; /* level parameter */ zio_compress_func_t *ci_compress;
char *ci_name; /* algorithm name */ zio_decompress_func_t *ci_decompress;
} zio_compress_info_t; } zio_compress_info_t;
extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
@ -96,13 +105,16 @@ extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
int level); int level);
extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
int level); int level);
extern int lz4_decompress_abd(abd_t *src, void *dst, size_t s_len, size_t d_len,
int level);
/* /*
* Compress and decompress data if necessary. * Compress and decompress data if necessary.
*/ */
extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst, extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst,
size_t s_len); size_t s_len);
extern int zio_decompress_data(enum zio_compress c, void *src, void *dst, extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
size_t s_len, size_t d_len);
extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
size_t s_len, size_t d_len); size_t s_len, size_t d_len);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -48,15 +48,16 @@ extern "C" {
* checksum method is added. This method will ignore last (size % 4) bytes of * checksum method is added. This method will ignore last (size % 4) bytes of
* the data buffer. * the data buffer.
*/ */
void fletcher_init(zio_cksum_t *);
void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *); void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *);
void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *); void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *);
int fletcher_2_incremental_native(void *, size_t, void *);
int fletcher_2_incremental_byteswap(void *, size_t, void *);
void fletcher_4_native_varsize(const void *, uint64_t, zio_cksum_t *); void fletcher_4_native_varsize(const void *, uint64_t, zio_cksum_t *);
void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
void fletcher_4_incremental_native(const void *, uint64_t, int fletcher_4_incremental_native(void *, size_t, void *);
zio_cksum_t *); int fletcher_4_incremental_byteswap(void *, size_t, void *);
void fletcher_4_incremental_byteswap(const void *, uint64_t,
zio_cksum_t *);
int fletcher_4_impl_set(const char *selector); int fletcher_4_impl_set(const char *selector);
void fletcher_4_init(void); void fletcher_4_init(void);
void fletcher_4_fini(void); void fletcher_4_fini(void);

View File

@ -366,11 +366,12 @@ cksummer(void *arg)
if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum, if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum,
zero_cksum) || zero_cksum) ||
!DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) { !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) {
SHA256_CTX ctx; SHA2_CTX ctx;
zio_cksum_t tmpsha256; zio_cksum_t tmpsha256;
zio_checksum_SHA256(buf, SHA2Init(SHA256, &ctx);
payload_size, &ctx, &tmpsha256); SHA2Update(&ctx, buf, payload_size);
SHA2Final(&tmpsha256, &ctx);
drrw->drr_key.ddk_cksum.zc_word[0] = drrw->drr_key.ddk_cksum.zc_word[0] =
BE_64(tmpsha256.zc_word[0]); BE_64(tmpsha256.zc_word[0]);

View File

@ -33,6 +33,7 @@ KERNEL_C = \
zfs_uio.c \ zfs_uio.c \
zpool_prop.c \ zpool_prop.c \
zprop_common.c \ zprop_common.c \
abd.c \
arc.c \ arc.c \
blkptr.c \ blkptr.c \
bplist.c \ bplist.c \

View File

@ -27,6 +27,10 @@
* Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved.
*/ */
/*
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
/* /*
* Fletcher Checksums * Fletcher Checksums
* ------------------ * ------------------
@ -219,14 +223,26 @@ static boolean_t fletcher_4_initialized = B_FALSE;
/*ARGSUSED*/ /*ARGSUSED*/
void void
fletcher_2_native(const void *buf, uint64_t size, fletcher_init(zio_cksum_t *zcp)
const void *ctx_template, zio_cksum_t *zcp)
{ {
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
}
int
fletcher_2_incremental_native(void *buf, size_t size, void *data)
{
zio_cksum_t *zcp = data;
const uint64_t *ip = buf; const uint64_t *ip = buf;
const uint64_t *ipend = ip + (size / sizeof (uint64_t)); const uint64_t *ipend = ip + (size / sizeof (uint64_t));
uint64_t a0, b0, a1, b1; uint64_t a0, b0, a1, b1;
for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { a0 = zcp->zc_word[0];
a1 = zcp->zc_word[1];
b0 = zcp->zc_word[2];
b1 = zcp->zc_word[3];
for (; ip < ipend; ip += 2) {
a0 += ip[0]; a0 += ip[0];
a1 += ip[1]; a1 += ip[1];
b0 += a0; b0 += a0;
@ -234,18 +250,33 @@ fletcher_2_native(const void *buf, uint64_t size,
} }
ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
return (0);
} }
/*ARGSUSED*/ /*ARGSUSED*/
void void
fletcher_2_byteswap(const void *buf, uint64_t size, fletcher_2_native(const void *buf, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp) const void *ctx_template, zio_cksum_t *zcp)
{ {
fletcher_init(zcp);
(void) fletcher_2_incremental_native((void *) buf, size, zcp);
}
int
fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
{
zio_cksum_t *zcp = data;
const uint64_t *ip = buf; const uint64_t *ip = buf;
const uint64_t *ipend = ip + (size / sizeof (uint64_t)); const uint64_t *ipend = ip + (size / sizeof (uint64_t));
uint64_t a0, b0, a1, b1; uint64_t a0, b0, a1, b1;
for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { a0 = zcp->zc_word[0];
a1 = zcp->zc_word[1];
b0 = zcp->zc_word[2];
b1 = zcp->zc_word[3];
for (; ip < ipend; ip += 2) {
a0 += BSWAP_64(ip[0]); a0 += BSWAP_64(ip[0]);
a1 += BSWAP_64(ip[1]); a1 += BSWAP_64(ip[1]);
b0 += a0; b0 += a0;
@ -253,6 +284,16 @@ fletcher_2_byteswap(const void *buf, uint64_t size,
} }
ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
return (0);
}
/*ARGSUSED*/
void
fletcher_2_byteswap(const void *buf, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
} }
static void static void
@ -523,25 +564,28 @@ fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
} }
} }
void int
fletcher_4_incremental_native(const void *buf, uint64_t size, zio_cksum_t *zcp) fletcher_4_incremental_native(void *buf, size_t size, void *data)
{ {
zio_cksum_t *zcp = data;
/* Use scalar impl to directly update cksum of small blocks */ /* Use scalar impl to directly update cksum of small blocks */
if (size < SPA_MINBLOCKSIZE) if (size < SPA_MINBLOCKSIZE)
fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
else else
fletcher_4_incremental_impl(B_TRUE, buf, size, zcp); fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
return (0);
} }
void int
fletcher_4_incremental_byteswap(const void *buf, uint64_t size, fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
zio_cksum_t *zcp)
{ {
zio_cksum_t *zcp = data;
/* Use scalar impl to directly update cksum of small blocks */ /* Use scalar impl to directly update cksum of small blocks */
if (size < SPA_MINBLOCKSIZE) if (size < SPA_MINBLOCKSIZE)
fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
else else
fletcher_4_incremental_impl(B_FALSE, buf, size, zcp); fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
return (0);
} }
@ -607,6 +651,9 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
#define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */ #define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */
typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
zio_cksum_t *);
static void static void
fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
{ {
@ -618,8 +665,9 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
zio_cksum_t zc; zio_cksum_t zc;
uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
zio_checksum_func_t *fletcher_4_test = native ? fletcher_4_native :
fletcher_4_byteswap; fletcher_checksum_func_t *fletcher_4_test = native ?
fletcher_4_native : fletcher_4_byteswap;
for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i]; struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
@ -769,6 +817,9 @@ module_param_call(zfs_fletcher_4_impl,
fletcher_4_param_set, fletcher_4_param_get, NULL, 0644); fletcher_4_param_set, fletcher_4_param_get, NULL, 0644);
MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation."); MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation.");
EXPORT_SYMBOL(fletcher_init);
EXPORT_SYMBOL(fletcher_2_incremental_native);
EXPORT_SYMBOL(fletcher_2_incremental_byteswap);
EXPORT_SYMBOL(fletcher_4_init); EXPORT_SYMBOL(fletcher_4_init);
EXPORT_SYMBOL(fletcher_4_fini); EXPORT_SYMBOL(fletcher_4_fini);
EXPORT_SYMBOL(fletcher_2_native); EXPORT_SYMBOL(fletcher_2_native);

View File

@ -7,6 +7,7 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
obj-$(CONFIG_ZFS) := $(MODULE).o obj-$(CONFIG_ZFS) := $(MODULE).o
$(MODULE)-objs += abd.o
$(MODULE)-objs += arc.o $(MODULE)-objs += arc.o
$(MODULE)-objs += blkptr.o $(MODULE)-objs += blkptr.o
$(MODULE)-objs += bplist.o $(MODULE)-objs += bplist.o

1008
module/zfs/abd.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -136,14 +136,14 @@
* the arc_buf_hdr_t that will point to the data block in memory. A block can * the arc_buf_hdr_t that will point to the data block in memory. A block can
* only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
* caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
* also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
* *
* The L1ARC's data pointer may or may not be uncompressed. The ARC has the * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
* ability to store the physical data (b_pdata) associated with the DVA of the * ability to store the physical data (b_pabd) associated with the DVA of the
* arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block, * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
* it will match its on-disk compression characteristics. This behavior can be * it will match its on-disk compression characteristics. This behavior can be
* disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
* compressed ARC functionality is disabled, the b_pdata will point to an * compressed ARC functionality is disabled, the b_pabd will point to an
* uncompressed version of the on-disk data. * uncompressed version of the on-disk data.
* *
* Data in the L1ARC is not accessed by consumers of the ARC directly. Each * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
@ -182,7 +182,7 @@
* | l1arc_buf_hdr_t * | l1arc_buf_hdr_t
* | | arc_buf_t * | | arc_buf_t
* | b_buf +------------>+-----------+ arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t
* | b_pdata +-+ |b_next +---->+-----------+ * | b_pabd +-+ |b_next +---->+-----------+
* +-----------+ | |-----------| |b_next +-->NULL * +-----------+ | |-----------| |b_next +-->NULL
* | |b_comp = T | +-----------+ * | |b_comp = T | +-----------+
* | |b_data +-+ |b_comp = F | * | |b_data +-+ |b_comp = F |
@ -199,8 +199,8 @@
* When a consumer reads a block, the ARC must first look to see if the * When a consumer reads a block, the ARC must first look to see if the
* arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
* arc_buf_t and either copies uncompressed data into a new data buffer from an * arc_buf_t and either copies uncompressed data into a new data buffer from an
* existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
* new data buffer, or shares the hdr's b_pdata buffer, depending on whether the * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
* hdr is compressed and the desired compression characteristics of the * hdr is compressed and the desired compression characteristics of the
* arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
* arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
@ -224,7 +224,7 @@
* | | arc_buf_t (shared) * | | arc_buf_t (shared)
* | b_buf +------------>+---------+ arc_buf_t * | b_buf +------------>+---------+ arc_buf_t
* | | |b_next +---->+---------+ * | | |b_next +---->+---------+
* | b_pdata +-+ |---------| |b_next +-->NULL * | b_pabd +-+ |---------| |b_next +-->NULL
* +-----------+ | | | +---------+ * +-----------+ | | | +---------+
* | |b_data +-+ | | * | |b_data +-+ | |
* | +---------+ | |b_data +-+ * | +---------+ | |b_data +-+
@ -238,19 +238,19 @@
* | +------+ | * | +------+ |
* +---------------------------------+ * +---------------------------------+
* *
* Writing to the ARC requires that the ARC first discard the hdr's b_pdata * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
* since the physical block is about to be rewritten. The new data contents * since the physical block is about to be rewritten. The new data contents
* will be contained in the arc_buf_t. As the I/O pipeline performs the write, * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
* it may compress the data before writing it to disk. The ARC will be called * it may compress the data before writing it to disk. The ARC will be called
* with the transformed data and will bcopy the transformed on-disk block into * with the transformed data and will bcopy the transformed on-disk block into
* a newly allocated b_pdata. Writes are always done into buffers which have * a newly allocated b_pabd. Writes are always done into buffers which have
* either been loaned (and hence are new and don't have other readers) or * either been loaned (and hence are new and don't have other readers) or
* buffers which have been released (and hence have their own hdr, if there * buffers which have been released (and hence have their own hdr, if there
* were originally other readers of the buf's original hdr). This ensures that * were originally other readers of the buf's original hdr). This ensures that
* the ARC only needs to update a single buf and its hdr after a write occurs. * the ARC only needs to update a single buf and its hdr after a write occurs.
* *
* When the L2ARC is in use, it will also take advantage of the b_pdata. The * When the L2ARC is in use, it will also take advantage of the b_pabd. The
* L2ARC will always write the contents of b_pdata to the L2ARC. This means * L2ARC will always write the contents of b_pabd to the L2ARC. This means
* that when compressed ARC is enabled that the L2ARC blocks are identical * that when compressed ARC is enabled that the L2ARC blocks are identical
* to the on-disk block in the main data pool. This provides a significant * to the on-disk block in the main data pool. This provides a significant
* advantage since the ARC can leverage the bp's checksum when reading from the * advantage since the ARC can leverage the bp's checksum when reading from the
@ -271,7 +271,9 @@
#include <sys/vdev.h> #include <sys/vdev.h>
#include <sys/vdev_impl.h> #include <sys/vdev_impl.h>
#include <sys/dsl_pool.h> #include <sys/dsl_pool.h>
#include <sys/zio_checksum.h>
#include <sys/multilist.h> #include <sys/multilist.h>
#include <sys/abd.h>
#ifdef _KERNEL #ifdef _KERNEL
#include <sys/vmsystm.h> #include <sys/vmsystm.h>
#include <vm/anon.h> #include <vm/anon.h>
@ -315,7 +317,7 @@ int zfs_arc_num_sublists_per_state = 0;
/* number of seconds before growing cache again */ /* number of seconds before growing cache again */
static int arc_grow_retry = 5; static int arc_grow_retry = 5;
/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
int zfs_arc_overflow_shift = 8; int zfs_arc_overflow_shift = 8;
/* shift of arc_c for calculating both min and max arc_p */ /* shift of arc_c for calculating both min and max arc_p */
@ -455,13 +457,13 @@ typedef struct arc_stats {
kstat_named_t arcstat_c_max; kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size; kstat_named_t arcstat_size;
/* /*
* Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
* Note that the compressed bytes may match the uncompressed bytes * Note that the compressed bytes may match the uncompressed bytes
* if the block is either not compressed or compressed arc is disabled. * if the block is either not compressed or compressed arc is disabled.
*/ */
kstat_named_t arcstat_compressed_size; kstat_named_t arcstat_compressed_size;
/* /*
* Uncompressed size of the data stored in b_pdata. If compressed * Uncompressed size of the data stored in b_pabd. If compressed
* arc is disabled then this value will be identical to the stat * arc is disabled then this value will be identical to the stat
* above. * above.
*/ */
@ -960,7 +962,7 @@ typedef struct l2arc_read_callback {
typedef struct l2arc_data_free { typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */ /* protected by l2arc_free_on_write_mtx */
void *l2df_data; abd_t *l2df_abd;
size_t l2df_size; size_t l2df_size;
arc_buf_contents_t l2df_type; arc_buf_contents_t l2df_type;
list_node_t l2df_list_node; list_node_t l2df_list_node;
@ -970,10 +972,14 @@ static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv; static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit; static uint8_t l2arc_thread_exit;
static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); static void arc_hdr_free_pabd(arc_buf_hdr_t *);
static void arc_hdr_alloc_pabd(arc_buf_hdr_t *);
static void arc_access(arc_buf_hdr_t *, kmutex_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing(void); static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *); static void arc_buf_watch(arc_buf_t *);
@ -1336,7 +1342,9 @@ static inline boolean_t
arc_buf_is_shared(arc_buf_t *buf) arc_buf_is_shared(arc_buf_t *buf)
{ {
boolean_t shared = (buf->b_data != NULL && boolean_t shared = (buf->b_data != NULL &&
buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); buf->b_hdr->b_l1hdr.b_pabd != NULL &&
abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_SHARED(buf));
IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
@ -1376,8 +1384,6 @@ arc_cksum_verify(arc_buf_t *buf)
return; return;
if (ARC_BUF_COMPRESSED(buf)) { if (ARC_BUF_COMPRESSED(buf)) {
ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
hdr->b_l1hdr.b_bufcnt > 1);
return; return;
} }
@ -1424,7 +1430,8 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
lsize = HDR_GET_LSIZE(hdr); lsize = HDR_GET_LSIZE(hdr);
csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
if (csize < HDR_GET_PSIZE(hdr)) { if (csize < HDR_GET_PSIZE(hdr)) {
/* /*
@ -1459,7 +1466,7 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
* logical I/O size and not just a gang fragment. * logical I/O size and not just a gang fragment.
*/ */
valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
zio->io_offset, NULL) == 0); zio->io_offset, NULL) == 0);
zio_pop_transforms(zio); zio_pop_transforms(zio);
return (valid_cksum); return (valid_cksum);
@ -1483,18 +1490,9 @@ arc_cksum_compute(arc_buf_t *buf)
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_l1hdr.b_freeze_cksum != NULL) { if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
ASSERT(!ARC_BUF_COMPRESSED(buf) || hdr->b_l1hdr.b_bufcnt > 1);
mutex_exit(&hdr->b_l1hdr.b_freeze_lock); mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return; return;
} else if (ARC_BUF_COMPRESSED(buf)) { } else if (ARC_BUF_COMPRESSED(buf)) {
/*
* Since the checksum doesn't apply to compressed buffers, we
* only keep a checksum if there are uncompressed buffers.
* Therefore there must be another buffer, which is
* uncompressed.
*/
IMPLY(hdr->b_l1hdr.b_freeze_cksum != NULL,
hdr->b_l1hdr.b_bufcnt > 1);
mutex_exit(&hdr->b_l1hdr.b_freeze_lock); mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return; return;
} }
@ -1589,8 +1587,6 @@ arc_buf_thaw(arc_buf_t *buf)
* allocate b_thawed. * allocate b_thawed.
*/ */
if (ARC_BUF_COMPRESSED(buf)) { if (ARC_BUF_COMPRESSED(buf)) {
ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
hdr->b_l1hdr.b_bufcnt > 1);
return; return;
} }
@ -1609,8 +1605,6 @@ arc_buf_freeze(arc_buf_t *buf)
return; return;
if (ARC_BUF_COMPRESSED(buf)) { if (ARC_BUF_COMPRESSED(buf)) {
ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
hdr->b_l1hdr.b_bufcnt > 1);
return; return;
} }
@ -1740,7 +1734,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
if (hdr_compressed == compressed) { if (hdr_compressed == compressed) {
if (!arc_buf_is_shared(buf)) { if (!arc_buf_is_shared(buf)) {
bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
arc_buf_size(buf)); arc_buf_size(buf));
} }
} else { } else {
@ -1792,7 +1786,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
return (0); return (0);
} else { } else {
int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
hdr->b_l1hdr.b_pdata, buf->b_data, hdr->b_l1hdr.b_pabd, buf->b_data,
HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
/* /*
@ -1829,7 +1823,7 @@ arc_decompress(arc_buf_t *buf)
} }
/* /*
* Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
*/ */
static uint64_t static uint64_t
arc_hdr_size(arc_buf_hdr_t *hdr) arc_hdr_size(arc_buf_hdr_t *hdr)
@ -1862,14 +1856,14 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
if (GHOST_STATE(state)) { if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
(void) refcount_add_many(&state->arcs_esize[type], (void) refcount_add_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr); HDR_GET_LSIZE(hdr), hdr);
return; return;
} }
ASSERT(!GHOST_STATE(state)); ASSERT(!GHOST_STATE(state));
if (hdr->b_l1hdr.b_pdata != NULL) { if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_add_many(&state->arcs_esize[type], (void) refcount_add_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr); arc_hdr_size(hdr), hdr);
} }
@ -1897,14 +1891,14 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
if (GHOST_STATE(state)) { if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
(void) refcount_remove_many(&state->arcs_esize[type], (void) refcount_remove_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr); HDR_GET_LSIZE(hdr), hdr);
return; return;
} }
ASSERT(!GHOST_STATE(state)); ASSERT(!GHOST_STATE(state));
if (hdr->b_l1hdr.b_pdata != NULL) { if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_remove_many(&state->arcs_esize[type], (void) refcount_remove_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr); arc_hdr_size(hdr), hdr);
} }
@ -2051,7 +2045,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
old_state = hdr->b_l1hdr.b_state; old_state = hdr->b_l1hdr.b_state;
refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
bufcnt = hdr->b_l1hdr.b_bufcnt; bufcnt = hdr->b_l1hdr.b_bufcnt;
update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
} else { } else {
old_state = arc_l2c_only; old_state = arc_l2c_only;
refcnt = 0; refcnt = 0;
@ -2120,7 +2114,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
*/ */
(void) refcount_add_many(&new_state->arcs_size, (void) refcount_add_many(&new_state->arcs_size,
HDR_GET_LSIZE(hdr), hdr); HDR_GET_LSIZE(hdr), hdr);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
} else { } else {
arc_buf_t *buf; arc_buf_t *buf;
uint32_t buffers = 0; uint32_t buffers = 0;
@ -2150,7 +2144,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
} }
ASSERT3U(bufcnt, ==, buffers); ASSERT3U(bufcnt, ==, buffers);
if (hdr->b_l1hdr.b_pdata != NULL) { if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_add_many(&new_state->arcs_size, (void) refcount_add_many(&new_state->arcs_size,
arc_hdr_size(hdr), hdr); arc_hdr_size(hdr), hdr);
} else { } else {
@ -2163,7 +2157,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(old_state)) { if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt); ASSERT0(bufcnt);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
/* /*
* When moving a header off of a ghost state, * When moving a header off of a ghost state,
@ -2204,7 +2198,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
buf); buf);
} }
ASSERT3U(bufcnt, ==, buffers); ASSERT3U(bufcnt, ==, buffers);
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
(void) refcount_remove_many( (void) refcount_remove_many(
&old_state->arcs_size, arc_hdr_size(hdr), hdr); &old_state->arcs_size, arc_hdr_size(hdr), hdr);
} }
@ -2302,7 +2296,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
/* /*
* Given a hdr and a buf, returns whether that buf can share its b_data buffer * Given a hdr and a buf, returns whether that buf can share its b_data buffer
* with the hdr's b_pdata. * with the hdr's b_pabd.
*/ */
static boolean_t static boolean_t
arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
@ -2397,17 +2391,20 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
* set the appropriate bit in the hdr's b_flags to indicate the hdr is * set the appropriate bit in the hdr's b_flags to indicate the hdr is
* allocate a new buffer to store the buf's data. * allocate a new buffer to store the buf's data.
* *
* There is one additional restriction here because we're sharing * There are two additional restrictions here because we're sharing
* hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
* involved in an L2ARC write, because if this buf is used by an * actively involved in an L2ARC write, because if this buf is used by
* arc_write() then the hdr's data buffer will be released when the * an arc_write() then the hdr's data buffer will be released when the
* write completes, even though the L2ARC write might still be using it. * write completes, even though the L2ARC write might still be using it.
* Second, the hdr's ABD must be linear so that the buf's user doesn't
* need to be ABD-aware.
*/ */
can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr); can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
abd_is_linear(hdr->b_l1hdr.b_pabd);
/* Set up b_data and sharing */ /* Set up b_data and sharing */
if (can_share) { if (can_share) {
buf->b_data = hdr->b_l1hdr.b_pdata; buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
buf->b_flags |= ARC_BUF_FLAG_SHARED; buf->b_flags |= ARC_BUF_FLAG_SHARED;
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
} else { } else {
@ -2492,11 +2489,11 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
} }
static void static void
l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
{ {
l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
df->l2df_data = data; df->l2df_abd = abd;
df->l2df_size = size; df->l2df_size = size;
df->l2df_type = type; df->l2df_type = type;
mutex_enter(&l2arc_free_on_write_mtx); mutex_enter(&l2arc_free_on_write_mtx);
@ -2521,7 +2518,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
} }
(void) refcount_remove_many(&state->arcs_size, size, hdr); (void) refcount_remove_many(&state->arcs_size, size, hdr);
l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
} }
/* /*
@ -2533,7 +2530,7 @@ static void
arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{ {
ASSERT(arc_can_share(hdr, buf)); ASSERT(arc_can_share(hdr, buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/* /*
@ -2542,7 +2539,9 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
* the refcount whenever an arc_buf_t is shared. * the refcount whenever an arc_buf_t is shared.
*/ */
refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr); refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr);
hdr->b_l1hdr.b_pdata = buf->b_data; hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
HDR_ISTYPE_METADATA(hdr));
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
buf->b_flags |= ARC_BUF_FLAG_SHARED; buf->b_flags |= ARC_BUF_FLAG_SHARED;
@ -2560,7 +2559,7 @@ static void
arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{ {
ASSERT(arc_buf_is_shared(buf)); ASSERT(arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/* /*
@ -2569,7 +2568,9 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
*/ */
refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf); refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf);
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
hdr->b_l1hdr.b_pdata = NULL; abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
abd_put(hdr->b_l1hdr.b_pabd);
hdr->b_l1hdr.b_pabd = NULL;
buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
/* /*
@ -2665,7 +2666,7 @@ arc_buf_destroy_impl(arc_buf_t *buf)
if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
/* /*
* If the current arc_buf_t is sharing its data buffer with the * If the current arc_buf_t is sharing its data buffer with the
* hdr, then reassign the hdr's b_pdata to share it with the new * hdr, then reassign the hdr's b_pabd to share it with the new
* buffer at the end of the list. The shared buffer is always * buffer at the end of the list. The shared buffer is always
* the last one on the hdr's buffer list. * the last one on the hdr's buffer list.
* *
@ -2680,8 +2681,8 @@ arc_buf_destroy_impl(arc_buf_t *buf)
/* hdr is uncompressed so can't have compressed buf */ /* hdr is uncompressed so can't have compressed buf */
VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
arc_hdr_free_pdata(hdr); arc_hdr_free_pabd(hdr);
/* /*
* We must setup a new shared block between the * We must setup a new shared block between the
@ -2714,26 +2715,26 @@ arc_buf_destroy_impl(arc_buf_t *buf)
} }
static void static void
arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr)
{ {
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
} }
static void static void
arc_hdr_free_pdata(arc_buf_hdr_t *hdr) arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
{ {
ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
/* /*
* If the hdr is currently being written to the l2arc then * If the hdr is currently being written to the l2arc then
@ -2745,10 +2746,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
arc_hdr_free_on_write(hdr); arc_hdr_free_on_write(hdr);
ARCSTAT_BUMP(arcstat_l2_free_on_write); ARCSTAT_BUMP(arcstat_l2_free_on_write);
} else { } else {
arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
arc_hdr_size(hdr), hdr); arc_hdr_size(hdr), hdr);
} }
hdr->b_l1hdr.b_pdata = NULL; hdr->b_l1hdr.b_pabd = NULL;
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
@ -2784,7 +2785,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
* the compressed or uncompressed data depending on the block * the compressed or uncompressed data depending on the block
* it references and compressed arc enablement. * it references and compressed arc enablement.
*/ */
arc_hdr_alloc_pdata(hdr); arc_hdr_alloc_pabd(hdr);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
return (hdr); return (hdr);
@ -2824,7 +2825,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
nhdr->b_l1hdr.b_state = arc_l2c_only; nhdr->b_l1hdr.b_state = arc_l2c_only;
/* Verify previous threads set to NULL before freeing */ /* Verify previous threads set to NULL before freeing */
ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
} else { } else {
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT0(hdr->b_l1hdr.b_bufcnt);
@ -2842,11 +2843,11 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
/* /*
* A buffer must not be moved into the arc_l2c_only * A buffer must not be moved into the arc_l2c_only
* state if it's not finished being written out to the * state if it's not finished being written out to the
* l2arc device. Otherwise, the b_l1hdr.b_pdata field * l2arc device. Otherwise, the b_l1hdr.b_pabd field
* might try to be accessed, even though it was removed. * might try to be accessed, even though it was removed.
*/ */
VERIFY(!HDR_L2_WRITING(hdr)); VERIFY(!HDR_L2_WRITING(hdr));
VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
} }
@ -2931,6 +2932,18 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
arc_buf_thaw(buf); arc_buf_thaw(buf);
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
if (!arc_buf_is_shared(buf)) {
/*
* To ensure that the hdr has the correct data in it if we call
* arc_decompress() on this buf before it's been written to
* disk, it's easiest if we just set up sharing between the
* buf and the hdr.
*/
ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
arc_hdr_free_pabd(hdr);
arc_share_buf(hdr, buf);
}
return (buf); return (buf);
} }
@ -2999,9 +3012,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
while (hdr->b_l1hdr.b_buf != NULL) while (hdr->b_l1hdr.b_buf != NULL)
arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
if (hdr->b_l1hdr.b_pdata != NULL) { if (hdr->b_l1hdr.b_pabd != NULL)
arc_hdr_free_pdata(hdr); arc_hdr_free_pabd(hdr);
}
} }
ASSERT3P(hdr->b_hash_next, ==, NULL); ASSERT3P(hdr->b_hash_next, ==, NULL);
@ -3068,7 +3080,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
/* /*
* l2arc_write_buffers() relies on a header's L1 portion * l2arc_write_buffers() relies on a header's L1 portion
* (i.e. its b_pdata field) during its write phase. * (i.e. its b_pabd field) during it's write phase.
* Thus, we cannot push a header onto the arc_l2c_only * Thus, we cannot push a header onto the arc_l2c_only
* state (removing its L1 piece) until the header is * state (removing its L1 piece) until the header is
* done being written to the l2arc. * done being written to the l2arc.
@ -3084,7 +3096,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
if (HDR_HAS_L2HDR(hdr)) { if (HDR_HAS_L2HDR(hdr)) {
ASSERT(hdr->b_l1hdr.b_pdata == NULL); ASSERT(hdr->b_l1hdr.b_pabd == NULL);
/* /*
* This buffer is cached on the 2nd Level ARC; * This buffer is cached on the 2nd Level ARC;
* don't destroy the header. * don't destroy the header.
@ -3149,9 +3161,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* If this hdr is being evicted and has a compressed * If this hdr is being evicted and has a compressed
* buffer then we discard it here before we change states. * buffer then we discard it here before we change states.
* This ensures that the accounting is updated correctly * This ensures that the accounting is updated correctly
* in arc_free_data_buf(). * in arc_free_data_impl().
*/ */
arc_hdr_free_pdata(hdr); arc_hdr_free_pabd(hdr);
arc_change_state(evicted_state, hdr, hash_lock); arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr)); ASSERT(HDR_IN_HASH_TABLE(hdr));
@ -3249,7 +3261,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* thread. If we used cv_broadcast, we could * thread. If we used cv_broadcast, we could
* wake up "too many" threads causing arc_size * wake up "too many" threads causing arc_size
* to significantly overflow arc_c; since * to significantly overflow arc_c; since
* arc_get_data_buf() doesn't check for overflow * arc_get_data_impl() doesn't check for overflow
* when it's woken up (it doesn't because it's * when it's woken up (it doesn't because it's
* possible for the ARC to be overflowing while * possible for the ARC to be overflowing while
* full of un-evictable buffers, and the * full of un-evictable buffers, and the
@ -4154,13 +4166,13 @@ arc_kmem_reap_now(void)
} }
/* /*
* Threads can block in arc_get_data_buf() waiting for this thread to evict * Threads can block in arc_get_data_impl() waiting for this thread to evict
* enough data and signal them to proceed. When this happens, the threads in * enough data and signal them to proceed. When this happens, the threads in
* arc_get_data_buf() are sleeping while holding the hash lock for their * arc_get_data_impl() are sleeping while holding the hash lock for their
* particular arc header. Thus, we must be careful to never sleep on a * particular arc header. Thus, we must be careful to never sleep on a
* hash lock in this thread. This is to prevent the following deadlock: * hash lock in this thread. This is to prevent the following deadlock:
* *
* - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
* waiting for the reclaim thread to signal it. * waiting for the reclaim thread to signal it.
* *
* - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
@ -4509,16 +4521,43 @@ arc_is_overflowing(void)
return (arc_size >= arc_c + overflow); return (arc_size >= arc_c + overflow);
} }
static abd_t *
arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_get_data_impl(hdr, size, tag);
if (type == ARC_BUFC_METADATA) {
return (abd_alloc(size, B_TRUE));
} else {
ASSERT(type == ARC_BUFC_DATA);
return (abd_alloc(size, B_FALSE));
}
}
static void *
arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_get_data_impl(hdr, size, tag);
if (type == ARC_BUFC_METADATA) {
return (zio_buf_alloc(size));
} else {
ASSERT(type == ARC_BUFC_DATA);
return (zio_data_buf_alloc(size));
}
}
/* /*
* Allocate a block and return it to the caller. If we are hitting the * Allocate a block and return it to the caller. If we are hitting the
* hard limit for the cache size, we must sleep, waiting for the eviction * hard limit for the cache size, we must sleep, waiting for the eviction
* thread to catch up. If we're past the target size but below the hard * thread to catch up. If we're past the target size but below the hard
* limit, we'll only signal the reclaim thread and continue on. * limit, we'll only signal the reclaim thread and continue on.
*/ */
static void * static void
arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{ {
void *datap = NULL;
arc_state_t *state = hdr->b_l1hdr.b_state; arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr); arc_buf_contents_t type = arc_buf_type(hdr);
@ -4562,11 +4601,8 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
VERIFY3U(hdr->b_type, ==, type); VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) { if (type == ARC_BUFC_METADATA) {
datap = zio_buf_alloc(size);
arc_space_consume(size, ARC_SPACE_META); arc_space_consume(size, ARC_SPACE_META);
} else { } else {
ASSERT(type == ARC_BUFC_DATA);
datap = zio_data_buf_alloc(size);
arc_space_consume(size, ARC_SPACE_DATA); arc_space_consume(size, ARC_SPACE_DATA);
} }
@ -4602,14 +4638,34 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
refcount_count(&arc_mru->arcs_size) > arc_p)) refcount_count(&arc_mru->arcs_size) > arc_p))
arc_p = MIN(arc_c, arc_p + size); arc_p = MIN(arc_c, arc_p + size);
} }
return (datap); }
static void
arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
{
arc_free_data_impl(hdr, size, tag);
abd_free(abd);
}
static void
arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
arc_free_data_impl(hdr, size, tag);
if (type == ARC_BUFC_METADATA) {
zio_buf_free(buf, size);
} else {
ASSERT(type == ARC_BUFC_DATA);
zio_data_buf_free(buf, size);
}
} }
/* /*
* Free the arc data buffer. * Free the arc data buffer.
*/ */
static void static void
arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{ {
arc_state_t *state = hdr->b_l1hdr.b_state; arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr); arc_buf_contents_t type = arc_buf_type(hdr);
@ -4626,11 +4682,9 @@ arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
VERIFY3U(hdr->b_type, ==, type); VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) { if (type == ARC_BUFC_METADATA) {
zio_buf_free(data, size);
arc_space_return(size, ARC_SPACE_META); arc_space_return(size, ARC_SPACE_META);
} else { } else {
ASSERT(type == ARC_BUFC_DATA); ASSERT(type == ARC_BUFC_DATA);
zio_data_buf_free(data, size);
arc_space_return(size, ARC_SPACE_DATA); arc_space_return(size, ARC_SPACE_DATA);
} }
} }
@ -4912,7 +4966,7 @@ arc_read_done(zio_t *zio)
if (callback_cnt == 0) { if (callback_cnt == 0) {
ASSERT(HDR_PREFETCH(hdr)); ASSERT(HDR_PREFETCH(hdr));
ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
} }
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
@ -5009,7 +5063,7 @@ top:
hdr = buf_hash_find(guid, bp, &hash_lock); hdr = buf_hash_find(guid, bp, &hash_lock);
} }
if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
arc_buf_t *buf = NULL; arc_buf_t *buf = NULL;
*arc_flags |= ARC_FLAG_CACHED; *arc_flags |= ARC_FLAG_CACHED;
@ -5161,7 +5215,7 @@ top:
hdr_full_cache); hdr_full_cache);
} }
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@ -5179,9 +5233,9 @@ top:
* avoid hitting an assert in remove_reference(). * avoid hitting an assert in remove_reference().
*/ */
arc_access(hdr, hash_lock); arc_access(hdr, hash_lock);
arc_hdr_alloc_pdata(hdr); arc_hdr_alloc_pabd(hdr);
} }
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
size = arc_hdr_size(hdr); size = arc_hdr_size(hdr);
/* /*
@ -5285,7 +5339,7 @@ top:
ASSERT3U(HDR_GET_COMPRESS(hdr), !=, ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
ZIO_COMPRESS_EMPTY); ZIO_COMPRESS_EMPTY);
rzio = zio_read_phys(pio, vd, addr, rzio = zio_read_phys(pio, vd, addr,
size, hdr->b_l1hdr.b_pdata, size, hdr->b_l1hdr.b_pabd,
ZIO_CHECKSUM_OFF, ZIO_CHECKSUM_OFF,
l2arc_read_done, cb, priority, l2arc_read_done, cb, priority,
zio_flags | ZIO_FLAG_DONT_CACHE | zio_flags | ZIO_FLAG_DONT_CACHE |
@ -5325,7 +5379,7 @@ top:
} }
} }
rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
arc_read_done, hdr, priority, zio_flags, zb); arc_read_done, hdr, priority, zio_flags, zb);
if (*arc_flags & ARC_FLAG_WAIT) { if (*arc_flags & ARC_FLAG_WAIT) {
@ -5557,16 +5611,17 @@ arc_release(arc_buf_t *buf, void *tag)
arc_unshare_buf(hdr, buf); arc_unshare_buf(hdr, buf);
/* /*
* Now we need to recreate the hdr's b_pdata. Since we * Now we need to recreate the hdr's b_pabd. Since we
* have lastbuf handy, we try to share with it, but if * have lastbuf handy, we try to share with it, but if
* we can't then we allocate a new b_pdata and copy the * we can't then we allocate a new b_pabd and copy the
* data from buf into it. * data from buf into it.
*/ */
if (arc_can_share(hdr, lastbuf)) { if (arc_can_share(hdr, lastbuf)) {
arc_share_buf(hdr, lastbuf); arc_share_buf(hdr, lastbuf);
} else { } else {
arc_hdr_alloc_pdata(hdr); arc_hdr_alloc_pabd(hdr);
bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize); abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
buf->b_data, psize);
} }
VERIFY3P(lastbuf->b_data, !=, NULL); VERIFY3P(lastbuf->b_data, !=, NULL);
} else if (HDR_SHARED_DATA(hdr)) { } else if (HDR_SHARED_DATA(hdr)) {
@ -5582,7 +5637,7 @@ arc_release(arc_buf_t *buf, void *tag)
HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
ASSERT(!ARC_BUF_SHARED(buf)); ASSERT(!ARC_BUF_SHARED(buf));
} }
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3P(state, !=, arc_l2c_only); ASSERT3P(state, !=, arc_l2c_only);
(void) refcount_remove_many(&state->arcs_size, (void) refcount_remove_many(&state->arcs_size,
@ -5601,7 +5656,7 @@ arc_release(arc_buf_t *buf, void *tag)
mutex_exit(hash_lock); mutex_exit(hash_lock);
/* /*
* Allocate a new hdr. The new hdr will contain a b_pdata * Allocate a new hdr. The new hdr will contain a b_pabd
* buffer which will be freed in arc_write(). * buffer which will be freed in arc_write().
*/ */
nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
@ -5677,6 +5732,7 @@ arc_write_ready(zio_t *zio)
arc_buf_hdr_t *hdr = buf->b_hdr; arc_buf_hdr_t *hdr = buf->b_hdr;
uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
enum zio_compress compress; enum zio_compress compress;
fstrans_cookie_t cookie = spl_fstrans_mark();
ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
@ -5690,15 +5746,15 @@ arc_write_ready(zio_t *zio)
if (zio->io_flags & ZIO_FLAG_REEXECUTED) { if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
arc_cksum_free(hdr); arc_cksum_free(hdr);
arc_buf_unwatch(buf); arc_buf_unwatch(buf);
if (hdr->b_l1hdr.b_pdata != NULL) { if (hdr->b_l1hdr.b_pabd != NULL) {
if (arc_buf_is_shared(buf)) { if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf); arc_unshare_buf(hdr, buf);
} else { } else {
arc_hdr_free_pdata(hdr); arc_hdr_free_pabd(hdr);
} }
} }
} }
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT(!arc_buf_is_shared(buf)); ASSERT(!arc_buf_is_shared(buf));
@ -5720,33 +5776,47 @@ arc_write_ready(zio_t *zio)
arc_hdr_set_compress(hdr, compress); arc_hdr_set_compress(hdr, compress);
/* /*
* If the hdr is compressed, then copy the compressed * Fill the hdr with data. If the hdr is compressed, the data we want
* zio contents into arc_buf_hdr_t. Otherwise, copy the original * is available from the zio, otherwise we can take it from the buf.
* data buf into the hdr. Ideally, we would like to always copy the *
* io_data into b_pdata but the user may have disabled compressed * We might be able to share the buf's data with the hdr here. However,
* arc thus the on-disk block may or may not match what we maintain * doing so would cause the ARC to be full of linear ABDs if we write a
* in the hdr's b_pdata field. * lot of shareable data. As a compromise, we check whether scattered
* ABDs are allowed, and assume that if they are then the user wants
* the ARC to be primarily filled with them regardless of the data being
* written. Therefore, if they're allowed then we allocate one and copy
* the data into it; otherwise, we share the data directly if we can.
*/ */
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
!ARC_BUF_COMPRESSED(buf)) { arc_hdr_alloc_pabd(hdr);
ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF);
/*
* Ideally, we would always copy the io_abd into b_pabd, but the
* user may have disabled compressed ARC, thus we must check the
* hdr's compression setting rather than the io_bp's.
*/
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
ZIO_COMPRESS_OFF);
ASSERT3U(psize, >, 0); ASSERT3U(psize, >, 0);
arc_hdr_alloc_pdata(hdr);
bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
} else { } else {
ASSERT3P(buf->b_data, ==, zio->io_orig_data); ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
arc_buf_size(buf));
}
} else {
ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
/*
* This hdr is not compressed so we're able to share
* the arc_buf_t data buffer with the hdr.
*/
arc_share_buf(hdr, buf); arc_share_buf(hdr, buf);
ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
HDR_GET_LSIZE(hdr)));
} }
arc_hdr_verify(hdr, zio->io_bp); arc_hdr_verify(hdr, zio->io_bp);
spl_fstrans_unmark(cookie);
} }
static void static void
@ -5850,6 +5920,7 @@ arc_write_done(zio_t *zio)
ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
callback->awcb_done(zio, buf, callback->awcb_private); callback->awcb_done(zio, buf, callback->awcb_private);
abd_put(zio->io_abd);
kmem_free(callback, sizeof (arc_write_callback_t)); kmem_free(callback, sizeof (arc_write_callback_t));
} }
@ -5886,10 +5957,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
callback->awcb_buf = buf; callback->awcb_buf = buf;
/* /*
* The hdr's b_pdata is now stale, free it now. A new data block * The hdr's b_pabd is now stale, free it now. A new data block
* will be allocated when the zio pipeline calls arc_write_ready(). * will be allocated when the zio pipeline calls arc_write_ready().
*/ */
if (hdr->b_l1hdr.b_pdata != NULL) { if (hdr->b_l1hdr.b_pabd != NULL) {
/* /*
* If the buf is currently sharing the data block with * If the buf is currently sharing the data block with
* the hdr then we need to break that relationship here. * the hdr then we need to break that relationship here.
@ -5899,15 +5970,16 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
if (arc_buf_is_shared(buf)) { if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf); arc_unshare_buf(hdr, buf);
} else { } else {
arc_hdr_free_pdata(hdr); arc_hdr_free_pabd(hdr);
} }
VERIFY3P(buf->b_data, !=, NULL); VERIFY3P(buf->b_data, !=, NULL);
arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
} }
ASSERT(!arc_buf_is_shared(buf)); ASSERT(!arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
zio = zio_write(pio, spa, txg, bp, buf->b_data, zio = zio_write(pio, spa, txg, bp,
abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp,
arc_write_ready, arc_write_ready,
(children_ready != NULL) ? arc_write_children_ready : NULL, (children_ready != NULL) ? arc_write_children_ready : NULL,
@ -6768,13 +6840,8 @@ l2arc_do_free_on_write(void)
for (df = list_tail(buflist); df; df = df_prev) { for (df = list_tail(buflist); df; df = df_prev) {
df_prev = list_prev(buflist, df); df_prev = list_prev(buflist, df);
ASSERT3P(df->l2df_data, !=, NULL); ASSERT3P(df->l2df_abd, !=, NULL);
if (df->l2df_type == ARC_BUFC_METADATA) { abd_free(df->l2df_abd);
zio_buf_free(df->l2df_data, df->l2df_size);
} else {
ASSERT(df->l2df_type == ARC_BUFC_DATA);
zio_data_buf_free(df->l2df_data, df->l2df_size);
}
list_remove(buflist, df); list_remove(buflist, df);
kmem_free(df, sizeof (l2arc_data_free_t)); kmem_free(df, sizeof (l2arc_data_free_t));
} }
@ -6928,12 +6995,12 @@ l2arc_read_done(zio_t *zio)
mutex_enter(hash_lock); mutex_enter(hash_lock);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
ASSERT3P(zio->io_data, !=, NULL); ASSERT3P(zio->io_abd, !=, NULL);
/* /*
* Check this survived the L2ARC journey. * Check this survived the L2ARC journey.
*/ */
ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
@ -6967,7 +7034,7 @@ l2arc_read_done(zio_t *zio)
ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
hdr, zio->io_priority, cb->l2rcb_flags, hdr, zio->io_priority, cb->l2rcb_flags,
&cb->l2rcb_zb)); &cb->l2rcb_zb));
} }
@ -7191,7 +7258,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
for (; hdr; hdr = hdr_prev) { for (; hdr; hdr = hdr_prev) {
kmutex_t *hash_lock; kmutex_t *hash_lock;
uint64_t asize, size; uint64_t asize, size;
void *to_write; abd_t *to_write;
if (arc_warm == B_FALSE) if (arc_warm == B_FALSE)
hdr_prev = multilist_sublist_next(mls, hdr); hdr_prev = multilist_sublist_next(mls, hdr);
@ -7264,7 +7331,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3U(arc_hdr_size(hdr), >, 0); ASSERT3U(arc_hdr_size(hdr), >, 0);
size = arc_hdr_size(hdr); size = arc_hdr_size(hdr);
@ -7280,18 +7347,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
* add it to the l2arc_free_on_write queue. * add it to the l2arc_free_on_write queue.
*/ */
if (!HDR_SHARED_DATA(hdr)) { if (!HDR_SHARED_DATA(hdr)) {
to_write = hdr->b_l1hdr.b_pdata; to_write = hdr->b_l1hdr.b_pabd;
} else { } else {
arc_buf_contents_t type = arc_buf_type(hdr); to_write = abd_alloc_for_io(size,
if (type == ARC_BUFC_METADATA) { HDR_ISTYPE_METADATA(hdr));
to_write = zio_buf_alloc(size); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
} else { l2arc_free_abd_on_write(to_write, size,
ASSERT3U(type, ==, ARC_BUFC_DATA); arc_buf_type(hdr));
to_write = zio_data_buf_alloc(size);
}
bcopy(hdr->b_l1hdr.b_pdata, to_write, size);
l2arc_free_data_on_write(to_write, size, type);
} }
wzio = zio_write_phys(pio, dev->l2ad_vdev, wzio = zio_write_phys(pio, dev->l2ad_vdev,
hdr->b_l2hdr.b_daddr, size, to_write, hdr->b_l2hdr.b_daddr, size, to_write,

View File

@ -14,7 +14,7 @@
*/ */
/* /*
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>

View File

@ -46,6 +46,7 @@
#include <sys/range_tree.h> #include <sys/range_tree.h>
#include <sys/trace_dbuf.h> #include <sys/trace_dbuf.h>
#include <sys/callb.h> #include <sys/callb.h>
#include <sys/abd.h>
struct dbuf_hold_impl_data { struct dbuf_hold_impl_data {
/* Function arguments */ /* Function arguments */
@ -3709,6 +3710,9 @@ dbuf_write_override_done(zio_t *zio)
mutex_exit(&db->db_mtx); mutex_exit(&db->db_mtx);
dbuf_write_done(zio, NULL, db); dbuf_write_done(zio, NULL, db);
if (zio->io_abd != NULL)
abd_put(zio->io_abd);
} }
/* Issue I/O to commit a dirty buffer to disk. */ /* Issue I/O to commit a dirty buffer to disk. */
@ -3801,7 +3805,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
* The BP for this block has been provided by open context * The BP for this block has been provided by open context
* (by dmu_sync() or dmu_buf_write_embedded()). * (by dmu_sync() or dmu_buf_write_embedded()).
*/ */
void *contents = (data != NULL) ? data->b_data : NULL; abd_t *contents = (data != NULL) ?
abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
dr->dr_zio = zio_write(zio, os->os_spa, txg, dr->dr_zio = zio_write(zio, os->os_spa, txg,
&dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size,

View File

@ -21,7 +21,7 @@
/* /*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -36,6 +36,7 @@
#include <sys/zio_checksum.h> #include <sys/zio_checksum.h>
#include <sys/zio_compress.h> #include <sys/zio_compress.h>
#include <sys/dsl_scan.h> #include <sys/dsl_scan.h>
#include <sys/abd.h>
static kmem_cache_t *ddt_cache; static kmem_cache_t *ddt_cache;
static kmem_cache_t *ddt_entry_cache; static kmem_cache_t *ddt_entry_cache;
@ -706,9 +707,8 @@ ddt_free(ddt_entry_t *dde)
for (p = 0; p < DDT_PHYS_TYPES; p++) for (p = 0; p < DDT_PHYS_TYPES; p++)
ASSERT(dde->dde_lead_zio[p] == NULL); ASSERT(dde->dde_lead_zio[p] == NULL);
if (dde->dde_repair_data != NULL) if (dde->dde_repair_abd != NULL)
zio_buf_free(dde->dde_repair_data, abd_free(dde->dde_repair_abd);
DDK_GET_PSIZE(&dde->dde_key));
cv_destroy(&dde->dde_cv); cv_destroy(&dde->dde_cv);
kmem_cache_free(ddt_entry_cache, dde); kmem_cache_free(ddt_entry_cache, dde);
@ -1002,7 +1002,7 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
ddt_enter(ddt); ddt_enter(ddt);
if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
avl_insert(&ddt->ddt_repair_tree, dde, where); avl_insert(&ddt->ddt_repair_tree, dde, where);
else else
@ -1040,7 +1040,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
continue; continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
} }

View File

@ -47,6 +47,7 @@
#include <sys/zio_compress.h> #include <sys/zio_compress.h>
#include <sys/sa.h> #include <sys/sa.h>
#include <sys/zfeature.h> #include <sys/zfeature.h>
#include <sys/abd.h>
#ifdef _KERNEL #ifdef _KERNEL
#include <sys/vmsystm.h> #include <sys/vmsystm.h>
#include <sys/zfs_znode.h> #include <sys/zfs_znode.h>
@ -1513,6 +1514,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
dsa->dsa_done(dsa->dsa_zgd, zio->io_error); dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
abd_put(zio->io_abd);
kmem_free(dsa, sizeof (*dsa)); kmem_free(dsa, sizeof (*dsa));
} }
@ -1537,11 +1539,11 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
dsa->dsa_zgd = zgd; dsa->dsa_zgd = zgd;
dsa->dsa_tx = tx; dsa->dsa_tx = tx;
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
ZIO_FLAG_CANFAIL, zb)); dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
return (0); return (0);
} }
@ -2062,6 +2064,7 @@ byteswap_uint8_array(void *vbuf, size_t size)
void void
dmu_init(void) dmu_init(void)
{ {
abd_init();
zfs_dbgmsg_init(); zfs_dbgmsg_init();
sa_cache_init(); sa_cache_init();
xuio_stat_init(); xuio_stat_init();
@ -2087,6 +2090,7 @@ dmu_fini(void)
xuio_stat_fini(); xuio_stat_fini();
sa_cache_fini(); sa_cache_fini();
zfs_dbgmsg_fini(); zfs_dbgmsg_fini();
abd_fini();
} }
#if defined(_KERNEL) && defined(HAVE_SPL) #if defined(_KERNEL) && defined(HAVE_SPL)

View File

@ -166,7 +166,7 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
{ {
ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
fletcher_4_incremental_native(dsp->dsa_drr, (void) fletcher_4_incremental_native(dsp->dsa_drr,
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
&dsp->dsa_zc); &dsp->dsa_zc);
if (dsp->dsa_drr->drr_type == DRR_BEGIN) { if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
@ -179,13 +179,13 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
if (dsp->dsa_drr->drr_type == DRR_END) { if (dsp->dsa_drr->drr_type == DRR_END) {
dsp->dsa_sent_end = B_TRUE; dsp->dsa_sent_end = B_TRUE;
} }
fletcher_4_incremental_native(&dsp->dsa_drr-> (void) fletcher_4_incremental_native(&dsp->dsa_drr->
drr_u.drr_checksum.drr_checksum, drr_u.drr_checksum.drr_checksum,
sizeof (zio_cksum_t), &dsp->dsa_zc); sizeof (zio_cksum_t), &dsp->dsa_zc);
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR)); return (SET_ERROR(EINTR));
if (payload_len != 0) { if (payload_len != 0) {
fletcher_4_incremental_native(payload, payload_len, (void) fletcher_4_incremental_native(payload, payload_len,
&dsp->dsa_zc); &dsp->dsa_zc);
if (dump_bytes(dsp, payload, payload_len) != 0) if (dump_bytes(dsp, payload, payload_len) != 0)
return (SET_ERROR(EINTR)); return (SET_ERROR(EINTR));
@ -1786,11 +1786,11 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
drc->drc_byteswap = B_TRUE; drc->drc_byteswap = B_TRUE;
fletcher_4_incremental_byteswap(drr_begin, (void) fletcher_4_incremental_byteswap(drr_begin,
sizeof (dmu_replay_record_t), &drc->drc_cksum); sizeof (dmu_replay_record_t), &drc->drc_cksum);
byteswap_record(drr_begin); byteswap_record(drr_begin);
} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
fletcher_4_incremental_native(drr_begin, (void) fletcher_4_incremental_native(drr_begin,
sizeof (dmu_replay_record_t), &drc->drc_cksum); sizeof (dmu_replay_record_t), &drc->drc_cksum);
} else { } else {
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
@ -2470,9 +2470,9 @@ static void
receive_cksum(struct receive_arg *ra, int len, void *buf) receive_cksum(struct receive_arg *ra, int len, void *buf)
{ {
if (ra->byteswap) { if (ra->byteswap) {
fletcher_4_incremental_byteswap(buf, len, &ra->cksum); (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
} else { } else {
fletcher_4_incremental_native(buf, len, &ra->cksum); (void) fletcher_4_incremental_native(buf, len, &ra->cksum);
} }
} }

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2016 Gary Mills * Copyright 2016 Gary Mills
*/ */
@ -47,6 +47,7 @@
#include <sys/sa.h> #include <sys/sa.h>
#include <sys/sa_impl.h> #include <sys/sa_impl.h>
#include <sys/zfeature.h> #include <sys/zfeature.h>
#include <sys/abd.h>
#ifdef _KERNEL #ifdef _KERNEL
#include <sys/zfs_vfsops.h> #include <sys/zfs_vfsops.h>
#endif #endif
@ -1820,7 +1821,7 @@ dsl_scan_scrub_done(zio_t *zio)
{ {
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
zio_data_buf_free(zio->io_data, zio->io_size); abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock); mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--; spa->spa_scrub_inflight--;
@ -1904,7 +1905,6 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
if (needs_io && !zfs_no_scrub_io) { if (needs_io && !zfs_no_scrub_io) {
vdev_t *rvd = spa->spa_root_vdev; vdev_t *rvd = spa->spa_root_vdev;
uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
void *data = zio_data_buf_alloc(size);
mutex_enter(&spa->spa_scrub_lock); mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= maxinflight) while (spa->spa_scrub_inflight >= maxinflight)
@ -1919,9 +1919,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
delay(scan_delay); delay(scan_delay);
zio_nowait(zio_read(NULL, spa, bp, data, size, zio_nowait(zio_read(NULL, spa, bp,
dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
zio_flags, zb)); NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
} }
/* do not relocate this block */ /* do not relocate this block */

View File

@ -22,20 +22,32 @@
* Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved.
* Use is subject to license terms. * Use is subject to license terms.
*/ */
/*
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/edonr.h> #include <sys/edonr.h>
#include <sys/zfs_context.h> /* For CTASSERT() */ #include <sys/zfs_context.h> /* For CTASSERT() */
#include <sys/abd.h>
#define EDONR_MODE 512 #define EDONR_MODE 512
#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE #define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE
static int
edonr_incremental(void *buf, size_t size, void *arg)
{
EdonRState *ctx = arg;
EdonRUpdate(ctx, buf, size * 8);
return (0);
}
/* /*
* Native zio_checksum interface for the Edon-R hash function. * Native zio_checksum interface for the Edon-R hash function.
*/ */
/*ARGSUSED*/ /*ARGSUSED*/
void void
zio_checksum_edonr_native(const void *buf, uint64_t size, abd_checksum_edonr_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp) const void *ctx_template, zio_cksum_t *zcp)
{ {
uint8_t digest[EDONR_MODE / 8]; uint8_t digest[EDONR_MODE / 8];
@ -43,7 +55,7 @@ zio_checksum_edonr_native(const void *buf, uint64_t size,
ASSERT(ctx_template != NULL); ASSERT(ctx_template != NULL);
bcopy(ctx_template, &ctx, sizeof (ctx)); bcopy(ctx_template, &ctx, sizeof (ctx));
EdonRUpdate(&ctx, buf, size * 8); (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
EdonRFinal(&ctx, digest); EdonRFinal(&ctx, digest);
bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word)); bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
} }
@ -52,12 +64,12 @@ zio_checksum_edonr_native(const void *buf, uint64_t size,
* Byteswapped zio_checksum interface for the Edon-R hash function. * Byteswapped zio_checksum interface for the Edon-R hash function.
*/ */
void void
zio_checksum_edonr_byteswap(const void *buf, uint64_t size, abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp) const void *ctx_template, zio_cksum_t *zcp)
{ {
zio_cksum_t tmp; zio_cksum_t tmp;
zio_checksum_edonr_native(buf, size, ctx_template, &tmp); abd_checksum_edonr_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]); zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]); zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]); zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
@ -65,7 +77,7 @@ zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
} }
void * void *
zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
{ {
EdonRState *ctx; EdonRState *ctx;
uint8_t salt_block[EDONR_BLOCK_SIZE]; uint8_t salt_block[EDONR_BLOCK_SIZE];
@ -94,7 +106,7 @@ zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
} }
void void
zio_checksum_edonr_tmpl_free(void *ctx_template) abd_checksum_edonr_tmpl_free(void *ctx_template)
{ {
EdonRState *ctx = ctx_template; EdonRState *ctx = ctx_template;

View File

@ -24,30 +24,39 @@
*/ */
/* /*
* Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/sha2.h> #include <sys/sha2.h>
#include <sys/abd.h>
static int
sha_incremental(void *buf, size_t size, void *arg)
{
SHA2_CTX *ctx = arg;
SHA2Update(ctx, buf, size);
return (0);
}
/*ARGSUSED*/ /*ARGSUSED*/
void void
zio_checksum_SHA256(const void *buf, uint64_t size, abd_checksum_SHA256(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp) const void *ctx_template, zio_cksum_t *zcp)
{ {
SHA2_CTX ctx; SHA2_CTX ctx;
zio_cksum_t tmp; zio_cksum_t tmp;
SHA2Init(SHA256, &ctx); SHA2Init(SHA256, &ctx);
SHA2Update(&ctx, buf, size); (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
SHA2Final(&tmp, &ctx); SHA2Final(&tmp, &ctx);
/* /*
* A prior implementation of this function had a * A prior implementation of this function had a
* private SHA256 implementation always wrote things out in * private SHA256 implementation always wrote things out in
* Big Endian and there wasn't a byteswap variant of it. * Big Endian and there wasn't a byteswap variant of it.
* To preseve on disk compatibility we need to force that * To preserve on disk compatibility we need to force that
* behaviour. * behavior.
*/ */
zcp->zc_word[0] = BE_64(tmp.zc_word[0]); zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
zcp->zc_word[1] = BE_64(tmp.zc_word[1]); zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
@ -57,24 +66,24 @@ zio_checksum_SHA256(const void *buf, uint64_t size,
/*ARGSUSED*/ /*ARGSUSED*/
void void
zio_checksum_SHA512_native(const void *buf, uint64_t size, abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp) const void *ctx_template, zio_cksum_t *zcp)
{ {
SHA2_CTX ctx; SHA2_CTX ctx;
SHA2Init(SHA512_256, &ctx); SHA2Init(SHA512_256, &ctx);
SHA2Update(&ctx, buf, size); (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
SHA2Final(zcp, &ctx); SHA2Final(zcp, &ctx);
} }
/*ARGSUSED*/ /*ARGSUSED*/
void void
zio_checksum_SHA512_byteswap(const void *buf, uint64_t size, abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp) const void *ctx_template, zio_cksum_t *zcp)
{ {
zio_cksum_t tmp; zio_cksum_t tmp;
zio_checksum_SHA512_native(buf, size, ctx_template, &tmp); abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);

View File

@ -20,42 +20,52 @@
*/ */
/* /*
* Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/skein.h> #include <sys/skein.h>
#include <sys/abd.h>
static int
skein_incremental(void *buf, size_t size, void *arg)
{
Skein_512_Ctxt_t *ctx = arg;
(void) Skein_512_Update(ctx, buf, size);
return (0);
}
/* /*
* Computes a native 256-bit skein MAC checksum. Please note that this * Computes a native 256-bit skein MAC checksum. Please note that this
* function requires the presence of a ctx_template that should be allocated * function requires the presence of a ctx_template that should be allocated
* using zio_checksum_skein_tmpl_init. * using abd_checksum_skein_tmpl_init.
*/ */
/*ARGSUSED*/ /*ARGSUSED*/
void void
zio_checksum_skein_native(const void *buf, uint64_t size, abd_checksum_skein_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp) const void *ctx_template, zio_cksum_t *zcp)
{ {
Skein_512_Ctxt_t ctx; Skein_512_Ctxt_t ctx;
ASSERT(ctx_template != NULL); ASSERT(ctx_template != NULL);
bcopy(ctx_template, &ctx, sizeof (ctx)); bcopy(ctx_template, &ctx, sizeof (ctx));
(void) Skein_512_Update(&ctx, buf, size); (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
(void) Skein_512_Final(&ctx, (uint8_t *)zcp); (void) Skein_512_Final(&ctx, (uint8_t *)zcp);
bzero(&ctx, sizeof (ctx)); bzero(&ctx, sizeof (ctx));
} }
/* /*
* Byteswapped version of zio_checksum_skein_native. This just invokes * Byteswapped version of abd_checksum_skein_native. This just invokes
* the native checksum function and byteswaps the resulting checksum (since * the native checksum function and byteswaps the resulting checksum (since
* skein is internally endian-insensitive). * skein is internally endian-insensitive).
*/ */
void void
zio_checksum_skein_byteswap(const void *buf, uint64_t size, abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp) const void *ctx_template, zio_cksum_t *zcp)
{ {
zio_cksum_t tmp; zio_cksum_t tmp;
zio_checksum_skein_native(buf, size, ctx_template, &tmp); abd_checksum_skein_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
@ -67,7 +77,7 @@ zio_checksum_skein_byteswap(const void *buf, uint64_t size,
* computations and returns a pointer to it. * computations and returns a pointer to it.
*/ */
void * void *
zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
{ {
Skein_512_Ctxt_t *ctx; Skein_512_Ctxt_t *ctx;
@ -82,7 +92,7 @@ zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
* zio_checksum_skein_tmpl_init. * zio_checksum_skein_tmpl_init.
*/ */
void void
zio_checksum_skein_tmpl_free(void *ctx_template) abd_checksum_skein_tmpl_free(void *ctx_template)
{ {
Skein_512_Ctxt_t *ctx = ctx_template; Skein_512_Ctxt_t *ctx = ctx_template;

View File

@ -1963,6 +1963,7 @@ spa_load_verify_done(zio_t *zio)
int error = zio->io_error; int error = zio->io_error;
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
abd_free(zio->io_abd);
if (error) { if (error) {
if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
type != DMU_OT_INTENT_LOG) type != DMU_OT_INTENT_LOG)
@ -1970,7 +1971,6 @@ spa_load_verify_done(zio_t *zio)
else else
atomic_inc_64(&sle->sle_data_count); atomic_inc_64(&sle->sle_data_count);
} }
zio_data_buf_free(zio->io_data, zio->io_size);
mutex_enter(&spa->spa_scrub_lock); mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--; spa->spa_scrub_inflight--;
@ -1993,7 +1993,6 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
{ {
zio_t *rio; zio_t *rio;
size_t size; size_t size;
void *data;
if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0); return (0);
@ -2004,12 +2003,11 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/ */
if (!spa_load_verify_metadata) if (!spa_load_verify_metadata)
return (0); return (0);
if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
return (0); return (0);
rio = arg; rio = arg;
size = BP_GET_PSIZE(bp); size = BP_GET_PSIZE(bp);
data = zio_data_buf_alloc(size);
mutex_enter(&spa->spa_scrub_lock); mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
@ -2017,7 +2015,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
spa->spa_scrub_inflight++; spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock); mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(rio, spa, bp, data, size, zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));

View File

@ -43,6 +43,7 @@
#include <sys/arc.h> #include <sys/arc.h>
#include <sys/zil.h> #include <sys/zil.h>
#include <sys/dsl_scan.h> #include <sys/dsl_scan.h>
#include <sys/abd.h>
#include <sys/zvol.h> #include <sys/zvol.h>
#include <sys/zfs_ratelimit.h> #include <sys/zfs_ratelimit.h>
@ -999,16 +1000,16 @@ vdev_probe_done(zio_t *zio)
vps->vps_readable = 1; vps->vps_readable = 1;
if (zio->io_error == 0 && spa_writeable(spa)) { if (zio->io_error == 0 && spa_writeable(spa)) {
zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
zio->io_offset, zio->io_size, zio->io_data, zio->io_offset, zio->io_size, zio->io_abd,
ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
} else { } else {
zio_buf_free(zio->io_data, zio->io_size); abd_free(zio->io_abd);
} }
} else if (zio->io_type == ZIO_TYPE_WRITE) { } else if (zio->io_type == ZIO_TYPE_WRITE) {
if (zio->io_error == 0) if (zio->io_error == 0)
vps->vps_writeable = 1; vps->vps_writeable = 1;
zio_buf_free(zio->io_data, zio->io_size); abd_free(zio->io_abd);
} else if (zio->io_type == ZIO_TYPE_NULL) { } else if (zio->io_type == ZIO_TYPE_NULL) {
zio_t *pio; zio_t *pio;
zio_link_t *zl; zio_link_t *zl;
@ -1126,8 +1127,8 @@ vdev_probe(vdev_t *vd, zio_t *zio)
for (l = 1; l < VDEV_LABELS; l++) { for (l = 1; l < VDEV_LABELS; l++) {
zio_nowait(zio_read_phys(pio, vd, zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l, vdev_label_offset(vd->vdev_psize, l,
offsetof(vdev_label_t, vl_pad2)), offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE,
VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
} }

View File

@ -23,7 +23,7 @@
* Use is subject to license terms. * Use is subject to license terms.
*/ */
/* /*
* Copyright (c) 2013, 2015 by Delphix. All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -31,6 +31,7 @@
#include <sys/vdev_impl.h> #include <sys/vdev_impl.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/kstat.h> #include <sys/kstat.h>
#include <sys/abd.h>
/* /*
* Virtual device read-ahead caching. * Virtual device read-ahead caching.
@ -136,12 +137,12 @@ static void
vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
{ {
ASSERT(MUTEX_HELD(&vc->vc_lock)); ASSERT(MUTEX_HELD(&vc->vc_lock));
ASSERT(ve->ve_fill_io == NULL); ASSERT3P(ve->ve_fill_io, ==, NULL);
ASSERT(ve->ve_data != NULL); ASSERT3P(ve->ve_abd, !=, NULL);
avl_remove(&vc->vc_lastused_tree, ve); avl_remove(&vc->vc_lastused_tree, ve);
avl_remove(&vc->vc_offset_tree, ve); avl_remove(&vc->vc_offset_tree, ve);
zio_buf_free(ve->ve_data, VCBS); abd_free(ve->ve_abd);
kmem_free(ve, sizeof (vdev_cache_entry_t)); kmem_free(ve, sizeof (vdev_cache_entry_t));
} }
@ -171,14 +172,14 @@ vdev_cache_allocate(zio_t *zio)
ve = avl_first(&vc->vc_lastused_tree); ve = avl_first(&vc->vc_lastused_tree);
if (ve->ve_fill_io != NULL) if (ve->ve_fill_io != NULL)
return (NULL); return (NULL);
ASSERT(ve->ve_hits != 0); ASSERT3U(ve->ve_hits, !=, 0);
vdev_cache_evict(vc, ve); vdev_cache_evict(vc, ve);
} }
ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
ve->ve_offset = offset; ve->ve_offset = offset;
ve->ve_lastused = ddi_get_lbolt(); ve->ve_lastused = ddi_get_lbolt();
ve->ve_data = zio_buf_alloc(VCBS); ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_offset_tree, ve);
avl_add(&vc->vc_lastused_tree, ve); avl_add(&vc->vc_lastused_tree, ve);
@ -192,7 +193,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
ASSERT(MUTEX_HELD(&vc->vc_lock)); ASSERT(MUTEX_HELD(&vc->vc_lock));
ASSERT(ve->ve_fill_io == NULL); ASSERT3P(ve->ve_fill_io, ==, NULL);
if (ve->ve_lastused != ddi_get_lbolt()) { if (ve->ve_lastused != ddi_get_lbolt()) {
avl_remove(&vc->vc_lastused_tree, ve); avl_remove(&vc->vc_lastused_tree, ve);
@ -201,7 +202,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
} }
ve->ve_hits++; ve->ve_hits++;
bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
} }
/* /*
@ -216,16 +217,16 @@ vdev_cache_fill(zio_t *fio)
zio_t *pio; zio_t *pio;
zio_link_t *zl; zio_link_t *zl;
ASSERT(fio->io_size == VCBS); ASSERT3U(fio->io_size, ==, VCBS);
/* /*
* Add data to the cache. * Add data to the cache.
*/ */
mutex_enter(&vc->vc_lock); mutex_enter(&vc->vc_lock);
ASSERT(ve->ve_fill_io == fio); ASSERT3P(ve->ve_fill_io, ==, fio);
ASSERT(ve->ve_offset == fio->io_offset); ASSERT3U(ve->ve_offset, ==, fio->io_offset);
ASSERT(ve->ve_data == fio->io_data); ASSERT3P(ve->ve_abd, ==, fio->io_abd);
ve->ve_fill_io = NULL; ve->ve_fill_io = NULL;
@ -256,7 +257,7 @@ vdev_cache_read(zio_t *zio)
zio_t *fio; zio_t *fio;
ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS)); ASSERTV(uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS));
ASSERT(zio->io_type == ZIO_TYPE_READ); ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
if (zio->io_flags & ZIO_FLAG_DONT_CACHE) if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
return (B_FALSE); return (B_FALSE);
@ -270,7 +271,7 @@ vdev_cache_read(zio_t *zio)
if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
return (B_FALSE); return (B_FALSE);
ASSERT(cache_phase + zio->io_size <= VCBS); ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
mutex_enter(&vc->vc_lock); mutex_enter(&vc->vc_lock);
@ -309,7 +310,7 @@ vdev_cache_read(zio_t *zio)
} }
fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
ve->ve_fill_io = fio; ve->ve_fill_io = fio;
@ -337,7 +338,7 @@ vdev_cache_write(zio_t *zio)
uint64_t max_offset = P2ROUNDUP(io_end, VCBS); uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
avl_index_t where; avl_index_t where;
ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
mutex_enter(&vc->vc_lock); mutex_enter(&vc->vc_lock);
@ -354,8 +355,8 @@ vdev_cache_write(zio_t *zio)
if (ve->ve_fill_io != NULL) { if (ve->ve_fill_io != NULL) {
ve->ve_missed_update = 1; ve->ve_missed_update = 1;
} else { } else {
bcopy((char *)zio->io_data + start - io_start, abd_copy_off(ve->ve_abd, zio->io_abd, start - io_start,
ve->ve_data + start - ve->ve_offset, end - start); start - ve->ve_offset, end - start);
} }
ve = AVL_NEXT(&vc->vc_offset_tree, ve); ve = AVL_NEXT(&vc->vc_offset_tree, ve);
} }

View File

@ -30,6 +30,7 @@
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/vdev_disk.h> #include <sys/vdev_disk.h>
#include <sys/vdev_impl.h> #include <sys/vdev_impl.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/sunldi.h> #include <sys/sunldi.h>
@ -42,6 +43,7 @@ static void *zfs_vdev_holder = VDEV_HOLDER;
*/ */
typedef struct dio_request { typedef struct dio_request {
zio_t *dr_zio; /* Parent ZIO */ zio_t *dr_zio; /* Parent ZIO */
void *dr_loanbuf; /* borrowed abd buffer */
atomic_t dr_ref; /* References */ atomic_t dr_ref; /* References */
int dr_error; /* Bio error */ int dr_error; /* Bio error */
int dr_bio_count; /* Count of bio's */ int dr_bio_count; /* Count of bio's */
@ -402,6 +404,7 @@ vdev_disk_dio_put(dio_request_t *dr)
*/ */
if (rc == 0) { if (rc == 0) {
zio_t *zio = dr->dr_zio; zio_t *zio = dr->dr_zio;
void *loanbuf = dr->dr_loanbuf;
int error = dr->dr_error; int error = dr->dr_error;
vdev_disk_dio_free(dr); vdev_disk_dio_free(dr);
@ -411,6 +414,15 @@ vdev_disk_dio_put(dio_request_t *dr)
ASSERT3S(zio->io_error, >=, 0); ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error) if (zio->io_error)
vdev_disk_error(zio); vdev_disk_error(zio);
/* ABD placeholder */
if (loanbuf != NULL) {
if (zio->io_type == ZIO_TYPE_READ) {
abd_copy_from_buf(zio->io_abd, loanbuf,
zio->io_size);
}
zio_buf_free(loanbuf, zio->io_size);
}
zio_delay_interrupt(zio); zio_delay_interrupt(zio);
} }
} }
@ -547,7 +559,30 @@ retry:
* their volume block size to match the maximum request size and * their volume block size to match the maximum request size and
* the common case will be one bio per vdev IO request. * the common case will be one bio per vdev IO request.
*/ */
if (zio != NULL) {
abd_t *abd = zio->io_abd;
/*
* ABD placeholder
* We can't use abd_borrow_buf routines here since our
* completion context is interrupt and abd refcounts
* take a mutex (in debug mode).
*/
if (abd_is_linear(abd)) {
bio_ptr = abd_to_buf(abd);
dr->dr_loanbuf = NULL;
} else {
bio_ptr = zio_buf_alloc(zio->io_size);
dr->dr_loanbuf = bio_ptr;
if (zio->io_type != ZIO_TYPE_READ)
abd_copy_to_buf(bio_ptr, abd, zio->io_size);
}
} else {
bio_ptr = kbuf_ptr; bio_ptr = kbuf_ptr;
dr->dr_loanbuf = NULL;
}
bio_offset = kbuf_offset; bio_offset = kbuf_offset;
bio_size = kbuf_size; bio_size = kbuf_size;
for (i = 0; i <= dr->dr_bio_count; i++) { for (i = 0; i <= dr->dr_bio_count; i++) {
@ -562,6 +597,8 @@ retry:
* are needed we allocate a larger dio and warn the user. * are needed we allocate a larger dio and warn the user.
*/ */
if (dr->dr_bio_count == i) { if (dr->dr_bio_count == i) {
if (dr->dr_loanbuf)
zio_buf_free(dr->dr_loanbuf, zio->io_size);
vdev_disk_dio_free(dr); vdev_disk_dio_free(dr);
bio_count *= 2; bio_count *= 2;
goto retry; goto retry;
@ -571,6 +608,8 @@ retry:
dr->dr_bio[i] = bio_alloc(GFP_NOIO, dr->dr_bio[i] = bio_alloc(GFP_NOIO,
MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES)); MIN(bio_nr_pages(bio_ptr, bio_size), BIO_MAX_PAGES));
if (unlikely(dr->dr_bio[i] == NULL)) { if (unlikely(dr->dr_bio[i] == NULL)) {
if (dr->dr_loanbuf)
zio_buf_free(dr->dr_loanbuf, zio->io_size);
vdev_disk_dio_free(dr); vdev_disk_dio_free(dr);
return (ENOMEM); return (ENOMEM);
} }
@ -730,7 +769,7 @@ vdev_disk_io_start(zio_t *zio)
} }
zio->io_target_timestamp = zio_handle_io_delay(zio); zio->io_target_timestamp = zio_handle_io_delay(zio);
error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, error = __vdev_disk_physio(vd->vd_bdev, zio, NULL,
zio->io_size, zio->io_offset, rw, flags); zio->io_size, zio->io_offset, rw, flags);
if (error) { if (error) {
zio->io_error = error; zio->io_error = error;

View File

@ -31,6 +31,7 @@
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h> #include <sys/fm/fs/zfs.h>
#include <sys/abd.h>
/* /*
* Virtual device vector for files. * Virtual device vector for files.
@ -150,11 +151,21 @@ vdev_file_io_strategy(void *arg)
vdev_t *vd = zio->io_vd; vdev_t *vd = zio->io_vd;
vdev_file_t *vf = vd->vdev_tsd; vdev_file_t *vf = vd->vdev_tsd;
ssize_t resid; ssize_t resid;
void *buf;
if (zio->io_type == ZIO_TYPE_READ)
buf = abd_borrow_buf(zio->io_abd, zio->io_size);
else
buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size,
zio->io_size, zio->io_offset, UIO_SYSSPACE, zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
0, RLIM64_INFINITY, kcred, &resid);
if (zio->io_type == ZIO_TYPE_READ)
abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
else
abd_return_buf(zio->io_abd, buf, zio->io_size);
if (resid != 0 && zio->io_error == 0) if (resid != 0 && zio->io_error == 0)
zio->io_error = SET_ERROR(ENOSPC); zio->io_error = SET_ERROR(ENOSPC);

View File

@ -145,6 +145,7 @@
#include <sys/metaslab.h> #include <sys/metaslab.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/dsl_scan.h> #include <sys/dsl_scan.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
/* /*
@ -178,7 +179,7 @@ vdev_label_number(uint64_t psize, uint64_t offset)
} }
static void static void
vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private, int flags) uint64_t size, zio_done_func_t *done, void *private, int flags)
{ {
ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
@ -192,7 +193,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
} }
static void static void
vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private, int flags) uint64_t size, zio_done_func_t *done, void *private, int flags)
{ {
ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
@ -587,6 +588,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
nvlist_t *config = NULL; nvlist_t *config = NULL;
vdev_phys_t *vp; vdev_phys_t *vp;
abd_t *vp_abd;
zio_t *zio; zio_t *zio;
uint64_t best_txg = 0; uint64_t best_txg = 0;
int error = 0; int error = 0;
@ -599,7 +601,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
if (!vdev_readable(vd)) if (!vdev_readable(vd))
return (NULL); return (NULL);
vp = zio_buf_alloc(sizeof (vdev_phys_t)); vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
vp = abd_to_buf(vp_abd);
retry: retry:
for (l = 0; l < VDEV_LABELS; l++) { for (l = 0; l < VDEV_LABELS; l++) {
@ -607,7 +610,7 @@ retry:
zio = zio_root(spa, NULL, NULL, flags); zio = zio_root(spa, NULL, NULL, flags);
vdev_label_read(zio, vd, l, vp, vdev_label_read(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys), offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags); sizeof (vdev_phys_t), NULL, NULL, flags);
@ -646,7 +649,7 @@ retry:
goto retry; goto retry;
} }
zio_buf_free(vp, sizeof (vdev_phys_t)); abd_free(vp_abd);
return (config); return (config);
} }
@ -782,8 +785,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
nvlist_t *label; nvlist_t *label;
vdev_phys_t *vp; vdev_phys_t *vp;
char *pad2; abd_t *vp_abd;
abd_t *pad2;
uberblock_t *ub; uberblock_t *ub;
abd_t *ub_abd;
zio_t *zio; zio_t *zio;
char *buf; char *buf;
size_t buflen; size_t buflen;
@ -867,8 +872,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/* /*
* Initialize its label. * Initialize its label.
*/ */
vp = zio_buf_alloc(sizeof (vdev_phys_t)); vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
bzero(vp, sizeof (vdev_phys_t)); abd_zero(vp_abd, sizeof (vdev_phys_t));
vp = abd_to_buf(vp_abd);
/* /*
* Generate a label describing the pool and our top-level vdev. * Generate a label describing the pool and our top-level vdev.
@ -928,7 +934,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
if (error != 0) { if (error != 0) {
nvlist_free(label); nvlist_free(label);
zio_buf_free(vp, sizeof (vdev_phys_t)); abd_free(vp_abd);
/* EFAULT means nvlist_pack ran out of room */ /* EFAULT means nvlist_pack ran out of room */
return (error == EFAULT ? ENAMETOOLONG : EINVAL); return (error == EFAULT ? ENAMETOOLONG : EINVAL);
} }
@ -936,14 +942,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/* /*
* Initialize uberblock template. * Initialize uberblock template.
*/ */
ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
bzero(ub, VDEV_UBERBLOCK_RING); abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
*ub = spa->spa_uberblock; abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
ub = abd_to_buf(ub_abd);
ub->ub_txg = 0; ub->ub_txg = 0;
/* Initialize the 2nd padding area. */ /* Initialize the 2nd padding area. */
pad2 = zio_buf_alloc(VDEV_PAD_SIZE); pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
bzero(pad2, VDEV_PAD_SIZE); abd_zero(pad2, VDEV_PAD_SIZE);
/* /*
* Write everything in parallel. * Write everything in parallel.
@ -953,7 +960,7 @@ retry:
for (l = 0; l < VDEV_LABELS; l++) { for (l = 0; l < VDEV_LABELS; l++) {
vdev_label_write(zio, vd, l, vp, vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys), offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags); sizeof (vdev_phys_t), NULL, NULL, flags);
@ -966,7 +973,7 @@ retry:
offsetof(vdev_label_t, vl_pad2), offsetof(vdev_label_t, vl_pad2),
VDEV_PAD_SIZE, NULL, NULL, flags); VDEV_PAD_SIZE, NULL, NULL, flags);
vdev_label_write(zio, vd, l, ub, vdev_label_write(zio, vd, l, ub_abd,
offsetof(vdev_label_t, vl_uberblock), offsetof(vdev_label_t, vl_uberblock),
VDEV_UBERBLOCK_RING, NULL, NULL, flags); VDEV_UBERBLOCK_RING, NULL, NULL, flags);
} }
@ -979,9 +986,9 @@ retry:
} }
nvlist_free(label); nvlist_free(label);
zio_buf_free(pad2, VDEV_PAD_SIZE); abd_free(pad2);
zio_buf_free(ub, VDEV_UBERBLOCK_RING); abd_free(ub_abd);
zio_buf_free(vp, sizeof (vdev_phys_t)); abd_free(vp_abd);
/* /*
* If this vdev hasn't been previously identified as a spare, then we * If this vdev hasn't been previously identified as a spare, then we
@ -1039,7 +1046,7 @@ vdev_uberblock_load_done(zio_t *zio)
vdev_t *vd = zio->io_vd; vdev_t *vd = zio->io_vd;
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
zio_t *rio = zio->io_private; zio_t *rio = zio->io_private;
uberblock_t *ub = zio->io_data; uberblock_t *ub = abd_to_buf(zio->io_abd);
struct ubl_cbdata *cbp = rio->io_private; struct ubl_cbdata *cbp = rio->io_private;
ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
@ -1060,7 +1067,7 @@ vdev_uberblock_load_done(zio_t *zio)
mutex_exit(&rio->io_lock); mutex_exit(&rio->io_lock);
} }
zio_buf_free(zio->io_data, zio->io_size); abd_free(zio->io_abd);
} }
static void static void
@ -1076,8 +1083,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
for (l = 0; l < VDEV_LABELS; l++) { for (l = 0; l < VDEV_LABELS; l++) {
for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_read(zio, vd, l, vdev_label_read(zio, vd, l,
zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
VDEV_UBERBLOCK_OFFSET(vd, n), B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
VDEV_UBERBLOCK_SIZE(vd), VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_load_done, zio, flags); vdev_uberblock_load_done, zio, flags);
} }
@ -1144,7 +1151,7 @@ vdev_uberblock_sync_done(zio_t *zio)
static void static void
vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
{ {
uberblock_t *ubbuf; abd_t *ub_abd;
int c, l, n; int c, l, n;
for (c = 0; c < vd->vdev_children; c++) for (c = 0; c < vd->vdev_children; c++)
@ -1158,17 +1165,18 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); /* Copy the uberblock_t into the ABD */
bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
*ubbuf = *ub; abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
for (l = 0; l < VDEV_LABELS; l++) for (l = 0; l < VDEV_LABELS; l++)
vdev_label_write(zio, vd, l, ubbuf, vdev_label_write(zio, vd, l, ub_abd,
VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_sync_done, zio->io_private, vdev_uberblock_sync_done, zio->io_private,
flags | ZIO_FLAG_DONT_PROPAGATE); flags | ZIO_FLAG_DONT_PROPAGATE);
zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); abd_free(ub_abd);
} }
/* Sync the uberblocks to all vdevs in svd[] */ /* Sync the uberblocks to all vdevs in svd[] */
@ -1245,6 +1253,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
{ {
nvlist_t *label; nvlist_t *label;
vdev_phys_t *vp; vdev_phys_t *vp;
abd_t *vp_abd;
char *buf; char *buf;
size_t buflen; size_t buflen;
int c; int c;
@ -1263,15 +1272,16 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
*/ */
label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
vp = zio_buf_alloc(sizeof (vdev_phys_t)); vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
bzero(vp, sizeof (vdev_phys_t)); abd_zero(vp_abd, sizeof (vdev_phys_t));
vp = abd_to_buf(vp_abd);
buf = vp->vp_nvlist; buf = vp->vp_nvlist;
buflen = sizeof (vp->vp_nvlist); buflen = sizeof (vp->vp_nvlist);
if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) {
for (; l < VDEV_LABELS; l += 2) { for (; l < VDEV_LABELS; l += 2) {
vdev_label_write(zio, vd, l, vp, vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys), offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), sizeof (vdev_phys_t),
vdev_label_sync_done, zio->io_private, vdev_label_sync_done, zio->io_private,
@ -1279,7 +1289,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
} }
} }
zio_buf_free(vp, sizeof (vdev_phys_t)); abd_free(vp_abd);
nvlist_free(label); nvlist_free(label);
} }

View File

@ -31,6 +31,7 @@
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/vdev_impl.h> #include <sys/vdev_impl.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
/* /*
@ -272,13 +273,13 @@ vdev_mirror_scrub_done(zio_t *zio)
while ((pio = zio_walk_parents(zio, &zl)) != NULL) { while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
mutex_enter(&pio->io_lock); mutex_enter(&pio->io_lock);
ASSERT3U(zio->io_size, >=, pio->io_size); ASSERT3U(zio->io_size, >=, pio->io_size);
bcopy(zio->io_data, pio->io_data, pio->io_size); abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
mutex_exit(&pio->io_lock); mutex_exit(&pio->io_lock);
} }
mutex_exit(&zio->io_lock); mutex_exit(&zio->io_lock);
} }
zio_buf_free(zio->io_data, zio->io_size); abd_free(zio->io_abd);
mc->mc_error = zio->io_error; mc->mc_error = zio->io_error;
mc->mc_tried = 1; mc->mc_tried = 1;
@ -433,7 +434,8 @@ vdev_mirror_io_start(zio_t *zio)
mc = &mm->mm_child[c]; mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp, zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, mc->mc_vd, mc->mc_offset,
zio_buf_alloc(zio->io_size), zio->io_size, abd_alloc_sametype(zio->io_abd,
zio->io_size), zio->io_size,
zio->io_type, zio->io_priority, 0, zio->io_type, zio->io_priority, 0,
vdev_mirror_scrub_done, mc)); vdev_mirror_scrub_done, mc));
} }
@ -458,7 +460,7 @@ vdev_mirror_io_start(zio_t *zio)
while (children--) { while (children--) {
mc = &mm->mm_child[c]; mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp, zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
zio->io_type, zio->io_priority, 0, zio->io_type, zio->io_priority, 0,
vdev_mirror_child_done, mc)); vdev_mirror_child_done, mc));
c++; c++;
@ -543,7 +545,7 @@ vdev_mirror_io_done(zio_t *zio)
mc = &mm->mm_child[c]; mc = &mm->mm_child[c];
zio_vdev_io_redone(zio); zio_vdev_io_redone(zio);
zio_nowait(zio_vdev_child_io(zio, zio->io_bp, zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
ZIO_TYPE_READ, zio->io_priority, 0, ZIO_TYPE_READ, zio->io_priority, 0,
vdev_mirror_child_done, mc)); vdev_mirror_child_done, mc));
return; return;
@ -584,7 +586,7 @@ vdev_mirror_io_done(zio_t *zio)
zio_nowait(zio_vdev_child_io(zio, zio->io_bp, zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, mc->mc_vd, mc->mc_offset,
zio->io_data, zio->io_size, zio->io_abd, zio->io_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));

View File

@ -37,6 +37,7 @@
#include <sys/spa.h> #include <sys/spa.h>
#include <sys/spa_impl.h> #include <sys/spa_impl.h>
#include <sys/kstat.h> #include <sys/kstat.h>
#include <sys/abd.h>
/* /*
* ZFS I/O Scheduler * ZFS I/O Scheduler
@ -496,12 +497,12 @@ vdev_queue_agg_io_done(zio_t *aio)
zio_t *pio; zio_t *pio;
zio_link_t *zl = NULL; zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(aio, &zl)) != NULL) { while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
bcopy((char *)aio->io_data + (pio->io_offset - abd_copy_off(pio->io_abd, aio->io_abd,
aio->io_offset), pio->io_data, pio->io_size); 0, pio->io_offset - aio->io_offset, pio->io_size);
} }
} }
zio_buf_free(aio->io_data, aio->io_size); abd_free(aio->io_abd);
} }
/* /*
@ -523,7 +524,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
boolean_t stretch = B_FALSE; boolean_t stretch = B_FALSE;
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
void *buf; abd_t *abd;
limit = MAX(MIN(zfs_vdev_aggregation_limit, limit = MAX(MIN(zfs_vdev_aggregation_limit,
spa_maxblocksize(vq->vq_vdev->vdev_spa)), 0); spa_maxblocksize(vq->vq_vdev->vdev_spa)), 0);
@ -626,12 +627,12 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
size = IO_SPAN(first, last); size = IO_SPAN(first, last);
ASSERT3U(size, <=, limit); ASSERT3U(size, <=, limit);
buf = zio_buf_alloc_flags(size, KM_NOSLEEP); abd = abd_alloc_for_io(size, B_TRUE);
if (buf == NULL) if (abd == NULL)
return (NULL); return (NULL);
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
buf, size, first->io_type, zio->io_priority, abd, size, first->io_type, zio->io_priority,
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL); vdev_queue_agg_io_done, NULL);
aio->io_timestamp = first->io_timestamp; aio->io_timestamp = first->io_timestamp;
@ -644,12 +645,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
if (dio->io_flags & ZIO_FLAG_NODATA) { if (dio->io_flags & ZIO_FLAG_NODATA) {
ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
bzero((char *)aio->io_data + (dio->io_offset - abd_zero_off(aio->io_abd,
aio->io_offset), dio->io_size); dio->io_offset - aio->io_offset, dio->io_size);
} else if (dio->io_type == ZIO_TYPE_WRITE) { } else if (dio->io_type == ZIO_TYPE_WRITE) {
bcopy(dio->io_data, (char *)aio->io_data + abd_copy_off(aio->io_abd, dio->io_abd,
(dio->io_offset - aio->io_offset), dio->io_offset - aio->io_offset, 0, dio->io_size);
dio->io_size);
} }
zio_add_child(dio, aio); zio_add_child(dio, aio);

View File

@ -30,6 +30,7 @@
#include <sys/vdev_impl.h> #include <sys/vdev_impl.h>
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/zio_checksum.h> #include <sys/zio_checksum.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h> #include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h> #include <sys/fm/fs/zfs.h>
#include <sys/vdev_raidz.h> #include <sys/vdev_raidz.h>
@ -136,7 +137,7 @@ vdev_raidz_map_free(raidz_map_t *rm)
size_t size; size_t size;
for (c = 0; c < rm->rm_firstdatacol; c++) { for (c = 0; c < rm->rm_firstdatacol; c++) {
zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); abd_free(rm->rm_col[c].rc_abd);
if (rm->rm_col[c].rc_gdata != NULL) if (rm->rm_col[c].rc_gdata != NULL)
zio_buf_free(rm->rm_col[c].rc_gdata, zio_buf_free(rm->rm_col[c].rc_gdata,
@ -144,11 +145,13 @@ vdev_raidz_map_free(raidz_map_t *rm)
} }
size = 0; size = 0;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
abd_put(rm->rm_col[c].rc_abd);
size += rm->rm_col[c].rc_size; size += rm->rm_col[c].rc_size;
}
if (rm->rm_datacopy != NULL) if (rm->rm_abd_copy != NULL)
zio_buf_free(rm->rm_datacopy, size); abd_free(rm->rm_abd_copy);
kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
} }
@ -185,7 +188,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
size_t x; size_t x;
const char *good = NULL; const char *good = NULL;
const char *bad = rm->rm_col[c].rc_data; char *bad;
if (good_data == NULL) { if (good_data == NULL) {
zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
@ -199,8 +202,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
* data never changes for a given logical ZIO) * data never changes for a given logical ZIO)
*/ */
if (rm->rm_col[0].rc_gdata == NULL) { if (rm->rm_col[0].rc_gdata == NULL) {
char *bad_parity[VDEV_RAIDZ_MAXPARITY]; abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
char *buf; char *buf;
int offset;
/* /*
* Set up the rm_col[]s to generate the parity for * Set up the rm_col[]s to generate the parity for
@ -208,15 +212,20 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
* replacing them with buffers to hold the result. * replacing them with buffers to hold the result.
*/ */
for (x = 0; x < rm->rm_firstdatacol; x++) { for (x = 0; x < rm->rm_firstdatacol; x++) {
bad_parity[x] = rm->rm_col[x].rc_data; bad_parity[x] = rm->rm_col[x].rc_abd;
rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = rm->rm_col[x].rc_gdata =
zio_buf_alloc(rm->rm_col[x].rc_size); zio_buf_alloc(rm->rm_col[x].rc_size);
rm->rm_col[x].rc_abd =
abd_get_from_buf(rm->rm_col[x].rc_gdata,
rm->rm_col[x].rc_size);
} }
/* fill in the data columns from good_data */ /* fill in the data columns from good_data */
buf = (char *)good_data; buf = (char *)good_data;
for (; x < rm->rm_cols; x++) { for (; x < rm->rm_cols; x++) {
rm->rm_col[x].rc_data = buf; abd_put(rm->rm_col[x].rc_abd);
rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
rm->rm_col[x].rc_size);
buf += rm->rm_col[x].rc_size; buf += rm->rm_col[x].rc_size;
} }
@ -226,13 +235,17 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
vdev_raidz_generate_parity(rm); vdev_raidz_generate_parity(rm);
/* restore everything back to its original state */ /* restore everything back to its original state */
for (x = 0; x < rm->rm_firstdatacol; x++) for (x = 0; x < rm->rm_firstdatacol; x++) {
rm->rm_col[x].rc_data = bad_parity[x]; abd_put(rm->rm_col[x].rc_abd);
rm->rm_col[x].rc_abd = bad_parity[x];
}
buf = rm->rm_datacopy; offset = 0;
for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
rm->rm_col[x].rc_data = buf; abd_put(rm->rm_col[x].rc_abd);
buf += rm->rm_col[x].rc_size; rm->rm_col[x].rc_abd = abd_get_offset(
rm->rm_abd_copy, offset);
offset += rm->rm_col[x].rc_size;
} }
} }
@ -246,8 +259,10 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
good += rm->rm_col[x].rc_size; good += rm->rm_col[x].rc_size;
} }
bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
/* we drop the ereport if it ends up that the data was good */ /* we drop the ereport if it ends up that the data was good */
zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
} }
/* /*
@ -260,7 +275,7 @@ static void
vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
{ {
size_t c = (size_t)(uintptr_t)arg; size_t c = (size_t)(uintptr_t)arg;
caddr_t buf; size_t offset;
raidz_map_t *rm = zio->io_vsd; raidz_map_t *rm = zio->io_vsd;
size_t size; size_t size;
@ -274,7 +289,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
rm->rm_reports++; rm->rm_reports++;
ASSERT3U(rm->rm_reports, >, 0); ASSERT3U(rm->rm_reports, >, 0);
if (rm->rm_datacopy != NULL) if (rm->rm_abd_copy != NULL)
return; return;
/* /*
@ -290,17 +305,20 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
size += rm->rm_col[c].rc_size; size += rm->rm_col[c].rc_size;
buf = rm->rm_datacopy = zio_buf_alloc(size); rm->rm_abd_copy =
abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
raidz_col_t *col = &rm->rm_col[c]; raidz_col_t *col = &rm->rm_col[c];
abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
bcopy(col->rc_data, buf, col->rc_size); abd_copy(tmp, col->rc_abd, col->rc_size);
col->rc_data = buf; abd_put(col->rc_abd);
col->rc_abd = tmp;
buf += col->rc_size; offset += col->rc_size;
} }
ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); ASSERT3U(offset, ==, size);
} }
static const zio_vsd_ops_t vdev_raidz_vsd_ops = { static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
@ -329,6 +347,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
/* The starting byte offset on each child vdev. */ /* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift; uint64_t o = (b / dcols) << unit_shift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
uint64_t off = 0;
/* /*
* "Quotient": The number of data sectors for this stripe on all but * "Quotient": The number of data sectors for this stripe on all but
@ -373,7 +392,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
rm->rm_missingdata = 0; rm->rm_missingdata = 0;
rm->rm_missingparity = 0; rm->rm_missingparity = 0;
rm->rm_firstdatacol = nparity; rm->rm_firstdatacol = nparity;
rm->rm_datacopy = NULL; rm->rm_abd_copy = NULL;
rm->rm_reports = 0; rm->rm_reports = 0;
rm->rm_freed = 0; rm->rm_freed = 0;
rm->rm_ecksuminjected = 0; rm->rm_ecksuminjected = 0;
@ -389,7 +408,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
} }
rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_devidx = col;
rm->rm_col[c].rc_offset = coff; rm->rm_col[c].rc_offset = coff;
rm->rm_col[c].rc_data = NULL; rm->rm_col[c].rc_abd = NULL;
rm->rm_col[c].rc_gdata = NULL; rm->rm_col[c].rc_gdata = NULL;
rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_error = 0;
rm->rm_col[c].rc_tried = 0; rm->rm_col[c].rc_tried = 0;
@ -412,13 +431,16 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
ASSERT3U(rm->rm_nskip, <=, nparity); ASSERT3U(rm->rm_nskip, <=, nparity);
for (c = 0; c < rm->rm_firstdatacol; c++) for (c = 0; c < rm->rm_firstdatacol; c++)
rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); rm->rm_col[c].rc_abd =
abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
rm->rm_col[c].rc_data = zio->io_data; rm->rm_col[c].rc_abd = abd_get_offset(zio->io_abd, 0);
off = rm->rm_col[c].rc_size;
for (c = c + 1; c < acols; c++) for (c = c + 1; c < acols; c++) {
rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + rm->rm_col[c].rc_abd = abd_get_offset(zio->io_abd, off);
rm->rm_col[c - 1].rc_size; off += rm->rm_col[c].rc_size;
}
/* /*
* If all data stored spans all columns, there's a danger that parity * If all data stored spans all columns, there's a danger that parity
@ -464,29 +486,84 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
return (rm); return (rm);
} }
struct pqr_struct {
uint64_t *p;
uint64_t *q;
uint64_t *r;
};
static int
vdev_raidz_p_func(void *buf, size_t size, void *private)
{
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
int i, cnt = size / sizeof (src[0]);
ASSERT(pqr->p && !pqr->q && !pqr->r);
for (i = 0; i < cnt; i++, src++, pqr->p++)
*pqr->p ^= *src;
return (0);
}
static int
vdev_raidz_pq_func(void *buf, size_t size, void *private)
{
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
uint64_t mask;
int i, cnt = size / sizeof (src[0]);
ASSERT(pqr->p && pqr->q && !pqr->r);
for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
*pqr->p ^= *src;
VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
*pqr->q ^= *src;
}
return (0);
}
static int
vdev_raidz_pqr_func(void *buf, size_t size, void *private)
{
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
uint64_t mask;
int i, cnt = size / sizeof (src[0]);
ASSERT(pqr->p && pqr->q && pqr->r);
for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
*pqr->p ^= *src;
VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
*pqr->q ^= *src;
VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
*pqr->r ^= *src;
}
return (0);
}
static void static void
vdev_raidz_generate_parity_p(raidz_map_t *rm) vdev_raidz_generate_parity_p(raidz_map_t *rm)
{ {
uint64_t *p, *src, pcount, ccount, i; uint64_t *p;
int c; int c;
abd_t *src;
pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
src = rm->rm_col[c].rc_data; src = rm->rm_col[c].rc_abd;
p = rm->rm_col[VDEV_RAIDZ_P].rc_data; p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
if (c == rm->rm_firstdatacol) { if (c == rm->rm_firstdatacol) {
ASSERT(ccount == pcount); abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
for (i = 0; i < ccount; i++, src++, p++) {
*p = *src;
}
} else { } else {
ASSERT(ccount <= pcount); struct pqr_struct pqr = { p, NULL, NULL };
for (i = 0; i < ccount; i++, src++, p++) { (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
*p ^= *src; vdev_raidz_p_func, &pqr);
}
} }
} }
} }
@ -494,50 +571,43 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
static void static void
vdev_raidz_generate_parity_pq(raidz_map_t *rm) vdev_raidz_generate_parity_pq(raidz_map_t *rm)
{ {
uint64_t *p, *q, *src, pcnt, ccnt, mask, i; uint64_t *p, *q, pcnt, ccnt, mask, i;
int c; int c;
abd_t *src;
pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size); rm->rm_col[VDEV_RAIDZ_Q].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
src = rm->rm_col[c].rc_data; src = rm->rm_col[c].rc_abd;
p = rm->rm_col[VDEV_RAIDZ_P].rc_data; p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
if (c == rm->rm_firstdatacol) { if (c == rm->rm_firstdatacol) {
ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
for (i = 0; i < ccnt; i++, src++, p++, q++) { (void) memcpy(q, p, rm->rm_col[c].rc_size);
*p = *src; } else {
*q = *src; struct pqr_struct pqr = { p, q, NULL };
(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
vdev_raidz_pq_func, &pqr);
} }
for (; i < pcnt; i++, src++, p++, q++) {
*p = 0; if (c == rm->rm_firstdatacol) {
*q = 0; for (i = ccnt; i < pcnt; i++) {
p[i] = 0;
q[i] = 0;
} }
} else { } else {
ASSERT(ccnt <= pcnt);
/*
* Apply the algorithm described above by multiplying
* the previous result and adding in the new value.
*/
for (i = 0; i < ccnt; i++, src++, p++, q++) {
*p ^= *src;
VDEV_RAIDZ_64MUL_2(*q, mask);
*q ^= *src;
}
/* /*
* Treat short columns as though they are full of 0s. * Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P. * Note that there's therefore nothing needed for P.
*/ */
for (; i < pcnt; i++, q++) { for (i = ccnt; i < pcnt; i++) {
VDEV_RAIDZ_64MUL_2(*q, mask); VDEV_RAIDZ_64MUL_2(q[i], mask);
} }
} }
} }
@ -546,59 +616,48 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
static void static void
vdev_raidz_generate_parity_pqr(raidz_map_t *rm) vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
{ {
uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
int c; int c;
abd_t *src;
pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size); rm->rm_col[VDEV_RAIDZ_Q].rc_size);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_R].rc_size); rm->rm_col[VDEV_RAIDZ_R].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
src = rm->rm_col[c].rc_data; src = rm->rm_col[c].rc_abd;
p = rm->rm_col[VDEV_RAIDZ_P].rc_data; p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
r = rm->rm_col[VDEV_RAIDZ_R].rc_data; r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
if (c == rm->rm_firstdatacol) { if (c == rm->rm_firstdatacol) {
ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { (void) memcpy(q, p, rm->rm_col[c].rc_size);
*p = *src; (void) memcpy(r, p, rm->rm_col[c].rc_size);
*q = *src; } else {
*r = *src; struct pqr_struct pqr = { p, q, r };
(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
vdev_raidz_pqr_func, &pqr);
} }
for (; i < pcnt; i++, src++, p++, q++, r++) {
*p = 0; if (c == rm->rm_firstdatacol) {
*q = 0; for (i = ccnt; i < pcnt; i++) {
*r = 0; p[i] = 0;
q[i] = 0;
r[i] = 0;
} }
} else { } else {
ASSERT(ccnt <= pcnt);
/*
* Apply the algorithm described above by multiplying
* the previous result and adding in the new value.
*/
for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
*p ^= *src;
VDEV_RAIDZ_64MUL_2(*q, mask);
*q ^= *src;
VDEV_RAIDZ_64MUL_4(*r, mask);
*r ^= *src;
}
/* /*
* Treat short columns as though they are full of 0s. * Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P. * Note that there's therefore nothing needed for P.
*/ */
for (; i < pcnt; i++, q++, r++) { for (i = ccnt; i < pcnt; i++) {
VDEV_RAIDZ_64MUL_2(*q, mask); VDEV_RAIDZ_64MUL_2(q[i], mask);
VDEV_RAIDZ_64MUL_4(*r, mask); VDEV_RAIDZ_64MUL_4(r[i], mask);
} }
} }
} }
@ -630,40 +689,159 @@ vdev_raidz_generate_parity(raidz_map_t *rm)
} }
} }
/* ARGSUSED */
static int
vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
{
uint64_t *dst = dbuf;
uint64_t *src = sbuf;
int cnt = size / sizeof (src[0]);
int i;
for (i = 0; i < cnt; i++) {
dst[i] ^= src[i];
}
return (0);
}
/* ARGSUSED */
static int
vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
void *private)
{
uint64_t *dst = dbuf;
uint64_t *src = sbuf;
uint64_t mask;
int cnt = size / sizeof (dst[0]);
int i;
for (i = 0; i < cnt; i++, dst++, src++) {
VDEV_RAIDZ_64MUL_2(*dst, mask);
*dst ^= *src;
}
return (0);
}
/* ARGSUSED */
static int
vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
{
uint64_t *dst = buf;
uint64_t mask;
int cnt = size / sizeof (dst[0]);
int i;
for (i = 0; i < cnt; i++, dst++) {
/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
VDEV_RAIDZ_64MUL_2(*dst, mask);
}
return (0);
}
struct reconst_q_struct {
uint64_t *q;
int exp;
};
static int
vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
{
struct reconst_q_struct *rq = private;
uint64_t *dst = buf;
int cnt = size / sizeof (dst[0]);
int i;
for (i = 0; i < cnt; i++, dst++, rq->q++) {
int j;
uint8_t *b;
*dst ^= *rq->q;
for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
*b = vdev_raidz_exp2(*b, rq->exp);
}
}
return (0);
}
struct reconst_pq_struct {
uint8_t *p;
uint8_t *q;
uint8_t *pxy;
uint8_t *qxy;
int aexp;
int bexp;
};
static int
vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
{
struct reconst_pq_struct *rpq = private;
uint8_t *xd = xbuf;
uint8_t *yd = ybuf;
int i;
for (i = 0; i < size;
i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
*yd = *rpq->p ^ *rpq->pxy ^ *xd;
}
return (0);
}
static int
vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
{
struct reconst_pq_struct *rpq = private;
uint8_t *xd = xbuf;
int i;
for (i = 0; i < size;
i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
/* same operation as vdev_raidz_reconst_pq_func() on xd */
*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
}
return (0);
}
static int static int
vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
{ {
uint64_t *dst, *src, xcount, ccount, count, i;
int x = tgts[0]; int x = tgts[0];
int c; int c;
abd_t *dst, *src;
ASSERT(ntgts == 1); ASSERT(ntgts == 1);
ASSERT(x >= rm->rm_firstdatacol); ASSERT(x >= rm->rm_firstdatacol);
ASSERT(x < rm->rm_cols); ASSERT(x < rm->rm_cols);
xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); ASSERT(rm->rm_col[x].rc_size > 0);
ASSERT(xcount > 0);
src = rm->rm_col[VDEV_RAIDZ_P].rc_data; src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
dst = rm->rm_col[x].rc_data; dst = rm->rm_col[x].rc_abd;
for (i = 0; i < xcount; i++, dst++, src++) {
*dst = *src; abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size);
}
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
src = rm->rm_col[c].rc_data; uint64_t size = MIN(rm->rm_col[x].rc_size,
dst = rm->rm_col[x].rc_data; rm->rm_col[c].rc_size);
src = rm->rm_col[c].rc_abd;
dst = rm->rm_col[x].rc_abd;
if (c == x) if (c == x)
continue; continue;
ccount = rm->rm_col[c].rc_size / sizeof (src[0]); (void) abd_iterate_func2(dst, src, 0, 0, size,
count = MIN(ccount, xcount); vdev_raidz_reconst_p_func, NULL);
for (i = 0; i < count; i++, dst++, src++) {
*dst ^= *src;
}
} }
return (1 << VDEV_RAIDZ_P); return (1 << VDEV_RAIDZ_P);
@ -672,57 +850,46 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
static int static int
vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
{ {
uint64_t *dst, *src, xcount, ccount, count, mask, i;
uint8_t *b;
int x = tgts[0]; int x = tgts[0];
int c, j, exp; int c, exp;
abd_t *dst, *src;
struct reconst_q_struct rq;
ASSERT(ntgts == 1); ASSERT(ntgts == 1);
xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
src = rm->rm_col[c].rc_data; uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
dst = rm->rm_col[x].rc_data; rm->rm_col[c].rc_size);
if (c == x) src = rm->rm_col[c].rc_abd;
ccount = 0; dst = rm->rm_col[x].rc_abd;
else
ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
count = MIN(ccount, xcount);
if (c == rm->rm_firstdatacol) { if (c == rm->rm_firstdatacol) {
for (i = 0; i < count; i++, dst++, src++) { abd_copy(dst, src, size);
*dst = *src; if (rm->rm_col[x].rc_size > size)
} abd_zero_off(dst, size,
for (; i < xcount; i++, dst++) { rm->rm_col[x].rc_size - size);
*dst = 0;
}
} else { } else {
for (i = 0; i < count; i++, dst++, src++) { ASSERT3U(size, <=, rm->rm_col[x].rc_size);
VDEV_RAIDZ_64MUL_2(*dst, mask); (void) abd_iterate_func2(dst, src, 0, 0, size,
*dst ^= *src; vdev_raidz_reconst_q_pre_func, NULL);
} (void) abd_iterate_func(dst,
size, rm->rm_col[x].rc_size - size,
for (; i < xcount; i++, dst++) { vdev_raidz_reconst_q_pre_tail_func, NULL);
VDEV_RAIDZ_64MUL_2(*dst, mask);
}
} }
} }
src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
dst = rm->rm_col[x].rc_data; dst = rm->rm_col[x].rc_abd;
exp = 255 - (rm->rm_cols - 1 - x); exp = 255 - (rm->rm_cols - 1 - x);
rq.q = abd_to_buf(src);
rq.exp = exp;
for (i = 0; i < xcount; i++, dst++, src++) { (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
*dst ^= *src; vdev_raidz_reconst_q_post_func, &rq);
for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
*b = vdev_raidz_exp2(*b, exp);
}
}
return (1 << VDEV_RAIDZ_Q); return (1 << VDEV_RAIDZ_Q);
} }
@ -730,11 +897,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
static int static int
vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
{ {
uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
void *pdata, *qdata; abd_t *pdata, *qdata;
uint64_t xsize, ysize, i; uint64_t xsize, ysize;
int x = tgts[0]; int x = tgts[0];
int y = tgts[1]; int y = tgts[1];
abd_t *xd, *yd;
struct reconst_pq_struct rpq;
ASSERT(ntgts == 2); ASSERT(ntgts == 2);
ASSERT(x < y); ASSERT(x < y);
@ -750,15 +919,15 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
* parity so we make those columns appear to be full of zeros by * parity so we make those columns appear to be full of zeros by
* setting their lengths to zero. * setting their lengths to zero.
*/ */
pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
xsize = rm->rm_col[x].rc_size; xsize = rm->rm_col[x].rc_size;
ysize = rm->rm_col[y].rc_size; ysize = rm->rm_col[y].rc_size;
rm->rm_col[VDEV_RAIDZ_P].rc_data = rm->rm_col[VDEV_RAIDZ_P].rc_abd =
zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
rm->rm_col[VDEV_RAIDZ_Q].rc_data = rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
rm->rm_col[x].rc_size = 0; rm->rm_col[x].rc_size = 0;
rm->rm_col[y].rc_size = 0; rm->rm_col[y].rc_size = 0;
@ -767,12 +936,12 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
rm->rm_col[x].rc_size = xsize; rm->rm_col[x].rc_size = xsize;
rm->rm_col[y].rc_size = ysize; rm->rm_col[y].rc_size = ysize;
p = pdata; p = abd_to_buf(pdata);
q = qdata; q = abd_to_buf(qdata);
pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
xd = rm->rm_col[x].rc_data; xd = rm->rm_col[x].rc_abd;
yd = rm->rm_col[y].rc_data; yd = rm->rm_col[y].rc_abd;
/* /*
* We now have: * We now have:
@ -796,24 +965,27 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { ASSERT3U(xsize, >=, ysize);
*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ rpq.p = p;
vdev_raidz_exp2(*q ^ *qxy, bexp); rpq.q = q;
rpq.pxy = pxy;
rpq.qxy = qxy;
rpq.aexp = aexp;
rpq.bexp = bexp;
if (i < ysize) (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
*yd = *p ^ *pxy ^ *xd; vdev_raidz_reconst_pq_func, &rpq);
} (void) abd_iterate_func(xd, ysize, xsize - ysize,
vdev_raidz_reconst_pq_tail_func, &rpq);
zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
rm->rm_col[VDEV_RAIDZ_P].rc_size); abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
/* /*
* Restore the saved parity data. * Restore the saved parity data.
*/ */
rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
} }
@ -1131,7 +1303,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
c = used[i]; c = used[i];
ASSERT3U(c, <, rm->rm_cols); ASSERT3U(c, <, rm->rm_cols);
src = rm->rm_col[c].rc_data; src = abd_to_buf(rm->rm_col[c].rc_abd);
ccount = rm->rm_col[c].rc_size; ccount = rm->rm_col[c].rc_size;
for (j = 0; j < nmissing; j++) { for (j = 0; j < nmissing; j++) {
cc = missing[j] + rm->rm_firstdatacol; cc = missing[j] + rm->rm_firstdatacol;
@ -1139,7 +1311,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
ASSERT3U(cc, <, rm->rm_cols); ASSERT3U(cc, <, rm->rm_cols);
ASSERT3U(cc, !=, c); ASSERT3U(cc, !=, c);
dst[j] = rm->rm_col[cc].rc_data; dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
dcount[j] = rm->rm_col[cc].rc_size; dcount[j] = rm->rm_col[cc].rc_size;
} }
@ -1187,8 +1359,25 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
uint8_t *used; uint8_t *used;
abd_t **bufs = NULL;
int code = 0; int code = 0;
/*
* Matrix reconstruction can't use scatter ABDs yet, so we allocate
* temporary linear ABDs.
*/
if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
raidz_col_t *col = &rm->rm_col[c];
bufs[c] = col->rc_abd;
col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
abd_copy(col->rc_abd, bufs[c], col->rc_size);
}
}
n = rm->rm_cols - rm->rm_firstdatacol; n = rm->rm_cols - rm->rm_firstdatacol;
@ -1275,6 +1464,20 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
kmem_free(p, psize); kmem_free(p, psize);
/*
* copy back from temporary linear abds and free them
*/
if (bufs) {
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
raidz_col_t *col = &rm->rm_col[c];
abd_copy(bufs[c], col->rc_abd, col->rc_size);
abd_free(col->rc_abd);
col->rc_abd = bufs[c];
}
kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
}
return (code); return (code);
} }
@ -1321,7 +1524,6 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
dt = &tgts[nbadparity]; dt = &tgts[nbadparity];
/* Reconstruct using the new math implementation */ /* Reconstruct using the new math implementation */
ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
if (ret != RAIDZ_ORIGINAL_IMPL) if (ret != RAIDZ_ORIGINAL_IMPL)
@ -1479,7 +1681,7 @@ vdev_raidz_io_start(zio_t *zio)
rc = &rm->rm_col[c]; rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx]; cvd = vd->vdev_child[rc->rc_devidx];
zio_nowait(zio_vdev_child_io(zio, NULL, cvd, zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_data, rc->rc_size, rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0, zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc)); vdev_raidz_child_done, rc));
} }
@ -1536,7 +1738,7 @@ vdev_raidz_io_start(zio_t *zio)
if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd, zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_data, rc->rc_size, rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0, zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc)); vdev_raidz_child_done, rc));
} }
@ -1552,6 +1754,7 @@ vdev_raidz_io_start(zio_t *zio)
static void static void
raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
{ {
void *buf;
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@ -1565,9 +1768,11 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
zbc.zbc_has_cksum = 0; zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected; zbc.zbc_injected = rm->rm_ecksuminjected;
buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
zfs_ereport_post_checksum(zio->io_spa, vd, zio, zfs_ereport_post_checksum(zio->io_spa, vd, zio,
rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, rc->rc_offset, rc->rc_size, buf, bad_data,
&zbc); &zbc);
abd_return_buf(rc->rc_abd, buf, rc->rc_size);
} }
} }
@ -1616,7 +1821,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
if (!rc->rc_tried || rc->rc_error != 0) if (!rc->rc_tried || rc->rc_error != 0)
continue; continue;
orig[c] = zio_buf_alloc(rc->rc_size); orig[c] = zio_buf_alloc(rc->rc_size);
bcopy(rc->rc_data, orig[c], rc->rc_size); abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
} }
vdev_raidz_generate_parity(rm); vdev_raidz_generate_parity(rm);
@ -1625,7 +1830,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
rc = &rm->rm_col[c]; rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0) if (!rc->rc_tried || rc->rc_error != 0)
continue; continue;
if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { if (bcmp(orig[c], abd_to_buf(rc->rc_abd), rc->rc_size) != 0) {
raidz_checksum_error(zio, rc, orig[c]); raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = SET_ERROR(ECKSUM); rc->rc_error = SET_ERROR(ECKSUM);
ret++; ret++;
@ -1728,7 +1933,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
ASSERT3S(c, >=, 0); ASSERT3S(c, >=, 0);
ASSERT3S(c, <, rm->rm_cols); ASSERT3S(c, <, rm->rm_cols);
rc = &rm->rm_col[c]; rc = &rm->rm_col[c];
bcopy(rc->rc_data, orig[i], rc->rc_size); abd_copy_to_buf(orig[i], rc->rc_abd,
rc->rc_size);
} }
/* /*
@ -1758,7 +1964,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
for (i = 0; i < n; i++) { for (i = 0; i < n; i++) {
c = tgts[i]; c = tgts[i];
rc = &rm->rm_col[c]; rc = &rm->rm_col[c];
bcopy(orig[i], rc->rc_data, rc->rc_size); abd_copy_from_buf(rc->rc_abd, orig[i],
rc->rc_size);
} }
do { do {
@ -1997,7 +2204,7 @@ vdev_raidz_io_done(zio_t *zio)
continue; continue;
zio_nowait(zio_vdev_child_io(zio, NULL, zio_nowait(zio_vdev_child_io(zio, NULL,
vd->vdev_child[rc->rc_devidx], vd->vdev_child[rc->rc_devidx],
rc->rc_offset, rc->rc_data, rc->rc_size, rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0, zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc)); vdev_raidz_child_done, rc));
} while (++c < rm->rm_cols); } while (++c < rm->rm_cols);
@ -2077,7 +2284,7 @@ done:
continue; continue;
zio_nowait(zio_vdev_child_io(zio, NULL, cvd, zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_data, rc->rc_size, rc->rc_offset, rc->rc_abd, rc->rc_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));

View File

@ -44,6 +44,16 @@ static raidz_impl_ops_t vdev_raidz_fastest_impl = {
.name = "fastest" .name = "fastest"
}; };
/* ABD BRINGUP -- not ready yet */
#if 1
#ifdef HAVE_SSSE3
#undef HAVE_SSSE3
#endif
#ifdef HAVE_AVX2
#undef HAVE_AVX2
#endif
#endif
/* All compiled in implementations */ /* All compiled in implementations */
const raidz_impl_ops_t *raidz_all_maths[] = { const raidz_impl_ops_t *raidz_all_maths[] = {
&vdev_raidz_original_impl, &vdev_raidz_original_impl,
@ -149,6 +159,8 @@ vdev_raidz_math_generate(raidz_map_t *rm)
{ {
raidz_gen_f gen_parity = NULL; raidz_gen_f gen_parity = NULL;
/* ABD Bringup -- vector code not ready */
#if 0
switch (raidz_parity(rm)) { switch (raidz_parity(rm)) {
case 1: case 1:
gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P]; gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
@ -165,6 +177,7 @@ vdev_raidz_math_generate(raidz_map_t *rm)
raidz_parity(rm)); raidz_parity(rm));
break; break;
} }
#endif
/* if method is NULL execute the original implementation */ /* if method is NULL execute the original implementation */
if (gen_parity == NULL) if (gen_parity == NULL)
@ -175,6 +188,8 @@ vdev_raidz_math_generate(raidz_map_t *rm)
return (0); return (0);
} }
/* ABD Bringup -- vector code not ready */
#if 0
static raidz_rec_f static raidz_rec_f
reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid, reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
const int nbaddata) const int nbaddata)
@ -229,6 +244,7 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
} }
return ((raidz_rec_f) NULL); return ((raidz_rec_f) NULL);
} }
#endif
/* /*
* Select data reconstruction method for raidz_map * Select data reconstruction method for raidz_map
@ -242,6 +258,8 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
{ {
raidz_rec_f rec_data = NULL; raidz_rec_f rec_data = NULL;
/* ABD Bringup -- vector code not ready */
#if 0
switch (raidz_parity(rm)) { switch (raidz_parity(rm)) {
case PARITY_P: case PARITY_P:
rec_data = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); rec_data = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
@ -257,6 +275,7 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
raidz_parity(rm)); raidz_parity(rm));
break; break;
} }
#endif
if (rec_data == NULL) if (rec_data == NULL)
return (RAIDZ_ORIGINAL_IMPL); return (RAIDZ_ORIGINAL_IMPL);
@ -471,13 +490,12 @@ vdev_raidz_math_init(void)
return; return;
#endif #endif
/* Fake an zio and run the benchmark on it */ /* Fake an zio and run the benchmark on a warmed up buffer */
bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
bench_zio->io_offset = 0; bench_zio->io_offset = 0;
bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
bench_zio->io_data = zio_data_buf_alloc(BENCH_ZIO_SIZE); bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
VERIFY(bench_zio->io_data); memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
memset(bench_zio->io_data, 0xAA, BENCH_ZIO_SIZE); /* warm up */
/* Benchmark parity generation methods */ /* Benchmark parity generation methods */
for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
@ -501,7 +519,7 @@ vdev_raidz_math_init(void)
vdev_raidz_map_free(bench_rm); vdev_raidz_map_free(bench_rm);
/* cleanup the bench zio */ /* cleanup the bench zio */
zio_data_buf_free(bench_zio->io_data, BENCH_ZIO_SIZE); abd_free(bench_zio->io_abd);
kmem_free(bench_zio, sizeof (zio_t)); kmem_free(bench_zio, sizeof (zio_t));
/* install kstats for all impl */ /* install kstats for all impl */

View File

@ -21,7 +21,6 @@
/* /*
* Copyright (C) 2016 Gvozden Nešković. All rights reserved. * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
*/ */
#include <sys/isa_defs.h> #include <sys/isa_defs.h>
#if defined(__x86_64) && defined(HAVE_AVX2) #if defined(__x86_64) && defined(HAVE_AVX2)
@ -401,7 +400,12 @@ DEFINE_REC_METHODS(avx2);
static boolean_t static boolean_t
raidz_will_avx2_work(void) raidz_will_avx2_work(void)
{ {
/* ABD Bringup -- vector code not ready */
#if 1
return (B_FALSE);
#else
return (zfs_avx_available() && zfs_avx2_available()); return (zfs_avx_available() && zfs_avx2_available());
#endif
} }
const raidz_impl_ops_t vdev_raidz_avx2_impl = { const raidz_impl_ops_t vdev_raidz_avx2_impl = {

View File

@ -33,7 +33,8 @@
#endif #endif
/* Calculate data offset in raidz column, offset is in bytes */ /* Calculate data offset in raidz column, offset is in bytes */
#define COL_OFF(col, off) ((v_t *)(((char *)(col)->rc_data) + (off))) /* ADB BRINGUP -- needs to be refactored for ABD */
#define COL_OFF(col, off) ((v_t *)(((char *)(col)->rc_abd) + (off)))
/* /*
* PARITY CALCULATION * PARITY CALCULATION
@ -83,6 +84,8 @@ raidz_generate_p_impl(raidz_map_t * const rm)
const size_t psize = raidz_big_size(rm); const size_t psize = raidz_big_size(rm);
const size_t short_size = raidz_short_size(rm); const size_t short_size = raidz_short_size(rm);
panic("not ABD ready");
raidz_math_begin(); raidz_math_begin();
/* short_size */ /* short_size */
@ -141,6 +144,8 @@ raidz_generate_pq_impl(raidz_map_t * const rm)
const size_t psize = raidz_big_size(rm); const size_t psize = raidz_big_size(rm);
const size_t short_size = raidz_short_size(rm); const size_t short_size = raidz_short_size(rm);
panic("not ABD ready");
raidz_math_begin(); raidz_math_begin();
/* short_size */ /* short_size */
@ -208,6 +213,8 @@ raidz_generate_pqr_impl(raidz_map_t * const rm)
const size_t psize = raidz_big_size(rm); const size_t psize = raidz_big_size(rm);
const size_t short_size = raidz_short_size(rm); const size_t short_size = raidz_short_size(rm);
panic("not ABD ready");
raidz_math_begin(); raidz_math_begin();
/* short_size */ /* short_size */

View File

@ -24,7 +24,6 @@
*/ */
#include <sys/vdev_raidz_impl.h> #include <sys/vdev_raidz_impl.h>
/* /*
* Provide native CPU scalar routines. * Provide native CPU scalar routines.
* Support 32bit and 64bit CPUs. * Support 32bit and 64bit CPUs.

View File

@ -403,8 +403,13 @@ DEFINE_REC_METHODS(ssse3);
static boolean_t static boolean_t
raidz_will_ssse3_work(void) raidz_will_ssse3_work(void)
{ {
/* ABD Bringup -- vector code not ready */
#if 1
return (B_FALSE);
#else
return (zfs_sse_available() && zfs_sse2_available() && return (zfs_sse_available() && zfs_sse2_available() &&
zfs_ssse3_available()); zfs_ssse3_available());
#endif
} }
const raidz_impl_ops_t vdev_raidz_ssse3_impl = { const raidz_impl_ops_t vdev_raidz_ssse3_impl = {

View File

@ -40,6 +40,7 @@
#include <sys/dsl_pool.h> #include <sys/dsl_pool.h>
#include <sys/metaslab.h> #include <sys/metaslab.h>
#include <sys/trace_zil.h> #include <sys/trace_zil.h>
#include <sys/abd.h>
/* /*
* The zfs intent log (ZIL) saves transaction records of system calls * The zfs intent log (ZIL) saves transaction records of system calls
@ -878,6 +879,7 @@ zil_lwb_write_done(zio_t *zio)
* one in zil_commit_writer(). zil_sync() will only remove * one in zil_commit_writer(). zil_sync() will only remove
* the lwb if lwb_buf is null. * the lwb if lwb_buf is null.
*/ */
abd_put(zio->io_abd);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lock);
lwb->lwb_zio = NULL; lwb->lwb_zio = NULL;
@ -914,12 +916,14 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
mutex_enter(&zilog->zl_lock); mutex_enter(&zilog->zl_lock);
if (lwb->lwb_zio == NULL) { if (lwb->lwb_zio == NULL) {
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
BP_GET_LSIZE(&lwb->lwb_blk));
if (!lwb->lwb_fastwrite) { if (!lwb->lwb_fastwrite) {
metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
lwb->lwb_fastwrite = 1; lwb->lwb_fastwrite = 1;
} }
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_FASTWRITE, &zb); ZIO_FLAG_FASTWRITE, &zb);

View File

@ -42,6 +42,7 @@
#include <sys/metaslab_impl.h> #include <sys/metaslab_impl.h>
#include <sys/time.h> #include <sys/time.h>
#include <sys/trace_zio.h> #include <sys/trace_zio.h>
#include <sys/abd.h>
/* /*
* ========================================================================== * ==========================================================================
@ -67,6 +68,11 @@ kmem_cache_t *zio_cache;
kmem_cache_t *zio_link_cache; kmem_cache_t *zio_link_cache;
kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
#endif
int zio_delay_max = ZIO_DELAY_MAX; int zio_delay_max = ZIO_DELAY_MAX;
#define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_CONTINUE 0x100
@ -211,6 +217,13 @@ zio_fini(void)
*/ */
if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize) if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize)
break; break;
#endif
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c])
(void) printf("zio_fini: [%d] %llu != %llu\n",
(int)((c + 1) << SPA_MINBLOCKSHIFT),
(long long unsigned)zio_buf_cache_allocs[c],
(long long unsigned)zio_buf_cache_frees[c]);
#endif #endif
if (zio_buf_cache[c] != last_cache) { if (zio_buf_cache[c] != last_cache) {
last_cache = zio_buf_cache[c]; last_cache = zio_buf_cache[c];
@ -251,6 +264,9 @@ zio_buf_alloc(size_t size)
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
atomic_add_64(&zio_buf_cache_allocs[c], 1);
#endif
return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
} }
@ -271,26 +287,15 @@ zio_data_buf_alloc(size_t size)
return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
} }
/*
* Use zio_buf_alloc_flags when specific allocation flags are needed. e.g.
* passing KM_NOSLEEP when it is acceptable for an allocation to fail.
*/
void *
zio_buf_alloc_flags(size_t size, int flags)
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
return (kmem_cache_alloc(zio_buf_cache[c], flags));
}
void void
zio_buf_free(void *buf, size_t size) zio_buf_free(void *buf, size_t size)
{ {
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
atomic_add_64(&zio_buf_cache_frees[c], 1);
#endif
kmem_cache_free(zio_buf_cache[c], buf); kmem_cache_free(zio_buf_cache[c], buf);
} }
@ -311,12 +316,18 @@ zio_data_buf_free(void *buf, size_t size)
* ========================================================================== * ==========================================================================
*/ */
void void
zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
zio_transform_func_t *transform) zio_transform_func_t *transform)
{ {
zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
zt->zt_orig_data = zio->io_data; /*
* Ensure that anyone expecting this zio to contain a linear ABD isn't
* going to get a nasty surprise when they try to access the data.
*/
IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
zt->zt_orig_abd = zio->io_abd;
zt->zt_orig_size = zio->io_size; zt->zt_orig_size = zio->io_size;
zt->zt_bufsize = bufsize; zt->zt_bufsize = bufsize;
zt->zt_transform = transform; zt->zt_transform = transform;
@ -324,7 +335,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
zt->zt_next = zio->io_transform_stack; zt->zt_next = zio->io_transform_stack;
zio->io_transform_stack = zt; zio->io_transform_stack = zt;
zio->io_data = data; zio->io_abd = data;
zio->io_size = size; zio->io_size = size;
} }
@ -336,12 +347,12 @@ zio_pop_transforms(zio_t *zio)
while ((zt = zio->io_transform_stack) != NULL) { while ((zt = zio->io_transform_stack) != NULL) {
if (zt->zt_transform != NULL) if (zt->zt_transform != NULL)
zt->zt_transform(zio, zt->zt_transform(zio,
zt->zt_orig_data, zt->zt_orig_size); zt->zt_orig_abd, zt->zt_orig_size);
if (zt->zt_bufsize != 0) if (zt->zt_bufsize != 0)
zio_buf_free(zio->io_data, zt->zt_bufsize); abd_free(zio->io_abd);
zio->io_data = zt->zt_orig_data; zio->io_abd = zt->zt_orig_abd;
zio->io_size = zt->zt_orig_size; zio->io_size = zt->zt_orig_size;
zio->io_transform_stack = zt->zt_next; zio->io_transform_stack = zt->zt_next;
@ -355,21 +366,26 @@ zio_pop_transforms(zio_t *zio)
* ========================================================================== * ==========================================================================
*/ */
static void static void
zio_subblock(zio_t *zio, void *data, uint64_t size) zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
{ {
ASSERT(zio->io_size > size); ASSERT(zio->io_size > size);
if (zio->io_type == ZIO_TYPE_READ) if (zio->io_type == ZIO_TYPE_READ)
bcopy(zio->io_data, data, size); abd_copy(data, zio->io_abd, size);
} }
static void static void
zio_decompress(zio_t *zio, void *data, uint64_t size) zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
{ {
if (zio->io_error == 0 && if (zio->io_error == 0) {
zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), void *tmp = abd_borrow_buf(data, size);
zio->io_data, data, zio->io_size, size) != 0) int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
zio->io_abd, tmp, zio->io_size, size);
abd_return_buf_copy(data, tmp, size);
if (ret != 0)
zio->io_error = SET_ERROR(EIO); zio->io_error = SET_ERROR(EIO);
}
} }
/* /*
@ -552,7 +568,7 @@ zio_timestamp_compare(const void *x1, const void *x2)
*/ */
static zio_t * static zio_t *
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
void *private, zio_type_t type, zio_priority_t priority, void *private, zio_type_t type, zio_priority_t priority,
enum zio_flag flags, vdev_t *vd, uint64_t offset, enum zio_flag flags, vdev_t *vd, uint64_t offset,
const zbookmark_phys_t *zb, enum zio_stage stage, const zbookmark_phys_t *zb, enum zio_stage stage,
@ -611,7 +627,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_priority = priority; zio->io_priority = priority;
zio->io_vd = vd; zio->io_vd = vd;
zio->io_offset = offset; zio->io_offset = offset;
zio->io_orig_data = zio->io_data = data; zio->io_orig_abd = zio->io_abd = data;
zio->io_orig_size = zio->io_size = psize; zio->io_orig_size = zio->io_size = psize;
zio->io_lsize = lsize; zio->io_lsize = lsize;
zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_flags = zio->io_flags = flags;
@ -755,7 +771,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
zio_t * zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private, abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
{ {
zio_t *zio; zio_t *zio;
@ -773,7 +789,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
zio_t * zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done, zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags, void *private, zio_priority_t priority, enum zio_flag flags,
@ -814,7 +830,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
} }
zio_t * zio_t *
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
uint64_t size, zio_done_func_t *done, void *private, uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
{ {
@ -967,7 +983,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_t * zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private, abd_t *data, int checksum, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, boolean_t labels) zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{ {
zio_t *zio; zio_t *zio;
@ -988,7 +1004,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t * zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private, abd_t *data, int checksum, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, boolean_t labels) zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{ {
zio_t *zio; zio_t *zio;
@ -1011,8 +1027,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
* Therefore, we must make a local copy in case the data is * Therefore, we must make a local copy in case the data is
* being written to multiple places in parallel. * being written to multiple places in parallel.
*/ */
void *wbuf = zio_buf_alloc(size); abd_t *wbuf = abd_alloc_sametype(data, size);
bcopy(data, wbuf, size); abd_copy(wbuf, data, size);
zio_push_transform(zio, wbuf, size, size, NULL); zio_push_transform(zio, wbuf, size, size, NULL);
} }
@ -1024,7 +1041,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
*/ */
zio_t * zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
void *data, uint64_t size, int type, zio_priority_t priority, abd_t *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private) enum zio_flag flags, zio_done_func_t *done, void *private)
{ {
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
@ -1090,7 +1107,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
} }
zio_t * zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
int type, zio_priority_t priority, enum zio_flag flags, int type, zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private) zio_done_func_t *done, void *private)
{ {
@ -1151,14 +1168,17 @@ zio_read_bp_init(zio_t *zio)
!(zio->io_flags & ZIO_FLAG_RAW)) { !(zio->io_flags & ZIO_FLAG_RAW)) {
uint64_t psize = uint64_t psize =
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(psize); zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
psize, psize, zio_decompress);
zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
} }
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
int psize = BPE_GET_PSIZE(bp);
void *data = abd_borrow_buf(zio->io_abd, psize);
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
decode_embedded_bp_compressed(bp, zio->io_data); decode_embedded_bp_compressed(bp, data);
abd_return_buf_copy(zio->io_abd, data, psize);
} else { } else {
ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(!BP_IS_EMBEDDED(bp));
} }
@ -1299,7 +1319,7 @@ zio_write_compress(zio_t *zio)
/* If it's a compressed write that is not raw, compress the buffer. */ /* If it's a compressed write that is not raw, compress the buffer. */
if (compress != ZIO_COMPRESS_OFF && psize == lsize) { if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
void *cbuf = zio_buf_alloc(lsize); void *cbuf = zio_buf_alloc(lsize);
psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
if (psize == 0 || psize == lsize) { if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF; compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize); zio_buf_free(cbuf, lsize);
@ -1337,9 +1357,11 @@ zio_write_compress(zio_t *zio)
zio_buf_free(cbuf, lsize); zio_buf_free(cbuf, lsize);
psize = lsize; psize = lsize;
} else { } else {
bzero((char *)cbuf + psize, rounded - psize); abd_t *cdata = abd_get_from_buf(cbuf, lsize);
abd_take_ownership_of_buf(cdata, B_TRUE);
abd_zero_off(cdata, psize, rounded - psize);
psize = rounded; psize = rounded;
zio_push_transform(zio, cbuf, zio_push_transform(zio, cdata,
psize, lsize, NULL); psize, lsize, NULL);
} }
} }
@ -1942,26 +1964,38 @@ zio_resume_wait(spa_t *spa)
* ========================================================================== * ==========================================================================
*/ */
static void
zio_gang_issue_func_done(zio_t *zio)
{
abd_put(zio->io_abd);
}
static zio_t * static zio_t *
zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{ {
if (gn != NULL) if (gn != NULL)
return (pio); return (pio);
return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), BP_GET_PSIZE(bp), zio_gang_issue_func_done,
NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark)); &pio->io_bookmark));
} }
zio_t * static zio_t *
zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{ {
zio_t *zio; zio_t *zio;
if (gn != NULL) { if (gn != NULL) {
abd_t *gbh_abd =
abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark);
/* /*
* As we rewrite each gang header, the pipeline will compute * As we rewrite each gang header, the pipeline will compute
* a new gang block header checksum for it; but no one will * a new gang block header checksum for it; but no one will
@ -1972,8 +2006,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
* this is just good hygiene.) * this is just good hygiene.)
*/ */
if (gn != pio->io_gang_leader->io_gang_tree) { if (gn != pio->io_gang_leader->io_gang_tree) {
abd_t *buf = abd_get_offset(data, offset);
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
data, BP_GET_PSIZE(bp)); buf, BP_GET_PSIZE(bp));
abd_put(buf);
} }
/* /*
* If we are here to damage data for testing purposes, * If we are here to damage data for testing purposes,
@ -1983,7 +2021,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
} else { } else {
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, abd_get_offset(data, offset), BP_GET_PSIZE(bp),
zio_gang_issue_func_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
} }
@ -1991,16 +2030,18 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
} }
/* ARGSUSED */ /* ARGSUSED */
zio_t * static zio_t *
zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{ {
return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
ZIO_GANG_CHILD_FLAGS(pio))); ZIO_GANG_CHILD_FLAGS(pio)));
} }
/* ARGSUSED */ /* ARGSUSED */
zio_t * static zio_t *
zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
uint64_t offset)
{ {
return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
@ -2064,13 +2105,14 @@ static void
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
{ {
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
ASSERT(gio->io_gang_leader == gio); ASSERT(gio->io_gang_leader == gio);
ASSERT(BP_IS_GANG(bp)); ASSERT(BP_IS_GANG(bp));
zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, zio_gang_tree_assemble_done, gn, gio->io_priority,
gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
} }
static void static void
@ -2087,13 +2129,16 @@ zio_gang_tree_assemble_done(zio_t *zio)
if (zio->io_error) if (zio->io_error)
return; return;
/* this ABD was created from a linear buf in zio_gang_tree_assemble */
if (BP_SHOULD_BYTESWAP(bp)) if (BP_SHOULD_BYTESWAP(bp))
byteswap_uint64_array(zio->io_data, zio->io_size); byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
ASSERT(zio->io_data == gn->gn_gbh); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
abd_put(zio->io_abd);
for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (!BP_IS_GANG(gbp)) if (!BP_IS_GANG(gbp))
@ -2103,7 +2148,8 @@ zio_gang_tree_assemble_done(zio_t *zio)
} }
static void static void
zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
uint64_t offset)
{ {
zio_t *gio = pio->io_gang_leader; zio_t *gio = pio->io_gang_leader;
zio_t *zio; zio_t *zio;
@ -2117,7 +2163,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
* If you're a gang header, your data is in gn->gn_gbh. * If you're a gang header, your data is in gn->gn_gbh.
* If you're a gang member, your data is in 'data' and gn == NULL. * If you're a gang member, your data is in 'data' and gn == NULL.
*/ */
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
if (gn != NULL) { if (gn != NULL) {
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
@ -2126,13 +2172,14 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (BP_IS_HOLE(gbp)) if (BP_IS_HOLE(gbp))
continue; continue;
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
data = (char *)data + BP_GET_PSIZE(gbp); offset);
offset += BP_GET_PSIZE(gbp);
} }
} }
if (gn == gio->io_gang_tree) if (gn == gio->io_gang_tree)
ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); ASSERT3U(gio->io_size, ==, offset);
if (zio != pio) if (zio != pio)
zio_nowait(zio); zio_nowait(zio);
@ -2165,7 +2212,8 @@ zio_gang_issue(zio_t *zio)
ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
if (zio->io_child_error[ZIO_CHILD_GANG] == 0) if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
0);
else else
zio_gang_tree_free(&zio->io_gang_tree); zio_gang_tree_free(&zio->io_gang_tree);
@ -2205,6 +2253,12 @@ zio_write_gang_member_ready(zio_t *zio)
mutex_exit(&pio->io_lock); mutex_exit(&pio->io_lock);
} }
static void
zio_write_gang_done(zio_t *zio)
{
abd_put(zio->io_abd);
}
static int static int
zio_write_gang_block(zio_t *pio) zio_write_gang_block(zio_t *pio)
{ {
@ -2215,6 +2269,7 @@ zio_write_gang_block(zio_t *pio)
zio_t *zio; zio_t *zio;
zio_gang_node_t *gn, **gnpp; zio_gang_node_t *gn, **gnpp;
zio_gbh_phys_t *gbh; zio_gbh_phys_t *gbh;
abd_t *gbh_abd;
uint64_t txg = pio->io_txg; uint64_t txg = pio->io_txg;
uint64_t resid = pio->io_size; uint64_t resid = pio->io_size;
uint64_t lsize; uint64_t lsize;
@ -2275,12 +2330,14 @@ zio_write_gang_block(zio_t *pio)
gn = zio_gang_node_alloc(gnpp); gn = zio_gang_node_alloc(gnpp);
gbh = gn->gn_gbh; gbh = gn->gn_gbh;
bzero(gbh, SPA_GANGBLOCKSIZE); bzero(gbh, SPA_GANGBLOCKSIZE);
gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
/* /*
* Create the gang header. * Create the gang header.
*/ */
zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); zio_write_gang_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
/* /*
* Create and nowait the gang children. * Create and nowait the gang children.
@ -2302,9 +2359,9 @@ zio_write_gang_block(zio_t *pio)
zp.zp_nopwrite = B_FALSE; zp.zp_nopwrite = B_FALSE;
cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
lsize, &zp, zio_write_gang_member_ready, NULL, NULL, NULL, lsize, &zp, zio_write_gang_member_ready, NULL, NULL,
&gn->gn_child[g], pio->io_priority, zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
@ -2320,7 +2377,6 @@ zio_write_gang_block(zio_t *pio)
zp.zp_copies, cio, flags)); zp.zp_copies, cio, flags));
} }
zio_nowait(cio); zio_nowait(cio);
} }
/* /*
@ -2423,10 +2479,11 @@ zio_ddt_child_read_done(zio_t *zio)
ddp = ddt_phys_select(dde, bp); ddp = ddt_phys_select(dde, bp);
if (zio->io_error == 0) if (zio->io_error == 0)
ddt_phys_clear(ddp); /* this ddp doesn't need repair */ ddt_phys_clear(ddp); /* this ddp doesn't need repair */
if (zio->io_error == 0 && dde->dde_repair_data == NULL)
dde->dde_repair_data = zio->io_data; if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
dde->dde_repair_abd = zio->io_abd;
else else
zio_buf_free(zio->io_data, zio->io_size); abd_free(zio->io_abd);
mutex_exit(&pio->io_lock); mutex_exit(&pio->io_lock);
} }
@ -2459,16 +2516,16 @@ zio_ddt_read_start(zio_t *zio)
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
&blk); &blk);
zio_nowait(zio_read(zio, zio->io_spa, &blk, zio_nowait(zio_read(zio, zio->io_spa, &blk,
zio_buf_alloc(zio->io_size), zio->io_size, abd_alloc_for_io(zio->io_size, B_TRUE),
zio_ddt_child_read_done, dde, zio->io_priority, zio->io_size, zio_ddt_child_read_done, dde,
ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
&zio->io_bookmark)); ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
} }
return (ZIO_PIPELINE_CONTINUE); return (ZIO_PIPELINE_CONTINUE);
} }
zio_nowait(zio_read(zio, zio->io_spa, bp, zio_nowait(zio_read(zio, zio->io_spa, bp,
zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
return (ZIO_PIPELINE_CONTINUE); return (ZIO_PIPELINE_CONTINUE);
@ -2498,8 +2555,9 @@ zio_ddt_read_done(zio_t *zio)
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (ZIO_PIPELINE_STOP); return (ZIO_PIPELINE_STOP);
} }
if (dde->dde_repair_data != NULL) { if (dde->dde_repair_abd != NULL) {
bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); abd_copy(zio->io_abd, dde->dde_repair_abd,
zio->io_size);
zio->io_child_error[ZIO_CHILD_DDT] = 0; zio->io_child_error[ZIO_CHILD_DDT] = 0;
} }
ddt_repair_done(ddt, dde); ddt_repair_done(ddt, dde);
@ -2537,12 +2595,10 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (lio != NULL && do_raw) { if (lio != NULL && do_raw) {
return (lio->io_size != zio->io_size || return (lio->io_size != zio->io_size ||
bcmp(zio->io_data, lio->io_data, abd_cmp(zio->io_abd, lio->io_abd) != 0);
zio->io_size) != 0);
} else if (lio != NULL) { } else if (lio != NULL) {
return (lio->io_orig_size != zio->io_orig_size || return (lio->io_orig_size != zio->io_orig_size ||
bcmp(zio->io_orig_data, lio->io_orig_data, abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
zio->io_orig_size) != 0);
} }
} }
@ -2552,7 +2608,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (ddp->ddp_phys_birth != 0 && do_raw) { if (ddp->ddp_phys_birth != 0 && do_raw) {
blkptr_t blk = *zio->io_bp; blkptr_t blk = *zio->io_bp;
uint64_t psize; uint64_t psize;
void *tmpbuf; abd_t *tmpabd;
int error; int error;
ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
@ -2563,19 +2619,19 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
ddt_exit(ddt); ddt_exit(ddt);
tmpbuf = zio_buf_alloc(psize); tmpabd = abd_alloc_for_io(psize, B_TRUE);
error = zio_wait(zio_read(NULL, spa, &blk, tmpbuf, error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
ZIO_FLAG_RAW, &zio->io_bookmark)); ZIO_FLAG_RAW, &zio->io_bookmark));
if (error == 0) { if (error == 0) {
if (bcmp(tmpbuf, zio->io_data, psize) != 0) if (abd_cmp(tmpabd, zio->io_abd) != 0)
error = SET_ERROR(ENOENT); error = SET_ERROR(ENOENT);
} }
zio_buf_free(tmpbuf, psize); abd_free(tmpabd);
ddt_enter(ddt); ddt_enter(ddt);
return (error != 0); return (error != 0);
} else if (ddp->ddp_phys_birth != 0) { } else if (ddp->ddp_phys_birth != 0) {
@ -2597,7 +2653,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
&aflags, &zio->io_bookmark); &aflags, &zio->io_bookmark);
if (error == 0) { if (error == 0) {
if (bcmp(abuf->b_data, zio->io_orig_data, if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
zio->io_orig_size) != 0) zio->io_orig_size) != 0)
error = SET_ERROR(ENOENT); error = SET_ERROR(ENOENT);
arc_buf_destroy(abuf, &abuf); arc_buf_destroy(abuf, &abuf);
@ -2762,12 +2818,12 @@ zio_ddt_write(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE); return (ZIO_PIPELINE_CONTINUE);
} }
dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
} }
@ -2784,13 +2840,13 @@ zio_ddt_write(zio_t *zio)
ddt_phys_fill(ddp, bp); ddt_phys_fill(ddp, bp);
ddt_phys_addref(ddp); ddt_phys_addref(ddp);
} else { } else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, zp, zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_ready, NULL, NULL,
zio_ddt_child_write_done, dde, zio->io_priority, zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_lead_zio[p] = cio; dde->dde_lead_zio[p] = cio;
} }
@ -3130,11 +3186,11 @@ zio_vdev_io_start(zio_t *zio)
P2PHASE(zio->io_size, align) != 0) { P2PHASE(zio->io_size, align) != 0) {
/* Transform logical writes to be a full physical block size. */ /* Transform logical writes to be a full physical block size. */
uint64_t asize = P2ROUNDUP(zio->io_size, align); uint64_t asize = P2ROUNDUP(zio->io_size, align);
char *abuf = zio_buf_alloc(asize); abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
ASSERT(vd == vd->vdev_top); ASSERT(vd == vd->vdev_top);
if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_type == ZIO_TYPE_WRITE) {
bcopy(zio->io_data, abuf, zio->io_size); abd_copy(abuf, zio->io_abd, zio->io_size);
bzero(abuf + zio->io_size, asize - zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
} }
zio_push_transform(zio, abuf, asize, asize, zio_subblock); zio_push_transform(zio, abuf, asize, asize, zio_subblock);
} }
@ -3264,7 +3320,7 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
{ {
void *buf = zio_buf_alloc(zio->io_size); void *buf = zio_buf_alloc(zio->io_size);
bcopy(zio->io_data, buf, zio->io_size); abd_copy_to_buf(buf, zio->io_abd, zio->io_size);
zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbinfo = zio->io_size;
zcr->zcr_cbdata = buf; zcr->zcr_cbdata = buf;
@ -3398,7 +3454,7 @@ zio_checksum_generate(zio_t *zio)
} }
} }
zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
return (ZIO_PIPELINE_CONTINUE); return (ZIO_PIPELINE_CONTINUE);
} }
@ -3537,7 +3593,7 @@ zio_ready(zio_t *zio)
if (BP_IS_GANG(bp)) { if (BP_IS_GANG(bp)) {
zio->io_flags &= ~ZIO_FLAG_NODATA; zio->io_flags &= ~ZIO_FLAG_NODATA;
} else { } else {
ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
} }
} }
@ -3616,6 +3672,7 @@ zio_done(zio_t *zio)
* Always attempt to keep stack usage minimal here since * Always attempt to keep stack usage minimal here since
* we can be called recurisvely up to 19 levels deep. * we can be called recurisvely up to 19 levels deep.
*/ */
uint64_t psize = zio->io_size;
zio_t *pio, *pio_next; zio_t *pio, *pio_next;
int c, w; int c, w;
zio_link_t *zl = NULL; zio_link_t *zl = NULL;
@ -3696,28 +3753,35 @@ zio_done(zio_t *zio)
while (zio->io_cksum_report != NULL) { while (zio->io_cksum_report != NULL) {
zio_cksum_report_t *zcr = zio->io_cksum_report; zio_cksum_report_t *zcr = zio->io_cksum_report;
uint64_t align = zcr->zcr_align; uint64_t align = zcr->zcr_align;
uint64_t asize = P2ROUNDUP(zio->io_size, align); uint64_t asize = P2ROUNDUP(psize, align);
char *abuf = zio->io_data; char *abuf = NULL;
abd_t *adata = zio->io_abd;
if (asize != zio->io_size) { if (asize != psize) {
abuf = zio_buf_alloc(asize); adata = abd_alloc_linear(asize, B_TRUE);
bcopy(zio->io_data, abuf, zio->io_size); abd_copy(adata, zio->io_abd, psize);
bzero(abuf+zio->io_size, asize-zio->io_size); abd_zero_off(adata, psize, asize - psize);
} }
if (adata != NULL)
abuf = abd_borrow_buf_copy(adata, asize);
zio->io_cksum_report = zcr->zcr_next; zio->io_cksum_report = zcr->zcr_next;
zcr->zcr_next = NULL; zcr->zcr_next = NULL;
zcr->zcr_finish(zcr, abuf); zcr->zcr_finish(zcr, abuf);
zfs_ereport_free_checksum(zcr); zfs_ereport_free_checksum(zcr);
if (asize != zio->io_size) if (adata != NULL)
zio_buf_free(abuf, asize); abd_return_buf(adata, abuf, asize);
if (asize != psize)
abd_free(adata);
} }
} }
zio_pop_transforms(zio); /* note: may set zio->io_error */ zio_pop_transforms(zio); /* note: may set zio->io_error */
vdev_stat_update(zio, zio->io_size); vdev_stat_update(zio, psize);
/* /*
* If this I/O is attached to a particular vdev is slow, exceeding * If this I/O is attached to a particular vdev is slow, exceeding
@ -4098,7 +4162,6 @@ zbookmark_subtree_completed(const dnode_phys_t *dnp,
EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_type_name);
EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_buf_alloc);
EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc);
EXPORT_SYMBOL(zio_buf_alloc_flags);
EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_buf_free);
EXPORT_SYMBOL(zio_data_buf_free); EXPORT_SYMBOL(zio_data_buf_free);

View File

@ -20,8 +20,8 @@
*/ */
/* /*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -30,6 +30,7 @@
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/zio_checksum.h> #include <sys/zio_checksum.h>
#include <sys/zil.h> #include <sys/zil.h>
#include <sys/abd.h>
#include <zfs_fletcher.h> #include <zfs_fletcher.h>
/* /*
@ -92,45 +93,85 @@
/*ARGSUSED*/ /*ARGSUSED*/
static void static void
zio_checksum_off(const void *buf, uint64_t size, abd_checksum_off(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp) const void *ctx_template, zio_cksum_t *zcp)
{ {
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
} }
/*ARGSUSED*/
void
abd_fletcher_2_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) abd_iterate_func(abd, 0, size,
fletcher_2_incremental_native, zcp);
}
/*ARGSUSED*/
void
abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) abd_iterate_func(abd, 0, size,
fletcher_2_incremental_byteswap, zcp);
}
/*ARGSUSED*/
void
abd_fletcher_4_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) abd_iterate_func(abd, 0, size,
fletcher_4_incremental_native, zcp);
}
/*ARGSUSED*/
void
abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
fletcher_init(zcp);
(void) abd_iterate_func(abd, 0, size,
fletcher_4_incremental_byteswap, zcp);
}
zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
{{NULL, NULL}, NULL, NULL, 0, "inherit"}, {{NULL, NULL}, NULL, NULL, 0, "inherit"},
{{NULL, NULL}, NULL, NULL, 0, "on"}, {{NULL, NULL}, NULL, NULL, 0, "on"},
{{zio_checksum_off, zio_checksum_off}, {{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "off"}, NULL, NULL, 0, "off"},
{{zio_checksum_SHA256, zio_checksum_SHA256}, {{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"label"}, "label"},
{{zio_checksum_SHA256, zio_checksum_SHA256}, {{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"gang_header"}, "gang_header"},
{{fletcher_2_native, fletcher_2_byteswap}, {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
{{fletcher_2_native, fletcher_2_byteswap}, {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
NULL, NULL, 0, "fletcher2"}, NULL, NULL, 0, "fletcher2"},
{{fletcher_4_native, fletcher_4_byteswap}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
{{zio_checksum_SHA256, zio_checksum_SHA256}, {{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
{{fletcher_4_native, fletcher_4_byteswap}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
{{zio_checksum_off, zio_checksum_off}, {{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "noparity"}, NULL, NULL, 0, "noparity"},
{{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
{{zio_checksum_skein_native, zio_checksum_skein_byteswap}, {{abd_checksum_skein_native, abd_checksum_skein_byteswap},
zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
{{zio_checksum_edonr_native, zio_checksum_edonr_byteswap}, {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap},
zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free, abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
}; };
@ -251,7 +292,7 @@ zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
*/ */
void void
zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
void *data, uint64_t size) abd_t *abd, uint64_t size)
{ {
blkptr_t *bp = zio->io_bp; blkptr_t *bp = zio->io_bp;
uint64_t offset = zio->io_offset; uint64_t offset = zio->io_offset;
@ -266,6 +307,7 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck; zio_eck_t *eck;
void *data = abd_to_buf(abd);
if (checksum == ZIO_CHECKSUM_ZILOG2) { if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = data; zil_chain_t *zilc = data;
@ -283,18 +325,18 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
else else
bp->blk_cksum = eck->zec_cksum; bp->blk_cksum = eck->zec_cksum;
eck->zec_magic = ZEC_MAGIC; eck->zec_magic = ZEC_MAGIC;
ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum], ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
&cksum); &cksum);
eck->zec_cksum = cksum; eck->zec_cksum = cksum;
} else { } else {
ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum], ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
&bp->blk_cksum); &bp->blk_cksum);
} }
} }
int int
zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
{ {
zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_checksum_info_t *ci = &zio_checksum_table[checksum];
int byteswap; int byteswap;
@ -308,25 +350,32 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck; zio_eck_t *eck;
zio_cksum_t verifier; zio_cksum_t verifier;
size_t eck_offset;
uint64_t data_size = size;
void *data = abd_borrow_buf_copy(abd, data_size);
if (checksum == ZIO_CHECKSUM_ZILOG2) { if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = data; zil_chain_t *zilc = data;
uint64_t nused; uint64_t nused;
eck = &zilc->zc_eck; eck = &zilc->zc_eck;
if (eck->zec_magic == ZEC_MAGIC) if (eck->zec_magic == ZEC_MAGIC) {
nused = zilc->zc_nused; nused = zilc->zc_nused;
else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) {
nused = BSWAP_64(zilc->zc_nused); nused = BSWAP_64(zilc->zc_nused);
else } else {
abd_return_buf(abd, data, data_size);
return (SET_ERROR(ECKSUM)); return (SET_ERROR(ECKSUM));
}
if (nused > size) if (nused > data_size) {
abd_return_buf(abd, data, data_size);
return (SET_ERROR(ECKSUM)); return (SET_ERROR(ECKSUM));
}
size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
} else { } else {
eck = (zio_eck_t *)((char *)data + size) - 1; eck = (zio_eck_t *)((char *)data + data_size) - 1;
} }
if (checksum == ZIO_CHECKSUM_GANG_HEADER) if (checksum == ZIO_CHECKSUM_GANG_HEADER)
@ -341,11 +390,15 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
if (byteswap) if (byteswap)
byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data;
expected_cksum = eck->zec_cksum; expected_cksum = eck->zec_cksum;
eck->zec_cksum = verifier; eck->zec_cksum = verifier;
ci->ci_func[byteswap](data, size, abd_return_buf_copy(abd, data, data_size);
ci->ci_func[byteswap](abd, size,
spa->spa_cksum_tmpls[checksum], &actual_cksum); spa->spa_cksum_tmpls[checksum], &actual_cksum);
eck->zec_cksum = expected_cksum; abd_copy_from_buf_off(abd, &expected_cksum,
eck_offset, sizeof (zio_cksum_t));
if (byteswap) { if (byteswap) {
byteswap_uint64_array(&expected_cksum, byteswap_uint64_array(&expected_cksum,
@ -354,7 +407,7 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
} else { } else {
byteswap = BP_SHOULD_BYTESWAP(bp); byteswap = BP_SHOULD_BYTESWAP(bp);
expected_cksum = bp->blk_cksum; expected_cksum = bp->blk_cksum;
ci->ci_func[byteswap](data, size, ci->ci_func[byteswap](abd, size,
spa->spa_cksum_tmpls[checksum], &actual_cksum); spa->spa_cksum_tmpls[checksum], &actual_cksum);
} }
@ -383,7 +436,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
uint64_t size = (bp == NULL ? zio->io_size : uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t offset = zio->io_offset; uint64_t offset = zio->io_offset;
void *data = zio->io_data; abd_t *data = zio->io_abd;
spa_t *spa = zio->io_spa; spa_t *spa = zio->io_spa;
error = zio_checksum_error_impl(spa, bp, checksum, data, size, error = zio_checksum_error_impl(spa, bp, checksum, data, size,

View File

@ -28,7 +28,7 @@
*/ */
/* /*
* Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/ */
#include <sys/zfs_context.h> #include <sys/zfs_context.h>
@ -41,24 +41,23 @@
/* /*
* Compression vectors. * Compression vectors.
*/ */
zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
{NULL, NULL, 0, "inherit"}, {"inherit", 0, NULL, NULL},
{NULL, NULL, 0, "on"}, {"on", 0, NULL, NULL},
{NULL, NULL, 0, "uncompressed"}, {"uncompressed", 0, NULL, NULL},
{lzjb_compress, lzjb_decompress, 0, "lzjb"}, {"lzjb", 0, lzjb_compress, lzjb_decompress},
{NULL, NULL, 0, "empty"}, {"empty", 0, NULL, NULL},
{gzip_compress, gzip_decompress, 1, "gzip-1"}, {"gzip-1", 1, gzip_compress, gzip_decompress},
{gzip_compress, gzip_decompress, 2, "gzip-2"}, {"gzip-2", 2, gzip_compress, gzip_decompress},
{gzip_compress, gzip_decompress, 3, "gzip-3"}, {"gzip-3", 3, gzip_compress, gzip_decompress},
{gzip_compress, gzip_decompress, 4, "gzip-4"}, {"gzip-4", 4, gzip_compress, gzip_decompress},
{gzip_compress, gzip_decompress, 5, "gzip-5"}, {"gzip-5", 5, gzip_compress, gzip_decompress},
{gzip_compress, gzip_decompress, 6, "gzip-6"}, {"gzip-6", 6, gzip_compress, gzip_decompress},
{gzip_compress, gzip_decompress, 7, "gzip-7"}, {"gzip-7", 7, gzip_compress, gzip_decompress},
{gzip_compress, gzip_decompress, 8, "gzip-8"}, {"gzip-8", 8, gzip_compress, gzip_decompress},
{gzip_compress, gzip_decompress, 9, "gzip-9"}, {"gzip-9", 9, gzip_compress, gzip_decompress},
{zle_compress, zle_decompress, 64, "zle"}, {"zle", 64, zle_compress, zle_decompress},
{lz4_compress_zfs, lz4_decompress_zfs, 0, "lz4"}, {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs}
}; };
enum zio_compress enum zio_compress
@ -85,12 +84,26 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
return (result); return (result);
} }
size_t /*ARGSUSED*/
zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) static int
zio_compress_zeroed_cb(void *data, size_t len, void *private)
{
uint64_t *end = (uint64_t *)((char *)data + len);
uint64_t *word;
for (word = data; word < end; word++)
if (*word != 0)
return (1);
return (0);
}
size_t
zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len)
{ {
uint64_t *word, *word_end;
size_t c_len, d_len; size_t c_len, d_len;
zio_compress_info_t *ci = &zio_compress_table[c]; zio_compress_info_t *ci = &zio_compress_table[c];
void *tmp;
ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
@ -99,12 +112,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
* If the data is all zeroes, we don't even need to allocate * If the data is all zeroes, we don't even need to allocate
* a block for it. We indicate this by returning zero size. * a block for it. We indicate this by returning zero size.
*/ */
word_end = (uint64_t *)((char *)src + s_len); if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0)
for (word = src; word < word_end; word++)
if (*word != 0)
break;
if (word == word_end)
return (0); return (0);
if (c == ZIO_COMPRESS_EMPTY) if (c == ZIO_COMPRESS_EMPTY)
@ -112,7 +120,11 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
/* Compress at least 12.5% */ /* Compress at least 12.5% */
d_len = s_len - (s_len >> 3); d_len = s_len - (s_len >> 3);
c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
/* No compression algorithms can read from ABDs directly */
tmp = abd_borrow_buf_copy(src, s_len);
c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level);
abd_return_buf(src, tmp, s_len);
if (c_len > d_len) if (c_len > d_len)
return (s_len); return (s_len);
@ -122,13 +134,23 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
} }
int int
zio_decompress_data(enum zio_compress c, void *src, void *dst, zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
size_t s_len, size_t d_len) size_t s_len, size_t d_len)
{ {
zio_compress_info_t *ci = &zio_compress_table[c]; zio_compress_info_t *ci = &zio_compress_table[c];
if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
return (SET_ERROR(EINVAL)); return (SET_ERROR(EINVAL));
return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
} }
int
zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
size_t s_len, size_t d_len)
{
void *tmp = abd_borrow_buf_copy(src, s_len);
int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len);
abd_return_buf(src, tmp, s_len);
return (ret);
}