2014-10-22 00:59:33 +00:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2020-04-10 17:33:35 +00:00
|
|
|
* Copyright (c) 2013, Delphix. All rights reserved.
|
|
|
|
* Copyright (c) 2013, Saso Kiselkov. All rights reserved.
|
|
|
|
* Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved.
|
|
|
|
* Copyright (c) 2020, George Amanakis. All rights reserved.
|
2014-10-22 00:59:33 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _SYS_ARC_IMPL_H
|
|
|
|
#define _SYS_ARC_IMPL_H
|
|
|
|
|
|
|
|
#include <sys/arc.h>
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 17:36:48 +00:00
|
|
|
#include <sys/zio_crypt.h>
|
2019-10-18 17:23:19 +00:00
|
|
|
#include <sys/zthr.h>
|
|
|
|
#include <sys/aggsum.h>
|
2021-06-17 00:19:34 +00:00
|
|
|
#include <sys/wmsum.h>
|
2014-10-22 00:59:33 +00:00
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note that buffers can be in one of 6 states:
|
|
|
|
* ARC_anon - anonymous (discussed below)
|
|
|
|
* ARC_mru - recently used, currently cached
|
2019-08-30 16:53:15 +00:00
|
|
|
* ARC_mru_ghost - recently used, no longer in cache
|
2014-10-22 00:59:33 +00:00
|
|
|
* ARC_mfu - frequently used, currently cached
|
|
|
|
* ARC_mfu_ghost - frequently used, no longer in cache
|
|
|
|
* ARC_l2c_only - exists in L2ARC but not other states
|
|
|
|
* When there are no active references to the buffer, they are
|
|
|
|
* are linked onto a list in one of these arc states. These are
|
|
|
|
* the only buffers that can be evicted or deleted. Within each
|
|
|
|
* state there are multiple lists, one for meta-data and one for
|
|
|
|
* non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
|
|
|
|
* etc.) is tracked separately so that it can be managed more
|
|
|
|
* explicitly: favored over data, limited explicitly.
|
|
|
|
*
|
|
|
|
* Anonymous buffers are buffers that are not associated with
|
|
|
|
* a DVA. These are buffers that hold dirty block copies
|
|
|
|
* before they are written to stable storage. By definition,
|
|
|
|
* they are "ref'd" and are considered part of arc_mru
|
2017-01-03 17:31:18 +00:00
|
|
|
* that cannot be freed. Generally, they will acquire a DVA
|
2014-10-22 00:59:33 +00:00
|
|
|
* as they are written and migrate onto the arc_mru list.
|
|
|
|
*
|
|
|
|
* The ARC_l2c_only state is for buffers that are in the second
|
|
|
|
* level ARC but no longer in any of the ARC_m* lists. The second
|
|
|
|
* level ARC itself may also contain buffers that are in any of
|
|
|
|
* the ARC_m* states - meaning that a buffer can exist in two
|
|
|
|
* places. The reason for the ARC_l2c_only state is to keep the
|
|
|
|
* buffer header in the hash table, so that reads that hit the
|
|
|
|
* second level ARC benefit from these fast lookups.
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef struct arc_state {
|
2015-01-13 03:52:19 +00:00
|
|
|
/*
|
|
|
|
* list of evictable buffers
|
|
|
|
*/
|
2021-06-10 16:42:31 +00:00
|
|
|
multilist_t arcs_list[ARC_BUFC_NUMTYPES];
|
|
|
|
/*
|
|
|
|
* supports the "dbufs" kstat
|
|
|
|
*/
|
|
|
|
arc_state_type_t arcs_state;
|
2015-01-13 03:52:19 +00:00
|
|
|
/*
|
|
|
|
* total amount of evictable data in this state
|
|
|
|
*/
|
2021-06-10 16:42:31 +00:00
|
|
|
zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES] ____cacheline_aligned;
|
2015-01-13 03:52:19 +00:00
|
|
|
/*
|
|
|
|
* total amount of data in this state; this includes: evictable,
|
|
|
|
* non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
|
|
|
|
*/
|
2018-09-26 17:29:26 +00:00
|
|
|
zfs_refcount_t arcs_size;
|
2014-10-22 00:59:33 +00:00
|
|
|
} arc_state_t;
|
|
|
|
|
|
|
|
typedef struct arc_callback arc_callback_t;
|
|
|
|
|
|
|
|
struct arc_callback {
|
|
|
|
void *acb_private;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 17:36:48 +00:00
|
|
|
arc_read_done_func_t *acb_done;
|
2014-10-22 00:59:33 +00:00
|
|
|
arc_buf_t *acb_buf;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 17:36:48 +00:00
|
|
|
boolean_t acb_encrypted;
|
2016-07-11 17:45:52 +00:00
|
|
|
boolean_t acb_compressed;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 17:36:48 +00:00
|
|
|
boolean_t acb_noauth;
|
2020-12-13 00:00:00 +00:00
|
|
|
boolean_t acb_nobuf;
|
2018-05-02 22:36:20 +00:00
|
|
|
zbookmark_phys_t acb_zb;
|
2014-10-22 00:59:33 +00:00
|
|
|
zio_t *acb_zio_dummy;
|
2017-12-21 17:13:06 +00:00
|
|
|
zio_t *acb_zio_head;
|
2014-10-22 00:59:33 +00:00
|
|
|
arc_callback_t *acb_next;
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef struct arc_write_callback arc_write_callback_t;
|
|
|
|
|
|
|
|
struct arc_write_callback {
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 17:36:48 +00:00
|
|
|
void *awcb_private;
|
|
|
|
arc_write_done_func_t *awcb_ready;
|
|
|
|
arc_write_done_func_t *awcb_children_ready;
|
|
|
|
arc_write_done_func_t *awcb_physdone;
|
|
|
|
arc_write_done_func_t *awcb_done;
|
|
|
|
arc_buf_t *awcb_buf;
|
2014-10-22 00:59:33 +00:00
|
|
|
};
|
|
|
|
|
2014-12-30 03:12:23 +00:00
|
|
|
/*
|
|
|
|
* ARC buffers are separated into multiple structs as a memory saving measure:
|
|
|
|
* - Common fields struct, always defined, and embedded within it:
|
|
|
|
* - L2-only fields, always allocated but undefined when not in L2ARC
|
|
|
|
* - L1-only fields, only allocated when in L1ARC
|
|
|
|
*
|
|
|
|
* Buffer in L1 Buffer only in L2
|
|
|
|
* +------------------------+ +------------------------+
|
|
|
|
* | arc_buf_hdr_t | | arc_buf_hdr_t |
|
|
|
|
* | | | |
|
|
|
|
* | | | |
|
|
|
|
* | | | |
|
|
|
|
* +------------------------+ +------------------------+
|
|
|
|
* | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
|
|
|
|
* | (undefined if L1-only) | | |
|
|
|
|
* +------------------------+ +------------------------+
|
|
|
|
* | l1arc_buf_hdr_t |
|
|
|
|
* | |
|
|
|
|
* | |
|
|
|
|
* | |
|
|
|
|
* | |
|
|
|
|
* +------------------------+
|
|
|
|
*
|
|
|
|
* Because it's possible for the L2ARC to become extremely large, we can wind
|
|
|
|
* up eating a lot of memory in L2ARC buffer headers, so the size of a header
|
|
|
|
* is minimized by only allocating the fields necessary for an L1-cached buffer
|
|
|
|
* when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
|
|
|
|
* l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
|
|
|
|
* words in pointers. arc_hdr_realloc() is used to switch a header between
|
|
|
|
* these two allocation states.
|
|
|
|
*/
|
|
|
|
typedef struct l1arc_buf_hdr {
|
2014-10-22 00:59:33 +00:00
|
|
|
kmutex_t b_freeze_lock;
|
2016-06-02 04:04:53 +00:00
|
|
|
zio_cksum_t *b_freeze_cksum;
|
2014-10-22 00:59:33 +00:00
|
|
|
|
|
|
|
arc_buf_t *b_buf;
|
2016-06-02 04:04:53 +00:00
|
|
|
uint32_t b_bufcnt;
|
2014-12-30 03:12:23 +00:00
|
|
|
/* for waiting on writes to complete */
|
2014-10-22 00:59:33 +00:00
|
|
|
kcondvar_t b_cv;
|
2016-06-02 04:04:53 +00:00
|
|
|
uint8_t b_byteswap;
|
2014-10-22 00:59:33 +00:00
|
|
|
|
|
|
|
|
|
|
|
/* protected by arc state mutex */
|
|
|
|
arc_state_t *b_state;
|
2015-01-13 03:52:19 +00:00
|
|
|
multilist_node_t b_arc_node;
|
2014-10-22 00:59:33 +00:00
|
|
|
|
|
|
|
/* updated atomically */
|
|
|
|
clock_t b_arc_access;
|
|
|
|
uint32_t b_mru_hits;
|
|
|
|
uint32_t b_mru_ghost_hits;
|
|
|
|
uint32_t b_mfu_hits;
|
|
|
|
uint32_t b_mfu_ghost_hits;
|
|
|
|
uint32_t b_l2_hits;
|
|
|
|
|
|
|
|
/* self protecting */
|
2018-09-26 17:29:26 +00:00
|
|
|
zfs_refcount_t b_refcnt;
|
2014-10-22 00:59:33 +00:00
|
|
|
|
2014-12-30 03:12:23 +00:00
|
|
|
arc_callback_t *b_acb;
|
2016-07-22 15:52:49 +00:00
|
|
|
abd_t *b_pabd;
|
2014-12-30 03:12:23 +00:00
|
|
|
} l1arc_buf_hdr_t;
|
2014-10-22 00:59:33 +00:00
|
|
|
|
2020-04-10 17:33:35 +00:00
|
|
|
typedef enum l2arc_dev_hdr_flags_t {
|
|
|
|
L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
|
|
|
|
} l2arc_dev_hdr_flags_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pointer used in persistent L2ARC (for pointing to log blocks).
|
|
|
|
*/
|
|
|
|
typedef struct l2arc_log_blkptr {
|
|
|
|
/*
|
|
|
|
* Offset of log block within the device, in bytes
|
|
|
|
*/
|
|
|
|
uint64_t lbp_daddr;
|
|
|
|
/*
|
|
|
|
* Aligned payload size (in bytes) of the log block
|
|
|
|
*/
|
|
|
|
uint64_t lbp_payload_asize;
|
|
|
|
/*
|
|
|
|
* Offset in bytes of the first buffer in the payload
|
|
|
|
*/
|
|
|
|
uint64_t lbp_payload_start;
|
|
|
|
/*
|
|
|
|
* lbp_prop has the following format:
|
|
|
|
* * logical size (in bytes)
|
2020-05-07 23:34:03 +00:00
|
|
|
* * aligned (after compression) size (in bytes)
|
2020-04-10 17:33:35 +00:00
|
|
|
* * compression algorithm (we always LZ4-compress l2arc logs)
|
|
|
|
* * checksum algorithm (used for lbp_cksum)
|
|
|
|
*/
|
|
|
|
uint64_t lbp_prop;
|
|
|
|
zio_cksum_t lbp_cksum; /* checksum of log */
|
|
|
|
} l2arc_log_blkptr_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The persistent L2ARC device header.
|
|
|
|
* Byte order of magic determines whether 64-bit bswap of fields is necessary.
|
|
|
|
*/
|
|
|
|
typedef struct l2arc_dev_hdr_phys {
|
|
|
|
uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */
|
|
|
|
uint64_t dh_version; /* Persistent L2ARC version */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Global L2ARC device state and metadata.
|
|
|
|
*/
|
|
|
|
uint64_t dh_spa_guid;
|
|
|
|
uint64_t dh_vdev_guid;
|
2020-05-07 23:34:03 +00:00
|
|
|
uint64_t dh_log_entries; /* mirror of l2ad_log_entries */
|
2020-04-10 17:33:35 +00:00
|
|
|
uint64_t dh_evict; /* evicted offset in bytes */
|
|
|
|
uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */
|
|
|
|
/*
|
|
|
|
* Used in zdb.c for determining if a log block is valid, in the same
|
|
|
|
* way that l2arc_rebuild() does.
|
|
|
|
*/
|
2020-05-07 23:34:03 +00:00
|
|
|
uint64_t dh_start; /* mirror of l2ad_start */
|
|
|
|
uint64_t dh_end; /* mirror of l2ad_end */
|
2020-04-10 17:33:35 +00:00
|
|
|
/*
|
|
|
|
* Start of log block chain. [0] -> newest log, [1] -> one older (used
|
|
|
|
* for initiating prefetch).
|
|
|
|
*/
|
|
|
|
l2arc_log_blkptr_t dh_start_lbps[2];
|
2020-05-07 23:34:03 +00:00
|
|
|
/*
|
|
|
|
* Aligned size of all log blocks as accounted by vdev_space_update().
|
|
|
|
*/
|
|
|
|
uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */
|
|
|
|
uint64_t dh_lb_count; /* mirror of l2ad_lb_count */
|
2020-06-09 17:15:08 +00:00
|
|
|
/*
|
|
|
|
* Mirrors of vdev_trim_action_time and vdev_trim_state, used to
|
|
|
|
* display when the cache device was fully trimmed for the last
|
|
|
|
* time.
|
|
|
|
*/
|
|
|
|
uint64_t dh_trim_action_time;
|
|
|
|
uint64_t dh_trim_state;
|
|
|
|
const uint64_t dh_pad[30]; /* pad to 512 bytes */
|
2020-04-10 17:33:35 +00:00
|
|
|
zio_eck_t dh_tail;
|
|
|
|
} l2arc_dev_hdr_phys_t;
|
|
|
|
CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A single ARC buffer header entry in a l2arc_log_blk_phys_t.
|
|
|
|
*/
|
|
|
|
typedef struct l2arc_log_ent_phys {
|
|
|
|
dva_t le_dva; /* dva of buffer */
|
|
|
|
uint64_t le_birth; /* birth txg of buffer */
|
|
|
|
/*
|
|
|
|
* le_prop has the following format:
|
|
|
|
* * logical size (in bytes)
|
|
|
|
* * physical (compressed) size (in bytes)
|
|
|
|
* * compression algorithm
|
|
|
|
* * object type (used to restore arc_buf_contents_t)
|
|
|
|
* * protected status (used for encryption)
|
|
|
|
* * prefetch status (used in l2arc_read_done())
|
|
|
|
*/
|
|
|
|
uint64_t le_prop;
|
|
|
|
uint64_t le_daddr; /* buf location on l2dev */
|
Add zstd support to zfs
This PR adds two new compression types, based on ZStandard:
- zstd: A basic ZStandard compression algorithm Available compression.
Levels for zstd are zstd-1 through zstd-19, where the compression
increases with every level, but speed decreases.
- zstd-fast: A faster version of the ZStandard compression algorithm
zstd-fast is basically a "negative" level of zstd. The compression
decreases with every level, but speed increases.
Available compression levels for zstd-fast:
- zstd-fast-1 through zstd-fast-10
- zstd-fast-20 through zstd-fast-100 (in increments of 10)
- zstd-fast-500 and zstd-fast-1000
For more information check the man page.
Implementation details:
Rather than treat each level of zstd as a different algorithm (as was
done historically with gzip), the block pointer `enum zio_compress`
value is simply zstd for all levels, including zstd-fast, since they all
use the same decompression function.
The compress= property (a 64bit unsigned integer) uses the lower 7 bits
to store the compression algorithm (matching the number of bits used in
a block pointer, as the 8th bit was borrowed for embedded block
pointers). The upper bits are used to store the compression level.
It is necessary to be able to determine what compression level was used
when later reading a block back, so the concept used in LZ4, where the
first 32bits of the on-disk value are the size of the compressed data
(since the allocation is rounded up to the nearest ashift), was
extended, and we store the version of ZSTD and the level as well as the
compressed size. This value is returned when decompressing a block, so
that if the block needs to be recompressed (L2ARC, nop-write, etc), that
the same parameters will be used to result in the matching checksum.
All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`,
`zio_prop_t`, etc.) uses the separated _compress and _complevel
variables. Only the properties ZAP contains the combined/bit-shifted
value. The combined value is split when the compression_changed_cb()
callback is called, and sets both objset members (os_compress and
os_complevel).
The userspace tools all use the combined/bit-shifted value.
Additional notes:
zdb can now also decode the ZSTD compression header (flag -Z) and
inspect the size, version and compression level saved in that header.
For each record, if it is ZSTD compressed, the parameters of the decoded
compression header get printed.
ZSTD is included with all current tests and new tests are added
as-needed.
Per-dataset feature flags now get activated when the property is set.
If a compression algorithm requires a feature flag, zfs activates the
feature when the property is set, rather than waiting for the first
block to be born. This is currently only used by zstd but can be
extended as needed.
Portions-Sponsored-By: The FreeBSD Foundation
Co-authored-by: Allan Jude <allanjude@freebsd.org>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Co-authored-by: Michael Niewöhner <foss@mniewoehner.de>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Allan Jude <allanjude@freebsd.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #6247
Closes #9024
Closes #10277
Closes #10278
2020-08-18 17:10:17 +00:00
|
|
|
uint64_t le_complevel;
|
2020-04-10 17:33:35 +00:00
|
|
|
/*
|
|
|
|
* We pad the size of each entry to a power of 2 so that the size of
|
|
|
|
* l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT,
|
|
|
|
* because of the L2ARC_SET_*SIZE macros.
|
|
|
|
*/
|
Add zstd support to zfs
This PR adds two new compression types, based on ZStandard:
- zstd: A basic ZStandard compression algorithm Available compression.
Levels for zstd are zstd-1 through zstd-19, where the compression
increases with every level, but speed decreases.
- zstd-fast: A faster version of the ZStandard compression algorithm
zstd-fast is basically a "negative" level of zstd. The compression
decreases with every level, but speed increases.
Available compression levels for zstd-fast:
- zstd-fast-1 through zstd-fast-10
- zstd-fast-20 through zstd-fast-100 (in increments of 10)
- zstd-fast-500 and zstd-fast-1000
For more information check the man page.
Implementation details:
Rather than treat each level of zstd as a different algorithm (as was
done historically with gzip), the block pointer `enum zio_compress`
value is simply zstd for all levels, including zstd-fast, since they all
use the same decompression function.
The compress= property (a 64bit unsigned integer) uses the lower 7 bits
to store the compression algorithm (matching the number of bits used in
a block pointer, as the 8th bit was borrowed for embedded block
pointers). The upper bits are used to store the compression level.
It is necessary to be able to determine what compression level was used
when later reading a block back, so the concept used in LZ4, where the
first 32bits of the on-disk value are the size of the compressed data
(since the allocation is rounded up to the nearest ashift), was
extended, and we store the version of ZSTD and the level as well as the
compressed size. This value is returned when decompressing a block, so
that if the block needs to be recompressed (L2ARC, nop-write, etc), that
the same parameters will be used to result in the matching checksum.
All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`,
`zio_prop_t`, etc.) uses the separated _compress and _complevel
variables. Only the properties ZAP contains the combined/bit-shifted
value. The combined value is split when the compression_changed_cb()
callback is called, and sets both objset members (os_compress and
os_complevel).
The userspace tools all use the combined/bit-shifted value.
Additional notes:
zdb can now also decode the ZSTD compression header (flag -Z) and
inspect the size, version and compression level saved in that header.
For each record, if it is ZSTD compressed, the parameters of the decoded
compression header get printed.
ZSTD is included with all current tests and new tests are added
as-needed.
Per-dataset feature flags now get activated when the property is set.
If a compression algorithm requires a feature flag, zfs activates the
feature when the property is set, rather than waiting for the first
block to be born. This is currently only used by zstd but can be
extended as needed.
Portions-Sponsored-By: The FreeBSD Foundation
Co-authored-by: Allan Jude <allanjude@freebsd.org>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Co-authored-by: Michael Niewöhner <foss@mniewoehner.de>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Allan Jude <allanjude@freebsd.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #6247
Closes #9024
Closes #10277
Closes #10278
2020-08-18 17:10:17 +00:00
|
|
|
const uint64_t le_pad[2]; /* pad to 64 bytes */
|
2020-04-10 17:33:35 +00:00
|
|
|
} l2arc_log_ent_phys_t;
|
|
|
|
|
|
|
|
#define L2ARC_LOG_BLK_MAX_ENTRIES (1022)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A log block of up to 1022 ARC buffer log entries, chained into the
|
|
|
|
* persistent L2ARC metadata linked list. Byte order of magic determines
|
|
|
|
* whether 64-bit bswap of fields is necessary.
|
|
|
|
*/
|
|
|
|
typedef struct l2arc_log_blk_phys {
|
|
|
|
uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */
|
|
|
|
/*
|
|
|
|
* There are 2 chains (headed by dh_start_lbps[2]), and this field
|
|
|
|
* points back to the previous block in this chain. We alternate
|
|
|
|
* which chain we append to, so they are time-wise and offset-wise
|
|
|
|
* interleaved, but that is an optimization rather than for
|
|
|
|
* correctness.
|
|
|
|
*/
|
|
|
|
l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */
|
|
|
|
/*
|
|
|
|
* Pad header section to 128 bytes
|
|
|
|
*/
|
|
|
|
uint64_t lb_pad[7];
|
|
|
|
/* Payload */
|
|
|
|
l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
|
|
|
|
} l2arc_log_blk_phys_t; /* 64K total */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with
|
|
|
|
* SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros.
|
|
|
|
*/
|
|
|
|
CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t),
|
|
|
|
1ULL << SPA_MINBLOCKSHIFT));
|
|
|
|
CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE);
|
|
|
|
CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These structures hold in-flight abd buffers for log blocks as they're being
|
|
|
|
* written to the L2ARC device.
|
|
|
|
*/
|
|
|
|
typedef struct l2arc_lb_abd_buf {
|
|
|
|
abd_t *abd;
|
|
|
|
list_node_t node;
|
|
|
|
} l2arc_lb_abd_buf_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These structures hold pointers to log blocks present on the L2ARC device.
|
|
|
|
*/
|
|
|
|
typedef struct l2arc_lb_ptr_buf {
|
|
|
|
l2arc_log_blkptr_t *lb_ptr;
|
|
|
|
list_node_t node;
|
|
|
|
} l2arc_lb_ptr_buf_t;
|
|
|
|
|
|
|
|
/* Macros for setting fields in le_prop and lbp_prop */
|
|
|
|
#define L2BLK_GET_LSIZE(field) \
|
|
|
|
BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
|
|
|
|
#define L2BLK_SET_LSIZE(field, x) \
|
|
|
|
BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
|
|
|
|
#define L2BLK_GET_PSIZE(field) \
|
|
|
|
BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
|
|
|
|
#define L2BLK_SET_PSIZE(field, x) \
|
|
|
|
BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
|
|
|
|
#define L2BLK_GET_COMPRESS(field) \
|
|
|
|
BF64_GET((field), 32, SPA_COMPRESSBITS)
|
|
|
|
#define L2BLK_SET_COMPRESS(field, x) \
|
|
|
|
BF64_SET((field), 32, SPA_COMPRESSBITS, x)
|
|
|
|
#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1)
|
|
|
|
#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x)
|
|
|
|
#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8)
|
|
|
|
#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x)
|
|
|
|
#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8)
|
|
|
|
#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x)
|
|
|
|
#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1)
|
|
|
|
#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x)
|
Add L2ARC arcstats for MFU/MRU buffers and buffer content type
Currently the ARC state (MFU/MRU) of cached L2ARC buffer and their
content type is unknown. Knowing this information may prove beneficial
in adjusting the L2ARC caching policy.
This commit adds L2ARC arcstats that display the aligned size
(in bytes) of L2ARC buffers according to their content type
(data/metadata) and according to their ARC state (MRU/MFU or
prefetch). It also expands the existing evict_l2_eligible arcstat to
differentiate between MFU and MRU buffers.
L2ARC caches buffers from the MRU and MFU lists of ARC. Upon caching a
buffer, its ARC state (MRU/MFU) is stored in the L2 header
(b_arcs_state). The l2_m{f,r}u_asize arcstats reflect the aligned size
(in bytes) of L2ARC buffers according to their ARC state (based on
b_arcs_state). We also account for the case where an L2ARC and ARC
cached MRU or MRU_ghost buffer transitions to MFU. The l2_prefetch_asize
reflects the alinged size (in bytes) of L2ARC buffers that were cached
while they had the prefetch flag set in ARC. This is dynamically updated
as the prefetch flag of L2ARC buffers changes.
When buffers are evicted from ARC, if they are determined to be L2ARC
eligible then their logical size is recorded in
evict_l2_eligible_m{r,f}u arcstats according to their ARC state upon
eviction.
Persistent L2ARC:
When committing an L2ARC buffer to a log block (L2ARC metadata) its
b_arcs_state and prefetch flag is also stored. If the buffer changes
its arcstate or prefetch flag this is reflected in the above arcstats.
However, the L2ARC metadata cannot currently be updated to reflect this
change.
Example: L2ARC caches an MRU buffer. L2ARC metadata and arcstats count
this as an MRU buffer. The buffer transitions to MFU. The arcstats are
updated to reflect this. Upon pool re-import or on/offlining the L2ARC
device the arcstats are cleared and the buffer will now be counted as an
MRU buffer, as the L2ARC metadata were not updated.
Bug fix:
- If l2arc_noprefetch is set, arc_read_done clears the L2CACHE flag of
an ARC buffer. However, prefetches may be issued in a way that
arc_read_done() is bypassed. Instead, move the related code in
l2arc_write_eligible() to account for those cases too.
Also add a test and update manpages for l2arc_mfuonly module parameter,
and update the manpages and code comments for l2arc_noprefetch.
Move persist_l2arc tests to l2arc.
Reviewed-by: Ryan Moeller <freqlabs@FreeBSD.org>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #10743
2020-09-14 17:10:44 +00:00
|
|
|
#define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4)
|
|
|
|
#define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x)
|
2020-04-10 17:33:35 +00:00
|
|
|
|
|
|
|
#define PTR_SWAP(x, y) \
|
|
|
|
do { \
|
|
|
|
void *tmp = (x);\
|
|
|
|
x = y; \
|
|
|
|
y = tmp; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */
|
|
|
|
#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* L2ARC Internals
|
|
|
|
*/
|
|
|
|
typedef struct l2arc_dev {
|
|
|
|
vdev_t *l2ad_vdev; /* vdev */
|
|
|
|
spa_t *l2ad_spa; /* spa */
|
|
|
|
uint64_t l2ad_hand; /* next write location */
|
|
|
|
uint64_t l2ad_start; /* first addr on device */
|
|
|
|
uint64_t l2ad_end; /* last addr on device */
|
|
|
|
boolean_t l2ad_first; /* first sweep through */
|
|
|
|
boolean_t l2ad_writing; /* currently writing */
|
|
|
|
kmutex_t l2ad_mtx; /* lock for buffer list */
|
|
|
|
list_t l2ad_buflist; /* buffer list */
|
|
|
|
list_node_t l2ad_node; /* device list node */
|
|
|
|
zfs_refcount_t l2ad_alloc; /* allocated bytes */
|
|
|
|
/*
|
|
|
|
* Persistence-related stuff
|
|
|
|
*/
|
|
|
|
l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */
|
|
|
|
uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */
|
|
|
|
l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
|
|
|
|
int l2ad_log_ent_idx; /* index into cur log blk */
|
|
|
|
/* Number of bytes in current log block's payload */
|
|
|
|
uint64_t l2ad_log_blk_payload_asize;
|
|
|
|
/*
|
|
|
|
* Offset (in bytes) of the first buffer in current log block's
|
|
|
|
* payload.
|
|
|
|
*/
|
|
|
|
uint64_t l2ad_log_blk_payload_start;
|
|
|
|
/* Flag indicating whether a rebuild is scheduled or is going on */
|
|
|
|
boolean_t l2ad_rebuild;
|
|
|
|
boolean_t l2ad_rebuild_cancel;
|
|
|
|
boolean_t l2ad_rebuild_began;
|
|
|
|
uint64_t l2ad_log_entries; /* entries per log blk */
|
|
|
|
uint64_t l2ad_evict; /* evicted offset in bytes */
|
|
|
|
/* List of pointers to log blocks present in the L2ARC device */
|
|
|
|
list_t l2ad_lbptr_list;
|
2020-05-07 23:34:03 +00:00
|
|
|
/*
|
|
|
|
* Aligned size of all log blocks as accounted by vdev_space_update().
|
|
|
|
*/
|
|
|
|
zfs_refcount_t l2ad_lb_asize;
|
|
|
|
/*
|
|
|
|
* Number of log blocks present on the device.
|
|
|
|
*/
|
|
|
|
zfs_refcount_t l2ad_lb_count;
|
2020-06-09 17:15:08 +00:00
|
|
|
boolean_t l2ad_trim_all; /* TRIM whole device */
|
2020-04-10 17:33:35 +00:00
|
|
|
} l2arc_dev_t;
|
|
|
|
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 17:36:48 +00:00
|
|
|
/*
|
|
|
|
* Encrypted blocks will need to be stored encrypted on the L2ARC
|
|
|
|
* disk as they appear in the main pool. In order for this to work we
|
|
|
|
* need to pass around the encryption parameters so they can be used
|
|
|
|
* to write data to the L2ARC. This struct is only defined in the
|
|
|
|
* arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
|
|
|
|
* flag set.
|
|
|
|
*/
|
|
|
|
typedef struct arc_buf_hdr_crypt {
|
|
|
|
abd_t *b_rabd; /* raw encrypted data */
|
|
|
|
dmu_object_type_t b_ot; /* object type */
|
|
|
|
uint32_t b_ebufcnt; /* count of encrypted buffers */
|
|
|
|
|
|
|
|
/* dsobj for looking up encryption key for l2arc encryption */
|
|
|
|
uint64_t b_dsobj;
|
|
|
|
|
|
|
|
/* encryption parameters */
|
|
|
|
uint8_t b_salt[ZIO_DATA_SALT_LEN];
|
|
|
|
uint8_t b_iv[ZIO_DATA_IV_LEN];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Technically this could be removed since we will always be able to
|
|
|
|
* get the mac from the bp when we need it. However, it is inconvenient
|
|
|
|
* for callers of arc code to have to pass a bp in all the time. This
|
|
|
|
* also allows us to assert that L2ARC data is properly encrypted to
|
|
|
|
* match the data in the main storage pool.
|
|
|
|
*/
|
|
|
|
uint8_t b_mac[ZIO_DATA_MAC_LEN];
|
|
|
|
} arc_buf_hdr_crypt_t;
|
|
|
|
|
2014-12-30 03:12:23 +00:00
|
|
|
typedef struct l2arc_buf_hdr {
|
|
|
|
/* protected by arc_buf_hdr mutex */
|
|
|
|
l2arc_dev_t *b_dev; /* L2ARC device */
|
|
|
|
uint64_t b_daddr; /* disk address, offset byte */
|
|
|
|
uint32_t b_hits;
|
Add L2ARC arcstats for MFU/MRU buffers and buffer content type
Currently the ARC state (MFU/MRU) of cached L2ARC buffer and their
content type is unknown. Knowing this information may prove beneficial
in adjusting the L2ARC caching policy.
This commit adds L2ARC arcstats that display the aligned size
(in bytes) of L2ARC buffers according to their content type
(data/metadata) and according to their ARC state (MRU/MFU or
prefetch). It also expands the existing evict_l2_eligible arcstat to
differentiate between MFU and MRU buffers.
L2ARC caches buffers from the MRU and MFU lists of ARC. Upon caching a
buffer, its ARC state (MRU/MFU) is stored in the L2 header
(b_arcs_state). The l2_m{f,r}u_asize arcstats reflect the aligned size
(in bytes) of L2ARC buffers according to their ARC state (based on
b_arcs_state). We also account for the case where an L2ARC and ARC
cached MRU or MRU_ghost buffer transitions to MFU. The l2_prefetch_asize
reflects the alinged size (in bytes) of L2ARC buffers that were cached
while they had the prefetch flag set in ARC. This is dynamically updated
as the prefetch flag of L2ARC buffers changes.
When buffers are evicted from ARC, if they are determined to be L2ARC
eligible then their logical size is recorded in
evict_l2_eligible_m{r,f}u arcstats according to their ARC state upon
eviction.
Persistent L2ARC:
When committing an L2ARC buffer to a log block (L2ARC metadata) its
b_arcs_state and prefetch flag is also stored. If the buffer changes
its arcstate or prefetch flag this is reflected in the above arcstats.
However, the L2ARC metadata cannot currently be updated to reflect this
change.
Example: L2ARC caches an MRU buffer. L2ARC metadata and arcstats count
this as an MRU buffer. The buffer transitions to MFU. The arcstats are
updated to reflect this. Upon pool re-import or on/offlining the L2ARC
device the arcstats are cleared and the buffer will now be counted as an
MRU buffer, as the L2ARC metadata were not updated.
Bug fix:
- If l2arc_noprefetch is set, arc_read_done clears the L2CACHE flag of
an ARC buffer. However, prefetches may be issued in a way that
arc_read_done() is bypassed. Instead, move the related code in
l2arc_write_eligible() to account for those cases too.
Also add a test and update manpages for l2arc_mfuonly module parameter,
and update the manpages and code comments for l2arc_noprefetch.
Move persist_l2arc tests to l2arc.
Reviewed-by: Ryan Moeller <freqlabs@FreeBSD.org>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #10743
2020-09-14 17:10:44 +00:00
|
|
|
arc_state_type_t b_arcs_state;
|
2014-12-30 03:12:23 +00:00
|
|
|
list_node_t b_l2node;
|
|
|
|
} l2arc_buf_hdr_t;
|
|
|
|
|
2014-12-13 02:07:39 +00:00
|
|
|
typedef struct l2arc_write_callback {
|
|
|
|
l2arc_dev_t *l2wcb_dev; /* device info */
|
|
|
|
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
|
2020-04-10 17:33:35 +00:00
|
|
|
/* in-flight list of log blocks */
|
|
|
|
list_t l2wcb_abd_list;
|
2014-12-13 02:07:39 +00:00
|
|
|
} l2arc_write_callback_t;
|
|
|
|
|
2014-12-30 03:12:23 +00:00
|
|
|
struct arc_buf_hdr {
|
|
|
|
/* protected by hash lock */
|
|
|
|
dva_t b_dva;
|
|
|
|
uint64_t b_birth;
|
|
|
|
|
2016-06-02 04:04:53 +00:00
|
|
|
arc_buf_contents_t b_type;
|
Add zstd support to zfs
This PR adds two new compression types, based on ZStandard:
- zstd: A basic ZStandard compression algorithm Available compression.
Levels for zstd are zstd-1 through zstd-19, where the compression
increases with every level, but speed decreases.
- zstd-fast: A faster version of the ZStandard compression algorithm
zstd-fast is basically a "negative" level of zstd. The compression
decreases with every level, but speed increases.
Available compression levels for zstd-fast:
- zstd-fast-1 through zstd-fast-10
- zstd-fast-20 through zstd-fast-100 (in increments of 10)
- zstd-fast-500 and zstd-fast-1000
For more information check the man page.
Implementation details:
Rather than treat each level of zstd as a different algorithm (as was
done historically with gzip), the block pointer `enum zio_compress`
value is simply zstd for all levels, including zstd-fast, since they all
use the same decompression function.
The compress= property (a 64bit unsigned integer) uses the lower 7 bits
to store the compression algorithm (matching the number of bits used in
a block pointer, as the 8th bit was borrowed for embedded block
pointers). The upper bits are used to store the compression level.
It is necessary to be able to determine what compression level was used
when later reading a block back, so the concept used in LZ4, where the
first 32bits of the on-disk value are the size of the compressed data
(since the allocation is rounded up to the nearest ashift), was
extended, and we store the version of ZSTD and the level as well as the
compressed size. This value is returned when decompressing a block, so
that if the block needs to be recompressed (L2ARC, nop-write, etc), that
the same parameters will be used to result in the matching checksum.
All of the internal ZFS code ( `arc_buf_hdr_t`, `objset_t`,
`zio_prop_t`, etc.) uses the separated _compress and _complevel
variables. Only the properties ZAP contains the combined/bit-shifted
value. The combined value is split when the compression_changed_cb()
callback is called, and sets both objset members (os_compress and
os_complevel).
The userspace tools all use the combined/bit-shifted value.
Additional notes:
zdb can now also decode the ZSTD compression header (flag -Z) and
inspect the size, version and compression level saved in that header.
For each record, if it is ZSTD compressed, the parameters of the decoded
compression header get printed.
ZSTD is included with all current tests and new tests are added
as-needed.
Per-dataset feature flags now get activated when the property is set.
If a compression algorithm requires a feature flag, zfs activates the
feature when the property is set, rather than waiting for the first
block to be born. This is currently only used by zstd but can be
extended as needed.
Portions-Sponsored-By: The FreeBSD Foundation
Co-authored-by: Allan Jude <allanjude@freebsd.org>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Co-authored-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Co-authored-by: Michael Niewöhner <foss@mniewoehner.de>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Allan Jude <allanjude@freebsd.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
Signed-off-by: Michael Niewöhner <foss@mniewoehner.de>
Closes #6247
Closes #9024
Closes #10277
Closes #10278
2020-08-18 17:10:17 +00:00
|
|
|
uint8_t b_complevel;
|
|
|
|
uint8_t b_reserved1; /* used for 4 byte alignment */
|
|
|
|
uint16_t b_reserved2; /* used for 4 byte alignment */
|
2014-12-30 03:12:23 +00:00
|
|
|
arc_buf_hdr_t *b_hash_next;
|
|
|
|
arc_flags_t b_flags;
|
|
|
|
|
2016-06-02 04:04:53 +00:00
|
|
|
/*
|
|
|
|
* This field stores the size of the data buffer after
|
|
|
|
* compression, and is set in the arc's zio completion handlers.
|
|
|
|
* It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
|
|
|
|
*
|
|
|
|
* While the block pointers can store up to 32MB in their psize
|
|
|
|
* field, we can only store up to 32MB minus 512B. This is due
|
|
|
|
* to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
|
|
|
|
* a field of zeros represents 512B in the bp). We can't use a
|
|
|
|
* bias of 1 since we need to reserve a psize of zero, here, to
|
|
|
|
* represent holes and embedded blocks.
|
|
|
|
*
|
|
|
|
* This isn't a problem in practice, since the maximum size of a
|
|
|
|
* buffer is limited to 16MB, so we never need to store 32MB in
|
|
|
|
* this field. Even in the upstream illumos code base, the
|
|
|
|
* maximum size of a buffer is limited to 16MB.
|
|
|
|
*/
|
|
|
|
uint16_t b_psize;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This field stores the size of the data buffer before
|
|
|
|
* compression, and cannot change once set. It is in units
|
|
|
|
* of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
|
|
|
|
*/
|
|
|
|
uint16_t b_lsize; /* immutable */
|
|
|
|
uint64_t b_spa; /* immutable */
|
2014-12-30 03:12:23 +00:00
|
|
|
|
|
|
|
/* L2ARC fields. Undefined when not in L2ARC. */
|
|
|
|
l2arc_buf_hdr_t b_l2hdr;
|
|
|
|
/* L1ARC fields. Undefined when in l2arc_only state */
|
|
|
|
l1arc_buf_hdr_t b_l1hdr;
|
Native Encryption for ZFS on Linux
This change incorporates three major pieces:
The first change is a keystore that manages wrapping
and encryption keys for encrypted datasets. These
commands mostly involve manipulating the new
DSL Crypto Key ZAP Objects that live in the MOS. Each
encrypted dataset has its own DSL Crypto Key that is
protected with a user's key. This level of indirection
allows users to change their keys without re-encrypting
their entire datasets. The change implements the new
subcommands "zfs load-key", "zfs unload-key" and
"zfs change-key" which allow the user to manage their
encryption keys and settings. In addition, several new
flags and properties have been added to allow dataset
creation and to make mounting and unmounting more
convenient.
The second piece of this patch provides the ability to
encrypt, decyrpt, and authenticate protected datasets.
Each object set maintains a Merkel tree of Message
Authentication Codes that protect the lower layers,
similarly to how checksums are maintained. This part
impacts the zio layer, which handles the actual
encryption and generation of MACs, as well as the ARC
and DMU, which need to be able to handle encrypted
buffers and protected data.
The last addition is the ability to do raw, encrypted
sends and receives. The idea here is to send raw
encrypted and compressed data and receive it exactly
as is on a backup system. This means that the dataset
on the receiving system is protected using the same
user key that is in use on the sending side. By doing
so, datasets can be efficiently backed up to an
untrusted system without fear of data being
compromised.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #494
Closes #5769
2017-08-14 17:36:48 +00:00
|
|
|
/*
|
|
|
|
* Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
|
|
|
|
* is set and the L1 header exists.
|
|
|
|
*/
|
|
|
|
arc_buf_hdr_crypt_t b_crypt_hdr;
|
2014-12-30 03:12:23 +00:00
|
|
|
};
|
2019-10-01 23:35:05 +00:00
|
|
|
|
|
|
|
typedef struct arc_stats {
|
|
|
|
kstat_named_t arcstat_hits;
|
|
|
|
kstat_named_t arcstat_misses;
|
|
|
|
kstat_named_t arcstat_demand_data_hits;
|
|
|
|
kstat_named_t arcstat_demand_data_misses;
|
|
|
|
kstat_named_t arcstat_demand_metadata_hits;
|
|
|
|
kstat_named_t arcstat_demand_metadata_misses;
|
|
|
|
kstat_named_t arcstat_prefetch_data_hits;
|
|
|
|
kstat_named_t arcstat_prefetch_data_misses;
|
|
|
|
kstat_named_t arcstat_prefetch_metadata_hits;
|
|
|
|
kstat_named_t arcstat_prefetch_metadata_misses;
|
|
|
|
kstat_named_t arcstat_mru_hits;
|
|
|
|
kstat_named_t arcstat_mru_ghost_hits;
|
|
|
|
kstat_named_t arcstat_mfu_hits;
|
|
|
|
kstat_named_t arcstat_mfu_ghost_hits;
|
|
|
|
kstat_named_t arcstat_deleted;
|
|
|
|
/*
|
|
|
|
* Number of buffers that could not be evicted because the hash lock
|
|
|
|
* was held by another thread. The lock may not necessarily be held
|
|
|
|
* by something using the same buffer, since hash locks are shared
|
|
|
|
* by multiple buffers.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mutex_miss;
|
|
|
|
/*
|
|
|
|
* Number of buffers skipped when updating the access state due to the
|
|
|
|
* header having already been released after acquiring the hash lock.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_access_skip;
|
|
|
|
/*
|
|
|
|
* Number of buffers skipped because they have I/O in progress, are
|
|
|
|
* indirect prefetch buffers that have not lived long enough, or are
|
|
|
|
* not from the spa we're trying to evict from.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_evict_skip;
|
|
|
|
/*
|
|
|
|
* Number of times arc_evict_state() was unable to evict enough
|
|
|
|
* buffers to reach its target amount.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_evict_not_enough;
|
|
|
|
kstat_named_t arcstat_evict_l2_cached;
|
|
|
|
kstat_named_t arcstat_evict_l2_eligible;
|
Add L2ARC arcstats for MFU/MRU buffers and buffer content type
Currently the ARC state (MFU/MRU) of cached L2ARC buffer and their
content type is unknown. Knowing this information may prove beneficial
in adjusting the L2ARC caching policy.
This commit adds L2ARC arcstats that display the aligned size
(in bytes) of L2ARC buffers according to their content type
(data/metadata) and according to their ARC state (MRU/MFU or
prefetch). It also expands the existing evict_l2_eligible arcstat to
differentiate between MFU and MRU buffers.
L2ARC caches buffers from the MRU and MFU lists of ARC. Upon caching a
buffer, its ARC state (MRU/MFU) is stored in the L2 header
(b_arcs_state). The l2_m{f,r}u_asize arcstats reflect the aligned size
(in bytes) of L2ARC buffers according to their ARC state (based on
b_arcs_state). We also account for the case where an L2ARC and ARC
cached MRU or MRU_ghost buffer transitions to MFU. The l2_prefetch_asize
reflects the alinged size (in bytes) of L2ARC buffers that were cached
while they had the prefetch flag set in ARC. This is dynamically updated
as the prefetch flag of L2ARC buffers changes.
When buffers are evicted from ARC, if they are determined to be L2ARC
eligible then their logical size is recorded in
evict_l2_eligible_m{r,f}u arcstats according to their ARC state upon
eviction.
Persistent L2ARC:
When committing an L2ARC buffer to a log block (L2ARC metadata) its
b_arcs_state and prefetch flag is also stored. If the buffer changes
its arcstate or prefetch flag this is reflected in the above arcstats.
However, the L2ARC metadata cannot currently be updated to reflect this
change.
Example: L2ARC caches an MRU buffer. L2ARC metadata and arcstats count
this as an MRU buffer. The buffer transitions to MFU. The arcstats are
updated to reflect this. Upon pool re-import or on/offlining the L2ARC
device the arcstats are cleared and the buffer will now be counted as an
MRU buffer, as the L2ARC metadata were not updated.
Bug fix:
- If l2arc_noprefetch is set, arc_read_done clears the L2CACHE flag of
an ARC buffer. However, prefetches may be issued in a way that
arc_read_done() is bypassed. Instead, move the related code in
l2arc_write_eligible() to account for those cases too.
Also add a test and update manpages for l2arc_mfuonly module parameter,
and update the manpages and code comments for l2arc_noprefetch.
Move persist_l2arc tests to l2arc.
Reviewed-by: Ryan Moeller <freqlabs@FreeBSD.org>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #10743
2020-09-14 17:10:44 +00:00
|
|
|
kstat_named_t arcstat_evict_l2_eligible_mfu;
|
|
|
|
kstat_named_t arcstat_evict_l2_eligible_mru;
|
2019-10-01 23:35:05 +00:00
|
|
|
kstat_named_t arcstat_evict_l2_ineligible;
|
|
|
|
kstat_named_t arcstat_evict_l2_skip;
|
|
|
|
kstat_named_t arcstat_hash_elements;
|
|
|
|
kstat_named_t arcstat_hash_elements_max;
|
|
|
|
kstat_named_t arcstat_hash_collisions;
|
|
|
|
kstat_named_t arcstat_hash_chains;
|
|
|
|
kstat_named_t arcstat_hash_chain_max;
|
|
|
|
kstat_named_t arcstat_p;
|
|
|
|
kstat_named_t arcstat_c;
|
|
|
|
kstat_named_t arcstat_c_min;
|
|
|
|
kstat_named_t arcstat_c_max;
|
|
|
|
kstat_named_t arcstat_size;
|
|
|
|
/*
|
|
|
|
* Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
|
|
|
|
* Note that the compressed bytes may match the uncompressed bytes
|
|
|
|
* if the block is either not compressed or compressed arc is disabled.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_compressed_size;
|
|
|
|
/*
|
|
|
|
* Uncompressed size of the data stored in b_pabd. If compressed
|
|
|
|
* arc is disabled then this value will be identical to the stat
|
|
|
|
* above.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_uncompressed_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes stored in all the arc_buf_t's. This is classified
|
|
|
|
* as "overhead" since this data is typically short-lived and will
|
|
|
|
* be evicted from the arc when it becomes unreferenced unless the
|
|
|
|
* zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
|
|
|
|
* values have been set (see comment in dbuf.c for more information).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_overhead_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by internal ARC structures necessary
|
|
|
|
* for tracking purposes; these structures are not actually
|
|
|
|
* backed by ARC buffers. This includes arc_buf_hdr_t structures
|
|
|
|
* (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
|
|
|
|
* caches), and arc_buf_t structures (allocated via arc_buf_t
|
|
|
|
* cache).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_hdr_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers of type equal to
|
|
|
|
* ARC_BUFC_DATA. This is generally consumed by buffers backing
|
|
|
|
* on disk user data (e.g. plain file contents).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_data_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers of type equal to
|
|
|
|
* ARC_BUFC_METADATA. This is generally consumed by buffers
|
|
|
|
* backing on disk data that is used for internal ZFS
|
|
|
|
* structures (e.g. ZAP, dnode, indirect blocks, etc).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_metadata_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by dmu_buf_impl_t objects.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_dbuf_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by dnode_t objects.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_dnode_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by bonus buffers.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_bonus_size;
|
2020-08-20 17:55:02 +00:00
|
|
|
#if defined(COMPAT_FREEBSD11)
|
|
|
|
/*
|
|
|
|
* Sum of the previous three counters, provided for compatibility.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_other_size;
|
|
|
|
#endif
|
|
|
|
|
2019-10-01 23:35:05 +00:00
|
|
|
/*
|
|
|
|
* Total number of bytes consumed by ARC buffers residing in the
|
|
|
|
* arc_anon state. This includes *all* buffers in the arc_anon
|
|
|
|
* state; e.g. data, metadata, evictable, and unevictable buffers
|
|
|
|
* are all included in this value.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_anon_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that meet the
|
|
|
|
* following criteria: backing buffers of type ARC_BUFC_DATA,
|
|
|
|
* residing in the arc_anon state, and are eligible for eviction
|
|
|
|
* (e.g. have no outstanding holds on the buffer).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_anon_evictable_data;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that meet the
|
|
|
|
* following criteria: backing buffers of type ARC_BUFC_METADATA,
|
|
|
|
* residing in the arc_anon state, and are eligible for eviction
|
|
|
|
* (e.g. have no outstanding holds on the buffer).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_anon_evictable_metadata;
|
|
|
|
/*
|
|
|
|
* Total number of bytes consumed by ARC buffers residing in the
|
|
|
|
* arc_mru state. This includes *all* buffers in the arc_mru
|
|
|
|
* state; e.g. data, metadata, evictable, and unevictable buffers
|
|
|
|
* are all included in this value.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mru_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that meet the
|
|
|
|
* following criteria: backing buffers of type ARC_BUFC_DATA,
|
|
|
|
* residing in the arc_mru state, and are eligible for eviction
|
|
|
|
* (e.g. have no outstanding holds on the buffer).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mru_evictable_data;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that meet the
|
|
|
|
* following criteria: backing buffers of type ARC_BUFC_METADATA,
|
|
|
|
* residing in the arc_mru state, and are eligible for eviction
|
|
|
|
* (e.g. have no outstanding holds on the buffer).
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mru_evictable_metadata;
|
|
|
|
/*
|
|
|
|
* Total number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers in the arc_mru_ghost state. The key thing to note
|
|
|
|
* here, is the fact that this size doesn't actually indicate
|
|
|
|
* RAM consumption. The ghost lists only consist of headers and
|
|
|
|
* don't actually have ARC buffers linked off of these headers.
|
|
|
|
* Thus, *if* the headers had associated ARC buffers, these
|
|
|
|
* buffers *would have* consumed this number of bytes.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mru_ghost_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers that are eligible for eviction, of type
|
|
|
|
* ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mru_ghost_evictable_data;
|
|
|
|
/*
|
|
|
|
* Number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers that are eligible for eviction, of type
|
|
|
|
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mru_ghost_evictable_metadata;
|
|
|
|
/*
|
|
|
|
* Total number of bytes consumed by ARC buffers residing in the
|
|
|
|
* arc_mfu state. This includes *all* buffers in the arc_mfu
|
|
|
|
* state; e.g. data, metadata, evictable, and unevictable buffers
|
|
|
|
* are all included in this value.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mfu_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that are eligible for
|
|
|
|
* eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
|
|
|
|
* state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mfu_evictable_data;
|
|
|
|
/*
|
|
|
|
* Number of bytes consumed by ARC buffers that are eligible for
|
|
|
|
* eviction, of type ARC_BUFC_METADATA, and reside in the
|
|
|
|
* arc_mfu state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mfu_evictable_metadata;
|
|
|
|
/*
|
|
|
|
* Total number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers in the arc_mfu_ghost state. See the comment above
|
|
|
|
* arcstat_mru_ghost_size for more details.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mfu_ghost_size;
|
|
|
|
/*
|
|
|
|
* Number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers that are eligible for eviction, of type
|
|
|
|
* ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mfu_ghost_evictable_data;
|
|
|
|
/*
|
|
|
|
* Number of bytes that *would have been* consumed by ARC
|
|
|
|
* buffers that are eligible for eviction, of type
|
|
|
|
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_mfu_ghost_evictable_metadata;
|
|
|
|
kstat_named_t arcstat_l2_hits;
|
|
|
|
kstat_named_t arcstat_l2_misses;
|
Add L2ARC arcstats for MFU/MRU buffers and buffer content type
Currently the ARC state (MFU/MRU) of cached L2ARC buffer and their
content type is unknown. Knowing this information may prove beneficial
in adjusting the L2ARC caching policy.
This commit adds L2ARC arcstats that display the aligned size
(in bytes) of L2ARC buffers according to their content type
(data/metadata) and according to their ARC state (MRU/MFU or
prefetch). It also expands the existing evict_l2_eligible arcstat to
differentiate between MFU and MRU buffers.
L2ARC caches buffers from the MRU and MFU lists of ARC. Upon caching a
buffer, its ARC state (MRU/MFU) is stored in the L2 header
(b_arcs_state). The l2_m{f,r}u_asize arcstats reflect the aligned size
(in bytes) of L2ARC buffers according to their ARC state (based on
b_arcs_state). We also account for the case where an L2ARC and ARC
cached MRU or MRU_ghost buffer transitions to MFU. The l2_prefetch_asize
reflects the alinged size (in bytes) of L2ARC buffers that were cached
while they had the prefetch flag set in ARC. This is dynamically updated
as the prefetch flag of L2ARC buffers changes.
When buffers are evicted from ARC, if they are determined to be L2ARC
eligible then their logical size is recorded in
evict_l2_eligible_m{r,f}u arcstats according to their ARC state upon
eviction.
Persistent L2ARC:
When committing an L2ARC buffer to a log block (L2ARC metadata) its
b_arcs_state and prefetch flag is also stored. If the buffer changes
its arcstate or prefetch flag this is reflected in the above arcstats.
However, the L2ARC metadata cannot currently be updated to reflect this
change.
Example: L2ARC caches an MRU buffer. L2ARC metadata and arcstats count
this as an MRU buffer. The buffer transitions to MFU. The arcstats are
updated to reflect this. Upon pool re-import or on/offlining the L2ARC
device the arcstats are cleared and the buffer will now be counted as an
MRU buffer, as the L2ARC metadata were not updated.
Bug fix:
- If l2arc_noprefetch is set, arc_read_done clears the L2CACHE flag of
an ARC buffer. However, prefetches may be issued in a way that
arc_read_done() is bypassed. Instead, move the related code in
l2arc_write_eligible() to account for those cases too.
Also add a test and update manpages for l2arc_mfuonly module parameter,
and update the manpages and code comments for l2arc_noprefetch.
Move persist_l2arc tests to l2arc.
Reviewed-by: Ryan Moeller <freqlabs@FreeBSD.org>
Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #10743
2020-09-14 17:10:44 +00:00
|
|
|
/*
|
|
|
|
* Allocated size (in bytes) of L2ARC cached buffers by ARC state.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_prefetch_asize;
|
|
|
|
kstat_named_t arcstat_l2_mru_asize;
|
|
|
|
kstat_named_t arcstat_l2_mfu_asize;
|
|
|
|
/*
|
|
|
|
* Allocated size (in bytes) of L2ARC cached buffers by buffer content
|
|
|
|
* type.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_bufc_data_asize;
|
|
|
|
kstat_named_t arcstat_l2_bufc_metadata_asize;
|
2019-10-01 23:35:05 +00:00
|
|
|
kstat_named_t arcstat_l2_feeds;
|
|
|
|
kstat_named_t arcstat_l2_rw_clash;
|
|
|
|
kstat_named_t arcstat_l2_read_bytes;
|
|
|
|
kstat_named_t arcstat_l2_write_bytes;
|
|
|
|
kstat_named_t arcstat_l2_writes_sent;
|
|
|
|
kstat_named_t arcstat_l2_writes_done;
|
|
|
|
kstat_named_t arcstat_l2_writes_error;
|
|
|
|
kstat_named_t arcstat_l2_writes_lock_retry;
|
|
|
|
kstat_named_t arcstat_l2_evict_lock_retry;
|
|
|
|
kstat_named_t arcstat_l2_evict_reading;
|
|
|
|
kstat_named_t arcstat_l2_evict_l1cached;
|
|
|
|
kstat_named_t arcstat_l2_free_on_write;
|
|
|
|
kstat_named_t arcstat_l2_abort_lowmem;
|
|
|
|
kstat_named_t arcstat_l2_cksum_bad;
|
|
|
|
kstat_named_t arcstat_l2_io_error;
|
|
|
|
kstat_named_t arcstat_l2_lsize;
|
|
|
|
kstat_named_t arcstat_l2_psize;
|
|
|
|
kstat_named_t arcstat_l2_hdr_size;
|
2020-04-10 17:33:35 +00:00
|
|
|
/*
|
|
|
|
* Number of L2ARC log blocks written. These are used for restoring the
|
|
|
|
* L2ARC. Updated during writing of L2ARC log blocks.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_log_blk_writes;
|
|
|
|
/*
|
2020-05-07 23:34:03 +00:00
|
|
|
* Moving average of the aligned size of the L2ARC log blocks, in
|
2020-04-10 17:33:35 +00:00
|
|
|
* bytes. Updated during L2ARC rebuild and during writing of L2ARC
|
|
|
|
* log blocks.
|
|
|
|
*/
|
2020-05-07 23:34:03 +00:00
|
|
|
kstat_named_t arcstat_l2_log_blk_avg_asize;
|
|
|
|
/* Aligned size of L2ARC log blocks on L2ARC devices. */
|
|
|
|
kstat_named_t arcstat_l2_log_blk_asize;
|
|
|
|
/* Number of L2ARC log blocks present on L2ARC devices. */
|
|
|
|
kstat_named_t arcstat_l2_log_blk_count;
|
2020-04-10 17:33:35 +00:00
|
|
|
/*
|
2020-05-07 23:34:03 +00:00
|
|
|
* Moving average of the aligned size of L2ARC restored data, in bytes,
|
|
|
|
* to the aligned size of their metadata in L2ARC, in bytes.
|
2020-04-10 17:33:35 +00:00
|
|
|
* Updated during L2ARC rebuild and during writing of L2ARC log blocks.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_data_to_meta_ratio;
|
|
|
|
/*
|
|
|
|
* Number of times the L2ARC rebuild was successful for an L2ARC device.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_rebuild_success;
|
|
|
|
/*
|
|
|
|
* Number of times the L2ARC rebuild failed because the device header
|
|
|
|
* was in an unsupported format or corrupted.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_rebuild_abort_unsupported;
|
|
|
|
/*
|
|
|
|
* Number of times the L2ARC rebuild failed because of IO errors
|
|
|
|
* while reading a log block.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_rebuild_abort_io_errors;
|
|
|
|
/*
|
|
|
|
* Number of times the L2ARC rebuild failed because of IO errors when
|
|
|
|
* reading the device header.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
|
|
|
|
/*
|
|
|
|
* Number of L2ARC log blocks which failed to be restored due to
|
|
|
|
* checksum errors.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
|
|
|
|
/*
|
|
|
|
* Number of times the L2ARC rebuild was aborted due to low system
|
|
|
|
* memory.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_rebuild_abort_lowmem;
|
|
|
|
/* Logical size of L2ARC restored data, in bytes. */
|
|
|
|
kstat_named_t arcstat_l2_rebuild_size;
|
2020-05-07 23:34:03 +00:00
|
|
|
/* Aligned size of L2ARC restored data, in bytes. */
|
|
|
|
kstat_named_t arcstat_l2_rebuild_asize;
|
2020-04-10 17:33:35 +00:00
|
|
|
/*
|
|
|
|
* Number of L2ARC log entries (buffers) that were successfully
|
|
|
|
* restored in ARC.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_rebuild_bufs;
|
|
|
|
/*
|
|
|
|
* Number of L2ARC log entries (buffers) already cached in ARC. These
|
|
|
|
* were not restored again.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_rebuild_bufs_precached;
|
|
|
|
/*
|
|
|
|
* Number of L2ARC log blocks that were restored successfully. Each
|
|
|
|
* log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
|
|
|
|
*/
|
|
|
|
kstat_named_t arcstat_l2_rebuild_log_blks;
|
2019-10-01 23:35:05 +00:00
|
|
|
kstat_named_t arcstat_memory_throttle_count;
|
|
|
|
kstat_named_t arcstat_memory_direct_count;
|
|
|
|
kstat_named_t arcstat_memory_indirect_count;
|
|
|
|
kstat_named_t arcstat_memory_all_bytes;
|
|
|
|
kstat_named_t arcstat_memory_free_bytes;
|
|
|
|
kstat_named_t arcstat_memory_available_bytes;
|
|
|
|
kstat_named_t arcstat_no_grow;
|
|
|
|
kstat_named_t arcstat_tempreserve;
|
|
|
|
kstat_named_t arcstat_loaned_bytes;
|
|
|
|
kstat_named_t arcstat_prune;
|
|
|
|
kstat_named_t arcstat_meta_used;
|
|
|
|
kstat_named_t arcstat_meta_limit;
|
|
|
|
kstat_named_t arcstat_dnode_limit;
|
|
|
|
kstat_named_t arcstat_meta_max;
|
|
|
|
kstat_named_t arcstat_meta_min;
|
|
|
|
kstat_named_t arcstat_async_upgrade_sync;
|
|
|
|
kstat_named_t arcstat_demand_hit_predictive_prefetch;
|
|
|
|
kstat_named_t arcstat_demand_hit_prescient_prefetch;
|
|
|
|
kstat_named_t arcstat_need_free;
|
|
|
|
kstat_named_t arcstat_sys_free;
|
|
|
|
kstat_named_t arcstat_raw_size;
|
Improve zfs send performance by bypassing the ARC
When doing a zfs send on a dataset with small recordsize (e.g. 8K),
performance is dominated by the per-block overheads. This is especially
true with `zfs send --compressed`, which further reduces the amount of
data sent, for the same number of blocks. Several threads are involved,
but the limiting factor is the `send_prefetch` thread, which is 100% on
CPU.
The main job of the `send_prefetch` thread is to issue zio's for the
data that will be needed by the main thread. It does this by calling
`arc_read(ARC_FLAG_PREFETCH)`. This has an immediate cost of creating
an arc_hdr, which takes around 14% of one CPU. It also induces later
costs by other threads:
* Since the data was only prefetched, dmu_send()->dmu_dump_write() will
need to call arc_read() again to get the data. This will have to
look up the arc_hdr in the hash table and copy the data from the
scatter ABD in the arc_hdr to a linear ABD in arc_buf. This takes
27% of one CPU.
* dmu_dump_write() needs to arc_buf_destroy() This takes 11% of one
CPU.
* arc_adjust() will need to evict this arc_hdr, taking about 50% of one
CPU.
All of these costs can be avoided by bypassing the ARC if the data is
not already cached. This commit changes `zfs send` to check for the
data in the ARC, and if it is not found then we directly call
`zio_read()`, reading the data into a linear ABD which is used by
dmu_dump_write() directly.
The performance improvement is best expressed in terms of how many
blocks can be processed by `zfs send` in one second. This change
increases the metric by 50%, from ~100,000 to ~150,000. When the amount
of data per block is small (e.g. 2KB), there is a corresponding
reduction in the elapsed time of `zfs send >/dev/null` (from 86 minutes
to 58 minutes in this test case).
In addition to improving the performance of `zfs send`, this change
makes `zfs send` not pollute the ARC cache. In most cases the data will
not be reused, so this allows us to keep caching useful data in the MRU
(hit-once) part of the ARC.
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10067
2020-03-10 17:51:04 +00:00
|
|
|
kstat_named_t arcstat_cached_only_in_progress;
|
Include scatter_chunk_waste in arc_size
The ARC caches data in scatter ABD's, which are collections of pages,
which are typically 4K. Therefore, the space used to cache each block
is rounded up to a multiple of 4K. The ABD subsystem tracks this wasted
memory in the `scatter_chunk_waste` kstat. However, the ARC's `size` is
not aware of the memory used by this round-up, it only accounts for the
size that it requested from the ABD subsystem.
Therefore, the ARC is effectively using more memory than it is aware of,
due to the `scatter_chunk_waste`. This impacts observability, e.g.
`arcstat` will show that the ARC is using less memory than it
effectively is. It also impacts how the ARC responds to memory
pressure. As the amount of `scatter_chunk_waste` changes, it appears to
the ARC as memory pressure, so it needs to resize `arc_c`.
If the sector size (`1<<ashift`) is the same as the page size (or
larger), there won't be any waste. If the (compressed) block size is
relatively large compared to the page size, the amount of
`scatter_chunk_waste` will be small, so the problematic effects are
minimal.
However, if using 512B sectors (`ashift=9`), and the (compressed) block
size is small (e.g. `compression=on` with the default `volblocksize=8k`
or a decreased `recordsize`), the amount of `scatter_chunk_waste` can be
very large. On a production system, with `arc_size` at a constant 50%
of memory, `scatter_chunk_waste` has been been observed to be 10-30% of
memory.
This commit adds `scatter_chunk_waste` to `arc_size`, and adds a new
`waste` field to `arcstat`. As a result, the ARC's memory usage is more
observable, and `arc_c` does not need to be adjusted as frequently.
Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10701
2020-08-18 03:04:04 +00:00
|
|
|
kstat_named_t arcstat_abd_chunk_waste_size;
|
2019-10-01 23:35:05 +00:00
|
|
|
} arc_stats_t;
|
|
|
|
|
2021-06-17 00:19:34 +00:00
|
|
|
typedef struct arc_sums {
|
|
|
|
wmsum_t arcstat_hits;
|
|
|
|
wmsum_t arcstat_misses;
|
|
|
|
wmsum_t arcstat_demand_data_hits;
|
|
|
|
wmsum_t arcstat_demand_data_misses;
|
|
|
|
wmsum_t arcstat_demand_metadata_hits;
|
|
|
|
wmsum_t arcstat_demand_metadata_misses;
|
|
|
|
wmsum_t arcstat_prefetch_data_hits;
|
|
|
|
wmsum_t arcstat_prefetch_data_misses;
|
|
|
|
wmsum_t arcstat_prefetch_metadata_hits;
|
|
|
|
wmsum_t arcstat_prefetch_metadata_misses;
|
|
|
|
wmsum_t arcstat_mru_hits;
|
|
|
|
wmsum_t arcstat_mru_ghost_hits;
|
|
|
|
wmsum_t arcstat_mfu_hits;
|
|
|
|
wmsum_t arcstat_mfu_ghost_hits;
|
|
|
|
wmsum_t arcstat_deleted;
|
|
|
|
wmsum_t arcstat_mutex_miss;
|
|
|
|
wmsum_t arcstat_access_skip;
|
|
|
|
wmsum_t arcstat_evict_skip;
|
|
|
|
wmsum_t arcstat_evict_not_enough;
|
|
|
|
wmsum_t arcstat_evict_l2_cached;
|
|
|
|
wmsum_t arcstat_evict_l2_eligible;
|
|
|
|
wmsum_t arcstat_evict_l2_eligible_mfu;
|
|
|
|
wmsum_t arcstat_evict_l2_eligible_mru;
|
|
|
|
wmsum_t arcstat_evict_l2_ineligible;
|
|
|
|
wmsum_t arcstat_evict_l2_skip;
|
|
|
|
wmsum_t arcstat_hash_collisions;
|
|
|
|
wmsum_t arcstat_hash_chains;
|
|
|
|
aggsum_t arcstat_size;
|
|
|
|
wmsum_t arcstat_compressed_size;
|
|
|
|
wmsum_t arcstat_uncompressed_size;
|
|
|
|
wmsum_t arcstat_overhead_size;
|
|
|
|
wmsum_t arcstat_hdr_size;
|
|
|
|
wmsum_t arcstat_data_size;
|
|
|
|
wmsum_t arcstat_metadata_size;
|
|
|
|
wmsum_t arcstat_dbuf_size;
|
|
|
|
aggsum_t arcstat_dnode_size;
|
|
|
|
wmsum_t arcstat_bonus_size;
|
|
|
|
wmsum_t arcstat_l2_hits;
|
|
|
|
wmsum_t arcstat_l2_misses;
|
|
|
|
wmsum_t arcstat_l2_prefetch_asize;
|
|
|
|
wmsum_t arcstat_l2_mru_asize;
|
|
|
|
wmsum_t arcstat_l2_mfu_asize;
|
|
|
|
wmsum_t arcstat_l2_bufc_data_asize;
|
|
|
|
wmsum_t arcstat_l2_bufc_metadata_asize;
|
|
|
|
wmsum_t arcstat_l2_feeds;
|
|
|
|
wmsum_t arcstat_l2_rw_clash;
|
|
|
|
wmsum_t arcstat_l2_read_bytes;
|
|
|
|
wmsum_t arcstat_l2_write_bytes;
|
|
|
|
wmsum_t arcstat_l2_writes_sent;
|
|
|
|
wmsum_t arcstat_l2_writes_done;
|
|
|
|
wmsum_t arcstat_l2_writes_error;
|
|
|
|
wmsum_t arcstat_l2_writes_lock_retry;
|
|
|
|
wmsum_t arcstat_l2_evict_lock_retry;
|
|
|
|
wmsum_t arcstat_l2_evict_reading;
|
|
|
|
wmsum_t arcstat_l2_evict_l1cached;
|
|
|
|
wmsum_t arcstat_l2_free_on_write;
|
|
|
|
wmsum_t arcstat_l2_abort_lowmem;
|
|
|
|
wmsum_t arcstat_l2_cksum_bad;
|
|
|
|
wmsum_t arcstat_l2_io_error;
|
|
|
|
wmsum_t arcstat_l2_lsize;
|
|
|
|
wmsum_t arcstat_l2_psize;
|
|
|
|
aggsum_t arcstat_l2_hdr_size;
|
|
|
|
wmsum_t arcstat_l2_log_blk_writes;
|
|
|
|
wmsum_t arcstat_l2_log_blk_asize;
|
|
|
|
wmsum_t arcstat_l2_log_blk_count;
|
|
|
|
wmsum_t arcstat_l2_rebuild_success;
|
|
|
|
wmsum_t arcstat_l2_rebuild_abort_unsupported;
|
|
|
|
wmsum_t arcstat_l2_rebuild_abort_io_errors;
|
|
|
|
wmsum_t arcstat_l2_rebuild_abort_dh_errors;
|
|
|
|
wmsum_t arcstat_l2_rebuild_abort_cksum_lb_errors;
|
|
|
|
wmsum_t arcstat_l2_rebuild_abort_lowmem;
|
|
|
|
wmsum_t arcstat_l2_rebuild_size;
|
|
|
|
wmsum_t arcstat_l2_rebuild_asize;
|
|
|
|
wmsum_t arcstat_l2_rebuild_bufs;
|
|
|
|
wmsum_t arcstat_l2_rebuild_bufs_precached;
|
|
|
|
wmsum_t arcstat_l2_rebuild_log_blks;
|
|
|
|
wmsum_t arcstat_memory_throttle_count;
|
|
|
|
wmsum_t arcstat_memory_direct_count;
|
|
|
|
wmsum_t arcstat_memory_indirect_count;
|
|
|
|
wmsum_t arcstat_prune;
|
|
|
|
aggsum_t arcstat_meta_used;
|
|
|
|
wmsum_t arcstat_async_upgrade_sync;
|
|
|
|
wmsum_t arcstat_demand_hit_predictive_prefetch;
|
|
|
|
wmsum_t arcstat_demand_hit_prescient_prefetch;
|
|
|
|
wmsum_t arcstat_raw_size;
|
|
|
|
wmsum_t arcstat_cached_only_in_progress;
|
|
|
|
wmsum_t arcstat_abd_chunk_waste_size;
|
|
|
|
} arc_sums_t;
|
|
|
|
|
Revise ARC shrinker algorithm
The ARC shrinker callback `arc_shrinker_count/_scan()` is invoked by the
kernel's shrinker mechanism when the system is running low on free
pages. This happens via 2 code paths:
1. "direct reclaim": The system is attempting to allocate a page, but we
are low on memory. The ARC shrinker callback is invoked from the
page-allocation code path.
2. "indirect reclaim": kswapd notices that there aren't many free pages,
so it invokes the ARC shrinker callback.
In both cases, the kernel's shrinker code requests that the ARC shrinker
callback release some of its cache, and then it measures how many pages
were released. However, it's measurement of released pages does not
include pages that are freed via `__free_pages()`, which is how the ARC
releases memory (via `abd_free_chunks()`). Rather, the kernel shrinker
code is looking for pages to be placed on the lists of reclaimable pages
(which is separate from actually-free pages).
Because the kernel shrinker code doesn't detect that the ARC has
released pages, it may call the ARC shrinker callback many times,
resulting in the ARC "collapsing" down to `arc_c_min`. This has several
negative impacts:
1. ZFS doesn't use RAM to cache data effectively.
2. In the direct reclaim case, a single page allocation may wait a long
time (e.g. more than a minute) while we evict the entire ARC.
3. Even with the improvements made in 67c0f0dedc5 ("ARC shrinking blocks
reads/writes"), occasionally `arc_size` may stay above `arc_c` for the
entire time of the ARC collapse, thus blocking ZFS read/write operations
in `arc_get_data_impl()`.
To address these issues, this commit limits the ways that the ARC
shrinker callback can be used by the kernel shrinker code, and mitigates
the impact of arc_is_overflowing() on ZFS read/write operations.
With this commit:
1. We limit the amount of data that can be reclaimed from the ARC via
the "direct reclaim" shrinker. This limits the amount of time it takes
to allocate a single page.
2. We do not allow the ARC to shrink via kswapd (indirect reclaim).
Instead we rely on `arc_evict_zthr` to monitor free memory and reduce
the ARC target size to keep sufficient free memory in the system. Note
that we can't simply rely on limiting the amount that we reclaim at once
(as for the direct reclaim case), because kswapd's "boosted" logic can
invoke the callback an unlimited number of times (see
`balance_pgdat()`).
3. When `arc_is_overflowing()` and we want to allocate memory,
`arc_get_data_impl()` will wait only for a multiple of the requested
amount of data to be evicted, rather than waiting for the ARC to no
longer be overflowing. This allows ZFS reads/writes to make progress
even while the ARC is overflowing, while also ensuring that the eviction
thread makes progress towards reducing the total amount of memory used
by the ARC.
4. The amount of memory that the ARC always tries to keep free for the
rest of the system, `arc_sys_free` is increased.
5. Now that the shrinker callback is able to provide feedback to the
kernel's shrinker code about our progress, we can safely enable
the kswapd hook. This will allow the arc to receive notifications
when memory pressure is first detected by the kernel. We also
re-enable the appropriate kstats to track these callbacks.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10600
2020-08-01 04:10:52 +00:00
|
|
|
typedef struct arc_evict_waiter {
|
|
|
|
list_node_t aew_node;
|
|
|
|
kcondvar_t aew_cv;
|
|
|
|
uint64_t aew_count;
|
|
|
|
} arc_evict_waiter_t;
|
2019-10-18 17:23:19 +00:00
|
|
|
|
|
|
|
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
|
|
|
|
|
|
|
|
#define ARCSTAT_INCR(stat, val) \
|
2021-06-17 00:19:34 +00:00
|
|
|
wmsum_add(&arc_sums.stat, (val))
|
2019-10-18 17:23:19 +00:00
|
|
|
|
|
|
|
#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
|
|
|
|
#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
|
|
|
|
|
|
|
|
#define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */
|
|
|
|
#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
|
|
|
|
#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
|
|
|
|
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
|
|
|
|
#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
|
|
|
|
#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
|
|
|
|
|
2021-07-20 14:13:21 +00:00
|
|
|
#define arc_anon (&ARC_anon)
|
|
|
|
#define arc_mru (&ARC_mru)
|
|
|
|
#define arc_mru_ghost (&ARC_mru_ghost)
|
|
|
|
#define arc_mfu (&ARC_mfu)
|
|
|
|
#define arc_mfu_ghost (&ARC_mfu_ghost)
|
|
|
|
#define arc_l2c_only (&ARC_l2c_only)
|
|
|
|
|
2019-10-18 17:23:19 +00:00
|
|
|
extern taskq_t *arc_prune_taskq;
|
|
|
|
extern arc_stats_t arc_stats;
|
2021-06-17 00:19:34 +00:00
|
|
|
extern arc_sums_t arc_sums;
|
2019-10-18 17:23:19 +00:00
|
|
|
extern hrtime_t arc_growtime;
|
|
|
|
extern boolean_t arc_warm;
|
|
|
|
extern int arc_grow_retry;
|
2020-07-24 00:35:34 +00:00
|
|
|
extern int arc_no_grow_shift;
|
2019-10-18 17:23:19 +00:00
|
|
|
extern int arc_shrink_shift;
|
|
|
|
extern kmutex_t arc_prune_mtx;
|
|
|
|
extern list_t arc_prune_list;
|
2021-07-20 14:13:21 +00:00
|
|
|
extern arc_state_t ARC_mfu;
|
|
|
|
extern arc_state_t ARC_mru;
|
2019-10-18 17:23:19 +00:00
|
|
|
extern uint_t zfs_arc_pc_percent;
|
|
|
|
extern int arc_lotsfree_percent;
|
2020-07-19 17:15:34 +00:00
|
|
|
extern unsigned long zfs_arc_min;
|
|
|
|
extern unsigned long zfs_arc_max;
|
2019-10-18 17:23:19 +00:00
|
|
|
|
|
|
|
extern void arc_reduce_target_size(int64_t to_free);
|
|
|
|
extern boolean_t arc_reclaim_needed(void);
|
|
|
|
extern void arc_kmem_reap_soon(void);
|
Revise ARC shrinker algorithm
The ARC shrinker callback `arc_shrinker_count/_scan()` is invoked by the
kernel's shrinker mechanism when the system is running low on free
pages. This happens via 2 code paths:
1. "direct reclaim": The system is attempting to allocate a page, but we
are low on memory. The ARC shrinker callback is invoked from the
page-allocation code path.
2. "indirect reclaim": kswapd notices that there aren't many free pages,
so it invokes the ARC shrinker callback.
In both cases, the kernel's shrinker code requests that the ARC shrinker
callback release some of its cache, and then it measures how many pages
were released. However, it's measurement of released pages does not
include pages that are freed via `__free_pages()`, which is how the ARC
releases memory (via `abd_free_chunks()`). Rather, the kernel shrinker
code is looking for pages to be placed on the lists of reclaimable pages
(which is separate from actually-free pages).
Because the kernel shrinker code doesn't detect that the ARC has
released pages, it may call the ARC shrinker callback many times,
resulting in the ARC "collapsing" down to `arc_c_min`. This has several
negative impacts:
1. ZFS doesn't use RAM to cache data effectively.
2. In the direct reclaim case, a single page allocation may wait a long
time (e.g. more than a minute) while we evict the entire ARC.
3. Even with the improvements made in 67c0f0dedc5 ("ARC shrinking blocks
reads/writes"), occasionally `arc_size` may stay above `arc_c` for the
entire time of the ARC collapse, thus blocking ZFS read/write operations
in `arc_get_data_impl()`.
To address these issues, this commit limits the ways that the ARC
shrinker callback can be used by the kernel shrinker code, and mitigates
the impact of arc_is_overflowing() on ZFS read/write operations.
With this commit:
1. We limit the amount of data that can be reclaimed from the ARC via
the "direct reclaim" shrinker. This limits the amount of time it takes
to allocate a single page.
2. We do not allow the ARC to shrink via kswapd (indirect reclaim).
Instead we rely on `arc_evict_zthr` to monitor free memory and reduce
the ARC target size to keep sufficient free memory in the system. Note
that we can't simply rely on limiting the amount that we reclaim at once
(as for the direct reclaim case), because kswapd's "boosted" logic can
invoke the callback an unlimited number of times (see
`balance_pgdat()`).
3. When `arc_is_overflowing()` and we want to allocate memory,
`arc_get_data_impl()` will wait only for a multiple of the requested
amount of data to be evicted, rather than waiting for the ARC to no
longer be overflowing. This allows ZFS reads/writes to make progress
even while the ARC is overflowing, while also ensuring that the eviction
thread makes progress towards reducing the total amount of memory used
by the ARC.
4. The amount of memory that the ARC always tries to keep free for the
rest of the system, `arc_sys_free` is increased.
5. Now that the shrinker callback is able to provide feedback to the
kernel's shrinker code about our progress, we can safely enable
the kswapd hook. This will allow the arc to receive notifications
when memory pressure is first detected by the kernel. We also
re-enable the appropriate kstats to track these callbacks.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: George Wilson <george.wilson@delphix.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #10600
2020-08-01 04:10:52 +00:00
|
|
|
extern void arc_wait_for_eviction(uint64_t);
|
2019-10-18 17:23:19 +00:00
|
|
|
|
|
|
|
extern void arc_lowmem_init(void);
|
|
|
|
extern void arc_lowmem_fini(void);
|
|
|
|
extern void arc_prune_async(int64_t);
|
|
|
|
extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg);
|
|
|
|
extern uint64_t arc_free_memory(void);
|
|
|
|
extern int64_t arc_available_memory(void);
|
2020-04-09 22:39:48 +00:00
|
|
|
extern void arc_tuning_update(boolean_t);
|
2020-12-10 22:09:23 +00:00
|
|
|
extern void arc_register_hotplug(void);
|
|
|
|
extern void arc_unregister_hotplug(void);
|
2019-10-26 22:22:19 +00:00
|
|
|
|
2020-04-07 17:06:22 +00:00
|
|
|
extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS);
|
|
|
|
extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS);
|
2021-08-16 15:35:19 +00:00
|
|
|
extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS);
|
|
|
|
extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS);
|
2019-10-18 17:23:19 +00:00
|
|
|
|
2020-04-10 17:33:35 +00:00
|
|
|
/* used in zdb.c */
|
|
|
|
boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
|
|
|
|
const l2arc_log_blkptr_t *lbp);
|
|
|
|
|
2020-06-09 17:15:08 +00:00
|
|
|
/* used in vdev_trim.c */
|
|
|
|
void l2arc_dev_hdr_update(l2arc_dev_t *dev);
|
|
|
|
l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
|
|
|
|
|
2014-10-22 00:59:33 +00:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _SYS_ARC_IMPL_H */
|