Implement SA based xattrs

The current ZFS implementation stores xattrs on disk using a hidden
directory.  In this directory a file name represents the xattr name
and the file contexts are the xattr binary data.  This approach is
very flexible and allows for arbitrarily large xattrs.  However,
it also suffers from a significant performance penalty.  Accessing
a single xattr can requires up to three disk seeks.

  1) Lookup the dnode object.
  2) Lookup the dnodes's xattr directory object.
  3) Lookup the xattr object in the directory.

To avoid this performance penalty Linux filesystems such as ext3
and xfs try to store the xattr as part of the inode on disk.  When
the xattr is to large to store in the inode then a single external
block is allocated for them.  In practice most xattrs are small
and this approach works well.

The addition of System Attributes (SA) to zfs provides us a clean
way to make this optimization.  When the dataset property 'xattr=sa'
is set then xattrs will be preferentially stored as System Attributes.
This allows tiny xattrs (~100 bytes) to be stored with the dnode and
up to 64k of xattrs to be stored in the spill block.  If additional
xattr space is required, which is unlikely under Linux, they will be
stored using the traditional directory approach.

This optimization results in roughly a 3x performance improvement
when accessing xattrs which brings zfs roughly to parity with ext4
and xfs (see table below).  When multiple xattrs are stored per-file
the performance improvements are even greater because all of the
xattrs stored in the spill block will be cached.

However, by default SA based xattrs are disabled in the Linux port
to maximize compatibility with other implementations.  If you do
enable SA based xattrs then they will not be visible on platforms
which do not support this feature.

----------------------------------------------------------------------
   Time in seconds to get/set one xattr of N bytes on 100,000 files
------+--------------------------------+------------------------------
      |            setxattr            |            getxattr
bytes |  ext4     xfs zfs-dir  zfs-sa  |  ext4     xfs zfs-dir  zfs-sa
------+--------------------------------+------------------------------
1     |  2.33   31.88   21.50    4.57  |  2.35    2.64    6.29    2.43
32    |  2.79   30.68   21.98    4.60  |  2.44    2.59    6.78    2.48
256   |  3.25   31.99   21.36    5.92  |  2.32    2.71    6.22    3.14
1024  |  3.30   32.61   22.83    8.45  |  2.40    2.79    6.24    3.27
4096  |  3.57  317.46   22.52   10.73  |  2.78   28.62    6.90    3.94
16384 |   n/a 2342.39   34.30   19.20  |   n/a   45.44  145.90    7.55
65536 |   n/a 2941.39  128.15  131.32* |   n/a  141.92  256.85  262.12*

Legend:
* ext4      - Stock RHEL6.1 ext4 mounted with '-o user_xattr'.
* xfs       - Stock RHEL6.1 xfs mounted with default options.
* zfs-dir   - Directory based xattrs only.
* zfs-sa    - Prefer SAs but spill in to directories as needed, a
              trailing * indicates overflow in to directories occured.

NOTE: Ext4 supports 4096 bytes of xattr name/value pairs per file.
NOTE: XFS and ZFS have no limit on xattr name/value pairs per file.
NOTE: Linux limits individual name/value pairs to 65536 bytes.
NOTE: All setattr/getattr's were done after dropping the cache.
NOTE: All tests were run against a single hard drive.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #443
This commit is contained in:
Brian Behlendorf 2011-10-24 16:55:20 -07:00
parent e89236fd28
commit 82a37189aa
12 changed files with 425 additions and 76 deletions

View File

@ -309,6 +309,11 @@ typedef enum {
ZFS_SYNC_DISABLED = 2 ZFS_SYNC_DISABLED = 2
} zfs_sync_type_t; } zfs_sync_type_t;
typedef enum {
ZFS_XATTR_OFF = 0,
ZFS_XATTR_DIR = 1,
ZFS_XATTR_SA = 2
} zfs_xattr_type_t;
/* /*
* On-disk version number. * On-disk version number.

View File

@ -149,6 +149,8 @@ int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
boolean_t sa_enabled(objset_t *); boolean_t sa_enabled(objset_t *);
void sa_cache_init(void); void sa_cache_init(void);
void sa_cache_fini(void); void sa_cache_fini(void);
void *sa_spill_alloc(int);
void sa_spill_free(void *);
int sa_set_sa_object(objset_t *, uint64_t); int sa_set_sa_object(objset_t *, uint64_t);
int sa_hdrsize(void *); int sa_hdrsize(void *);
void sa_handle_lock(sa_handle_t *); void sa_handle_lock(sa_handle_t *);

View File

@ -73,6 +73,7 @@ typedef enum zpl_attr {
ZPL_SYMLINK, ZPL_SYMLINK,
ZPL_SCANSTAMP, ZPL_SCANSTAMP,
ZPL_DACL_ACES, ZPL_DACL_ACES,
ZPL_DXATTR,
ZPL_END ZPL_END
} zpl_attr_t; } zpl_attr_t;
@ -126,12 +127,20 @@ typedef struct znode_phys {
} znode_phys_t; } znode_phys_t;
#ifdef _KERNEL #ifdef _KERNEL
#define DXATTR_MAX_ENTRY_SIZE (32768)
#define DXATTR_MAX_SA_SIZE (SPA_MAXBLOCKSIZE >> 1)
int zfs_sa_readlink(struct znode *, uio_t *); int zfs_sa_readlink(struct znode *, uio_t *);
void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *); void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
void zfs_sa_get_scanstamp(struct znode *, xvattr_t *); void zfs_sa_get_scanstamp(struct znode *, xvattr_t *);
void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *); void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *);
int zfs_sa_get_xattr(struct znode *);
int zfs_sa_set_xattr(struct znode *);
void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *); void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *);
void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *); void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *);
void zfs_sa_init(void);
void zfs_sa_fini(void);
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -76,6 +76,7 @@ typedef struct zfs_sb {
boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */ boolean_t z_replay; /* set during ZIL replay */
boolean_t z_use_sa; /* version allow system attributes */ boolean_t z_use_sa; /* version allow system attributes */
boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */
uint64_t z_version; /* ZPL version */ uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */ uint64_t z_shares_dir; /* hidden shares dir */
kmutex_t z_lock; kmutex_t z_lock;

View File

@ -105,6 +105,7 @@ extern "C" {
#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS] #define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS]
#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE] #define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE]
#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL] #define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL]
#define SA_ZPL_DXATTR(z) z->z_attr_table[ZPL_DXATTR]
#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD] #define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD]
/* /*
@ -206,6 +207,8 @@ typedef struct znode {
uint32_t z_sync_cnt; /* synchronous open count */ uint32_t z_sync_cnt; /* synchronous open count */
kmutex_t z_acl_lock; /* acl data lock */ kmutex_t z_acl_lock; /* acl data lock */
zfs_acl_t *z_acl_cached; /* cached acl */ zfs_acl_t *z_acl_cached; /* cached acl */
krwlock_t z_xattr_lock; /* xattr data lock */
nvlist_t *z_xattr_cached;/* cached xattrs */
list_node_t z_link_node; /* all znodes in fs link */ list_node_t z_link_node; /* all znodes in fs link */
sa_handle_t *z_sa_hdl; /* handle to sa data */ sa_handle_t *z_sa_hdl; /* handle to sa data */
boolean_t z_is_sa; /* are we native sa? */ boolean_t z_is_sa; /* are we native sa? */

View File

@ -30,7 +30,7 @@
static void * static void *
nv_alloc_sleep_spl(nv_alloc_t *nva, size_t size) nv_alloc_sleep_spl(nv_alloc_t *nva, size_t size)
{ {
return (kmem_alloc(size, KM_SLEEP)); return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
} }
static void * static void *

View File

@ -186,6 +186,14 @@ zfs_prop_init(void)
{ NULL } { NULL }
}; };
static zprop_index_t xattr_table[] = {
{ "off", ZFS_XATTR_OFF },
{ "on", ZFS_XATTR_DIR },
{ "sa", ZFS_XATTR_SA },
{ "dir", ZFS_XATTR_DIR },
{ NULL }
};
/* inherit index properties */ /* inherit index properties */
zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD, zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
@ -226,6 +234,9 @@ zfs_prop_init(void)
zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"latency | throughput", "LOGBIAS", logbias_table); "latency | throughput", "LOGBIAS", logbias_table);
zprop_register_index(ZFS_PROP_XATTR, "xattr", ZFS_XATTR_DIR,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
"on | off | dir | sa", "XATTR", xattr_table);
/* inherit index (boolean) properties */ /* inherit index (boolean) properties */
zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
@ -244,12 +255,8 @@ zfs_prop_init(void)
boolean_table); boolean_table);
zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT, zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table); ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table);
zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
boolean_table);
zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table);
boolean_table);
zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
boolean_table); boolean_table);

View File

@ -201,6 +201,7 @@ sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
static int sa_legacy_attr_count = 16; static int sa_legacy_attr_count = 16;
static kmem_cache_t *sa_cache = NULL; static kmem_cache_t *sa_cache = NULL;
static kmem_cache_t *spill_cache = NULL;
/*ARGSUSED*/ /*ARGSUSED*/
static int static int
@ -232,6 +233,8 @@ sa_cache_init(void)
sa_cache = kmem_cache_create("sa_cache", sa_cache = kmem_cache_create("sa_cache",
sizeof (sa_handle_t), 0, sa_cache_constructor, sizeof (sa_handle_t), 0, sa_cache_constructor,
sa_cache_destructor, NULL, NULL, NULL, 0); sa_cache_destructor, NULL, NULL, NULL, 0);
spill_cache = kmem_cache_create("spill_cache",
SPA_MAXBLOCKSIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
} }
void void
@ -239,6 +242,21 @@ sa_cache_fini(void)
{ {
if (sa_cache) if (sa_cache)
kmem_cache_destroy(sa_cache); kmem_cache_destroy(sa_cache);
if (spill_cache)
kmem_cache_destroy(spill_cache);
}
void *
sa_spill_alloc(int flags)
{
return kmem_cache_alloc(spill_cache, flags);
}
void
sa_spill_free(void *obj)
{
kmem_cache_free(spill_cache, obj);
} }
static int static int
@ -1618,7 +1636,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
sa_bulk_attr_t *attr_desc; sa_bulk_attr_t *attr_desc;
void *old_data[2]; void *old_data[2];
int bonus_attr_count = 0; int bonus_attr_count = 0;
int bonus_data_size = 0, spill_data_size = 0; int bonus_data_size = 0;
int spill_attr_count = 0; int spill_attr_count = 0;
int error; int error;
uint16_t length; uint16_t length;
@ -1648,8 +1666,8 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
/* Bring spill buffer online if it isn't currently */ /* Bring spill buffer online if it isn't currently */
if ((error = sa_get_spill(hdl)) == 0) { if ((error = sa_get_spill(hdl)) == 0) {
spill_data_size = hdl->sa_spill->db_size; ASSERT3U(hdl->sa_spill->db_size, <=, SPA_MAXBLOCKSIZE);
old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP); old_data[1] = sa_spill_alloc(KM_SLEEP);
bcopy(hdl->sa_spill->db_data, old_data[1], bcopy(hdl->sa_spill->db_data, old_data[1],
hdl->sa_spill->db_size); hdl->sa_spill->db_size);
spill_attr_count = spill_attr_count =
@ -1729,7 +1747,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
if (old_data[0]) if (old_data[0])
kmem_free(old_data[0], bonus_data_size); kmem_free(old_data[0], bonus_data_size);
if (old_data[1]) if (old_data[1])
kmem_free(old_data[1], spill_data_size); sa_spill_free(old_data[1]);
kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
return (error); return (error);
@ -1998,6 +2016,8 @@ EXPORT_SYMBOL(sa_replace_all_by_template_locked);
EXPORT_SYMBOL(sa_enabled); EXPORT_SYMBOL(sa_enabled);
EXPORT_SYMBOL(sa_cache_init); EXPORT_SYMBOL(sa_cache_init);
EXPORT_SYMBOL(sa_cache_fini); EXPORT_SYMBOL(sa_cache_fini);
EXPORT_SYMBOL(sa_spill_alloc);
EXPORT_SYMBOL(sa_spill_free);
EXPORT_SYMBOL(sa_set_sa_object); EXPORT_SYMBOL(sa_set_sa_object);
EXPORT_SYMBOL(sa_hdrsize); EXPORT_SYMBOL(sa_hdrsize);
EXPORT_SYMBOL(sa_handle_lock); EXPORT_SYMBOL(sa_handle_lock);

View File

@ -63,6 +63,7 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
{"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0}, {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
{"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0}, {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
{"ZPL_DACL_ACES", 0, SA_ACL, 0}, {"ZPL_DACL_ACES", 0, SA_ACL, 0},
{"ZPL_DXATTR", 0, SA_UINT8_ARRAY, 0},
{NULL, 0, 0, 0} {NULL, 0, 0, 0}
}; };
@ -183,6 +184,83 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
} }
} }
int
zfs_sa_get_xattr(znode_t *zp)
{
zfs_sb_t *zsb = ZTOZSB(zp);
char *obj;
int size;
int error;
ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
ASSERT(!zp->z_xattr_cached);
ASSERT(zp->z_is_sa);
error = sa_size(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), &size);
if (error) {
if (error == ENOENT)
return nvlist_alloc(&zp->z_xattr_cached,
NV_UNIQUE_NAME, KM_SLEEP);
else
return (error);
}
obj = sa_spill_alloc(KM_SLEEP);
error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), obj, size);
if (error == 0)
error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP);
sa_spill_free(obj);
return (error);
}
int
zfs_sa_set_xattr(znode_t *zp)
{
zfs_sb_t *zsb = ZTOZSB(zp);
dmu_tx_t *tx;
char *obj;
size_t size;
int error;
ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
ASSERT(zp->z_xattr_cached);
ASSERT(zp->z_is_sa);
error = nvlist_size(zp->z_xattr_cached, &size, NV_ENCODE_XDR);
if (error)
goto out;
obj = sa_spill_alloc(KM_SLEEP);
error = nvlist_pack(zp->z_xattr_cached, &obj, &size,
NV_ENCODE_XDR, KM_SLEEP);
if (error)
goto out_free;
tx = dmu_tx_create(zsb->z_os);
dmu_tx_hold_sa_create(tx, size);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
} else {
error = sa_update(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb),
obj, size, tx);
if (error)
dmu_tx_abort(tx);
else
dmu_tx_commit(tx);
}
out_free:
sa_spill_free(obj);
out:
return (error);
}
/* /*
* I'm not convinced we should do any of this upgrade. * I'm not convinced we should do any of this upgrade.
* since the SA code can read both old/new znode formats * since the SA code can read both old/new znode formats
@ -338,6 +416,8 @@ EXPORT_SYMBOL(zfs_sa_readlink);
EXPORT_SYMBOL(zfs_sa_symlink); EXPORT_SYMBOL(zfs_sa_symlink);
EXPORT_SYMBOL(zfs_sa_get_scanstamp); EXPORT_SYMBOL(zfs_sa_get_scanstamp);
EXPORT_SYMBOL(zfs_sa_set_scanstamp); EXPORT_SYMBOL(zfs_sa_set_scanstamp);
EXPORT_SYMBOL(zfs_sa_get_xattr);
EXPORT_SYMBOL(zfs_sa_set_xattr);
EXPORT_SYMBOL(zfs_sa_upgrade); EXPORT_SYMBOL(zfs_sa_upgrade);
EXPORT_SYMBOL(zfs_sa_upgrade_txholds); EXPORT_SYMBOL(zfs_sa_upgrade_txholds);

View File

@ -140,10 +140,16 @@ xattr_changed_cb(void *arg, uint64_t newval)
{ {
zfs_sb_t *zsb = arg; zfs_sb_t *zsb = arg;
if (newval == TRUE) if (newval == ZFS_XATTR_OFF) {
zsb->z_flags |= ZSB_XATTR;
else
zsb->z_flags &= ~ZSB_XATTR; zsb->z_flags &= ~ZSB_XATTR;
} else {
zsb->z_flags |= ZSB_XATTR;
if (newval == ZFS_XATTR_SA)
zsb->z_xattr_sa = B_TRUE;
else
zsb->z_xattr_sa = B_FALSE;
}
} }
static void static void
@ -641,6 +647,10 @@ zfs_sb_create(const char *osname, zfs_sb_t **zsbp)
&sa_obj); &sa_obj);
if (error) if (error)
goto out; goto out;
error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &zval);
if ((error == 0) && (zval == ZFS_XATTR_SA))
zsb->z_xattr_sa = B_TRUE;
} else { } else {
/* /*
* Pre SA versions file systems should never touch * Pre SA versions file systems should never touch

View File

@ -106,6 +106,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&zp->z_range_avl, zfs_range_compare, avl_create(&zp->z_range_avl, zfs_range_compare,
@ -113,6 +114,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_dirlocks = NULL; zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL; zp->z_acl_cached = NULL;
zp->z_xattr_cached = NULL;
zp->z_moved = 0; zp->z_moved = 0;
return (0); return (0);
} }
@ -128,11 +130,13 @@ zfs_znode_cache_destructor(void *buf, void *arg)
rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_parent_lock);
rw_destroy(&zp->z_name_lock); rw_destroy(&zp->z_name_lock);
mutex_destroy(&zp->z_acl_lock); mutex_destroy(&zp->z_acl_lock);
rw_destroy(&zp->z_xattr_lock);
avl_destroy(&zp->z_range_avl); avl_destroy(&zp->z_range_avl);
mutex_destroy(&zp->z_range_lock); mutex_destroy(&zp->z_range_lock);
ASSERT(zp->z_dirlocks == NULL); ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL); ASSERT(zp->z_acl_cached == NULL);
ASSERT(zp->z_xattr_cached == NULL);
} }
void void
@ -272,6 +276,11 @@ zfs_inode_destroy(struct inode *ip)
zp->z_acl_cached = NULL; zp->z_acl_cached = NULL;
} }
if (zp->z_xattr_cached) {
nvlist_free(zp->z_xattr_cached);
zp->z_xattr_cached = NULL;
}
kmem_cache_free(znode_cache, zp); kmem_cache_free(znode_cache, zp);
} }

View File

@ -29,40 +29,54 @@
* as practically no size limit on the file, and the extended * as practically no size limit on the file, and the extended
* attributes permissions may differ from those of the parent file. * attributes permissions may differ from those of the parent file.
* This interface is really quite clever, but it's also completely * This interface is really quite clever, but it's also completely
* different than what is supported on Linux. * different than what is supported on Linux. It also comes with a
* steep performance penalty when accessing small xattrs because they
* are not stored with the parent file.
* *
* Under Linux extended attributes are manipulated by the system * Under Linux extended attributes are manipulated by the system
* calls getxattr(2), setxattr(2), and listxattr(2). They consider * calls getxattr(2), setxattr(2), and listxattr(2). They consider
* extended attributes to be name/value pairs where the name is a * extended attributes to be name/value pairs where the name is a
* NULL terminated string. The name must also include one of the * NULL terminated string. The name must also include one of the
* following name space prefixes: * following namespace prefixes:
* *
* user - No restrictions and is available to user applications. * user - No restrictions and is available to user applications.
* trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use. * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use.
* system - Used for access control lists (system.nfs4_acl, etc). * system - Used for access control lists (system.nfs4_acl, etc).
* security - Used by SELinux to store a files security context. * security - Used by SELinux to store a files security context.
* *
* This Linux interface is implemented internally using the more * The value under Linux to limited to 65536 bytes of binary data.
* flexible Solaris style extended attributes. Every extended * In practice, individual xattrs tend to be much smaller than this
* attribute is store as a file in a hidden directory associated * and are typically less than 100 bytes. A good example of this
* with the parent file. This ensures on disk compatibility with * are the security.selinux xattrs which are less than 100 bytes and
* zfs implementations on other platforms (Solaris, FreeBSD, MacOS). * exist for every file when xattr labeling is enabled.
* *
* One consequence of this implementation is that when an extended * The Linux xattr implemenation has been written to take advantage of
* attribute is manipulated an inode is created. This inode will * this typical usage. When the dataset property 'xattr=sa' is set,
* exist in the Linux inode cache but there will be no associated * then xattrs will be preferentially stored as System Attributes (SA).
* entry in the dentry cache which references it. This is safe * This allows tiny xattrs (~100 bytes) to be stored with the dnode and
* but it may result in some confusion. * up to 64k of xattrs to be stored in the spill block. If additional
* xattr space is required, which is unlikely under Linux, they will
* be stored using the traditional directory approach.
* *
* Longer term I would like to see the 'security.selinux' extended * This optimization results in roughly a 3x performance improvement
* attribute moved to a SA. This should significantly improve * when accessing xattrs because it avoids the need to perform a seek
* performance on a SELinux enabled system by minimizing the * for every xattr value. When multiple xattrs are stored per-file
* number of seeks required to access a file. However, for now * the performance improvements are even greater because all of the
* this xattr is still stored in a file because I'm pretty sure * xattrs stored in the spill block will be cached.
* adding a new SA will break on-disk compatibility. *
* However, by default SA based xattrs are disabled in the Linux port
* to maximize compatibility with other implementations. If you do
* enable SA based xattrs then they will not be visible on platforms
* which do not support this feature.
*
* NOTE: One additional consequence of the xattr directory implementation
* is that when an extended attribute is manipulated an inode is created.
* This inode will exist in the Linux inode cache but there will be no
* associated entry in the dentry cache which references it. This is
* safe but it may result in some confusion. Enabling SA based xattrs
* largely avoids the issue except in the overflow case.
*/ */
#include <sys/zfs_vfsops.h> #include <sys/zfs_vfsops.h>
#include <sys/zfs_vnops.h> #include <sys/zfs_vnops.h>
#include <sys/zfs_znode.h> #include <sys/zfs_znode.h>
@ -104,17 +118,13 @@ zpl_xattr_filldir(void *arg, const char *name, int name_len,
return (0); return (0);
} }
ssize_t static ssize_t
zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr)
{ {
struct inode *ip = dentry->d_inode; struct inode *ip = xf->inode;
struct inode *dxip = NULL; struct inode *dxip = NULL;
loff_t pos = 3; /* skip '.', '..', and '.zfs' entries. */ loff_t pos = 3; /* skip '.', '..', and '.zfs' entries. */
cred_t *cr = CRED();
int error; int error;
xattr_filldir_t xf = { buffer_size, 0, buffer, ip };
crhold(cr);
/* Lookup the xattr directory */ /* Lookup the xattr directory */
error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL);
@ -122,34 +132,84 @@ zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
if (error == -ENOENT) if (error == -ENOENT)
error = 0; error = 0;
goto out; return (error);
} }
/* Fill provided buffer via zpl_zattr_filldir helper */ /* Fill provided buffer via zpl_zattr_filldir helper */
error = -zfs_readdir(dxip, (void *)&xf, zpl_xattr_filldir, &pos, cr); error = -zfs_readdir(dxip, (void *)xf, zpl_xattr_filldir, &pos, cr);
iput(dxip);
return (error);
}
static ssize_t
zpl_xattr_list_sa(xattr_filldir_t *xf)
{
znode_t *zp = ITOZ(xf->inode);
nvpair_t *nvp = NULL;
int error = 0;
mutex_enter(&zp->z_lock);
if (zp->z_xattr_cached == NULL)
error = -zfs_sa_get_xattr(zp);
mutex_exit(&zp->z_lock);
if (error)
return (error);
ASSERT(zp->z_xattr_cached);
while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
error = zpl_xattr_filldir((void *)xf, nvpair_name(nvp),
strlen(nvpair_name(nvp)), 0, 0, 0);
if (error)
return (error);
}
return (0);
}
ssize_t
zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
znode_t *zp = ITOZ(dentry->d_inode);
zfs_sb_t *zsb = ZTOZSB(zp);
xattr_filldir_t xf = { buffer_size, 0, buffer, dentry->d_inode };
cred_t *cr = CRED();
int error = 0;
crhold(cr);
rw_enter(&zp->z_xattr_lock, RW_READER);
if (zsb->z_use_sa && zp->z_is_sa) {
error = zpl_xattr_list_sa(&xf);
if (error)
goto out;
}
error = zpl_xattr_list_dir(&xf, cr);
if (error) if (error)
goto out; goto out;
error = xf.offset; error = xf.offset;
out: out:
if (dxip)
iput(dxip);
rw_exit(&zp->z_xattr_lock);
crfree(cr); crfree(cr);
return (error); return (error);
} }
static int static int
zpl_xattr_get(struct inode *ip, const char *name, void *buf, size_t size) zpl_xattr_get_dir(struct inode *ip, const char *name, void *value,
size_t size, cred_t *cr)
{ {
struct inode *dxip = NULL; struct inode *dxip = NULL;
struct inode *xip = NULL; struct inode *xip = NULL;
cred_t *cr = CRED();
int error; int error;
crhold(cr);
/* Lookup the xattr directory */ /* Lookup the xattr directory */
error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL);
if (error) if (error)
@ -165,7 +225,7 @@ zpl_xattr_get(struct inode *ip, const char *name, void *buf, size_t size)
goto out; goto out;
} }
error = zpl_read_common(xip, buf, size, 0, UIO_SYSSPACE, 0, cr); error = zpl_read_common(xip, value, size, 0, UIO_SYSSPACE, 0, cr);
out: out:
if (xip) if (xip)
iput(xip); iput(xip);
@ -173,8 +233,59 @@ out:
if (dxip) if (dxip)
iput(dxip); iput(dxip);
crfree(cr); return (error);
}
static int
zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size)
{
znode_t *zp = ITOZ(ip);
uchar_t *nv_value;
uint_t nv_size;
int error = 0;
ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
mutex_enter(&zp->z_lock);
if (zp->z_xattr_cached == NULL)
error = -zfs_sa_get_xattr(zp);
mutex_exit(&zp->z_lock);
if (error)
return (error);
ASSERT(zp->z_xattr_cached);
error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name,
&nv_value, &nv_size);
if (error)
return (error);
if (!size)
return (nv_size);
memcpy(value, nv_value, MIN(size, nv_size));
return (MIN(size, nv_size));
}
static int
__zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size,
cred_t *cr)
{
znode_t *zp = ITOZ(ip);
zfs_sb_t *zsb = ZTOZSB(zp);
int error;
ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
if (zsb->z_use_sa && zp->z_is_sa) {
error = zpl_xattr_get_sa(ip, name, value, size);
if (error >= 0)
goto out;
}
error = zpl_xattr_get_dir(ip, name, value, size, cr);
out:
if (error == -ENOENT) if (error == -ENOENT)
error = -ENODATA; error = -ENODATA;
@ -182,42 +293,43 @@ out:
} }
static int static int
zpl_xattr_set(struct inode *ip, const char *name, const void *value, zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size)
size_t size, int flags) {
znode_t *zp = ITOZ(ip);
cred_t *cr = CRED();
int error;
crhold(cr);
rw_enter(&zp->z_xattr_lock, RW_READER);
error = __zpl_xattr_get(ip, name, value, size, cr);
rw_exit(&zp->z_xattr_lock);
crfree(cr);
return (error);
}
static int
zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value,
size_t size, int flags, cred_t *cr)
{ {
struct inode *dxip = NULL; struct inode *dxip = NULL;
struct inode *xip = NULL; struct inode *xip = NULL;
vattr_t *vap = NULL; vattr_t *vap = NULL;
cred_t *cr = CRED();
ssize_t wrote; ssize_t wrote;
int error; int error;
const int xattr_mode = S_IFREG | 0644; const int xattr_mode = S_IFREG | 0644;
crhold(cr);
/* Lookup the xattr directory and create it if required. */ /* Lookup the xattr directory and create it if required. */
error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR | CREATE_XATTR_DIR, error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR | CREATE_XATTR_DIR,
cr, NULL, NULL); cr, NULL, NULL);
if (error) if (error)
goto out; goto out;
/* /* Lookup a specific xattr name in the directory */
* Lookup a specific xattr name in the directory, two failure modes:
* XATTR_CREATE: fail if xattr already exists
* XATTR_REMOVE: fail if xattr does not exist
*/
error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL); error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL);
if (error) { if (error && (error != -ENOENT))
if (error != -ENOENT) goto out;
goto out;
if ((error == -ENOENT) && (flags & XATTR_REPLACE))
goto out;
} else {
error = -EEXIST;
if (flags & XATTR_CREATE)
goto out;
}
error = 0; error = 0;
/* Remove a specific name xattr when value is set to NULL. */ /* Remove a specific name xattr when value is set to NULL. */
@ -262,7 +374,6 @@ out:
if (dxip) if (dxip)
iput(dxip); iput(dxip);
crfree(cr);
if (error == -ENOENT) if (error == -ENOENT)
error = -ENODATA; error = -ENODATA;
@ -271,9 +382,101 @@ out:
return (error); return (error);
} }
static int
zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value,
size_t size, int flags, cred_t *cr)
{
znode_t *zp = ITOZ(ip);
nvlist_t *nvl;
size_t sa_size;
int error;
ASSERT(zp->z_xattr_cached);
nvl = zp->z_xattr_cached;
if (value == NULL) {
error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY);
if (error == -ENOENT)
error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr);
} else {
/* Limited to 32k to keep nvpair memory allocations small */
if (size > DXATTR_MAX_ENTRY_SIZE)
return (-EFBIG);
/* Prevent the DXATTR SA from consuming the entire SA region */
error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
if (error)
return (error);
if (sa_size > DXATTR_MAX_SA_SIZE)
return (-EFBIG);
error = -nvlist_add_byte_array(nvl, name,
(uchar_t *)value, size);
if (error)
return (error);
}
/* Update the SA for additions, modifications, and removals. */
if (!error)
error = -zfs_sa_set_xattr(zp);
ASSERT3S(error, <=, 0);
return (error);
}
static int
zpl_xattr_set(struct inode *ip, const char *name, const void *value,
size_t size, int flags)
{
znode_t *zp = ITOZ(ip);
zfs_sb_t *zsb = ZTOZSB(zp);
cred_t *cr = CRED();
int error;
crhold(cr);
rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER);
/*
* Before setting the xattr check to see if it already exists.
* This is done to ensure the following optional flags are honored.
*
* XATTR_CREATE: fail if xattr already exists
* XATTR_REPLACE: fail if xattr does not exist
*/
error = __zpl_xattr_get(ip, name, NULL, 0, cr);
if (error < 0) {
if (error != -ENODATA)
goto out;
if ((error == -ENODATA) && (flags & XATTR_REPLACE))
goto out;
} else {
error = -EEXIST;
if (flags & XATTR_CREATE)
goto out;
}
/* Preferentially store the xattr as a SA for better performance */
if (zsb->z_use_sa && zsb->z_xattr_sa && zp->z_is_sa) {
error = zpl_xattr_set_sa(ip, name, value, size, flags, cr);
if (error == 0)
goto out;
}
error = zpl_xattr_set_dir(ip, name, value, size, flags, cr);
out:
rw_exit(&ITOZ(ip)->z_xattr_lock);
crfree(cr);
ASSERT3S(error, <=, 0);
return (error);
}
static int static int
__zpl_xattr_user_get(struct inode *ip, const char *name, __zpl_xattr_user_get(struct inode *ip, const char *name,
void *buffer, size_t size) void *value, size_t size)
{ {
char *xattr_name; char *xattr_name;
int error; int error;
@ -285,7 +488,7 @@ __zpl_xattr_user_get(struct inode *ip, const char *name,
return -EOPNOTSUPP; return -EOPNOTSUPP;
xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
error = zpl_xattr_get(ip, xattr_name, buffer, size); error = zpl_xattr_get(ip, xattr_name, value, size);
strfree(xattr_name); strfree(xattr_name);
return (error); return (error);
@ -321,7 +524,7 @@ xattr_handler_t zpl_xattr_user_handler = {
static int static int
__zpl_xattr_trusted_get(struct inode *ip, const char *name, __zpl_xattr_trusted_get(struct inode *ip, const char *name,
void *buffer, size_t size) void *value, size_t size)
{ {
char *xattr_name; char *xattr_name;
int error; int error;
@ -333,7 +536,7 @@ __zpl_xattr_trusted_get(struct inode *ip, const char *name,
return -EINVAL; return -EINVAL;
xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
error = zpl_xattr_get(ip, xattr_name, buffer, size); error = zpl_xattr_get(ip, xattr_name, value, size);
strfree(xattr_name); strfree(xattr_name);
return (error); return (error);
@ -369,7 +572,7 @@ xattr_handler_t zpl_xattr_trusted_handler = {
static int static int
__zpl_xattr_security_get(struct inode *ip, const char *name, __zpl_xattr_security_get(struct inode *ip, const char *name,
void *buffer, size_t size) void *value, size_t size)
{ {
char *xattr_name; char *xattr_name;
int error; int error;
@ -378,7 +581,7 @@ __zpl_xattr_security_get(struct inode *ip, const char *name,
return -EINVAL; return -EINVAL;
xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
error = zpl_xattr_get(ip, xattr_name, buffer, size); error = zpl_xattr_get(ip, xattr_name, value, size);
strfree(xattr_name); strfree(xattr_name);
return (error); return (error);