More aggsum optimizations
- Avoid atomic_add() when updating as_lower_bound/as_upper_bound. Previous code was excessively strong on 64bit systems while not strong enough on 32bit ones. Instead introduce and use real atomic_load() and atomic_store() operations, just an assignments on 64bit machines, but using proper atomics on 32bit ones to avoid torn reads/writes. - Reduce number of buckets on large systems. Extra buckets not as much improve add speed, as hurt reads. Unlike wmsum for aggsum reads are still important. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored-By: iXsystems, Inc. Closes #12145
This commit is contained in:
parent
e5e76bd643
commit
ea400129c3
|
@ -48,6 +48,8 @@
|
||||||
#define atomic_sub_32_nv(v, i) atomic_sub_return((i), (atomic_t *)(v))
|
#define atomic_sub_32_nv(v, i) atomic_sub_return((i), (atomic_t *)(v))
|
||||||
#define atomic_cas_32(v, x, y) atomic_cmpxchg((atomic_t *)(v), x, y)
|
#define atomic_cas_32(v, x, y) atomic_cmpxchg((atomic_t *)(v), x, y)
|
||||||
#define atomic_swap_32(v, x) atomic_xchg((atomic_t *)(v), x)
|
#define atomic_swap_32(v, x) atomic_xchg((atomic_t *)(v), x)
|
||||||
|
#define atomic_load_32(v) atomic_read((atomic_t *)(v))
|
||||||
|
#define atomic_store_32(v, x) atomic_set((atomic_t *)(v), x)
|
||||||
#define atomic_inc_64(v) atomic64_inc((atomic64_t *)(v))
|
#define atomic_inc_64(v) atomic64_inc((atomic64_t *)(v))
|
||||||
#define atomic_dec_64(v) atomic64_dec((atomic64_t *)(v))
|
#define atomic_dec_64(v) atomic64_dec((atomic64_t *)(v))
|
||||||
#define atomic_add_64(v, i) atomic64_add((i), (atomic64_t *)(v))
|
#define atomic_add_64(v, i) atomic64_add((i), (atomic64_t *)(v))
|
||||||
|
@ -58,6 +60,8 @@
|
||||||
#define atomic_sub_64_nv(v, i) atomic64_sub_return((i), (atomic64_t *)(v))
|
#define atomic_sub_64_nv(v, i) atomic64_sub_return((i), (atomic64_t *)(v))
|
||||||
#define atomic_cas_64(v, x, y) atomic64_cmpxchg((atomic64_t *)(v), x, y)
|
#define atomic_cas_64(v, x, y) atomic64_cmpxchg((atomic64_t *)(v), x, y)
|
||||||
#define atomic_swap_64(v, x) atomic64_xchg((atomic64_t *)(v), x)
|
#define atomic_swap_64(v, x) atomic64_xchg((atomic64_t *)(v), x)
|
||||||
|
#define atomic_load_64(v) atomic64_read((atomic64_t *)(v))
|
||||||
|
#define atomic_store_64(v, x) atomic64_set((atomic64_t *)(v), x)
|
||||||
|
|
||||||
#ifdef _LP64
|
#ifdef _LP64
|
||||||
static __inline__ void *
|
static __inline__ void *
|
||||||
|
|
|
@ -39,15 +39,16 @@ struct aggsum_bucket {
|
||||||
typedef struct aggsum {
|
typedef struct aggsum {
|
||||||
kmutex_t as_lock;
|
kmutex_t as_lock;
|
||||||
int64_t as_lower_bound;
|
int64_t as_lower_bound;
|
||||||
int64_t as_upper_bound;
|
uint64_t as_upper_bound;
|
||||||
|
aggsum_bucket_t *as_buckets ____cacheline_aligned;
|
||||||
uint_t as_numbuckets;
|
uint_t as_numbuckets;
|
||||||
aggsum_bucket_t *as_buckets;
|
uint_t as_bucketshift;
|
||||||
} aggsum_t;
|
} aggsum_t;
|
||||||
|
|
||||||
void aggsum_init(aggsum_t *, uint64_t);
|
void aggsum_init(aggsum_t *, uint64_t);
|
||||||
void aggsum_fini(aggsum_t *);
|
void aggsum_fini(aggsum_t *);
|
||||||
int64_t aggsum_lower_bound(aggsum_t *);
|
int64_t aggsum_lower_bound(aggsum_t *);
|
||||||
int64_t aggsum_upper_bound(aggsum_t *);
|
uint64_t aggsum_upper_bound(aggsum_t *);
|
||||||
int aggsum_compare(aggsum_t *, uint64_t);
|
int aggsum_compare(aggsum_t *, uint64_t);
|
||||||
uint64_t aggsum_value(aggsum_t *);
|
uint64_t aggsum_value(aggsum_t *);
|
||||||
void aggsum_add(aggsum_t *, int64_t);
|
void aggsum_add(aggsum_t *, int64_t);
|
||||||
|
|
|
@ -313,6 +313,19 @@ atomic_swap_ptr(volatile void *target, void *bits)
|
||||||
return (__atomic_exchange_n((void **)target, bits, __ATOMIC_SEQ_CST));
|
return (__atomic_exchange_n((void **)target, bits, __ATOMIC_SEQ_CST));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef _LP64
|
||||||
|
uint64_t
|
||||||
|
atomic_load_64(volatile uint64_t *target)
|
||||||
|
{
|
||||||
|
return (__atomic_load_n(target, __ATOMIC_RELAXED));
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
atomic_store_64(volatile uint64_t *target, uint64_t bits)
|
||||||
|
{
|
||||||
|
return (__atomic_store_n(target, bits, __ATOMIC_RELAXED));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
int
|
int
|
||||||
atomic_set_long_excl(volatile ulong_t *target, uint_t value)
|
atomic_set_long_excl(volatile ulong_t *target, uint_t value)
|
||||||
|
|
|
@ -245,6 +245,49 @@ extern ulong_t atomic_swap_ulong(volatile ulong_t *, ulong_t);
|
||||||
extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t);
|
extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Atomically read variable.
|
||||||
|
*/
|
||||||
|
#define atomic_load_char(p) (*(volatile uchar_t *)(p))
|
||||||
|
#define atomic_load_short(p) (*(volatile ushort_t *)(p))
|
||||||
|
#define atomic_load_int(p) (*(volatile uint_t *)(p))
|
||||||
|
#define atomic_load_long(p) (*(volatile ulong_t *)(p))
|
||||||
|
#define atomic_load_ptr(p) (*(volatile __typeof(*p) *)(p))
|
||||||
|
#define atomic_load_8(p) (*(volatile uint8_t *)(p))
|
||||||
|
#define atomic_load_16(p) (*(volatile uint16_t *)(p))
|
||||||
|
#define atomic_load_32(p) (*(volatile uint32_t *)(p))
|
||||||
|
#ifdef _LP64
|
||||||
|
#define atomic_load_64(p) (*(volatile uint64_t *)(p))
|
||||||
|
#elif defined(_INT64_TYPE)
|
||||||
|
extern uint64_t atomic_load_64(volatile uint64_t *);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Atomically write variable.
|
||||||
|
*/
|
||||||
|
#define atomic_store_char(p, v) \
|
||||||
|
(*(volatile uchar_t *)(p) = (uchar_t)(v))
|
||||||
|
#define atomic_store_short(p, v) \
|
||||||
|
(*(volatile ushort_t *)(p) = (ushort_t)(v))
|
||||||
|
#define atomic_store_int(p, v) \
|
||||||
|
(*(volatile uint_t *)(p) = (uint_t)(v))
|
||||||
|
#define atomic_store_long(p, v) \
|
||||||
|
(*(volatile ulong_t *)(p) = (ulong_t)(v))
|
||||||
|
#define atomic_store_ptr(p, v) \
|
||||||
|
(*(volatile __typeof(*p) *)(p) = (v))
|
||||||
|
#define atomic_store_8(p, v) \
|
||||||
|
(*(volatile uint8_t *)(p) = (uint8_t)(v))
|
||||||
|
#define atomic_store_16(p, v) \
|
||||||
|
(*(volatile uint16_t *)(p) = (uint16_t)(v))
|
||||||
|
#define atomic_store_32(p, v) \
|
||||||
|
(*(volatile uint32_t *)(p) = (uint32_t)(v))
|
||||||
|
#ifdef _LP64
|
||||||
|
#define atomic_store_64(p, v) \
|
||||||
|
(*(volatile uint64_t *)(p) = (uint64_t)(v))
|
||||||
|
#elif defined(_INT64_TYPE)
|
||||||
|
extern void atomic_store_64(volatile uint64_t *, uint64_t);
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Perform an exclusive atomic bit set/clear on a target.
|
* Perform an exclusive atomic bit set/clear on a target.
|
||||||
* Returns 0 if bit was successfully set/cleared, or -1
|
* Returns 0 if bit was successfully set/cleared, or -1
|
||||||
|
|
|
@ -78,11 +78,11 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We will borrow aggsum_borrow_multiplier times the current request, so we will
|
* We will borrow 2^aggsum_borrow_shift times the current request, so we will
|
||||||
* have to get the as_lock approximately every aggsum_borrow_multiplier calls to
|
* have to get the as_lock approximately every 2^aggsum_borrow_shift calls to
|
||||||
* aggsum_delta().
|
* aggsum_add().
|
||||||
*/
|
*/
|
||||||
static uint_t aggsum_borrow_multiplier = 10;
|
static uint_t aggsum_borrow_shift = 4;
|
||||||
|
|
||||||
void
|
void
|
||||||
aggsum_init(aggsum_t *as, uint64_t value)
|
aggsum_init(aggsum_t *as, uint64_t value)
|
||||||
|
@ -90,9 +90,14 @@ aggsum_init(aggsum_t *as, uint64_t value)
|
||||||
bzero(as, sizeof (*as));
|
bzero(as, sizeof (*as));
|
||||||
as->as_lower_bound = as->as_upper_bound = value;
|
as->as_lower_bound = as->as_upper_bound = value;
|
||||||
mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
|
mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||||
as->as_numbuckets = boot_ncpus;
|
/*
|
||||||
as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
|
* Too many buckets may hurt read performance without improving
|
||||||
KM_SLEEP);
|
* write. From 12 CPUs use bucket per 2 CPUs, from 48 per 4, etc.
|
||||||
|
*/
|
||||||
|
as->as_bucketshift = highbit64(boot_ncpus / 6) / 2;
|
||||||
|
as->as_numbuckets = ((boot_ncpus - 1) >> as->as_bucketshift) + 1;
|
||||||
|
as->as_buckets = kmem_zalloc(as->as_numbuckets *
|
||||||
|
sizeof (aggsum_bucket_t), KM_SLEEP);
|
||||||
for (int i = 0; i < as->as_numbuckets; i++) {
|
for (int i = 0; i < as->as_numbuckets; i++) {
|
||||||
mutex_init(&as->as_buckets[i].asc_lock,
|
mutex_init(&as->as_buckets[i].asc_lock,
|
||||||
NULL, MUTEX_DEFAULT, NULL);
|
NULL, MUTEX_DEFAULT, NULL);
|
||||||
|
@ -111,59 +116,49 @@ aggsum_fini(aggsum_t *as)
|
||||||
int64_t
|
int64_t
|
||||||
aggsum_lower_bound(aggsum_t *as)
|
aggsum_lower_bound(aggsum_t *as)
|
||||||
{
|
{
|
||||||
return (as->as_lower_bound);
|
return (atomic_load_64((volatile uint64_t *)&as->as_lower_bound));
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t
|
uint64_t
|
||||||
aggsum_upper_bound(aggsum_t *as)
|
aggsum_upper_bound(aggsum_t *as)
|
||||||
{
|
{
|
||||||
return (as->as_upper_bound);
|
return (atomic_load_64(&as->as_upper_bound));
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
|
|
||||||
{
|
|
||||||
ASSERT(MUTEX_HELD(&as->as_lock));
|
|
||||||
ASSERT(MUTEX_HELD(&asb->asc_lock));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We use atomic instructions for this because we read the upper and
|
|
||||||
* lower bounds without the lock, so we need stores to be atomic.
|
|
||||||
*/
|
|
||||||
atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
|
|
||||||
asb->asc_delta + asb->asc_borrowed);
|
|
||||||
atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
|
|
||||||
asb->asc_delta - asb->asc_borrowed);
|
|
||||||
asb->asc_delta = 0;
|
|
||||||
asb->asc_borrowed = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t
|
uint64_t
|
||||||
aggsum_value(aggsum_t *as)
|
aggsum_value(aggsum_t *as)
|
||||||
{
|
{
|
||||||
int64_t rv;
|
int64_t lb;
|
||||||
|
uint64_t ub;
|
||||||
|
|
||||||
mutex_enter(&as->as_lock);
|
mutex_enter(&as->as_lock);
|
||||||
if (as->as_lower_bound == as->as_upper_bound) {
|
lb = as->as_lower_bound;
|
||||||
rv = as->as_lower_bound;
|
ub = as->as_upper_bound;
|
||||||
|
if (lb == ub) {
|
||||||
for (int i = 0; i < as->as_numbuckets; i++) {
|
for (int i = 0; i < as->as_numbuckets; i++) {
|
||||||
ASSERT0(as->as_buckets[i].asc_delta);
|
ASSERT0(as->as_buckets[i].asc_delta);
|
||||||
ASSERT0(as->as_buckets[i].asc_borrowed);
|
ASSERT0(as->as_buckets[i].asc_borrowed);
|
||||||
}
|
}
|
||||||
mutex_exit(&as->as_lock);
|
mutex_exit(&as->as_lock);
|
||||||
return (rv);
|
return (lb);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < as->as_numbuckets; i++) {
|
for (int i = 0; i < as->as_numbuckets; i++) {
|
||||||
struct aggsum_bucket *asb = &as->as_buckets[i];
|
struct aggsum_bucket *asb = &as->as_buckets[i];
|
||||||
|
if (asb->asc_borrowed == 0)
|
||||||
|
continue;
|
||||||
mutex_enter(&asb->asc_lock);
|
mutex_enter(&asb->asc_lock);
|
||||||
aggsum_flush_bucket(as, asb);
|
lb += asb->asc_delta + asb->asc_borrowed;
|
||||||
|
ub += asb->asc_delta - asb->asc_borrowed;
|
||||||
|
asb->asc_delta = 0;
|
||||||
|
asb->asc_borrowed = 0;
|
||||||
mutex_exit(&asb->asc_lock);
|
mutex_exit(&asb->asc_lock);
|
||||||
}
|
}
|
||||||
VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
|
ASSERT3U(lb, ==, ub);
|
||||||
rv = as->as_lower_bound;
|
atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb);
|
||||||
|
atomic_store_64(&as->as_upper_bound, lb);
|
||||||
mutex_exit(&as->as_lock);
|
mutex_exit(&as->as_lock);
|
||||||
|
|
||||||
return (rv);
|
return (lb);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -172,7 +167,8 @@ aggsum_add(aggsum_t *as, int64_t delta)
|
||||||
struct aggsum_bucket *asb;
|
struct aggsum_bucket *asb;
|
||||||
int64_t borrow;
|
int64_t borrow;
|
||||||
|
|
||||||
asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets];
|
asb = &as->as_buckets[(CPU_SEQID_UNSTABLE >> as->as_bucketshift) %
|
||||||
|
as->as_numbuckets];
|
||||||
|
|
||||||
/* Try fast path if we already borrowed enough before. */
|
/* Try fast path if we already borrowed enough before. */
|
||||||
mutex_enter(&asb->asc_lock);
|
mutex_enter(&asb->asc_lock);
|
||||||
|
@ -188,21 +184,22 @@ aggsum_add(aggsum_t *as, int64_t delta)
|
||||||
* We haven't borrowed enough. Take the global lock and borrow
|
* We haven't borrowed enough. Take the global lock and borrow
|
||||||
* considering what is requested now and what we borrowed before.
|
* considering what is requested now and what we borrowed before.
|
||||||
*/
|
*/
|
||||||
borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
|
borrow = (delta < 0 ? -delta : delta);
|
||||||
|
borrow <<= aggsum_borrow_shift + as->as_bucketshift;
|
||||||
mutex_enter(&as->as_lock);
|
mutex_enter(&as->as_lock);
|
||||||
mutex_enter(&asb->asc_lock);
|
|
||||||
delta += asb->asc_delta;
|
|
||||||
asb->asc_delta = 0;
|
|
||||||
if (borrow >= asb->asc_borrowed)
|
if (borrow >= asb->asc_borrowed)
|
||||||
borrow -= asb->asc_borrowed;
|
borrow -= asb->asc_borrowed;
|
||||||
else
|
else
|
||||||
borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
|
borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
|
||||||
|
mutex_enter(&asb->asc_lock);
|
||||||
|
delta += asb->asc_delta;
|
||||||
|
asb->asc_delta = 0;
|
||||||
asb->asc_borrowed += borrow;
|
asb->asc_borrowed += borrow;
|
||||||
atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
|
|
||||||
delta - borrow);
|
|
||||||
atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
|
|
||||||
delta + borrow);
|
|
||||||
mutex_exit(&asb->asc_lock);
|
mutex_exit(&asb->asc_lock);
|
||||||
|
atomic_store_64((volatile uint64_t *)&as->as_lower_bound,
|
||||||
|
as->as_lower_bound + delta - borrow);
|
||||||
|
atomic_store_64(&as->as_upper_bound,
|
||||||
|
as->as_upper_bound + delta + borrow);
|
||||||
mutex_exit(&as->as_lock);
|
mutex_exit(&as->as_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -214,27 +211,35 @@ aggsum_add(aggsum_t *as, int64_t delta)
|
||||||
int
|
int
|
||||||
aggsum_compare(aggsum_t *as, uint64_t target)
|
aggsum_compare(aggsum_t *as, uint64_t target)
|
||||||
{
|
{
|
||||||
if (as->as_upper_bound < target)
|
int64_t lb;
|
||||||
|
uint64_t ub;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (atomic_load_64(&as->as_upper_bound) < target)
|
||||||
return (-1);
|
return (-1);
|
||||||
if (as->as_lower_bound > target)
|
lb = atomic_load_64((volatile uint64_t *)&as->as_lower_bound);
|
||||||
|
if (lb > 0 && (uint64_t)lb > target)
|
||||||
return (1);
|
return (1);
|
||||||
mutex_enter(&as->as_lock);
|
mutex_enter(&as->as_lock);
|
||||||
for (int i = 0; i < as->as_numbuckets; i++) {
|
lb = as->as_lower_bound;
|
||||||
|
ub = as->as_upper_bound;
|
||||||
|
for (i = 0; i < as->as_numbuckets; i++) {
|
||||||
struct aggsum_bucket *asb = &as->as_buckets[i];
|
struct aggsum_bucket *asb = &as->as_buckets[i];
|
||||||
|
if (asb->asc_borrowed == 0)
|
||||||
|
continue;
|
||||||
mutex_enter(&asb->asc_lock);
|
mutex_enter(&asb->asc_lock);
|
||||||
aggsum_flush_bucket(as, asb);
|
lb += asb->asc_delta + asb->asc_borrowed;
|
||||||
|
ub += asb->asc_delta - asb->asc_borrowed;
|
||||||
|
asb->asc_delta = 0;
|
||||||
|
asb->asc_borrowed = 0;
|
||||||
mutex_exit(&asb->asc_lock);
|
mutex_exit(&asb->asc_lock);
|
||||||
if (as->as_upper_bound < target) {
|
if (ub < target || (lb > 0 && (uint64_t)lb > target))
|
||||||
mutex_exit(&as->as_lock);
|
break;
|
||||||
return (-1);
|
|
||||||
}
|
|
||||||
if (as->as_lower_bound > target) {
|
|
||||||
mutex_exit(&as->as_lock);
|
|
||||||
return (1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
|
if (i >= as->as_numbuckets)
|
||||||
ASSERT3U(as->as_lower_bound, ==, target);
|
ASSERT3U(lb, ==, ub);
|
||||||
|
atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb);
|
||||||
|
atomic_store_64(&as->as_upper_bound, ub);
|
||||||
mutex_exit(&as->as_lock);
|
mutex_exit(&as->as_lock);
|
||||||
return (0);
|
return (ub < target ? -1 : (uint64_t)lb > target ? 1 : 0);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue