Skip to content

Commit cd5437a

Browse files
committed
More aggsum optimizations.
- Avoid atomic_add() when updating as_lower_bound/as_upper_bound. Previous code was excessively strong on 64bit systems while not strong enough on 32bit ones. Instead introduce and use real atomic_load() and atomic_store() operations, just an assignments on 64bit machines, but using proper atomics on 32bit ones to avoid torn reads/writes. - Reduce number of buckets on large systems. Extra buckets not as much improve add speed, as hurt reads. Unlike wmsum for aggsum reads are still important. Signed-off-by: Alexander Motin <[email protected]> Sponsored-By: iXsystems, Inc.
1 parent 8670644 commit cd5437a

File tree

5 files changed

+130
-62
lines changed

5 files changed

+130
-62
lines changed

include/os/linux/spl/sys/atomic.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@
4848
#define atomic_sub_32_nv(v, i) atomic_sub_return((i), (atomic_t *)(v))
4949
#define atomic_cas_32(v, x, y) atomic_cmpxchg((atomic_t *)(v), x, y)
5050
#define atomic_swap_32(v, x) atomic_xchg((atomic_t *)(v), x)
51+
#define atomic_load_32(v) atomic64_load((atomic_t *)(v))
52+
#define atomic_store_32(v, x) atomic64_store((atomic_t *)(v), x)
5153
#define atomic_inc_64(v) atomic64_inc((atomic64_t *)(v))
5254
#define atomic_dec_64(v) atomic64_dec((atomic64_t *)(v))
5355
#define atomic_add_64(v, i) atomic64_add((i), (atomic64_t *)(v))
@@ -58,6 +60,8 @@
5860
#define atomic_sub_64_nv(v, i) atomic64_sub_return((i), (atomic64_t *)(v))
5961
#define atomic_cas_64(v, x, y) atomic64_cmpxchg((atomic64_t *)(v), x, y)
6062
#define atomic_swap_64(v, x) atomic64_xchg((atomic64_t *)(v), x)
63+
#define atomic_load_64(v) atomic64_load((atomic64_t *)(v))
64+
#define atomic_store_64(v, x) atomic64_store((atomic64_t *)(v), x)
6165

6266
#ifdef _LP64
6367
static __inline__ void *

include/sys/aggsum.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,16 @@ struct aggsum_bucket {
3939
typedef struct aggsum {
4040
kmutex_t as_lock;
4141
int64_t as_lower_bound;
42-
int64_t as_upper_bound;
42+
uint64_t as_upper_bound;
43+
aggsum_bucket_t *as_buckets ____cacheline_aligned;
4344
uint_t as_numbuckets;
44-
aggsum_bucket_t *as_buckets;
45+
uint_t as_bucketshift;
4546
} aggsum_t;
4647

4748
void aggsum_init(aggsum_t *, uint64_t);
4849
void aggsum_fini(aggsum_t *);
4950
int64_t aggsum_lower_bound(aggsum_t *);
50-
int64_t aggsum_upper_bound(aggsum_t *);
51+
uint64_t aggsum_upper_bound(aggsum_t *);
5152
int aggsum_compare(aggsum_t *, uint64_t);
5253
uint64_t aggsum_value(aggsum_t *);
5354
void aggsum_add(aggsum_t *, int64_t);

lib/libspl/atomic.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,19 @@ atomic_swap_ptr(volatile void *target, void *bits)
313313
return (__atomic_exchange_n((void **)target, bits, __ATOMIC_SEQ_CST));
314314
}
315315

316+
#ifndef _LP64
317+
void
318+
atomic_load_64(volatile uint64_t *target)
319+
{
320+
return (__atomic_load_n(target, __ATOMIC_RELAXED));
321+
}
322+
323+
void
324+
atomic_store_64(volatile uint64_t *target, uint64_t bits)
325+
{
326+
return (__atomic_store_n(target, bits, __ATOMIC_RELAXED));
327+
}
328+
#endif
316329

317330
int
318331
atomic_set_long_excl(volatile ulong_t *target, uint_t value)

lib/libspl/include/atomic.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,49 @@ extern ulong_t atomic_swap_ulong(volatile ulong_t *, ulong_t);
245245
extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t);
246246
#endif
247247

248+
/*
249+
* Atomically read variable.
250+
*/
251+
#define atomic_load_char(p) (*(volatile u_char *)(p))
252+
#define atomic_load_short(p) (*(volatile u_short *)(p))
253+
#define atomic_load_int(p) (*(volatile u_int *)(p))
254+
#define atomic_load_long(p) (*(volatile u_long *)(p))
255+
#define atomic_load_ptr(p) (*(volatile __typeof(*p) *)(p))
256+
#define atomic_load_8(p) (*(volatile uint8_t *)(p))
257+
#define atomic_load_16(p) (*(volatile uint16_t *)(p))
258+
#define atomic_load_32(p) (*(volatile uint32_t *)(p))
259+
#ifdef _LP64
260+
#define atomic_load_64(p) (*(volatile uint64_t *)(p))
261+
#elif defined(_INT64_TYPE)
262+
extern uint64_t atomic_load_64(volatile uint64_t *);
263+
#endif
264+
265+
/*
266+
* Atomically write variable.
267+
*/
268+
#define atomic_store_char(p, v) \
269+
(*(volatile u_char *)(p) = (u_char)(v))
270+
#define atomic_store_short(p, v) \
271+
(*(volatile u_short *)(p) = (u_short)(v))
272+
#define atomic_store_int(p, v) \
273+
(*(volatile u_int *)(p) = (u_int)(v))
274+
#define atomic_store_long(p, v) \
275+
(*(volatile u_long *)(p) = (u_long)(v))
276+
#define atomic_store_ptr(p, v) \
277+
(*(volatile __typeof(*p) *)(p) = (v))
278+
#define atomic_store_8(p, v) \
279+
(*(volatile uint8_t *)(p) = (uint8_t)(v))
280+
#define atomic_store_16(p, v) \
281+
(*(volatile uint16_t *)(p) = (uint16_t)(v))
282+
#define atomic_store_32(p, v) \
283+
(*(volatile uint32_t *)(p) = (uint32_t)(v))
284+
#ifdef _LP64
285+
#define atomic_store_64(p, v) \
286+
(*(volatile uint64_t *)(p) = (uint64_t)(v))
287+
#elif defined(_INT64_TYPE)
288+
extern void atomic_store_64(volatile uint64_t *, uint64_t);
289+
#endif
290+
248291
/*
249292
* Perform an exclusive atomic bit set/clear on a target.
250293
* Returns 0 if bit was successfully set/cleared, or -1

module/zfs/aggsum.c

Lines changed: 66 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -78,21 +78,26 @@
7878
*/
7979

8080
/*
81-
* We will borrow aggsum_borrow_multiplier times the current request, so we will
82-
* have to get the as_lock approximately every aggsum_borrow_multiplier calls to
83-
* aggsum_delta().
81+
* We will borrow 2^aggsum_borrow_shift times the current request, so we will
82+
* have to get the as_lock approximately every 2^aggsum_borrow_shift calls to
83+
* aggsum_add().
8484
*/
85-
static uint_t aggsum_borrow_multiplier = 10;
85+
static uint_t aggsum_borrow_shift = 4;
8686

8787
void
8888
aggsum_init(aggsum_t *as, uint64_t value)
8989
{
9090
bzero(as, sizeof (*as));
9191
as->as_lower_bound = as->as_upper_bound = value;
9292
mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
93-
as->as_numbuckets = boot_ncpus;
94-
as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
95-
KM_SLEEP);
93+
/*
94+
* Too many buckets may hurt read performance without improving
95+
* write. From 12 CPUs use bucket per 2 CPUs, from 48 per 4, etc.
96+
*/
97+
as->as_bucketshift = highbit64(boot_ncpus / 6) / 2;
98+
as->as_numbuckets = ((boot_ncpus - 1) >> as->as_bucketshift) + 1;
99+
as->as_buckets = kmem_zalloc(as->as_numbuckets *
100+
sizeof (aggsum_bucket_t), KM_SLEEP);
96101
for (int i = 0; i < as->as_numbuckets; i++) {
97102
mutex_init(&as->as_buckets[i].asc_lock,
98103
NULL, MUTEX_DEFAULT, NULL);
@@ -111,59 +116,49 @@ aggsum_fini(aggsum_t *as)
111116
int64_t
112117
aggsum_lower_bound(aggsum_t *as)
113118
{
114-
return (as->as_lower_bound);
119+
return (atomic_load_64(&as->as_lower_bound));
115120
}
116121

117-
int64_t
122+
uint64_t
118123
aggsum_upper_bound(aggsum_t *as)
119124
{
120-
return (as->as_upper_bound);
121-
}
122-
123-
static void
124-
aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
125-
{
126-
ASSERT(MUTEX_HELD(&as->as_lock));
127-
ASSERT(MUTEX_HELD(&asb->asc_lock));
128-
129-
/*
130-
* We use atomic instructions for this because we read the upper and
131-
* lower bounds without the lock, so we need stores to be atomic.
132-
*/
133-
atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
134-
asb->asc_delta + asb->asc_borrowed);
135-
atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
136-
asb->asc_delta - asb->asc_borrowed);
137-
asb->asc_delta = 0;
138-
asb->asc_borrowed = 0;
125+
return (atomic_load_64(&as->as_upper_bound));
139126
}
140127

141128
uint64_t
142129
aggsum_value(aggsum_t *as)
143130
{
144-
int64_t rv;
131+
int64_t lb;
132+
uint64_t ub;
145133

146134
mutex_enter(&as->as_lock);
147-
if (as->as_lower_bound == as->as_upper_bound) {
148-
rv = as->as_lower_bound;
135+
lb = as->as_lower_bound;
136+
ub = as->as_upper_bound;
137+
if (lb == ub) {
149138
for (int i = 0; i < as->as_numbuckets; i++) {
150139
ASSERT0(as->as_buckets[i].asc_delta);
151140
ASSERT0(as->as_buckets[i].asc_borrowed);
152141
}
153142
mutex_exit(&as->as_lock);
154-
return (rv);
143+
return (lb);
155144
}
156145
for (int i = 0; i < as->as_numbuckets; i++) {
157146
struct aggsum_bucket *asb = &as->as_buckets[i];
147+
if (asb->asc_borrowed == 0)
148+
continue;
158149
mutex_enter(&asb->asc_lock);
159-
aggsum_flush_bucket(as, asb);
150+
lb += asb->asc_delta + asb->asc_borrowed;
151+
ub += asb->asc_delta - asb->asc_borrowed;
152+
asb->asc_delta = 0;
153+
asb->asc_borrowed = 0;
160154
mutex_exit(&asb->asc_lock);
161155
}
162-
VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
163-
rv = as->as_lower_bound;
156+
ASSERT3U(lb, ==, ub);
157+
atomic_store_64(&as->as_lower_bound, lb);
158+
atomic_store_64(&as->as_upper_bound, lb);
164159
mutex_exit(&as->as_lock);
165160

166-
return (rv);
161+
return (lb);
167162
}
168163

169164
void
@@ -172,7 +167,8 @@ aggsum_add(aggsum_t *as, int64_t delta)
172167
struct aggsum_bucket *asb;
173168
int64_t borrow;
174169

175-
asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets];
170+
asb = &as->as_buckets[(CPU_SEQID_UNSTABLE >> as->as_bucketshift) %
171+
as->as_numbuckets];
176172

177173
/* Try fast path if we already borrowed enough before. */
178174
mutex_enter(&asb->asc_lock);
@@ -188,21 +184,22 @@ aggsum_add(aggsum_t *as, int64_t delta)
188184
* We haven't borrowed enough. Take the global lock and borrow
189185
* considering what is requested now and what we borrowed before.
190186
*/
191-
borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
187+
borrow = (delta < 0 ? -delta : delta);
188+
borrow <<= aggsum_borrow_shift + as->as_bucketshift;
192189
mutex_enter(&as->as_lock);
193-
mutex_enter(&asb->asc_lock);
194-
delta += asb->asc_delta;
195-
asb->asc_delta = 0;
196190
if (borrow >= asb->asc_borrowed)
197191
borrow -= asb->asc_borrowed;
198192
else
199193
borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
194+
mutex_enter(&asb->asc_lock);
195+
delta += asb->asc_delta;
196+
asb->asc_delta = 0;
200197
asb->asc_borrowed += borrow;
201-
atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
198+
mutex_exit(&asb->asc_lock);
199+
atomic_store_64(&as->as_lower_bound, as->as_lower_bound +
202200
delta - borrow);
203-
atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
201+
atomic_store_64(&as->as_upper_bound, as->as_upper_bound +
204202
delta + borrow);
205-
mutex_exit(&asb->asc_lock);
206203
mutex_exit(&as->as_lock);
207204
}
208205

@@ -214,27 +211,37 @@ aggsum_add(aggsum_t *as, int64_t delta)
214211
int
215212
aggsum_compare(aggsum_t *as, uint64_t target)
216213
{
217-
if (as->as_upper_bound < target)
214+
int64_t lb;
215+
uint64_t ub;
216+
int i;
217+
218+
if (atomic_load_64(&as->as_upper_bound) < target)
218219
return (-1);
219-
if (as->as_lower_bound > target)
220+
lb = atomic_load_64(&as->as_lower_bound);
221+
if (lb > 0 && (uint64_t)lb > target)
220222
return (1);
221223
mutex_enter(&as->as_lock);
222-
for (int i = 0; i < as->as_numbuckets; i++) {
224+
lb = as->as_lower_bound;
225+
ub = as->as_upper_bound;
226+
for (i = 0; i < as->as_numbuckets; i++) {
223227
struct aggsum_bucket *asb = &as->as_buckets[i];
228+
if (asb->asc_borrowed == 0)
229+
continue;
224230
mutex_enter(&asb->asc_lock);
225-
aggsum_flush_bucket(as, asb);
231+
lb += asb->asc_delta + asb->asc_borrowed;
232+
ub += asb->asc_delta - asb->asc_borrowed;
233+
asb->asc_delta = 0;
234+
asb->asc_borrowed = 0;
226235
mutex_exit(&asb->asc_lock);
227-
if (as->as_upper_bound < target) {
228-
mutex_exit(&as->as_lock);
229-
return (-1);
230-
}
231-
if (as->as_lower_bound > target) {
232-
mutex_exit(&as->as_lock);
233-
return (1);
234-
}
236+
if (ub < target || (lb > 0 && (uint64_t)lb > target))
237+
break;
238+
}
239+
if (i >= as->as_numbuckets) {
240+
ASSERT3U(lb, ==, ub);
241+
ASSERT3U(lb, ==, target);
235242
}
236-
VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
237-
ASSERT3U(as->as_lower_bound, ==, target);
243+
atomic_store_64(&as->as_lower_bound, lb);
244+
atomic_store_64(&as->as_upper_bound, ub);
238245
mutex_exit(&as->as_lock);
239-
return (0);
246+
return (ub < target ? -1 : (uint64_t)lb > target ? 1 : 0);
240247
}

0 commit comments

Comments
 (0)