From e5ab9eff46b04c5a04778e40d7092fed3fda52ca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Mar 2023 21:55:30 +0100 Subject: atomics: Provide atomic_add_negative() variants atomic_add_negative() does not provide the relaxed/acquire/release variants. Provide them in preparation for a new scalable reference count algorithm. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20230323102800.101763813@linutronix.de --- include/linux/atomic/atomic-arch-fallback.h | 208 ++++++++++++++++++++++++++-- include/linux/atomic/atomic-instrumented.h | 68 ++++++++- include/linux/atomic/atomic-long.h | 38 ++++- 3 files changed, 303 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h index 77bc5522e61c..4226379a232d 100644 --- a/include/linux/atomic/atomic-arch-fallback.h +++ b/include/linux/atomic/atomic-arch-fallback.h @@ -1208,15 +1208,21 @@ arch_atomic_inc_and_test(atomic_t *v) #define arch_atomic_inc_and_test arch_atomic_inc_and_test #endif +#ifndef arch_atomic_add_negative_relaxed +#ifdef arch_atomic_add_negative +#define arch_atomic_add_negative_acquire arch_atomic_add_negative +#define arch_atomic_add_negative_release arch_atomic_add_negative +#define arch_atomic_add_negative_relaxed arch_atomic_add_negative +#endif /* arch_atomic_add_negative */ + #ifndef arch_atomic_add_negative /** - * arch_atomic_add_negative - add and test if negative + * arch_atomic_add_negative - Add and test if negative * @i: integer value to add * @v: pointer of type atomic_t * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. */ static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) @@ -1226,6 +1232,95 @@ arch_atomic_add_negative(int i, atomic_t *v) #define arch_atomic_add_negative arch_atomic_add_negative #endif +#ifndef arch_atomic_add_negative_acquire +/** + * arch_atomic_add_negative_acquire - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic_add_negative_acquire(int i, atomic_t *v) +{ + return arch_atomic_add_return_acquire(i, v) < 0; +} +#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire +#endif + +#ifndef arch_atomic_add_negative_release +/** + * arch_atomic_add_negative_release - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic_add_negative_release(int i, atomic_t *v) +{ + return arch_atomic_add_return_release(i, v) < 0; +} +#define arch_atomic_add_negative_release arch_atomic_add_negative_release +#endif + +#ifndef arch_atomic_add_negative_relaxed +/** + * arch_atomic_add_negative_relaxed - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic_add_negative_relaxed(int i, atomic_t *v) +{ + return arch_atomic_add_return_relaxed(i, v) < 0; +} +#define arch_atomic_add_negative_relaxed arch_atomic_add_negative_relaxed +#endif + +#else /* arch_atomic_add_negative_relaxed */ + +#ifndef arch_atomic_add_negative_acquire +static __always_inline bool +arch_atomic_add_negative_acquire(int i, atomic_t *v) +{ + bool ret = arch_atomic_add_negative_relaxed(i, v); + __atomic_acquire_fence(); + return ret; +} +#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire +#endif + +#ifndef arch_atomic_add_negative_release +static __always_inline bool +arch_atomic_add_negative_release(int i, atomic_t *v) +{ + __atomic_release_fence(); + return arch_atomic_add_negative_relaxed(i, v); +} +#define arch_atomic_add_negative_release arch_atomic_add_negative_release +#endif + +#ifndef arch_atomic_add_negative +static __always_inline bool +arch_atomic_add_negative(int i, atomic_t *v) +{ + bool ret; + __atomic_pre_full_fence(); + ret = arch_atomic_add_negative_relaxed(i, v); + __atomic_post_full_fence(); + return ret; +} +#define arch_atomic_add_negative arch_atomic_add_negative +#endif + +#endif /* arch_atomic_add_negative_relaxed */ + #ifndef arch_atomic_fetch_add_unless /** * arch_atomic_fetch_add_unless - add unless the number is already a given value @@ -2329,15 +2424,21 @@ arch_atomic64_inc_and_test(atomic64_t *v) #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test #endif +#ifndef arch_atomic64_add_negative_relaxed +#ifdef arch_atomic64_add_negative +#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative +#define arch_atomic64_add_negative_release arch_atomic64_add_negative +#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative +#endif /* arch_atomic64_add_negative */ + #ifndef arch_atomic64_add_negative /** - * arch_atomic64_add_negative - add and test if negative + * arch_atomic64_add_negative - Add and test if negative * @i: integer value to add * @v: pointer of type atomic64_t * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. */ static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) @@ -2347,6 +2448,95 @@ arch_atomic64_add_negative(s64 i, atomic64_t *v) #define arch_atomic64_add_negative arch_atomic64_add_negative #endif +#ifndef arch_atomic64_add_negative_acquire +/** + * arch_atomic64_add_negative_acquire - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic64_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v) +{ + return arch_atomic64_add_return_acquire(i, v) < 0; +} +#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire +#endif + +#ifndef arch_atomic64_add_negative_release +/** + * arch_atomic64_add_negative_release - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic64_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic64_add_negative_release(s64 i, atomic64_t *v) +{ + return arch_atomic64_add_return_release(i, v) < 0; +} +#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release +#endif + +#ifndef arch_atomic64_add_negative_relaxed +/** + * arch_atomic64_add_negative_relaxed - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic64_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic64_add_negative_relaxed(s64 i, atomic64_t *v) +{ + return arch_atomic64_add_return_relaxed(i, v) < 0; +} +#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative_relaxed +#endif + +#else /* arch_atomic64_add_negative_relaxed */ + +#ifndef arch_atomic64_add_negative_acquire +static __always_inline bool +arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v) +{ + bool ret = arch_atomic64_add_negative_relaxed(i, v); + __atomic_acquire_fence(); + return ret; +} +#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire +#endif + +#ifndef arch_atomic64_add_negative_release +static __always_inline bool +arch_atomic64_add_negative_release(s64 i, atomic64_t *v) +{ + __atomic_release_fence(); + return arch_atomic64_add_negative_relaxed(i, v); +} +#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release +#endif + +#ifndef arch_atomic64_add_negative +static __always_inline bool +arch_atomic64_add_negative(s64 i, atomic64_t *v) +{ + bool ret; + __atomic_pre_full_fence(); + ret = arch_atomic64_add_negative_relaxed(i, v); + __atomic_post_full_fence(); + return ret; +} +#define arch_atomic64_add_negative arch_atomic64_add_negative +#endif + +#endif /* arch_atomic64_add_negative_relaxed */ + #ifndef arch_atomic64_fetch_add_unless /** * arch_atomic64_fetch_add_unless - add unless the number is already a given value @@ -2456,4 +2646,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v) #endif #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// b5e87bdd5ede61470c29f7a7e4de781af3770f09 +// 00071fffa021cec66f6290d706d69c91df87bade diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index 7a139ec030b0..0496816738ca 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -592,6 +592,28 @@ atomic_add_negative(int i, atomic_t *v) return arch_atomic_add_negative(i, v); } +static __always_inline bool +atomic_add_negative_acquire(int i, atomic_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_add_negative_acquire(i, v); +} + +static __always_inline bool +atomic_add_negative_release(int i, atomic_t *v) +{ + kcsan_release(); + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_add_negative_release(i, v); +} + +static __always_inline bool +atomic_add_negative_relaxed(int i, atomic_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_add_negative_relaxed(i, v); +} + static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { @@ -1211,6 +1233,28 @@ atomic64_add_negative(s64 i, atomic64_t *v) return arch_atomic64_add_negative(i, v); } +static __always_inline bool +atomic64_add_negative_acquire(s64 i, atomic64_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic64_add_negative_acquire(i, v); +} + +static __always_inline bool +atomic64_add_negative_release(s64 i, atomic64_t *v) +{ + kcsan_release(); + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic64_add_negative_release(i, v); +} + +static __always_inline bool +atomic64_add_negative_relaxed(s64 i, atomic64_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic64_add_negative_relaxed(i, v); +} + static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { @@ -1830,6 +1874,28 @@ atomic_long_add_negative(long i, atomic_long_t *v) return arch_atomic_long_add_negative(i, v); } +static __always_inline bool +atomic_long_add_negative_acquire(long i, atomic_long_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_long_add_negative_acquire(i, v); +} + +static __always_inline bool +atomic_long_add_negative_release(long i, atomic_long_t *v) +{ + kcsan_release(); + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_long_add_negative_release(i, v); +} + +static __always_inline bool +atomic_long_add_negative_relaxed(long i, atomic_long_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_long_add_negative_relaxed(i, v); +} + static __always_inline long atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { @@ -2083,4 +2149,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) }) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// 764f741eb77a7ad565dc8d99ce2837d5542e8aee +// 1b485de9cbaa4900de59e14ee2084357eaeb1c3a diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h index 800b8c35992d..2fc51ba66beb 100644 --- a/include/linux/atomic/atomic-long.h +++ b/include/linux/atomic/atomic-long.h @@ -479,6 +479,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v) return arch_atomic64_add_negative(i, v); } +static __always_inline bool +arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v) +{ + return arch_atomic64_add_negative_acquire(i, v); +} + +static __always_inline bool +arch_atomic_long_add_negative_release(long i, atomic_long_t *v) +{ + return arch_atomic64_add_negative_release(i, v); +} + +static __always_inline bool +arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) +{ + return arch_atomic64_add_negative_relaxed(i, v); +} + static __always_inline long arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { @@ -973,6 +991,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v) return arch_atomic_add_negative(i, v); } +static __always_inline bool +arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v) +{ + return arch_atomic_add_negative_acquire(i, v); +} + +static __always_inline bool +arch_atomic_long_add_negative_release(long i, atomic_long_t *v) +{ + return arch_atomic_add_negative_release(i, v); +} + +static __always_inline bool +arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) +{ + return arch_atomic_add_negative_relaxed(i, v); +} + static __always_inline long arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { @@ -1011,4 +1047,4 @@ arch_atomic_long_dec_if_positive(atomic_long_t *v) #endif /* CONFIG_64BIT */ #endif /* _LINUX_ATOMIC_LONG_H */ -// e8f0e08ff072b74d180eabe2ad001282b38c2c88 +// a194c07d7d2f4b0e178d3c118c919775d5d65f50 -- cgit v1.2.3 From ee1ee6db07795d9637bc5e8993a8ddcf886541ef Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Mar 2023 21:55:31 +0100 Subject: atomics: Provide rcuref - scalable reference counting atomic_t based reference counting, including refcount_t, uses atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is implemented with a atomic_try_cmpxchg() loop. High contention of the reference count leads to retry loops and scales badly. There is nothing to improve on this implementation as the semantics have to be preserved. Provide rcuref as a scalable alternative solution which is suitable for RCU managed objects. Similar to refcount_t it comes with overflow and underflow detection and mitigation. rcuref treats the underlying atomic_t as an unsigned integer and partitions this space into zones: 0x00000000 - 0x7FFFFFFF valid zone (1 .. (INT_MAX + 1) references) 0x80000000 - 0xBFFFFFFF saturation zone 0xC0000000 - 0xFFFFFFFE dead zone 0xFFFFFFFF no reference rcuref_get() unconditionally increments the reference count with atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the reference count with atomic_add_negative_release(). This unconditional increment avoids the inc_not_zero() problem, but requires a more complex implementation on the put() side when the count drops from 0 to -1. When this transition is detected then it is attempted to mark the reference count dead, by setting it to the midpoint of the dead zone with a single atomic_cmpxchg_release() operation. This operation can fail due to a concurrent rcuref_get() elevating the reference count from -1 to 0 again. If the unconditional increment in rcuref_get() hits a reference count which is marked dead (or saturated) it will detect it after the fact and bring back the reference count to the midpoint of the respective zone. The zones provide enough tolerance which makes it practically impossible to escape from a zone. The racy implementation of rcuref_put() requires to protect rcuref_put() against a grace period ending in order to prevent a subtle use after free. As RCU is the only mechanism which allows to protect against that, it is not possible to fully replace the atomic_inc_not_zero() based implementation of refcount_t with this scheme. The final drop is slightly more expensive than the atomic_dec_return() counterpart, but that's not the case which this is optimized for. The optimization is on the high frequeunt get()/put() pairs and their scalability. The performance of an uncontended rcuref_get()/put() pair where the put() is not dropping the last reference is still on par with the plain atomic operations, while at the same time providing overflow and underflow detection and mitigation. The performance of rcuref compared to plain atomic_inc_not_zero() and atomic_dec_return() based reference counting under contention: - Micro benchmark: All CPUs running a increment/decrement loop on an elevated reference count, which means the 0 to -1 transition never happens. The performance gain depends on microarchitecture and the number of CPUs and has been observed in the range of 1.3X to 4.7X - Conversion of dst_entry::__refcnt to rcuref and testing with the localhost memtier/memcached benchmark. That benchmark shows the reference count contention prominently. The performance gain depends on microarchitecture and the number of CPUs and has been observed in the range of 1.1X to 2.6X over the previous fix for the false sharing issue vs. struct dst_entry::__refcnt. When memtier is run over a real 1Gb network connection, there is a small gain on top of the false sharing fix. The two changes combined result in a 2%-5% total gain for that networked test. Reported-by: Wangyang Guo Reported-by: Arjan Van De Ven Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de --- include/linux/rcuref.h | 155 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/types.h | 6 ++ 2 files changed, 161 insertions(+) create mode 100644 include/linux/rcuref.h (limited to 'include') diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h new file mode 100644 index 000000000000..2c8bfd0f1b6b --- /dev/null +++ b/include/linux/rcuref.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_RCUREF_H +#define _LINUX_RCUREF_H + +#include +#include +#include +#include +#include +#include + +#define RCUREF_ONEREF 0x00000000U +#define RCUREF_MAXREF 0x7FFFFFFFU +#define RCUREF_SATURATED 0xA0000000U +#define RCUREF_RELEASED 0xC0000000U +#define RCUREF_DEAD 0xE0000000U +#define RCUREF_NOREF 0xFFFFFFFFU + +/** + * rcuref_init - Initialize a rcuref reference count with the given reference count + * @ref: Pointer to the reference count + * @cnt: The initial reference count typically '1' + */ +static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) +{ + atomic_set(&ref->refcnt, cnt - 1); +} + +/** + * rcuref_read - Read the number of held reference counts of a rcuref + * @ref: Pointer to the reference count + * + * Return: The number of held references (0 ... N) + */ +static inline unsigned int rcuref_read(rcuref_t *ref) +{ + unsigned int c = atomic_read(&ref->refcnt); + + /* Return 0 if within the DEAD zone. */ + return c >= RCUREF_RELEASED ? 0 : c + 1; +} + +extern __must_check bool rcuref_get_slowpath(rcuref_t *ref); + +/** + * rcuref_get - Acquire one reference on a rcuref reference count + * @ref: Pointer to the reference count + * + * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See documentation in lib/rcuref.c + * + * Return: + * False if the attempt to acquire a reference failed. This happens + * when the last reference has been put already + * + * True if a reference was successfully acquired + */ +static inline __must_check bool rcuref_get(rcuref_t *ref) +{ + /* + * Unconditionally increase the reference count. The saturation and + * dead zones provide enough tolerance for this. + */ + if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt))) + return true; + + /* Handle the cases inside the saturation and dead zones */ + return rcuref_get_slowpath(ref); +} + +extern __must_check bool rcuref_put_slowpath(rcuref_t *ref); + +/* + * Internal helper. Do not invoke directly. + */ +static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), + "suspicious rcuref_put_rcusafe() usage"); + /* + * Unconditionally decrease the reference count. The saturation and + * dead zones provide enough tolerance for this. + */ + if (likely(!atomic_add_negative_release(-1, &ref->refcnt))) + return false; + + /* + * Handle the last reference drop and cases inside the saturation + * and dead zones. + */ + return rcuref_put_slowpath(ref); +} + +/** + * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe + * @ref: Pointer to the reference count + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides an acquire ordering on success such that free() + * must come after. + * + * Can be invoked from contexts, which guarantee that no grace period can + * happen which would free the object concurrently if the decrement drops + * the last reference and the slowpath races against a concurrent get() and + * put() pair. rcu_read_lock()'ed and atomic contexts qualify. + * + * Return: + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely release the + * object which is protected by the reference counter. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * release the protected object. + */ +static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref) +{ + return __rcuref_put(ref); +} + +/** + * rcuref_put -- Release one reference for a rcuref reference count + * @ref: Pointer to the reference count + * + * Can be invoked from any context. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides an acquire ordering on success such that free() + * must come after. + * + * Return: + * + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely schedule the + * object, which is protected by the reference counter, for + * deconstruction. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * deconstruct the protected object. + */ +static inline __must_check bool rcuref_put(rcuref_t *ref) +{ + bool released; + + preempt_disable(); + released = __rcuref_put(ref); + preempt_enable(); + return released; +} + +#endif diff --git a/include/linux/types.h b/include/linux/types.h index ea8cf60a8a79..688fb943556a 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -175,6 +175,12 @@ typedef struct { } atomic64_t; #endif +typedef struct { + atomic_t refcnt; +} rcuref_t; + +#define RCUREF_INIT(i) { .refcnt = ATOMIC_INIT(i - 1) } + struct list_head { struct list_head *next, *prev; }; -- cgit v1.2.3