summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>2026-01-23 11:21:37 +0100
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2026-01-23 11:21:37 +0100
commit3282f1c427e18feb61774c37a034fbb3622177b4 (patch)
tree109d5873614af7741ae06b968bd5f253ef39c613 /mm/page_alloc.c
parent0d579622a3b0bb76742ffa8becc103fe35934dca (diff)
parent5dfbc5357c34bdf81c84aa78bc8e3d6d9ba10aad (diff)
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c74
1 files changed, 58 insertions, 16 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e644f2744c2..623f6e5b583a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -166,6 +166,33 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
#define pcp_spin_unlock(ptr) \
pcpu_spin_unlock(lock, ptr)
+/*
+ * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e.
+ * a potentially remote cpu drain) and get interrupted by an operation that
+ * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP
+ * spinlock assumptions making the trylock a no-op. So we have to turn that
+ * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no
+ * remote cpu's so we can only be locking the only existing local one.
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+static inline void __flags_noop(unsigned long *flags) { }
+#define pcp_spin_lock_maybe_irqsave(ptr, flags) \
+({ \
+ __flags_noop(&(flags)); \
+ spin_lock(&(ptr)->lock); \
+})
+#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \
+({ \
+ spin_unlock(&(ptr)->lock); \
+ __flags_noop(&(flags)); \
+})
+#else
+#define pcp_spin_lock_maybe_irqsave(ptr, flags) \
+ spin_lock_irqsave(&(ptr)->lock, flags)
+#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \
+ spin_unlock_irqrestore(&(ptr)->lock, flags)
+#endif
+
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -2552,10 +2579,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* Called from the vmstat counter updater to decay the PCP high.
* Return whether there are addition works to do.
*/
-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
{
- int high_min, to_drain, batch;
- int todo = 0;
+ int high_min, to_drain, to_drain_batched, batch;
+ unsigned long UP_flags;
+ bool todo = false;
high_min = READ_ONCE(pcp->high_min);
batch = READ_ONCE(pcp->batch);
@@ -2568,15 +2596,18 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
pcp->high - (pcp->high >> 3), high_min);
if (pcp->high > high_min)
- todo++;
+ todo = true;
}
to_drain = pcp->count - pcp->high;
- if (to_drain > 0) {
- spin_lock(&pcp->lock);
- free_pcppages_bulk(zone, to_drain, pcp, 0);
- spin_unlock(&pcp->lock);
- todo++;
+ while (to_drain > 0) {
+ to_drain_batched = min(to_drain, batch);
+ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+ free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
+ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+ todo = true;
+
+ to_drain -= to_drain_batched;
}
return todo;
@@ -2590,14 +2621,15 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
+ unsigned long UP_flags;
int to_drain, batch;
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0) {
- spin_lock(&pcp->lock);
+ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
free_pcppages_bulk(zone, to_drain, pcp, 0);
- spin_unlock(&pcp->lock);
+ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
}
}
#endif
@@ -2608,10 +2640,11 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ unsigned long UP_flags;
int count;
do {
- spin_lock(&pcp->lock);
+ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
count = pcp->count;
if (count) {
int to_drain = min(count,
@@ -2620,7 +2653,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
free_pcppages_bulk(zone, to_drain, pcp, 0);
count -= to_drain;
}
- spin_unlock(&pcp->lock);
+ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
} while (count);
}
@@ -6078,6 +6111,7 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
{
struct per_cpu_pages *pcp;
struct cpu_cacheinfo *cci;
+ unsigned long UP_flags;
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
cci = get_cpu_cacheinfo(cpu);
@@ -6088,12 +6122,12 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
* This can reduce zone lock contention without hurting
* cache-hot pages sharing.
*/
- spin_lock(&pcp->lock);
+ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
pcp->flags |= PCPF_FREE_HIGH_BATCH;
else
pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
- spin_unlock(&pcp->lock);
+ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
}
void setup_pcp_cacheinfo(unsigned int cpu)
@@ -6611,11 +6645,19 @@ static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table *
int old_percpu_pagelist_high_fraction;
int ret;
+ /*
+ * Avoid using pcp_batch_high_lock for reads as the value is read
+ * atomically and a race with offlining is harmless.
+ */
+
+ if (!write)
+ return proc_dointvec_minmax(table, write, buffer, length, ppos);
+
mutex_lock(&pcp_batch_high_lock);
old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || ret < 0)
+ if (ret < 0)
goto out;
/* Sanity checking to avoid pcp imbalance */