7 files changed, 134 insertions, 57 deletions
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index c8f4f0a0b874..fc879ac4cc4f 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -89,6 +89,7 @@ int populate_cache_leaves(unsigned int cpu);
 int cache_setup_acpi(unsigned int cpu);
 bool last_level_cache_is_valid(unsigned int cpu);
 bool last_level_cache_is_shared(unsigned int cpu_x, unsigned int cpu_y);
+struct cacheinfo *get_cpu_cacheinfo_llc(unsigned int cpu);
 int fetch_cache_info(unsigned int cpu);
 int detect_cache_attributes(unsigned int cpu);
 #ifndef CONFIG_ACPI_PPTT
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2fc552b3924a..5cadb00d9352 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1223,6 +1223,8 @@ struct mm_struct {
 		/* MM CID related storage */
 		struct mm_mm_cid mm_cid;
 
+		/* sched_cache related statistics */
+		struct sched_cache_stat sc_stat;
 #ifdef CONFIG_MMU
 		atomic_long_t pgtables_bytes;	/* size of all page tables */
 #endif
@@ -1619,6 +1621,36 @@ static inline unsigned int mm_cid_size(void)
 # define MM_CID_STATIC_SIZE	0
 #endif /* CONFIG_SCHED_MM_CID */
 
+#ifdef CONFIG_SCHED_CACHE
+void mm_init_sched(struct mm_struct *mm,
+		   struct sched_cache_time __percpu *pcpu_sched);
+
+static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
+{
+	struct sched_cache_time __percpu *pcpu_sched =
+		alloc_percpu_noprof(struct sched_cache_time);
+
+	if (!pcpu_sched)
+		return -ENOMEM;
+
+	mm_init_sched(mm, pcpu_sched);
+	return 0;
+}
+
+#define mm_alloc_sched(...)	alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
+
+static inline void mm_destroy_sched(struct mm_struct *mm)
+{
+	free_percpu(mm->sc_stat.pcpu_sched);
+	mm->sc_stat.pcpu_sched = NULL;
+}
+#else /* !CONFIG_SCHED_CACHE */
+
+static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
+static inline void mm_destroy_sched(struct mm_struct *mm) { }
+
+#endif /* CONFIG_SCHED_CACHE */
+
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bd9488751f51..b3204a15d512 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -161,7 +161,7 @@ struct user_event_mm;
  */
 #define is_special_task_state(state)					\
 	((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED |	\
-		    TASK_DEAD | TASK_FROZEN))
+		    TASK_DEAD | TASK_WAKING | TASK_FROZEN))
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 # define debug_normal_state_change(state_value)				\
@@ -702,6 +702,11 @@ struct sched_dl_entity {
 	 * running, skipping the defer phase.
 	 *
 	 * @dl_defer_idle tracks idle state
+	 *
+	 * @dl_bw_attached tells if this server's bandwidth currently
+	 * contributes to the root domain's total_bw. Only meaningful for server
+	 * entities (@dl_server == 1). Allows toggling the reservation on/off
+	 * without losing the configured @dl_runtime/@dl_period.
 	 */
 	unsigned int			dl_throttled      : 1;
 	unsigned int			dl_yielded        : 1;
@@ -713,6 +718,7 @@ struct sched_dl_entity {
 	unsigned int			dl_defer_armed	  : 1;
 	unsigned int			dl_defer_running  : 1;
 	unsigned int			dl_defer_idle     : 1;
+	unsigned int			dl_bw_attached    : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
@@ -846,7 +852,11 @@ struct task_struct {
 	struct alloc_tag		*alloc_tag;
 #endif
 
-	int				on_cpu;
+	u8				on_cpu;
+	u8				on_rq;
+	u8				is_blocked;
+	u8				__pad;
+
 	struct __call_single_node	wake_entry;
 	unsigned int			wakee_flips;
 	unsigned long			wakee_flip_decay_ts;
@@ -861,7 +871,6 @@ struct task_struct {
 	 */
 	int				recent_used_cpu;
 	int				wake_cpu;
-	int				on_rq;
 
 	int				prio;
 	int				static_prio;
@@ -1243,6 +1252,13 @@ struct task_struct {
 	struct mutex			*blocked_on;	/* lock we're blocked on */
 	raw_spinlock_t			blocked_lock;
 
+	/*
+	 * The task that is boosting this task; a back link for the current
+	 * donor stack. Set in schedule() -> find_proxy_task() and only stable
+	 * under preempt_disable().
+	 */
+	struct task_struct		*blocked_donor;
+
 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
 	/*
 	 * Encoded lock address causing task block (lower 2 bits = type from
@@ -1403,6 +1419,13 @@ struct task_struct {
 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_SCHED_CACHE
+	struct callback_head		cache_work;
+	int				preferred_llc;
+	/* 1: task was enqueued to its preferred LLC, 0 otherwise */
+	int				pref_llc_queued;
+#endif
+
 	struct rseq_data		rseq;
 	struct sched_mm_cid		mm_cid;
 
@@ -2177,19 +2200,10 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock);
 
 #ifndef CONFIG_PREEMPT_RT
 
-/*
- * With proxy exec, if a task has been proxy-migrated, it may be a donor
- * on a cpu that it can't actually run on. Thus we need a special state
- * to denote that the task is being woken, but that it needs to be
- * evaluated for return-migration before it is run. So if the task is
- * blocked_on PROXY_WAKING, return migrate it before running it.
- */
-#define PROXY_WAKING ((struct mutex *)(-1L))
-
 static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
 {
 	lockdep_assert_held_once(&p->blocked_lock);
-	return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on;
+	return p->blocked_on;
 }
 
 static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
@@ -2217,7 +2231,7 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *
 	 * blocked_on relationships, but make sure we are not
 	 * clearing the relationship with a different lock.
 	 */
-	WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
+	WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m);
 	p->blocked_on = NULL;
 }
 
@@ -2226,35 +2240,6 @@ static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
 	guard(raw_spinlock_irqsave)(&p->blocked_lock);
 	__clear_task_blocked_on(p, m);
 }
-
-static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m)
-{
-	/* Currently we serialize blocked_on under the task::blocked_lock */
-	lockdep_assert_held_once(&p->blocked_lock);
-
-	if (!sched_proxy_exec()) {
-		__clear_task_blocked_on(p, m);
-		return;
-	}
-
-	/* Don't set PROXY_WAKING if blocked_on was already cleared */
-	if (!p->blocked_on)
-		return;
-	/*
-	 * There may be cases where we set PROXY_WAKING on tasks that were
-	 * already set to waking, but make sure we are not changing
-	 * the relationship with a different lock.
-	 */
-	WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
-	p->blocked_on = PROXY_WAKING;
-}
-
-static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m)
-{
-	guard(raw_spinlock_irqsave)(&p->blocked_lock);
-	__set_task_blocked_on_waking(p, m);
-}
-
 #else
 static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
 {
@@ -2263,14 +2248,6 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mute
 static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
 {
 }
-
-static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m)
-{
-}
-
-static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m)
-{
-}
 #endif /* !CONFIG_PREEMPT_RT */
 
 static __always_inline bool need_resched(void)
@@ -2403,6 +2380,29 @@ static __always_inline int task_mm_cid(struct task_struct *t)
 }
 #endif
 
+#ifdef CONFIG_SCHED_CACHE
+
+struct sched_cache_time {
+	u64 runtime;
+	unsigned long epoch;
+};
+
+struct sched_cache_stat {
+	struct sched_cache_time __percpu *pcpu_sched;
+	raw_spinlock_t lock;
+	unsigned long epoch;
+	u64 nr_running_avg;
+	unsigned long next_scan;
+	unsigned long footprint;
+	int cpu;
+} ____cacheline_aligned_in_smp;
+
+#else
+
+struct sched_cache_stat { };
+
+#endif
+
 #ifndef MODULE
 #ifndef COMPILE_OFFSETS
 
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 196f0ca351a2..39f0a7f94bfc 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -33,6 +33,11 @@ extern u64 sched_clock_cpu(int cpu);
 extern void sched_clock_init(void);
 
 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline int sched_clock_stable(void)
+{
+	return 1;
+}
+
 static inline void sched_clock_tick(void)
 {
 }
diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h
index 166b19af956f..cde6679c0278 100644
--- a/include/linux/sched/smt.h
+++ b/include/linux/sched/smt.h
@@ -4,16 +4,12 @@
 
 #include <linux/static_key.h>
 
-#ifdef CONFIG_SCHED_SMT
 extern struct static_key_false sched_smt_present;
 
 static __always_inline bool sched_smt_active(void)
 {
 	return static_branch_likely(&sched_smt_present);
 }
-#else
-static __always_inline bool sched_smt_active(void) { return false; }
-#endif
 
 void arch_smt_update(void);
 
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 36553e14866d..b5d9d7c2b8ad 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -67,7 +67,25 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
-	int		nr_idle_scan;
+	union {
+		int	nr_idle_scan;
+		/*
+		 * Used during allocation to claim the sched_domain_shared
+		 * object at multiple levels.
+		 *
+		 * Note: between build and the first periodic LB tick, which
+		 * rewrites the union via update_idle_cpu_scan(), readers of
+		 * nr_idle_scan may observe the transient SD_* flag value as
+		 * the scan bound. The flag bits are small positive integers,
+		 * so the effect is just a slightly relaxed scan bound for one
+		 * window and self-heals on the first tick.
+		 */
+		int	alloc_flags;
+	};
+#ifdef CONFIG_SCHED_CACHE
+	unsigned long	util_avg;
+	unsigned long	capacity;
+#endif
 };
 
 struct sched_domain {
@@ -99,6 +117,12 @@ struct sched_domain {
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
+#ifdef CONFIG_SCHED_CACHE
+	unsigned int llc_max;
+	unsigned int *llc_counts __counted_by_ptr(llc_max);
+	unsigned long llc_bytes;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* sched_balance_rq() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -256,4 +280,10 @@ static inline int task_node(const struct task_struct *p)
 	return cpu_to_node(task_cpu(p));
 }
 
+#ifdef CONFIG_SCHED_CACHE
+extern void sched_update_llc_bytes(unsigned int cpu);
+#else
+static inline void sched_update_llc_bytes(unsigned int cpu) { }
+#endif
+
 #endif /* _LINUX_SCHED_TOPOLOGY_H */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 6575af39fd10..709a2dcf4c73 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -230,11 +230,24 @@ static inline int cpu_to_mem(int cpu)
 #define topology_drawer_cpumask(cpu)		cpumask_of(cpu)
 #endif
 
-#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
+/*
+ * Defining cpu_smt_mask as cpumask_of that CPU helps to get
+ * rid of lot of ifdeffery all around the codebase in case of
+ * CONFIG_SCHED_SMT=n. It just means there are no other siblings, which
+ * is what is expected.
+ */
+#if defined(CONFIG_SCHED_SMT)
+# if !defined(cpu_smt_mask)
 static inline const struct cpumask *cpu_smt_mask(int cpu)
 {
 	return topology_sibling_cpumask(cpu);
 }
+# endif
+#else	/* !CONFIG_SCHED_SMT */
+static inline const struct cpumask *cpu_smt_mask(int cpu)
+{
+	return cpumask_of(cpu);
+}
 #endif
 
 #ifndef topology_is_primary_thread