23 files changed, 1789 insertions, 1248 deletions
diff --git a/kernel/time/.kunitconfig b/kernel/time/.kunitconfig
new file mode 100644
index 000000000000..d60a611b2853
--- /dev/null
+++ b/kernel/time/.kunitconfig
@@ -0,0 +1,2 @@
+CONFIG_KUNIT=y
+CONFIG_TIME_KUNIT_TEST=y
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 7c6a52f7836c..02aac7c5aa76 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -9,14 +9,13 @@
 config CLOCKSOURCE_WATCHDOG
 	bool
 
-# Architecture has extra clocksource data
-config ARCH_CLOCKSOURCE_DATA
-	bool
-
 # Architecture has extra clocksource init called from registration
 config ARCH_CLOCKSOURCE_INIT
 	bool
 
+config ARCH_WANTS_CLOCKSOURCE_READ_INLINE
+	bool
+
 # Timekeeping vsyscall support
 config GENERIC_TIME_VSYSCALL
 	bool
@@ -44,10 +43,23 @@ config GENERIC_CLOCKEVENTS_BROADCAST_IDLE
 config GENERIC_CLOCKEVENTS_MIN_ADJUST
 	bool
 
+config GENERIC_CLOCKEVENTS_COUPLED
+	bool
+
+config GENERIC_CLOCKEVENTS_COUPLED_INLINE
+	select GENERIC_CLOCKEVENTS_COUPLED
+	bool
+
 # Generic update of CMOS clock
 config GENERIC_CMOS_UPDATE
 	bool
 
+# Deferred rearming of the hrtimer interrupt
+config HRTIMER_REARM_DEFERRED
+       def_bool y
+       depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS
+       depends on HIGH_RES_TIMERS && SCHED_HRTICK
+
 # Select to handle posix CPU timers from task_work
 # and not from the timer interrupt context
 config HAVE_POSIX_CPU_TIMERS_TASK_WORK
@@ -196,18 +208,6 @@ config HIGH_RES_TIMERS
 	  hardware is not capable then this option only increases
 	  the size of the kernel image.
 
-config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
-	int "Clocksource watchdog maximum allowable skew (in microseconds)"
-	depends on CLOCKSOURCE_WATCHDOG
-	range 50 1000
-	default 125
-	help
-	  Specify the maximum amount of allowable watchdog skew in
-	  microseconds before reporting the clocksource to be unstable.
-	  The default is based on a half-second clocksource watchdog
-	  interval and NTP's maximum frequency drift of 500 parts
-	  per million.	If the clocksource is good enough for NTP,
-	  it is good enough for the clocksource watchdog!
 endif
 
 config POSIX_AUX_CLOCKS
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f7d52d9543cc..eaf290c972f9 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -30,5 +30,6 @@ obj-$(CONFIG_GENERIC_GETTIMEOFDAY)		+= vsyscall.o
 obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)			+= test_udelay.o
 obj-$(CONFIG_TIME_NS)				+= namespace.o
+obj-$(CONFIG_TIME_NS_VDSO)			+= namespace_vdso.o
 obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG)		+= clocksource-wdtest.o
 obj-$(CONFIG_TIME_KUNIT_TEST)			+= time_test.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 069d93bfb0c7..6e173d70d825 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -234,19 +234,23 @@ static int alarmtimer_suspend(struct device *dev)
 	if (!rtc)
 		return 0;
 
-	/* Find the soonest timer to expire*/
+	/* Find the soonest timer to expire */
 	for (i = 0; i < ALARM_NUMTYPE; i++) {
 		struct alarm_base *base = &alarm_bases[i];
 		struct timerqueue_node *next;
+		ktime_t next_expires;
 		ktime_t delta;
 
-		scoped_guard(spinlock_irqsave, &base->lock)
+		scoped_guard(spinlock_irqsave, &base->lock) {
 			next = timerqueue_getnext(&base->timerqueue);
+			if (next)
+				next_expires = next->expires;
+		}
 		if (!next)
 			continue;
-		delta = ktime_sub(next->expires, base->get_ktime());
+		delta = ktime_sub(next_expires, base->get_ktime());
 		if (!min || (delta < min)) {
-			expires = next->expires;
+			expires = next_expires;
 			min = delta;
 			type = i;
 		}
@@ -540,7 +544,7 @@ static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now)
 {
 	struct alarm *alarm = &timr->it.alarm.alarmtimer;
 
-	return alarm_forward(alarm, timr->it_interval, now);
+	return alarm_forward(alarm, now, timr->it_interval);
 }
 
 /**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index eaae1ce9f060..0014d163f989 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,6 +94,9 @@ static int __clockevents_switch_state(struct clock_event_device *dev,
 	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
 		return 0;
 
+	/* On state transitions clear the forced flag unconditionally */
+	dev->next_event_forced = 0;
+
 	/* Transition with new state-specific callbacks */
 	switch (state) {
 	case CLOCK_EVT_STATE_DETACHED:
@@ -172,6 +175,7 @@ void clockevents_shutdown(struct clock_event_device *dev)
 {
 	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 	dev->next_event = KTIME_MAX;
+	dev->next_event_forced = 0;
 }
 
 /**
@@ -292,6 +296,38 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
 
 #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE
+#include <asm/clock_inlined.h>
+#else
+static __always_inline void
+arch_inlined_clockevent_set_next_coupled(u64 cycles, struct clock_event_device *dev) { }
+#endif
+
+static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires)
+{
+	u64 cycles;
+
+	if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED)))
+		return false;
+
+	if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles)))
+		return false;
+
+	if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE))
+		arch_inlined_clockevent_set_next_coupled(cycles, dev);
+	else
+		dev->set_next_coupled(cycles, dev);
+	return true;
+}
+
+#else
+static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires)
+{
+	return false;
+}
+#endif
+
 /**
  * clockevents_program_event - Reprogram the clock event device.
  * @dev:	device to program
@@ -300,12 +336,10 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
  *
  * Returns 0 on success, -ETIME when the event is in the past.
  */
-int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
-			      bool force)
+int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force)
 {
-	unsigned long long clc;
 	int64_t delta;
-	int rc;
+	u64 cycles;
 
 	if (WARN_ON_ONCE(expires < 0))
 		return -ETIME;
@@ -319,21 +353,37 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 	WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
 		  clockevent_get_state(dev));
 
-	/* Shortcut for clockevent devices that can deal with ktime. */
-	if (dev->features & CLOCK_EVT_FEAT_KTIME)
+	/* ktime_t based reprogramming for the broadcast hrtimer device */
+	if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER))
 		return dev->set_next_ktime(expires, dev);
 
+	if (likely(clockevent_set_next_coupled(dev, expires)))
+		return 0;
+
 	delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
-	if (delta <= 0)
-		return force ? clockevents_program_min_delta(dev) : -ETIME;
 
-	delta = min(delta, (int64_t) dev->max_delta_ns);
-	delta = max(delta, (int64_t) dev->min_delta_ns);
+	/* Required for tick_periodic() during early boot */
+	if (delta <= 0 && !force)
+		return -ETIME;
 
-	clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
-	rc = dev->set_next_event((unsigned long) clc, dev);
+	if (delta > (int64_t)dev->min_delta_ns) {
+		delta = min(delta, (int64_t) dev->max_delta_ns);
+		cycles = ((u64)delta * dev->mult) >> dev->shift;
+		if (!dev->set_next_event((unsigned long) cycles, dev)) {
+			dev->next_event_forced = 0;
+			return 0;
+		}
+	}
+
+	if (dev->next_event_forced)
+		return 0;
 
-	return (rc && force) ? clockevents_program_min_delta(dev) : rc;
+	if (dev->set_next_event(dev->min_delta_ticks, dev)) {
+		if (!force || clockevents_program_min_delta(dev))
+			return -ETIME;
+	}
+	dev->next_event_forced = 1;
+	return 0;
 }
 
 /*
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
index 38dae590b29f..b4cf17b4aeed 100644
--- a/kernel/time/clocksource-wdtest.c
+++ b/kernel/time/clocksource-wdtest.c
@@ -3,202 +3,196 @@
  * Unit test for the clocksource watchdog.
  *
  * Copyright (C) 2021 Facebook, Inc.
+ * Copyright (C) 2026 Intel Corp.
  *
  * Author: Paul E. McKenney <paulmck@kernel.org>
+ * Author: Thomas Gleixner <tglx@kernel.org>
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/device.h>
 #include <linux/clocksource.h>
-#include <linux/init.h>
+#include <linux/delay.h>
 #include <linux/module.h>
-#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
-#include <linux/tick.h>
 #include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/prandom.h>
-#include <linux/cpu.h>
 
 #include "tick-internal.h"
+#include "timekeeping_internal.h"
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Clocksource watchdog unit test");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
+MODULE_AUTHOR("Thomas Gleixner <tglx@kernel.org>");
+
+enum wdtest_states {
+	WDTEST_INJECT_NONE,
+	WDTEST_INJECT_DELAY,
+	WDTEST_INJECT_POSITIVE,
+	WDTEST_INJECT_NEGATIVE,
+	WDTEST_INJECT_PERCPU	= 0x100,
+};
 
-static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0;
-module_param(holdoff, int, 0444);
-MODULE_PARM_DESC(holdoff, "Time to wait to start test (s).");
+static enum wdtest_states wdtest_state;
+static unsigned long wdtest_test_count;
+static ktime_t wdtest_last_ts, wdtest_offset;
 
-/* Watchdog kthread's task_struct pointer for debug purposes. */
-static struct task_struct *wdtest_task;
+#define SHIFT_4000PPM	8
 
-static u64 wdtest_jiffies_read(struct clocksource *cs)
+static ktime_t wdtest_get_offset(struct clocksource *cs)
 {
-	return (u64)jiffies;
-}
-
-static struct clocksource clocksource_wdtest_jiffies = {
-	.name			= "wdtest-jiffies",
-	.rating			= 1, /* lowest valid rating*/
-	.uncertainty_margin	= TICK_NSEC,
-	.read			= wdtest_jiffies_read,
-	.mask			= CLOCKSOURCE_MASK(32),
-	.flags			= CLOCK_SOURCE_MUST_VERIFY,
-	.mult			= TICK_NSEC << JIFFIES_SHIFT, /* details above */
-	.shift			= JIFFIES_SHIFT,
-	.max_cycles		= 10,
-};
+	if (wdtest_state < WDTEST_INJECT_PERCPU)
+		return wdtest_test_count & 0x1 ? 0 : wdtest_offset >> SHIFT_4000PPM;
 
-static int wdtest_ktime_read_ndelays;
-static bool wdtest_ktime_read_fuzz;
+	/* Only affect the readout of the "remote" CPU */
+	return cs->wd_cpu == smp_processor_id() ? 0 : NSEC_PER_MSEC;
+}
 
 static u64 wdtest_ktime_read(struct clocksource *cs)
 {
-	int wkrn = READ_ONCE(wdtest_ktime_read_ndelays);
-	static int sign = 1;
-	u64 ret;
+	ktime_t now = ktime_get_raw_fast_ns();
+	ktime_t intv = now - wdtest_last_ts;
 
-	if (wkrn) {
-		udelay(cs->uncertainty_margin / 250);
-		WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1);
-	}
-	ret = ktime_get_real_fast_ns();
-	if (READ_ONCE(wdtest_ktime_read_fuzz)) {
-		sign = -sign;
-		ret = ret + sign * 100 * NSEC_PER_MSEC;
+	/*
+	 * Only increment the test counter once per watchdog interval and
+	 * store the interval for the offset calculation of this step. This
+	 * guarantees a consistent behaviour even if the other side needs
+	 * to repeat due to a watchdog read timeout.
+	 */
+	if (intv > (NSEC_PER_SEC / 4)) {
+		WRITE_ONCE(wdtest_test_count, wdtest_test_count + 1);
+		wdtest_last_ts = now;
+		wdtest_offset = intv;
 	}
-	return ret;
-}
 
-static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs)
-{
-	pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name);
+	switch (wdtest_state & ~WDTEST_INJECT_PERCPU) {
+	case WDTEST_INJECT_POSITIVE:
+		return now + wdtest_get_offset(cs);
+	case WDTEST_INJECT_NEGATIVE:
+		return now - wdtest_get_offset(cs);
+	case WDTEST_INJECT_DELAY:
+		udelay(500);
+		return now;
+	default:
+		return now;
+	}
 }
 
-#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \
-		     CLOCK_SOURCE_VALID_FOR_HRES | \
-		     CLOCK_SOURCE_MUST_VERIFY | \
-		     CLOCK_SOURCE_VERIFY_PERCPU)
+#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS |	\
+		     CLOCK_SOURCE_CALIBRATED |		\
+		     CLOCK_SOURCE_MUST_VERIFY |		\
+		     CLOCK_SOURCE_WDTEST)
 
 static struct clocksource clocksource_wdtest_ktime = {
 	.name			= "wdtest-ktime",
-	.rating			= 300,
+	.rating			= 10,
 	.read			= wdtest_ktime_read,
 	.mask			= CLOCKSOURCE_MASK(64),
 	.flags			= KTIME_FLAGS,
-	.mark_unstable		= wdtest_ktime_cs_mark_unstable,
 	.list			= LIST_HEAD_INIT(clocksource_wdtest_ktime.list),
 };
 
-/* Reset the clocksource if needed. */
-static void wdtest_ktime_clocksource_reset(void)
+static void wdtest_clocksource_reset(enum wdtest_states which, bool percpu)
+{
+	clocksource_unregister(&clocksource_wdtest_ktime);
+
+	pr_info("Test: State %d percpu %d\n", which, percpu);
+
+	wdtest_state = which;
+	if (percpu)
+		wdtest_state |= WDTEST_INJECT_PERCPU;
+	wdtest_test_count = 0;
+	wdtest_last_ts = 0;
+
+	clocksource_wdtest_ktime.rating = 10;
+	clocksource_wdtest_ktime.flags = KTIME_FLAGS;
+	if (percpu)
+		clocksource_wdtest_ktime.flags |= CLOCK_SOURCE_WDTEST_PERCPU;
+	clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+}
+
+static bool wdtest_execute(enum wdtest_states which, bool percpu, unsigned int expect,
+			   unsigned long calls)
 {
-	if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) {
-		clocksource_unregister(&clocksource_wdtest_ktime);
-		clocksource_wdtest_ktime.flags = KTIME_FLAGS;
-		schedule_timeout_uninterruptible(HZ / 10);
-		clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+	wdtest_clocksource_reset(which, percpu);
+
+	for (; READ_ONCE(wdtest_test_count) < calls; msleep(100)) {
+		unsigned int flags = READ_ONCE(clocksource_wdtest_ktime.flags);
+
+		if (kthread_should_stop())
+			return false;
+
+		if (flags & CLOCK_SOURCE_UNSTABLE) {
+			if (expect & CLOCK_SOURCE_UNSTABLE)
+				return true;
+			pr_warn("Fail: Unexpected unstable\n");
+			return false;
+		}
+		if (flags & CLOCK_SOURCE_VALID_FOR_HRES) {
+			if (expect & CLOCK_SOURCE_VALID_FOR_HRES)
+				return true;
+			pr_warn("Fail: Unexpected valid for highres\n");
+			return false;
+		}
 	}
+
+	if (!expect)
+		return true;
+
+	pr_warn("Fail: Timed out\n");
+	return false;
 }
 
-/* Run the specified series of watchdog tests. */
-static int wdtest_func(void *arg)
+static bool wdtest_run(bool percpu)
 {
-	unsigned long j1, j2;
-	int i, max_retries;
-	char *s;
+	if (!wdtest_execute(WDTEST_INJECT_NONE, percpu, CLOCK_SOURCE_VALID_FOR_HRES, 8))
+		return false;
 
-	schedule_timeout_uninterruptible(holdoff * HZ);
+	if (!wdtest_execute(WDTEST_INJECT_DELAY, percpu, 0, 4))
+		return false;
 
-	/*
-	 * Verify that jiffies-like clocksources get the manually
-	 * specified uncertainty margin.
-	 */
-	pr_info("--- Verify jiffies-like uncertainty margin.\n");
-	__clocksource_register(&clocksource_wdtest_jiffies);
-	WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC);
+	if (!wdtest_execute(WDTEST_INJECT_POSITIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8))
+		return false;
 
-	j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
-	schedule_timeout_uninterruptible(HZ);
-	j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
-	WARN_ON_ONCE(j1 == j2);
+	if (!wdtest_execute(WDTEST_INJECT_NEGATIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8))
+		return false;
 
-	clocksource_unregister(&clocksource_wdtest_jiffies);
+	return true;
+}
 
-	/*
-	 * Verify that tsc-like clocksources are assigned a reasonable
-	 * uncertainty margin.
-	 */
-	pr_info("--- Verify tsc-like uncertainty margin.\n");
+static int wdtest_func(void *arg)
+{
 	clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
-	WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC);
-
-	j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
-	udelay(1);
-	j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
-	pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1);
-	WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC),
-		  "Expected at least 1000ns, got %lu.\n", j2 - j1);
-
-	/* Verify tsc-like stability with various numbers of errors injected. */
-	max_retries = clocksource_get_max_watchdog_retry();
-	for (i = 0; i <= max_retries + 1; i++) {
-		if (i <= 1 && i < max_retries)
-			s = "";
-		else if (i <= max_retries)
-			s = ", expect message";
-		else
-			s = ", expect clock skew";
-		pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s);
-		WRITE_ONCE(wdtest_ktime_read_ndelays, i);
-		schedule_timeout_uninterruptible(2 * HZ);
-		WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
-		WARN_ON_ONCE((i <= max_retries) !=
-			     !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
-		wdtest_ktime_clocksource_reset();
+	if (wdtest_run(false)) {
+		if (wdtest_run(true))
+			pr_info("Success: All tests passed\n");
 	}
-
-	/* Verify tsc-like stability with clock-value-fuzz error injection. */
-	pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n");
-	WRITE_ONCE(wdtest_ktime_read_fuzz, true);
-	schedule_timeout_uninterruptible(2 * HZ);
-	WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
-	clocksource_verify_percpu(&clocksource_wdtest_ktime);
-	WRITE_ONCE(wdtest_ktime_read_fuzz, false);
-
 	clocksource_unregister(&clocksource_wdtest_ktime);
 
-	pr_info("--- Done with test.\n");
-	return 0;
-}
+	if (!IS_MODULE(CONFIG_TEST_CLOCKSOURCE_WATCHDOG))
+		return 0;
 
-static void wdtest_print_module_parms(void)
-{
-	pr_alert("--- holdoff=%d\n", holdoff);
+	while (!kthread_should_stop())
+		schedule_timeout_interruptible(3600 * HZ);
+	return 0;
 }
 
-/* Cleanup function. */
-static void clocksource_wdtest_cleanup(void)
-{
-}
+static struct task_struct *wdtest_thread;
 
 static int __init clocksource_wdtest_init(void)
 {
-	int ret = 0;
-
-	wdtest_print_module_parms();
+	struct task_struct *t = kthread_run(wdtest_func, NULL, "wdtest");
 
-	/* Create watchdog-test task. */
-	wdtest_task = kthread_run(wdtest_func, NULL, "wdtest");
-	if (IS_ERR(wdtest_task)) {
-		ret = PTR_ERR(wdtest_task);
-		pr_warn("%s: Failed to create wdtest kthread.\n", __func__);
-		wdtest_task = NULL;
-		return ret;
+	if (IS_ERR(t)) {
+		pr_warn("Failed to create wdtest kthread.\n");
+		return PTR_ERR(t);
 	}
-
+	wdtest_thread = t;
 	return 0;
 }
-
 module_init(clocksource_wdtest_init);
+
+static void clocksource_wdtest_cleanup(void)
+{
+	if (wdtest_thread)
+		kthread_stop(wdtest_thread);
+}
 module_exit(clocksource_wdtest_cleanup);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index df7194961658..baee13a1f87f 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -7,15 +7,17 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/device.h>
 #include <linux/clocksource.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/device.h>
 #include <linux/init.h>
-#include <linux/module.h>
-#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
-#include <linux/tick.h>
 #include <linux/kthread.h>
+#include <linux/module.h>
 #include <linux/prandom.h>
-#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/topology.h>
 
 #include "tick-internal.h"
 #include "timekeeping_internal.h"
@@ -107,48 +109,6 @@ static char override_name[CS_NAME_LEN];
 static int finished_booting;
 static u64 suspend_start;
 
-/*
- * Interval: 0.5sec.
- */
-#define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
-
-/*
- * Threshold: 0.0312s, when doubled: 0.0625s.
- */
-#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
-
-/*
- * Maximum permissible delay between two readouts of the watchdog
- * clocksource surrounding a read of the clocksource being validated.
- * This delay could be due to SMIs, NMIs, or to VCPU preemptions.  Used as
- * a lower bound for cs->uncertainty_margin values when registering clocks.
- *
- * The default of 500 parts per million is based on NTP's limits.
- * If a clocksource is good enough for NTP, it is good enough for us!
- *
- * In other words, by default, even if a clocksource is extremely
- * precise (for example, with a sub-nanosecond period), the maximum
- * permissible skew between the clocksource watchdog and the clocksource
- * under test is not permitted to go below the 500ppm minimum defined
- * by MAX_SKEW_USEC.  This 500ppm minimum may be overridden using the
- * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option.
- */
-#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
-#define MAX_SKEW_USEC	CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
-#else
-#define MAX_SKEW_USEC	(125 * WATCHDOG_INTERVAL / HZ)
-#endif
-
-/*
- * Default for maximum permissible skew when cs->uncertainty_margin is
- * not specified, and the lower bound even when cs->uncertainty_margin
- * is specified.  This is also the default that is used when registering
- * clocks with unspecified cs->uncertainty_margin, so this macro is used
- * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels.
- */
-#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
-
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 static void clocksource_watchdog_work(struct work_struct *work);
 static void clocksource_select(void);
@@ -160,7 +120,42 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
 static int watchdog_running;
 static atomic_t watchdog_reset_pending;
-static int64_t watchdog_max_interval;
+
+/* Watchdog interval: 0.5sec. */
+#define WATCHDOG_INTERVAL		(HZ >> 1)
+#define WATCHDOG_INTERVAL_NS		(WATCHDOG_INTERVAL * (NSEC_PER_SEC / HZ))
+
+/* Maximum time between two reference watchdog readouts */
+#define WATCHDOG_READOUT_MAX_NS		(50U * NSEC_PER_USEC)
+
+/*
+ * Maximum time between two remote readouts for NUMA=n. On NUMA enabled systems
+ * the timeout is calculated from the numa distance.
+ */
+#define WATCHDOG_DEFAULT_TIMEOUT_NS	(50U * NSEC_PER_USEC)
+
+/*
+ * Remote timeout NUMA distance multiplier. The local distance is 10. The
+ * default remote distance is 20. ACPI tables provide more accurate numbers
+ * which are guaranteed to be greater than the local distance.
+ *
+ * This results in a 5us base value, which is equivalent to the above !NUMA
+ * default.
+ */
+#define WATCHDOG_NUMA_MULTIPLIER_NS	((u64)(WATCHDOG_DEFAULT_TIMEOUT_NS / LOCAL_DISTANCE))
+
+/* Limit the NUMA timeout in case the distance values are insanely big */
+#define WATCHDOG_NUMA_MAX_TIMEOUT_NS	((u64)(500U * NSEC_PER_USEC))
+
+/* Shift values to calculate the approximate $N ppm of a given delta. */
+#define SHIFT_500PPM			11
+#define SHIFT_4000PPM			8
+
+/* Number of attempts to read the watchdog */
+#define WATCHDOG_FREQ_RETRIES		3
+
+/* Five reads local and remote for inter CPU skew detection */
+#define WATCHDOG_REMOTE_MAX_SEQ		10
 
 static inline void clocksource_watchdog_lock(unsigned long *flags)
 {
@@ -241,204 +236,422 @@ void clocksource_mark_unstable(struct clocksource *cs)
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
-static int verify_n_cpus = 8;
-module_param(verify_n_cpus, int, 0644);
+static inline void clocksource_reset_watchdog(void)
+{
+	struct clocksource *cs;
 
-enum wd_read_status {
-	WD_READ_SUCCESS,
-	WD_READ_UNSTABLE,
-	WD_READ_SKIP
+	list_for_each_entry(cs, &watchdog_list, wd_list)
+		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+enum wd_result {
+	WD_SUCCESS,
+	WD_FREQ_NO_WATCHDOG,
+	WD_FREQ_TIMEOUT,
+	WD_FREQ_RESET,
+	WD_FREQ_SKEWED,
+	WD_CPU_TIMEOUT,
+	WD_CPU_SKEWED,
+};
+
+struct watchdog_cpu_data {
+	/* Keep first as it is 32 byte aligned */
+	call_single_data_t	csd;
+	atomic_t		remote_inprogress;
+	enum wd_result		result;
+	u64			cpu_ts[2];
+	struct clocksource	*cs;
+	/* Ensure that the sequence is in a separate cache line */
+	atomic_t		seq ____cacheline_aligned;
+	/* Set by the control CPU according to NUMA distance */
+	u64			timeout_ns;
 };
 
-static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
-{
-	int64_t md = watchdog->uncertainty_margin;
-	unsigned int nretries, max_retries;
-	int64_t wd_delay, wd_seq_delay;
-	u64 wd_end, wd_end2;
-
-	max_retries = clocksource_get_max_watchdog_retry();
-	for (nretries = 0; nretries <= max_retries; nretries++) {
-		local_irq_disable();
-		*wdnow = watchdog->read(watchdog);
-		*csnow = cs->read(cs);
-		wd_end = watchdog->read(watchdog);
-		wd_end2 = watchdog->read(watchdog);
-		local_irq_enable();
-
-		wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
-		if (wd_delay <= md + cs->uncertainty_margin) {
-			if (nretries > 1 && nretries >= max_retries) {
-				pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
-					smp_processor_id(), watchdog->name, nretries);
+struct watchdog_data {
+	raw_spinlock_t	lock;
+	enum wd_result	result;
+
+	u64		wd_seq;
+	u64		wd_delta;
+	u64		cs_delta;
+	u64		cpu_ts[2];
+
+	unsigned int	curr_cpu;
+} ____cacheline_aligned_in_smp;
+
+static void watchdog_check_skew_remote(void *unused);
+
+static DEFINE_PER_CPU_ALIGNED(struct watchdog_cpu_data, watchdog_cpu_data) = {
+	.csd	= CSD_INIT(watchdog_check_skew_remote, NULL),
+};
+
+static struct watchdog_data watchdog_data = {
+	.lock	= __RAW_SPIN_LOCK_UNLOCKED(watchdog_data.lock),
+};
+
+static inline void watchdog_set_result(struct watchdog_cpu_data *wd, enum wd_result result)
+{
+	guard(raw_spinlock)(&watchdog_data.lock);
+	if (!wd->result) {
+		atomic_set(&wd->seq, WATCHDOG_REMOTE_MAX_SEQ);
+		WRITE_ONCE(wd->result, result);
+	}
+}
+
+/* Wait for the sequence number to hand over control. */
+static bool watchdog_wait_seq(struct watchdog_cpu_data *wd, u64 start, int seq)
+{
+	for(int cnt = 0; atomic_read(&wd->seq) < seq; cnt++) {
+		/* Bail if the other side set an error result */
+		if (READ_ONCE(wd->result) != WD_SUCCESS)
+			return false;
+
+		/* Prevent endless loops if the other CPU does not react. */
+		if (cnt == 5000) {
+			u64 nsecs = ktime_get_raw_fast_ns();
+
+			if (nsecs - start >=wd->timeout_ns) {
+				watchdog_set_result(wd, WD_CPU_TIMEOUT);
+				return false;
 			}
-			return WD_READ_SUCCESS;
+			cnt = 0;
 		}
+		cpu_relax();
+	}
+	return seq < WATCHDOG_REMOTE_MAX_SEQ;
+}
 
-		/*
-		 * Now compute delay in consecutive watchdog read to see if
-		 * there is too much external interferences that cause
-		 * significant delay in reading both clocksource and watchdog.
-		 *
-		 * If consecutive WD read-back delay > md, report
-		 * system busy, reinit the watchdog and skip the current
-		 * watchdog test.
-		 */
-		wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
-		if (wd_seq_delay > md)
-			goto skip_test;
+static void watchdog_check_skew(struct watchdog_cpu_data *wd, int index)
+{
+	u64 prev, now, delta, start = ktime_get_raw_fast_ns();
+	int local = index, remote = (index + 1) & 0x1;
+	struct clocksource *cs = wd->cs;
+
+	/* Set the local timestamp so that the first iteration works correctly */
+	wd->cpu_ts[local] = cs->read(cs);
+
+	/* Signal arrival */
+	atomic_inc(&wd->seq);
+
+	for (int seq = local + 2; seq < WATCHDOG_REMOTE_MAX_SEQ; seq += 2) {
+		if (!watchdog_wait_seq(wd, start, seq))
+			return;
+
+		/* Capture local timestamp before possible non-local coherency overhead */
+		now = cs->read(cs);
+
+		/* Store local timestamp before reading remote to limit coherency stalls */
+		wd->cpu_ts[local] = now;
+
+		prev = wd->cpu_ts[remote];
+		delta = (now - prev) & cs->mask;
+
+		if (delta > cs->max_raw_delta) {
+			watchdog_set_result(wd, WD_CPU_SKEWED);
+			return;
+		}
+
+		/* Hand over to the remote CPU */
+		atomic_inc(&wd->seq);
 	}
+}
 
-	pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
-		smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
-	return WD_READ_UNSTABLE;
+static void watchdog_check_skew_remote(void *unused)
+{
+	struct watchdog_cpu_data *wd = this_cpu_ptr(&watchdog_cpu_data);
 
-skip_test:
-	pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
-		smp_processor_id(), watchdog->name, wd_seq_delay);
-	pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
-		cs->name, wd_delay);
-	return WD_READ_SKIP;
+	atomic_inc(&wd->remote_inprogress);
+	watchdog_check_skew(wd, 1);
+	atomic_dec(&wd->remote_inprogress);
 }
 
-static u64 csnow_mid;
-static cpumask_t cpus_ahead;
-static cpumask_t cpus_behind;
-static cpumask_t cpus_chosen;
+static inline bool wd_csd_locked(struct watchdog_cpu_data *wd)
+{
+	return READ_ONCE(wd->csd.node.u_flags) & CSD_FLAG_LOCK;
+}
+
+/*
+ * This is only invoked for remote CPUs. See watchdog_check_cpu_skew().
+ */
+static inline u64 wd_get_remote_timeout(unsigned int remote_cpu)
+{
+	unsigned int n1, n2;
+	u64 ns;
+
+	if (nr_node_ids == 1)
+		return WATCHDOG_DEFAULT_TIMEOUT_NS;
+
+	n1 = cpu_to_node(smp_processor_id());
+	n2 = cpu_to_node(remote_cpu);
+	ns = WATCHDOG_NUMA_MULTIPLIER_NS * node_distance(n1, n2);
+	return min(ns, WATCHDOG_NUMA_MAX_TIMEOUT_NS);
+}
 
-static void clocksource_verify_choose_cpus(void)
+static void __watchdog_check_cpu_skew(struct clocksource *cs, unsigned int cpu)
 {
-	int cpu, i, n = verify_n_cpus;
+	struct watchdog_cpu_data *wd;
 
-	if (n < 0 || n >= num_online_cpus()) {
-		/* Check all of the CPUs. */
-		cpumask_copy(&cpus_chosen, cpu_online_mask);
-		cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+	wd = per_cpu_ptr(&watchdog_cpu_data, cpu);
+	if (atomic_read(&wd->remote_inprogress) || wd_csd_locked(wd)) {
+		watchdog_data.result = WD_CPU_TIMEOUT;
 		return;
 	}
 
-	/* If no checking desired, or no other CPU to check, leave. */
-	cpumask_clear(&cpus_chosen);
-	if (n == 0 || num_online_cpus() <= 1)
+	atomic_set(&wd->seq, 0);
+	wd->result = WD_SUCCESS;
+	wd->cs = cs;
+	/* Store the current CPU ID for the watchdog test unit */
+	cs->wd_cpu = smp_processor_id();
+
+	wd->timeout_ns = wd_get_remote_timeout(cpu);
+
+	/* Kick the remote CPU into the watchdog function */
+	if (WARN_ON_ONCE(smp_call_function_single_async(cpu, &wd->csd))) {
+		watchdog_data.result = WD_CPU_TIMEOUT;
+		return;
+	}
+
+	scoped_guard(irq)
+		watchdog_check_skew(wd, 0);
+
+	scoped_guard(raw_spinlock_irq, &watchdog_data.lock) {
+		watchdog_data.result = wd->result;
+		memcpy(watchdog_data.cpu_ts, wd->cpu_ts, sizeof(wd->cpu_ts));
+	}
+}
+
+static void watchdog_check_cpu_skew(struct clocksource *cs)
+{
+	unsigned int cpu = watchdog_data.curr_cpu;
+
+	cpu = cpumask_next_wrap(cpu, cpu_online_mask);
+	watchdog_data.curr_cpu = cpu;
+
+	/* Skip the current CPU. Handles num_online_cpus() == 1 as well */
+	if (cpu == smp_processor_id())
 		return;
 
-	/* Make sure to select at least one CPU other than the current CPU. */
-	cpu = cpumask_any_but(cpu_online_mask, smp_processor_id());
-	if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+	/* Don't interfere with the test mechanics */
+	if ((cs->flags & CLOCK_SOURCE_WDTEST) && !(cs->flags & CLOCK_SOURCE_WDTEST_PERCPU))
 		return;
-	cpumask_set_cpu(cpu, &cpus_chosen);
 
-	/* Force a sane value for the boot parameter. */
-	if (n > nr_cpu_ids)
-		n = nr_cpu_ids;
+	__watchdog_check_cpu_skew(cs, cpu);
+}
+
+static bool watchdog_check_freq(struct clocksource *cs, bool reset_pending)
+{
+	unsigned int ppm_shift = SHIFT_4000PPM;
+	u64 wd_ts0, wd_ts1, cs_ts;
+
+	watchdog_data.result = WD_SUCCESS;
+	if (!watchdog) {
+		watchdog_data.result = WD_FREQ_NO_WATCHDOG;
+		return false;
+	}
+
+	if (cs->flags & CLOCK_SOURCE_WDTEST_PERCPU)
+		return true;
 
 	/*
-	 * Randomly select the specified number of CPUs.  If the same
-	 * CPU is selected multiple times, that CPU is checked only once,
-	 * and no replacement CPU is selected.  This gracefully handles
-	 * situations where verify_n_cpus is greater than the number of
-	 * CPUs that are currently online.
+	 * If both the clocksource and the watchdog claim they are
+	 * calibrated use 500ppm limit. Uncalibrated clocksources need a
+	 * larger allowance because thefirmware supplied frequencies can be
+	 * way off.
 	 */
-	for (i = 1; i < n; i++) {
-		cpu = cpumask_random(cpu_online_mask);
-		if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
-			cpumask_set_cpu(cpu, &cpus_chosen);
+	if (watchdog->flags & CLOCK_SOURCE_CALIBRATED && cs->flags & CLOCK_SOURCE_CALIBRATED)
+		ppm_shift = SHIFT_500PPM;
+
+	for (int retries = 0; retries < WATCHDOG_FREQ_RETRIES; retries++) {
+		s64 wd_last, cs_last, wd_seq, wd_delta, cs_delta, max_delta;
+
+		scoped_guard(irq) {
+			wd_ts0 = watchdog->read(watchdog);
+			cs_ts = cs->read(cs);
+			wd_ts1 = watchdog->read(watchdog);
+		}
+
+		wd_last = cs->wd_last;
+		cs_last = cs->cs_last;
+
+		/* Validate the watchdog readout window */
+		wd_seq = cycles_to_nsec_safe(watchdog, wd_ts0, wd_ts1);
+		if (wd_seq > WATCHDOG_READOUT_MAX_NS) {
+			/* Store for printout in case all retries fail */
+			watchdog_data.wd_seq = wd_seq;
+			continue;
+		}
+
+		/* Store for subsequent processing */
+		cs->wd_last = wd_ts0;
+		cs->cs_last = cs_ts;
+
+		/* First round or reset pending? */
+		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || reset_pending)
+			goto reset;
+
+		/* Calculate the nanosecond deltas from the last invocation */
+		wd_delta = cycles_to_nsec_safe(watchdog, wd_last, wd_ts0);
+		cs_delta = cycles_to_nsec_safe(cs, cs_last, cs_ts);
+
+		watchdog_data.wd_delta = wd_delta;
+		watchdog_data.cs_delta = cs_delta;
+
+		/*
+		 * Ensure that the deltas are within the readout limits of
+		 * the clocksource and the watchdog. Long delays can cause
+		 * clocksources to overflow.
+		 */
+		max_delta = max(wd_delta, cs_delta);
+		if (max_delta > cs->max_idle_ns || max_delta > watchdog->max_idle_ns)
+			goto reset;
+
+		/*
+		 * Calculate and validate the skew against the allowed PPM
+		 * value of the maximum delta plus the watchdog readout
+		 * time.
+		 */
+		if (abs(wd_delta - cs_delta) < (max_delta >> ppm_shift) + wd_seq)
+			return true;
+
+		watchdog_data.result = WD_FREQ_SKEWED;
+		return false;
 	}
 
-	/* Don't verify ourselves. */
-	cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+	watchdog_data.result = WD_FREQ_TIMEOUT;
+	return false;
+
+reset:
+	cs->flags |= CLOCK_SOURCE_WATCHDOG;
+	watchdog_data.result = WD_FREQ_RESET;
+	return false;
 }
 
-static void clocksource_verify_one_cpu(void *csin)
+/* Synchronization for sched clock */
+static void clocksource_tick_stable(struct clocksource *cs)
 {
-	struct clocksource *cs = (struct clocksource *)csin;
-
-	csnow_mid = cs->read(cs);
+	if (cs == curr_clocksource && cs->tick_stable)
+		cs->tick_stable(cs);
 }
 
-void clocksource_verify_percpu(struct clocksource *cs)
+/* Conditionaly enable high resolution mode */
+static void clocksource_enable_highres(struct clocksource *cs)
 {
-	int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
-	u64 csnow_begin, csnow_end;
-	int cpu, testcpu;
-	s64 delta;
+	if ((cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) ||
+	    !(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) ||
+	    !watchdog || !(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS))
+		return;
+
+	/* Mark it valid for high-res. */
+	cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
 
-	if (verify_n_cpus == 0)
+	/*
+	 * Can't schedule work before finished_booting is
+	 * true. clocksource_done_booting will take care of it.
+	 */
+	if (!finished_booting)
 		return;
-	cpumask_clear(&cpus_ahead);
-	cpumask_clear(&cpus_behind);
-	cpus_read_lock();
-	migrate_disable();
-	clocksource_verify_choose_cpus();
-	if (cpumask_empty(&cpus_chosen)) {
-		migrate_enable();
-		cpus_read_unlock();
-		pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
+
+	if (cs->flags & CLOCK_SOURCE_WDTEST)
 		return;
+
+	/*
+	 * If this is not the current clocksource let the watchdog thread
+	 * reselect it. Due to the change to high res this clocksource
+	 * might be preferred now. If it is the current clocksource let the
+	 * tick code know about that change.
+	 */
+	if (cs != curr_clocksource) {
+		cs->flags |= CLOCK_SOURCE_RESELECT;
+		schedule_work(&watchdog_work);
+	} else {
+		tick_clock_notify();
 	}
-	testcpu = smp_processor_id();
-	pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n",
-		cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
-	preempt_disable();
-	for_each_cpu(cpu, &cpus_chosen) {
-		if (cpu == testcpu)
-			continue;
-		csnow_begin = cs->read(cs);
-		smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
-		csnow_end = cs->read(cs);
-		delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
-		if (delta < 0)
-			cpumask_set_cpu(cpu, &cpus_behind);
-		delta = (csnow_end - csnow_mid) & cs->mask;
-		if (delta < 0)
-			cpumask_set_cpu(cpu, &cpus_ahead);
-		cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end);
-		if (cs_nsec > cs_nsec_max)
-			cs_nsec_max = cs_nsec;
-		if (cs_nsec < cs_nsec_min)
-			cs_nsec_min = cs_nsec;
+}
+
+static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 2);
+
+static void watchdog_print_freq_timeout(struct clocksource *cs)
+{
+	if (!__ratelimit(&ratelimit_state))
+		return;
+	pr_info("Watchdog %s read timed out. Readout sequence took: %lluns\n",
+		watchdog->name, watchdog_data.wd_seq);
+}
+
+static void watchdog_print_freq_skew(struct clocksource *cs)
+{
+	pr_warn("Marking clocksource %s unstable due to frequency skew\n", cs->name);
+	pr_warn("Watchdog    %20s interval: %16lluns\n", watchdog->name, watchdog_data.wd_delta);
+	pr_warn("Clocksource %20s interval: %16lluns\n", cs->name, watchdog_data.cs_delta);
+}
+
+static void watchdog_handle_remote_timeout(struct clocksource *cs)
+{
+	pr_info_once("Watchdog remote CPU %u read timed out\n", watchdog_data.curr_cpu);
+}
+
+static void watchdog_print_remote_skew(struct clocksource *cs)
+{
+	pr_warn("Marking clocksource %s unstable due to inter CPU skew\n", cs->name);
+	if (watchdog_data.cpu_ts[0] < watchdog_data.cpu_ts[1]) {
+		pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", smp_processor_id(),
+			watchdog_data.cpu_ts[0], watchdog_data.curr_cpu, watchdog_data.cpu_ts[1]);
+	} else {
+		pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", watchdog_data.curr_cpu,
+			watchdog_data.cpu_ts[1], smp_processor_id(), watchdog_data.cpu_ts[0]);
 	}
-	preempt_enable();
-	migrate_enable();
-	cpus_read_unlock();
-	if (!cpumask_empty(&cpus_ahead))
-		pr_warn("        CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
-			cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
-	if (!cpumask_empty(&cpus_behind))
-		pr_warn("        CPUs %*pbl behind CPU %d for clocksource %s.\n",
-			cpumask_pr_args(&cpus_behind), testcpu, cs->name);
-	pr_info("        CPU %d check durations %lldns - %lldns for clocksource %s.\n",
-		testcpu, cs_nsec_min, cs_nsec_max, cs->name);
-}
-EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+}
 
-static inline void clocksource_reset_watchdog(void)
+static void watchdog_check_result(struct clocksource *cs)
 {
-	struct clocksource *cs;
+	switch (watchdog_data.result) {
+	case WD_SUCCESS:
+		clocksource_tick_stable(cs);
+		clocksource_enable_highres(cs);
+		return;
 
-	list_for_each_entry(cs, &watchdog_list, wd_list)
+	case WD_FREQ_TIMEOUT:
+		watchdog_print_freq_timeout(cs);
+		/* Try again later and invalidate the reference timestamps. */
 		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
+		return;
 
+	case WD_FREQ_NO_WATCHDOG:
+	case WD_FREQ_RESET:
+		/*
+		 * Nothing to do when the reference timestamps were reset
+		 * or no watchdog clocksource registered.
+		 */
+		return;
+
+	case WD_FREQ_SKEWED:
+		watchdog_print_freq_skew(cs);
+		break;
+
+	case WD_CPU_TIMEOUT:
+		/* Remote check timed out. Try again next cycle. */
+		watchdog_handle_remote_timeout(cs);
+		return;
+
+	case WD_CPU_SKEWED:
+		watchdog_print_remote_skew(cs);
+		break;
+	}
+	__clocksource_unstable(cs);
+}
 
 static void clocksource_watchdog(struct timer_list *unused)
 {
-	int64_t wd_nsec, cs_nsec, interval;
-	u64 csnow, wdnow, cslast, wdlast;
-	int next_cpu, reset_pending;
 	struct clocksource *cs;
-	enum wd_read_status read_ret;
-	unsigned long extra_wait = 0;
-	u32 md;
+	bool reset_pending;
 
-	spin_lock(&watchdog_lock);
+	guard(spinlock)(&watchdog_lock);
 	if (!watchdog_running)
-		goto out;
+		return;
 
 	reset_pending = atomic_read(&watchdog_reset_pending);
 
 	list_for_each_entry(cs, &watchdog_list, wd_list) {
-
 		/* Clocksource already marked unstable? */
 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
 			if (finished_booting)
@@ -446,170 +659,40 @@ static void clocksource_watchdog(struct timer_list *unused)
 			continue;
 		}
 
-		read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
-
-		if (read_ret == WD_READ_UNSTABLE) {
-			/* Clock readout unreliable, so give it up. */
-			__clocksource_unstable(cs);
-			continue;
-		}
-
-		/*
-		 * When WD_READ_SKIP is returned, it means the system is likely
-		 * under very heavy load, where the latency of reading
-		 * watchdog/clocksource is very big, and affect the accuracy of
-		 * watchdog check. So give system some space and suspend the
-		 * watchdog check for 5 minutes.
-		 */
-		if (read_ret == WD_READ_SKIP) {
-			/*
-			 * As the watchdog timer will be suspended, and
-			 * cs->last could keep unchanged for 5 minutes, reset
-			 * the counters.
-			 */
-			clocksource_reset_watchdog();
-			extra_wait = HZ * 300;
-			break;
-		}
-
-		/* Clocksource initialized ? */
-		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
-		    atomic_read(&watchdog_reset_pending)) {
-			cs->flags |= CLOCK_SOURCE_WATCHDOG;
-			cs->wd_last = wdnow;
-			cs->cs_last = csnow;
-			continue;
+		/* Compare against watchdog clocksource if available */
+		if (watchdog_check_freq(cs, reset_pending)) {
+			/* Check for inter CPU skew */
+			watchdog_check_cpu_skew(cs);
 		}
 
-		wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow);
-		cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow);
-		wdlast = cs->wd_last; /* save these in case we print them */
-		cslast = cs->cs_last;
-		cs->cs_last = csnow;
-		cs->wd_last = wdnow;
-
-		if (atomic_read(&watchdog_reset_pending))
-			continue;
-
-		/*
-		 * The processing of timer softirqs can get delayed (usually
-		 * on account of ksoftirqd not getting to run in a timely
-		 * manner), which causes the watchdog interval to stretch.
-		 * Skew detection may fail for longer watchdog intervals
-		 * on account of fixed margins being used.
-		 * Some clocksources, e.g. acpi_pm, cannot tolerate
-		 * watchdog intervals longer than a few seconds.
-		 */
-		interval = max(cs_nsec, wd_nsec);
-		if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
-			if (system_state > SYSTEM_SCHEDULING &&
-			    interval > 2 * watchdog_max_interval) {
-				watchdog_max_interval = interval;
-				pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
-					cs_nsec, wd_nsec);
-			}
-			watchdog_timer.expires = jiffies;
-			continue;
-		}
-
-		/* Check the deviation from the watchdog clocksource. */
-		md = cs->uncertainty_margin + watchdog->uncertainty_margin;
-		if (abs(cs_nsec - wd_nsec) > md) {
-			s64 cs_wd_msec;
-			s64 wd_msec;
-			u32 wd_rem;
-
-			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
-				smp_processor_id(), cs->name);
-			pr_warn("                      '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
-				watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
-			pr_warn("                      '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
-				cs->name, cs_nsec, csnow, cslast, cs->mask);
-			cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem);
-			wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem);
-			pr_warn("                      Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
-				cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
-			if (curr_clocksource == cs)
-				pr_warn("                      '%s' is current clocksource.\n", cs->name);
-			else if (curr_clocksource)
-				pr_warn("                      '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
-			else
-				pr_warn("                      No current clocksource.\n");
-			__clocksource_unstable(cs);
-			continue;
-		}
-
-		if (cs == curr_clocksource && cs->tick_stable)
-			cs->tick_stable(cs);
-
-		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
-		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
-		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
-			/* Mark it valid for high-res. */
-			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
-
-			/*
-			 * clocksource_done_booting() will sort it if
-			 * finished_booting is not set yet.
-			 */
-			if (!finished_booting)
-				continue;
-
-			/*
-			 * If this is not the current clocksource let
-			 * the watchdog thread reselect it. Due to the
-			 * change to high res this clocksource might
-			 * be preferred now. If it is the current
-			 * clocksource let the tick code know about
-			 * that change.
-			 */
-			if (cs != curr_clocksource) {
-				cs->flags |= CLOCK_SOURCE_RESELECT;
-				schedule_work(&watchdog_work);
-			} else {
-				tick_clock_notify();
-			}
-		}
+		watchdog_check_result(cs);
 	}
 
-	/*
-	 * We only clear the watchdog_reset_pending, when we did a
-	 * full cycle through all clocksources.
-	 */
+	/* Clear after the full clocksource walk */
 	if (reset_pending)
 		atomic_dec(&watchdog_reset_pending);
 
-	/*
-	 * Cycle through CPUs to check if the CPUs stay synchronized
-	 * to each other.
-	 */
-	next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask);
-
-	/*
-	 * Arm timer if not already pending: could race with concurrent
-	 * pair clocksource_stop_watchdog() clocksource_start_watchdog().
-	 */
+	/* Could have been rearmed by a stop/start cycle */
 	if (!timer_pending(&watchdog_timer)) {
-		watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
-		add_timer_on(&watchdog_timer, next_cpu);
+		watchdog_timer.expires += WATCHDOG_INTERVAL;
+		add_timer_local(&watchdog_timer);
 	}
-out:
-	spin_unlock(&watchdog_lock);
 }
 
 static inline void clocksource_start_watchdog(void)
 {
-	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
+	if (watchdog_running || list_empty(&watchdog_list))
 		return;
-	timer_setup(&watchdog_timer, clocksource_watchdog, 0);
+	timer_setup(&watchdog_timer, clocksource_watchdog, TIMER_PINNED);
 	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
-	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
+
+	add_timer_on(&watchdog_timer, get_boot_cpu_id());
 	watchdog_running = 1;
 }
 
 static inline void clocksource_stop_watchdog(void)
 {
-	if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
+	if (!watchdog_running || !list_empty(&watchdog_list))
 		return;
 	timer_delete(&watchdog_timer);
 	watchdog_running = 0;
@@ -651,6 +734,13 @@ static void clocksource_select_watchdog(bool fallback)
 		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
 			continue;
 
+		/*
+		 * If it's not continuous, don't put the fox in charge of
+		 * the henhouse.
+		 */
+		if (!(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS))
+			continue;
+
 		/* Skip current if we were requested for a fallback. */
 		if (fallback && cs == old_wd)
 			continue;
@@ -690,12 +780,6 @@ static int __clocksource_watchdog_kthread(void)
 	unsigned long flags;
 	int select = 0;
 
-	/* Do any required per-CPU skew verification. */
-	if (curr_clocksource &&
-	    curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
-	    curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
-		clocksource_verify_percpu(curr_clocksource);
-
 	spin_lock_irqsave(&watchdog_lock, flags);
 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
@@ -1016,6 +1100,8 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
 			continue;
 		if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
 			continue;
+		if (cs->flags & CLOCK_SOURCE_WDTEST)
+			continue;
 		return cs;
 	}
 	return NULL;
@@ -1040,6 +1126,8 @@ static void __clocksource_select(bool skipcur)
 			continue;
 		if (strcmp(cs->name, override_name) != 0)
 			continue;
+		if (cs->flags & CLOCK_SOURCE_WDTEST)
+			continue;
 		/*
 		 * Check to make sure we don't switch to a non-highres
 		 * capable clocksource if the tick code is in oneshot
@@ -1169,31 +1257,10 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
 
 		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
 				       NSEC_PER_SEC / scale, sec * scale);
-	}
 
-	/*
-	 * If the uncertainty margin is not specified, calculate it.  If
-	 * both scale and freq are non-zero, calculate the clock period, but
-	 * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default.
-	 * However, if either of scale or freq is zero, be very conservative
-	 * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value
-	 * for the uncertainty margin.  Allow stupidly small uncertainty
-	 * margins to be specified by the caller for testing purposes,
-	 * but warn to discourage production use of this capability.
-	 *
-	 * Bottom line:  The sum of the uncertainty margins of the
-	 * watchdog clocksource and the clocksource under test will be at
-	 * least 500ppm by default.  For more information, please see the
-	 * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above.
-	 */
-	if (scale && freq && !cs->uncertainty_margin) {
-		cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
-		if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
-			cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
-	} else if (!cs->uncertainty_margin) {
-		cs->uncertainty_margin = WATCHDOG_THRESHOLD;
+		/* Update cs::freq_khz */
+		cs->freq_khz = div_u64((u64)freq * scale, 1000);
 	}
-	WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
 
 	/*
 	 * Ensure clocksources that have large 'mult' values don't overflow
@@ -1241,6 +1308,10 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 
 	if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
 		cs->id = CSID_GENERIC;
+
+	if (WARN_ON_ONCE(!freq && cs->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT))
+		cs->flags &= ~CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT;
+
 	if (cs->vdso_clock_mode < 0 ||
 	    cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
 		pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 860af7a58428..5bd6efe598f0 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -50,6 +50,28 @@
 #include "tick-internal.h"
 
 /*
+ * Constants to set the queued state of the timer (INACTIVE, ENQUEUED)
+ *
+ * The callback state is kept separate in the CPU base because having it in
+ * the timer would required touching the timer after the callback, which
+ * makes it impossible to free the timer from the callback function.
+ *
+ * Therefore we track the callback state in:
+ *
+ *	timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
+ * queued a signal. Between dropping the lock which protects the posix timer
+ * and reacquiring the base lock of the hrtimer, another CPU can deliver the
+ * signal and rearm the timer.
+ *
+ * All state transitions are protected by cpu_base->lock.
+ */
+#define HRTIMER_STATE_INACTIVE	false
+#define HRTIMER_STATE_ENQUEUED	true
+
+/*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
  * idea of the (in)accuracy of timers. Timer values are rounded up to
@@ -77,43 +99,22 @@ static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);
  * to reach a base using a clockid, hrtimer_clockid_to_base()
  * is used to convert from clockid to the proper hrtimer_base_type.
  */
+
+#define BASE_INIT(idx, cid)			\
+	[idx] = { .index = idx, .clockid = cid }
+
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
-	.clock_base =
-	{
-		{
-			.index = HRTIMER_BASE_MONOTONIC,
-			.clockid = CLOCK_MONOTONIC,
-		},
-		{
-			.index = HRTIMER_BASE_REALTIME,
-			.clockid = CLOCK_REALTIME,
-		},
-		{
-			.index = HRTIMER_BASE_BOOTTIME,
-			.clockid = CLOCK_BOOTTIME,
-		},
-		{
-			.index = HRTIMER_BASE_TAI,
-			.clockid = CLOCK_TAI,
-		},
-		{
-			.index = HRTIMER_BASE_MONOTONIC_SOFT,
-			.clockid = CLOCK_MONOTONIC,
-		},
-		{
-			.index = HRTIMER_BASE_REALTIME_SOFT,
-			.clockid = CLOCK_REALTIME,
-		},
-		{
-			.index = HRTIMER_BASE_BOOTTIME_SOFT,
-			.clockid = CLOCK_BOOTTIME,
-		},
-		{
-			.index = HRTIMER_BASE_TAI_SOFT,
-			.clockid = CLOCK_TAI,
-		},
+	.clock_base = {
+		BASE_INIT(HRTIMER_BASE_MONOTONIC,	CLOCK_MONOTONIC),
+		BASE_INIT(HRTIMER_BASE_REALTIME,	CLOCK_REALTIME),
+		BASE_INIT(HRTIMER_BASE_BOOTTIME,	CLOCK_BOOTTIME),
+		BASE_INIT(HRTIMER_BASE_TAI,		CLOCK_TAI),
+		BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT,	CLOCK_MONOTONIC),
+		BASE_INIT(HRTIMER_BASE_REALTIME_SOFT,	CLOCK_REALTIME),
+		BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT,	CLOCK_BOOTTIME),
+		BASE_INIT(HRTIMER_BASE_TAI_SOFT,	CLOCK_TAI),
 	},
 	.csd = CSD_INIT(retrigger_next_event, NULL)
 };
@@ -126,23 +127,43 @@ static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
 		return likely(base->online);
 }
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key);
+
+static void hrtimer_hres_workfn(struct work_struct *work)
+{
+	static_branch_enable(&hrtimer_highres_enabled_key);
+}
+
+static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn);
+
+static inline void hrtimer_schedule_hres_work(void)
+{
+	if (!hrtimer_highres_enabled())
+		schedule_work(&hrtimer_hres_work);
+}
+#else
+static inline void hrtimer_schedule_hres_work(void) { }
+#endif
+
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
  */
 #ifdef CONFIG_SMP
-
 /*
  * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
  * such that hrtimer_callback_running() can unconditionally dereference
  * timer->base->cpu_base
  */
 static struct hrtimer_cpu_base migration_cpu_base = {
-	.clock_base = { {
-		.cpu_base = &migration_cpu_base,
-		.seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
-						     &migration_cpu_base.lock),
-	}, },
+	.clock_base = {
+		[0] = {
+			.cpu_base = &migration_cpu_base,
+			.seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
+							     &migration_cpu_base.lock),
+		},
+	},
 };
 
 #define migration_base	migration_cpu_base.clock_base[0]
@@ -159,15 +180,13 @@ static struct hrtimer_cpu_base migration_cpu_base = {
  * possible to set timer->base = &migration_base and drop the lock: the timer
  * remains locked.
  */
-static
-struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
-					     unsigned long *flags)
+static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+						    unsigned long *flags)
 	__acquires(&timer->base->lock)
 {
-	struct hrtimer_clock_base *base;
-
 	for (;;) {
-		base = READ_ONCE(timer->base);
+		struct hrtimer_clock_base *base = READ_ONCE(timer->base);
+
 		if (likely(base != &migration_base)) {
 			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
 			if (likely(base == timer->base))
@@ -220,7 +239,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_
 	return expires >= new_base->cpu_base->expires_next;
 }
 
-static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned)
 {
 	if (!hrtimer_base_is_online(base)) {
 		int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
@@ -248,8 +267,7 @@ static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *
  * the timer callback is currently running.
  */
 static inline struct hrtimer_clock_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
-		    int pinned)
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned)
 {
 	struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
 	struct hrtimer_clock_base *new_base;
@@ -262,13 +280,12 @@ again:
 
 	if (base != new_base) {
 		/*
-		 * We are trying to move timer to new_base.
-		 * However we can't change timer's base while it is running,
-		 * so we keep it on the same CPU. No hassle vs. reprogramming
-		 * the event source in the high resolution case. The softirq
-		 * code will take care of this when the timer function has
-		 * completed. There is no conflict as we hold the lock until
-		 * the timer is enqueued.
+		 * We are trying to move timer to new_base. However we can't
+		 * change timer's base while it is running, so we keep it on
+		 * the same CPU. No hassle vs. reprogramming the event source
+		 * in the high resolution case. The remote CPU will take care
+		 * of this when the timer function has completed. There is no
+		 * conflict as we hold the lock until the timer is enqueued.
 		 */
 		if (unlikely(hrtimer_callback_running(timer)))
 			return base;
@@ -278,8 +295,7 @@ again:
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
-		if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
-					     this_cpu_base)) {
+		if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raw_spin_lock(&base->cpu_base->lock);
 			new_cpu_base = this_cpu_base;
@@ -298,14 +314,13 @@ again:
 
 #else /* CONFIG_SMP */
 
-static inline struct hrtimer_clock_base *
-lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+							   unsigned long *flags)
 	__acquires(&timer->base->cpu_base->lock)
 {
 	struct hrtimer_clock_base *base = timer->base;
 
 	raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
-
 	return base;
 }
 
@@ -340,7 +355,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div)
 	return dclc < 0 ? -tmp : tmp;
 }
 EXPORT_SYMBOL_GPL(__ktime_divns);
-#endif /* BITS_PER_LONG >= 64 */
+#endif /* BITS_PER_LONG < 64 */
 
 /*
  * Add two ktime values and do a safety check for overflow:
@@ -422,12 +437,37 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
 	}
 }
 
+/* Stub timer callback for improperly used timers. */
+static enum hrtimer_restart stub_timer(struct hrtimer *unused)
+{
+	WARN_ON_ONCE(1);
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * hrtimer_fixup_assert_init is called when:
+ * - an untracked/uninit-ed object is found
+ */
+static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state)
+{
+	struct hrtimer *timer = addr;
+
+	switch (state) {
+	case ODEBUG_STATE_NOTAVAILABLE:
+		hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0);
+		return true;
+	default:
+		return false;
+	}
+}
+
 static const struct debug_obj_descr hrtimer_debug_descr = {
-	.name		= "hrtimer",
-	.debug_hint	= hrtimer_debug_hint,
-	.fixup_init	= hrtimer_fixup_init,
-	.fixup_activate	= hrtimer_fixup_activate,
-	.fixup_free	= hrtimer_fixup_free,
+	.name			= "hrtimer",
+	.debug_hint		= hrtimer_debug_hint,
+	.fixup_init		= hrtimer_fixup_init,
+	.fixup_activate		= hrtimer_fixup_activate,
+	.fixup_free		= hrtimer_fixup_free,
+	.fixup_assert_init	= hrtimer_fixup_assert_init,
 };
 
 static inline void debug_hrtimer_init(struct hrtimer *timer)
@@ -440,8 +480,7 @@ static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer)
 	debug_object_init_on_stack(timer, &hrtimer_debug_descr);
 }
 
-static inline void debug_hrtimer_activate(struct hrtimer *timer,
-					  enum hrtimer_mode mode)
+static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode)
 {
 	debug_object_activate(timer, &hrtimer_debug_descr);
 }
@@ -451,6 +490,11 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
 	debug_object_deactivate(timer, &hrtimer_debug_descr);
 }
 
+static inline void debug_hrtimer_assert_init(struct hrtimer *timer)
+{
+	debug_object_assert_init(timer, &hrtimer_debug_descr);
+}
+
 void destroy_hrtimer_on_stack(struct hrtimer *timer)
 {
 	debug_object_free(timer, &hrtimer_debug_descr);
@@ -461,9 +505,9 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
 
 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
 static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { }
-static inline void debug_hrtimer_activate(struct hrtimer *timer,
-					  enum hrtimer_mode mode) { }
+static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { }
 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
+static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { }
 #endif
 
 static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode)
@@ -479,80 +523,80 @@ static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid
 	trace_hrtimer_setup(timer, clockid, mode);
 }
 
-static inline void debug_activate(struct hrtimer *timer,
-				  enum hrtimer_mode mode)
+static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed)
 {
 	debug_hrtimer_activate(timer, mode);
-	trace_hrtimer_start(timer, mode);
+	trace_hrtimer_start(timer, mode, was_armed);
 }
 
-static inline void debug_deactivate(struct hrtimer *timer)
-{
-	debug_hrtimer_deactivate(timer);
-	trace_hrtimer_cancel(timer);
-}
+#define for_each_active_base(base, cpu_base, active)					\
+	for (unsigned int idx = ffs(active); idx--; idx = ffs((active)))		\
+		for (bool done = false; !done; active &= ~(1U << idx))			\
+			for (base = &cpu_base->clock_base[idx]; !done; done = true)
 
-static struct hrtimer_clock_base *
-__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
+#define hrtimer_from_timerqueue_node(_n) container_of_const(_n, struct hrtimer, node)
+
+#if defined(CONFIG_NO_HZ_COMMON)
+/*
+ * Same as hrtimer_bases_next_event() below, but skips the excluded timer and
+ * does not update cpu_base->next_timer/expires.
+ */
+static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base,
+						const struct hrtimer *exclude,
+						unsigned int active, ktime_t expires_next)
 {
-	unsigned int idx;
+	struct hrtimer_clock_base *base;
+	ktime_t expires;
 
-	if (!*active)
-		return NULL;
+	lockdep_assert_held(&cpu_base->lock);
 
-	idx = __ffs(*active);
-	*active &= ~(1U << idx);
+	for_each_active_base(base, cpu_base, active) {
+		expires = ktime_sub(base->expires_next, base->offset);
+		if (expires >= expires_next)
+			continue;
+
+		/*
+		 * If the excluded timer is the first on this base evaluate the
+		 * next timer.
+		 */
+		struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active);
 
-	return &cpu_base->clock_base[idx];
+		if (unlikely(&exclude->node == node)) {
+			node = timerqueue_linked_next(node);
+			if (!node)
+				continue;
+			expires = ktime_sub(node->expires, base->offset);
+			if (expires >= expires_next)
+				continue;
+		}
+		expires_next = expires;
+	}
+	/* If base->offset changed, the result might be negative */
+	return max(expires_next, 0);
 }
+#endif
 
-#define for_each_active_base(base, cpu_base, active)	\
-	while ((base = __next_base((cpu_base), &(active))))
+static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base)
+{
+	struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
 
-static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
-					 const struct hrtimer *exclude,
-					 unsigned int active,
-					 ktime_t expires_next)
+	return hrtimer_from_timerqueue_node(next);
+}
+
+/* Find the base with the earliest expiry */
+static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active,
+				ktime_t *expires_next, struct hrtimer **next_timer)
 {
 	struct hrtimer_clock_base *base;
 	ktime_t expires;
 
 	for_each_active_base(base, cpu_base, active) {
-		struct timerqueue_node *next;
-		struct hrtimer *timer;
-
-		next = timerqueue_getnext(&base->active);
-		timer = container_of(next, struct hrtimer, node);
-		if (timer == exclude) {
-			/* Get to the next timer in the queue. */
-			next = timerqueue_iterate_next(next);
-			if (!next)
-				continue;
-
-			timer = container_of(next, struct hrtimer, node);
-		}
-		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-		if (expires < expires_next) {
-			expires_next = expires;
-
-			/* Skip cpu_base update if a timer is being excluded. */
-			if (exclude)
-				continue;
-
-			if (timer->is_soft)
-				cpu_base->softirq_next_timer = timer;
-			else
-				cpu_base->next_timer = timer;
+		expires = ktime_sub(base->expires_next, base->offset);
+		if (expires < *expires_next) {
+			*expires_next = expires;
+			*next_timer = clock_base_next_timer(base);
 		}
 	}
-	/*
-	 * clock_was_set() might have changed base->offset of any of
-	 * the clock bases so the result might be negative. Fix it up
-	 * to prevent a false positive in clockevents_program_event().
-	 */
-	if (expires_next < 0)
-		expires_next = 0;
-	return expires_next;
 }
 
 /*
@@ -575,30 +619,28 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
  *  - HRTIMER_ACTIVE_SOFT, or
  *  - HRTIMER_ACTIVE_HARD.
  */
-static ktime_t
-__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
+static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
 {
-	unsigned int active;
 	struct hrtimer *next_timer = NULL;
 	ktime_t expires_next = KTIME_MAX;
+	unsigned int active;
+
+	lockdep_assert_held(&cpu_base->lock);
 
 	if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
 		active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
-		cpu_base->softirq_next_timer = NULL;
-		expires_next = __hrtimer_next_event_base(cpu_base, NULL,
-							 active, KTIME_MAX);
-
-		next_timer = cpu_base->softirq_next_timer;
+		if (active)
+			hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
+		cpu_base->softirq_next_timer = next_timer;
 	}
 
 	if (active_mask & HRTIMER_ACTIVE_HARD) {
 		active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+		if (active)
+			hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
 		cpu_base->next_timer = next_timer;
-		expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
-							 expires_next);
 	}
-
-	return expires_next;
+	return max(expires_next, 0);
 }
 
 static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
@@ -638,8 +680,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
 	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
 
-	ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
-					    offs_real, offs_boot, offs_tai);
+	ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real,
+						   offs_boot, offs_tai);
 
 	base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
 	base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
@@ -649,7 +691,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 }
 
 /*
- * Is the high resolution mode active ?
+ * Is the high resolution mode active in the CPU base. This cannot use the
+ * static key as the CPUs are switched to high resolution mode
+ * asynchronously.
  */
 static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
 {
@@ -657,8 +701,13 @@ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
 		cpu_base->hres_active : 0;
 }
 
-static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
-				struct hrtimer *next_timer,
+static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred)
+{
+	trace_hrtimer_rearm(expires_next, deferred);
+	tick_program_event(expires_next, 1);
+}
+
+static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer,
 				ktime_t expires_next)
 {
 	cpu_base->expires_next = expires_next;
@@ -683,20 +732,13 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
 	if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
 		return;
 
-	tick_program_event(expires_next, 1);
+	hrtimer_rearm_event(expires_next, false);
 }
 
-/*
- * Reprogram the event source with checking both queues for the
- * next event
- * Called with interrupts disabled and base->lock held
- */
-static void
-hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
+/* Reprogram the event source with a evaluation of all clock bases */
+static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal)
 {
-	ktime_t expires_next;
-
-	expires_next = hrtimer_update_next_event(cpu_base);
+	ktime_t expires_next = hrtimer_update_next_event(cpu_base);
 
 	if (skip_equal && expires_next == cpu_base->expires_next)
 		return;
@@ -707,57 +749,49 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
-/*
- * High resolution timer enabled ?
- */
+/* High resolution timer enabled ? */
 static bool hrtimer_hres_enabled __read_mostly  = true;
 unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
 EXPORT_SYMBOL_GPL(hrtimer_resolution);
 
-/*
- * Enable / Disable high resolution mode
- */
+/* Enable / Disable high resolution mode */
 static int __init setup_hrtimer_hres(char *str)
 {
 	return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
 }
-
 __setup("highres=", setup_hrtimer_hres);
 
-/*
- * hrtimer_high_res_enabled - query, if the highres mode is enabled
- */
-static inline int hrtimer_is_hres_enabled(void)
+/* hrtimer_high_res_enabled - query, if the highres mode is enabled */
+static inline bool hrtimer_is_hres_enabled(void)
 {
 	return hrtimer_hres_enabled;
 }
 
-/*
- * Switch to high resolution mode
- */
+/* Switch to high resolution mode */
 static void hrtimer_switch_to_hres(void)
 {
 	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 
 	if (tick_init_highres()) {
-		pr_warn("Could not switch to high resolution mode on CPU %u\n",
-			base->cpu);
+		pr_warn("Could not switch to high resolution mode on CPU %u\n",	base->cpu);
 		return;
 	}
-	base->hres_active = 1;
+	base->hres_active = true;
 	hrtimer_resolution = HIGH_RES_NSEC;
 
 	tick_setup_sched_timer(true);
 	/* "Retrigger" the interrupt to get things going */
 	retrigger_next_event(NULL);
+	hrtimer_schedule_hres_work();
 }
 
 #else
 
-static inline int hrtimer_is_hres_enabled(void) { return 0; }
+static inline bool hrtimer_is_hres_enabled(void) { return 0; }
 static inline void hrtimer_switch_to_hres(void) { }
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
+
 /*
  * Retrigger next event is called after clock was set with interrupts
  * disabled through an SMP function call or directly from low level
@@ -792,13 +826,12 @@ static void retrigger_next_event(void *arg)
 	 * In periodic low resolution mode, the next softirq expiration
 	 * must also be updated.
 	 */
-	raw_spin_lock(&base->lock);
+	guard(raw_spinlock)(&base->lock);
 	hrtimer_update_base(base);
 	if (hrtimer_hres_active(base))
-		hrtimer_force_reprogram(base, 0);
+		hrtimer_force_reprogram(base, /* skip_equal */ false);
 	else
 		hrtimer_update_next_event(base);
-	raw_spin_unlock(&base->lock);
 }
 
 /*
@@ -812,10 +845,11 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 	struct hrtimer_clock_base *base = timer->base;
-	ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+	ktime_t expires = hrtimer_get_expires(timer);
 
-	WARN_ON_ONCE(hrtimer_get_expires(timer) < 0);
+	WARN_ON_ONCE(expires < 0);
 
+	expires = ktime_sub(expires, base->offset);
 	/*
 	 * CLOCK_REALTIME timer might be requested with an absolute
 	 * expiry time which is less than base->offset. Set it to 0.
@@ -842,8 +876,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
 		timer_cpu_base->softirq_next_timer = timer;
 		timer_cpu_base->softirq_expires_next = expires;
 
-		if (!ktime_before(expires, timer_cpu_base->expires_next) ||
-		    !reprogram)
+		if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram)
 			return;
 	}
 
@@ -857,11 +890,8 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
 	if (expires >= cpu_base->expires_next)
 		return;
 
-	/*
-	 * If the hrtimer interrupt is running, then it will reevaluate the
-	 * clock bases and reprogram the clock event device.
-	 */
-	if (cpu_base->in_hrtirq)
+	/* If a deferred rearm is pending skip reprogramming the device */
+	if (cpu_base->deferred_rearm)
 		return;
 
 	cpu_base->next_timer = timer;
@@ -869,8 +899,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
 	__hrtimer_reprogram(cpu_base, timer, expires);
 }
 
-static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
-			     unsigned int active)
+static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active)
 {
 	struct hrtimer_clock_base *base;
 	unsigned int seq;
@@ -896,13 +925,11 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
 	if (seq == cpu_base->clock_was_set_seq)
 		return false;
 
-	/*
-	 * If the remote CPU is currently handling an hrtimer interrupt, it
-	 * will reevaluate the first expiring timer of all clock bases
-	 * before reprogramming. Nothing to do here.
-	 */
-	if (cpu_base->in_hrtirq)
+	/* If a deferred rearm is pending the remote CPU will take care of it */
+	if (cpu_base->deferred_rearm) {
+		cpu_base->deferred_needs_update = true;
 		return false;
+	}
 
 	/*
 	 * Walk the affected clock bases and check whether the first expiring
@@ -913,9 +940,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
 	active &= cpu_base->active_bases;
 
 	for_each_active_base(base, cpu_base, active) {
-		struct timerqueue_node *next;
+		struct timerqueue_linked_node *next;
 
-		next = timerqueue_getnext(&base->active);
+		next = timerqueue_linked_first(&base->active);
 		expires = ktime_sub(next->expires, base->offset);
 		if (expires < cpu_base->expires_next)
 			return true;
@@ -947,11 +974,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
  */
 void clock_was_set(unsigned int bases)
 {
-	struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
 	cpumask_var_t mask;
-	int cpu;
 
-	if (!hrtimer_hres_active(cpu_base) && !tick_nohz_is_active())
+	if (!hrtimer_highres_enabled() && !tick_nohz_is_active())
 		goto out_timerfd;
 
 	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -960,23 +985,19 @@ void clock_was_set(unsigned int bases)
 	}
 
 	/* Avoid interrupting CPUs if possible */
-	cpus_read_lock();
-	for_each_online_cpu(cpu) {
-		unsigned long flags;
-
-		cpu_base = &per_cpu(hrtimer_bases, cpu);
-		raw_spin_lock_irqsave(&cpu_base->lock, flags);
+	scoped_guard(cpus_read_lock) {
+		int cpu;
 
-		if (update_needs_ipi(cpu_base, bases))
-			cpumask_set_cpu(cpu, mask);
+		for_each_online_cpu(cpu) {
+			struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
 
-		raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+			guard(raw_spinlock_irqsave)(&cpu_base->lock);
+			if (update_needs_ipi(cpu_base, bases))
+				cpumask_set_cpu(cpu, mask);
+		}
+		scoped_guard(preempt)
+			smp_call_function_many(mask, retrigger_next_event, NULL, 1);
 	}
-
-	preempt_disable();
-	smp_call_function_many(mask, retrigger_next_event, NULL, 1);
-	preempt_enable();
-	cpus_read_unlock();
 	free_cpumask_var(mask);
 
 out_timerfd:
@@ -1011,11 +1032,8 @@ void hrtimers_resume_local(void)
 	retrigger_next_event(NULL);
 }
 
-/*
- * Counterpart to lock_hrtimer_base above:
- */
-static inline
-void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+/* Counterpart to lock_hrtimer_base above */
+static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 	__releases(&timer->base->cpu_base->lock)
 {
 	raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
@@ -1032,7 +1050,7 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
  * .. note::
  *  This only updates the timer expiry value and does not requeue the timer.
  *
- * There is also a variant of the function hrtimer_forward_now().
+ * There is also a variant of this function: hrtimer_forward_now().
  *
  * Context: Can be safely called from the callback function of @timer. If called
  *          from other contexts @timer must neither be enqueued nor running the
@@ -1042,15 +1060,15 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
  */
 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 {
-	u64 orun = 1;
 	ktime_t delta;
+	u64 orun = 1;
 
 	delta = ktime_sub(now, hrtimer_get_expires(timer));
 
 	if (delta < 0)
 		return 0;
 
-	if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+	if (WARN_ON(timer->is_queued))
 		return 0;
 
 	if (interval < hrtimer_resolution)
@@ -1079,73 +1097,98 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
  * enqueue_hrtimer - internal function to (re)start a timer
  *
  * The timer is inserted in expiry order. Insertion into the
- * red black tree is O(log(n)). Must hold the base lock.
+ * red black tree is O(log(n)).
  *
  * Returns true when the new timer is the leftmost timer in the tree.
  */
 static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
-			    enum hrtimer_mode mode)
+			    enum hrtimer_mode mode, bool was_armed)
 {
-	debug_activate(timer, mode);
+	lockdep_assert_held(&base->cpu_base->lock);
+
+	debug_activate(timer, mode, was_armed);
 	WARN_ON_ONCE(!base->cpu_base->online);
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
 	/* Pairs with the lockless read in hrtimer_is_queued() */
-	WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
+	WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
+
+	if (!timerqueue_linked_add(&base->active, &timer->node))
+		return false;
+
+	base->expires_next = hrtimer_get_expires(timer);
+	return true;
+}
 
-	return timerqueue_add(&base->active, &timer->node);
+static inline void base_update_next_timer(struct hrtimer_clock_base *base)
+{
+	struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
+
+	base->expires_next = next ? next->expires : KTIME_MAX;
 }
 
 /*
  * __remove_hrtimer - internal function to remove a timer
  *
- * Caller must hold the base lock.
- *
  * High resolution timer mode reprograms the clock event device when the
  * timer is the one which expires next. The caller can disable this by setting
  * reprogram to zero. This is useful, when the context does a reprogramming
  * anyway (e.g. timer interrupt)
  */
-static void __remove_hrtimer(struct hrtimer *timer,
-			     struct hrtimer_clock_base *base,
-			     u8 newstate, int reprogram)
+static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+			     bool newstate, bool reprogram)
 {
 	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
-	u8 state = timer->state;
+	bool was_first;
 
-	/* Pairs with the lockless read in hrtimer_is_queued() */
-	WRITE_ONCE(timer->state, newstate);
-	if (!(state & HRTIMER_STATE_ENQUEUED))
+	lockdep_assert_held(&cpu_base->lock);
+
+	if (!timer->is_queued)
 		return;
 
-	if (!timerqueue_del(&base->active, &timer->node))
+	/* Pairs with the lockless read in hrtimer_is_queued() */
+	WRITE_ONCE(timer->is_queued, newstate);
+
+	was_first = !timerqueue_linked_prev(&timer->node);
+
+	if (!timerqueue_linked_del(&base->active, &timer->node))
 		cpu_base->active_bases &= ~(1 << base->index);
 
+	/* Nothing to update if this was not the first timer in the base */
+	if (!was_first)
+		return;
+
+	base_update_next_timer(base);
+
 	/*
-	 * Note: If reprogram is false we do not update
-	 * cpu_base->next_timer. This happens when we remove the first
-	 * timer on a remote cpu. No harm as we never dereference
-	 * cpu_base->next_timer. So the worst thing what can happen is
-	 * an superfluous call to hrtimer_force_reprogram() on the
-	 * remote cpu later on if the same timer gets enqueued again.
+	 * If reprogram is false don't update cpu_base->next_timer and do not
+	 * touch the clock event device.
+	 *
+	 * This happens when removing the first timer on a remote CPU, which
+	 * will be handled by the remote CPU's interrupt. It also happens when
+	 * a local timer is removed to be immediately restarted. That's handled
+	 * at the call site.
 	 */
-	if (reprogram && timer == cpu_base->next_timer)
-		hrtimer_force_reprogram(cpu_base, 1);
+	if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy)
+		return;
+
+	if (cpu_base->deferred_rearm)
+		cpu_base->deferred_needs_update = true;
+	else
+		hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
 }
 
-/*
- * remove hrtimer, called with base lock held
- */
-static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
-	       bool restart, bool keep_local)
+static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+				  bool newstate)
 {
-	u8 state = timer->state;
+	lockdep_assert_held(&base->cpu_base->lock);
 
-	if (state & HRTIMER_STATE_ENQUEUED) {
+	if (timer->is_queued) {
 		bool reprogram;
 
+		debug_hrtimer_deactivate(timer);
+
 		/*
 		 * Remove the timer and force reprogramming when high
 		 * resolution mode is active and the timer is on the current
@@ -1154,24 +1197,81 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
 		 * reprogramming happens in the interrupt handler. This is a
 		 * rare case and less expensive than a smp call.
 		 */
-		debug_deactivate(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
 
-		/*
-		 * If the timer is not restarted then reprogramming is
-		 * required if the timer is local. If it is local and about
-		 * to be restarted, avoid programming it twice (on removal
-		 * and a moment later when it's requeued).
-		 */
-		if (!restart)
-			state = HRTIMER_STATE_INACTIVE;
-		else
-			reprogram &= !keep_local;
+		__remove_hrtimer(timer, base, newstate, reprogram);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Update in place has to retrieve the expiry times of the neighbour nodes
+ * if they exist. That is cache line neutral because the dequeue/enqueue
+ * operation is going to need the same cache lines. But there is a big win
+ * when the dequeue/enqueue can be avoided because the RB tree does not
+ * have to be rebalanced twice.
+ */
+static inline bool
+hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires)
+{
+	struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node);
+	struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node);
+
+	/* If the new expiry goes behind the next timer, requeue is required */
+	if (next && expires > next->expires)
+		return false;
+
+	/* If this is the first timer, update in place */
+	if (!prev)
+		return true;
+
+	/* Update in place when it does not go ahead of the previous one */
+	return expires >= prev->expires;
+}
+
+static inline bool
+remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+			     const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns)
+{
+	bool was_first = false;
+
+	/* Remove it from the timer queue if active */
+	if (timer->is_queued) {
+		was_first = !timerqueue_linked_prev(&timer->node);
+
+		/* Try to update in place to avoid the de/enqueue dance */
+		if (hrtimer_can_update_in_place(timer, base, expires)) {
+			hrtimer_set_expires_range_ns(timer, expires, delta_ns);
+			trace_hrtimer_start(timer, mode, true);
+			if (was_first)
+				base->expires_next = expires;
+			return was_first;
+		}
 
-		__remove_hrtimer(timer, base, state, reprogram);
-		return 1;
+		debug_hrtimer_deactivate(timer);
+		timerqueue_linked_del(&base->active, &timer->node);
 	}
-	return 0;
+
+	/* Set the new expiry time */
+	hrtimer_set_expires_range_ns(timer, expires, delta_ns);
+
+	debug_activate(timer, mode, timer->is_queued);
+	base->cpu_base->active_bases |= 1 << base->index;
+
+	/* Pairs with the lockless read in hrtimer_is_queued() */
+	WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
+
+	/* If it's the first expiring timer now or again, update base */
+	if (timerqueue_linked_add(&base->active, &timer->node)) {
+		base->expires_next = expires;
+		return true;
+	}
+
+	if (was_first)
+		base_update_next_timer(base);
+
+	return false;
 }
 
 static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
@@ -1190,55 +1290,93 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
 	return tim;
 }
 
-static void
-hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
+static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
 {
-	ktime_t expires;
-
-	/*
-	 * Find the next SOFT expiration.
-	 */
-	expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
+	ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
 
 	/*
-	 * reprogramming needs to be triggered, even if the next soft
-	 * hrtimer expires at the same time than the next hard
+	 * Reprogramming needs to be triggered, even if the next soft
+	 * hrtimer expires at the same time as the next hard
 	 * hrtimer. cpu_base->softirq_expires_next needs to be updated!
 	 */
 	if (expires == KTIME_MAX)
 		return;
 
 	/*
-	 * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
-	 * cpu_base->*expires_next is only set by hrtimer_reprogram()
+	 * cpu_base->next_timer is recomputed by __hrtimer_get_next_event()
+	 * cpu_base->expires_next is only set by hrtimer_reprogram()
 	 */
 	hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
 }
 
-static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-				    u64 delta_ns, const enum hrtimer_mode mode,
-				    struct hrtimer_clock_base *base)
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
+{
+	if (static_branch_likely(&timers_migration_enabled)) {
+		/*
+		 * If it is local and the first expiring timer keep it on the local
+		 * CPU to optimize reprogramming of the clockevent device. Also
+		 * avoid switch_hrtimer_base() overhead when local and pinned.
+		 */
+		if (!is_local)
+			return false;
+		if (is_first || is_pinned)
+			return true;
+
+		/* Honour the NOHZ full restrictions */
+		if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE))
+			return false;
+
+		/*
+		 * If the tick is not stopped or need_resched() is set, then
+		 * there is no point in moving the timer somewhere else.
+		 */
+		return !tick_nohz_tick_stopped() || need_resched();
+	}
+	return is_local;
+}
+#else
+static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
+{
+	return is_local;
+}
+#endif
+
+static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first,
+				     bool is_pinned)
+{
+	/* If the timer is running the callback it has to stay on its CPU base. */
+	if (unlikely(timer->base->running == timer))
+		return true;
+
+	return hrtimer_prefer_local(is_local, is_first, is_pinned);
+}
+
+static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+				     const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
 {
 	struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
-	struct hrtimer_clock_base *new_base;
-	bool force_local, first;
+	bool is_pinned, first, was_first, keep_base = false;
+	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
 
-	/*
-	 * If the timer is on the local cpu base and is the first expiring
-	 * timer then this might end up reprogramming the hardware twice
-	 * (on removal and on enqueue). To avoid that by prevent the
-	 * reprogram on removal, keep the timer local to the current CPU
-	 * and enforce reprogramming after it is queued no matter whether
-	 * it is the new first expiring timer again or not.
-	 */
-	force_local = base->cpu_base == this_cpu_base;
-	force_local &= base->cpu_base->next_timer == timer;
+	was_first = cpu_base->next_timer == timer;
+	is_pinned = !!(mode & HRTIMER_MODE_PINNED);
 
 	/*
-	 * Don't force local queuing if this enqueue happens on a unplugged
-	 * CPU after hrtimer_cpu_dying() has been invoked.
+	 * Don't keep it local if this enqueue happens on a unplugged CPU
+	 * after hrtimer_cpu_dying() has been invoked.
 	 */
-	force_local &= this_cpu_base->online;
+	if (likely(this_cpu_base->online)) {
+		bool is_local = cpu_base == this_cpu_base;
+
+		keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned);
+	}
+
+	/* Calculate absolute expiry time for relative timers */
+	if (mode & HRTIMER_MODE_REL)
+		tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
+	/* Compensate for low resolution granularity */
+	tim = hrtimer_update_lowres(timer, tim, mode);
 
 	/*
 	 * Remove an active timer from the queue. In case it is not queued
@@ -1250,32 +1388,41 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	 * reprogramming later if it was the first expiring timer.  This
 	 * avoids programming the underlying clock event twice (once at
 	 * removal and once after enqueue).
+	 *
+	 * @keep_base is also true if the timer callback is running on a
+	 * remote CPU and for local pinned timers.
 	 */
-	remove_hrtimer(timer, base, true, force_local);
+	if (likely(keep_base)) {
+		first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns);
+	} else {
+		/* Keep the ENQUEUED state in case it is queued */
+		bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED);
 
-	if (mode & HRTIMER_MODE_REL)
-		tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
+		hrtimer_set_expires_range_ns(timer, tim, delta_ns);
 
-	tim = hrtimer_update_lowres(timer, tim, mode);
+		/* Switch the timer base, if necessary: */
+		base = switch_hrtimer_base(timer, base, is_pinned);
+		cpu_base = base->cpu_base;
 
-	hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+		first = enqueue_hrtimer(timer, base, mode, was_armed);
+	}
 
-	/* Switch the timer base, if necessary: */
-	if (!force_local) {
-		new_base = switch_hrtimer_base(timer, base,
-					       mode & HRTIMER_MODE_PINNED);
-	} else {
-		new_base = base;
+	/* If a deferred rearm is pending skip reprogramming the device */
+	if (cpu_base->deferred_rearm) {
+		cpu_base->deferred_needs_update = true;
+		return false;
 	}
 
-	first = enqueue_hrtimer(timer, new_base, mode);
-	if (!force_local) {
+	if (!was_first || cpu_base != this_cpu_base) {
 		/*
-		 * If the current CPU base is online, then the timer is
-		 * never queued on a remote CPU if it would be the first
-		 * expiring timer there.
+		 * If the current CPU base is online, then the timer is never
+		 * queued on a remote CPU if it would be the first expiring
+		 * timer there unless the timer callback is currently executed
+		 * on the remote CPU. In the latter case the remote CPU will
+		 * re-evaluate the first expiring timer after completing the
+		 * callbacks.
 		 */
-		if (hrtimer_base_is_online(this_cpu_base))
+		if (likely(hrtimer_base_is_online(this_cpu_base)))
 			return first;
 
 		/*
@@ -1283,21 +1430,33 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 		 * already offline. If the timer is the first to expire,
 		 * kick the remote CPU to reprogram the clock event.
 		 */
-		if (first) {
-			struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+		if (first)
+			smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd);
+		return false;
+	}
 
-			smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
-		}
-		return 0;
+	/*
+	 * Special case for the HRTICK timer. It is frequently rearmed and most
+	 * of the time moves the expiry into the future. That's expensive in
+	 * virtual machines and it's better to take the pointless already armed
+	 * interrupt than reprogramming the hardware on every context switch.
+	 *
+	 * If the new expiry is before the armed time, then reprogramming is
+	 * required.
+	 */
+	if (timer->is_lazy) {
+		if (cpu_base->expires_next <= hrtimer_get_expires(timer))
+			return false;
 	}
 
 	/*
-	 * Timer was forced to stay on the current CPU to avoid
-	 * reprogramming on removal and enqueue. Force reprogram the
-	 * hardware by evaluating the new first expiring timer.
+	 * Timer was the first expiring timer and forced to stay on the
+	 * current CPU to avoid reprogramming on removal and enqueue. Force
+	 * reprogram the hardware by evaluating the new first expiring
+	 * timer.
 	 */
-	hrtimer_force_reprogram(new_base->cpu_base, 1);
-	return 0;
+	hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
+	return false;
 }
 
 /**
@@ -1309,12 +1468,14 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
  *		relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
  *		softirq based mode is considered for debug purpose only!
  */
-void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-			    u64 delta_ns, const enum hrtimer_mode mode)
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+			    const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base;
 	unsigned long flags;
 
+	debug_hrtimer_assert_init(timer);
+
 	/*
 	 * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
 	 * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
@@ -1362,8 +1523,11 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
 
 	base = lock_hrtimer_base(timer, &flags);
 
-	if (!hrtimer_callback_running(timer))
-		ret = remove_hrtimer(timer, base, false, false);
+	if (!hrtimer_callback_running(timer)) {
+		ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE);
+		if (ret)
+			trace_hrtimer_cancel(timer);
+	}
 
 	unlock_hrtimer_base(timer, &flags);
 
@@ -1397,8 +1561,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
  * the timer callback to finish. Drop expiry_lock and reacquire it. That
  * allows the waiter to acquire the lock and make progress.
  */
-static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
-				      unsigned long flags)
+static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags)
 {
 	if (atomic_read(&cpu_base->timer_waiters)) {
 		raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1463,14 +1626,10 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer)
 	spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
 }
 #else
-static inline void
-hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
-static inline void
-hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
-static inline void
-hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
-static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
-					     unsigned long flags) { }
+static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { }
 #endif
 
 /**
@@ -1526,15 +1685,11 @@ u64 hrtimer_get_next_event(void)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 	u64 expires = KTIME_MAX;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
+	guard(raw_spinlock_irqsave)(&cpu_base->lock);
 	if (!hrtimer_hres_active(cpu_base))
 		expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
 
-	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
-
 	return expires;
 }
 
@@ -1549,26 +1704,20 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 	u64 expires = KTIME_MAX;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&cpu_base->lock, flags);
-
-	if (hrtimer_hres_active(cpu_base)) {
-		unsigned int active;
+	unsigned int active;
 
-		if (!cpu_base->softirq_activated) {
-			active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
-			expires = __hrtimer_next_event_base(cpu_base, exclude,
-							    active, KTIME_MAX);
-		}
-		active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
-		expires = __hrtimer_next_event_base(cpu_base, exclude, active,
-						    expires);
-	}
+	guard(raw_spinlock_irqsave)(&cpu_base->lock);
+	if (!hrtimer_hres_active(cpu_base))
+		return expires;
 
-	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+	active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+	if (active && !cpu_base->softirq_activated)
+		expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX);
 
-	return expires;
+	active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+	if (!active)
+		return expires;
+	return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires);
 }
 #endif
 
@@ -1612,8 +1761,7 @@ ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
 }
 EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);
 
-static void __hrtimer_setup(struct hrtimer *timer,
-			    enum hrtimer_restart (*function)(struct hrtimer *),
+static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *),
 			    clockid_t clock_id, enum hrtimer_mode mode)
 {
 	bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
@@ -1645,13 +1793,14 @@ static void __hrtimer_setup(struct hrtimer *timer,
 	base += hrtimer_clockid_to_base(clock_id);
 	timer->is_soft = softtimer;
 	timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
+	timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM);
 	timer->base = &cpu_base->clock_base[base];
-	timerqueue_init(&timer->node);
+	timerqueue_linked_init(&timer->node);
 
-	if (WARN_ON_ONCE(!function))
+	if (WARN_ON_ONCE(!fn))
 		ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout;
 	else
-		ACCESS_PRIVATE(timer, function) = function;
+		ACCESS_PRIVATE(timer, function) = fn;
 }
 
 /**
@@ -1710,12 +1859,10 @@ bool hrtimer_active(const struct hrtimer *timer)
 		base = READ_ONCE(timer->base);
 		seq = raw_read_seqcount_begin(&base->seq);
 
-		if (timer->state != HRTIMER_STATE_INACTIVE ||
-		    base->running == timer)
+		if (timer->is_queued || base->running == timer)
 			return true;
 
-	} while (read_seqcount_retry(&base->seq, seq) ||
-		 base != READ_ONCE(timer->base));
+	} while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base));
 
 	return false;
 }
@@ -1729,7 +1876,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
  *  - callback:	the timer is being ran
  *  - post:	the timer is inactive or (re)queued
  *
- * On the read side we ensure we observe timer->state and cpu_base->running
+ * On the read side we ensure we observe timer->is_queued and cpu_base->running
  * from the same section, if anything changed while we looked at it, we retry.
  * This includes timer->base changing because sequence numbers alone are
  * insufficient for that.
@@ -1738,11 +1885,9 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
  * a false negative if the read side got smeared over multiple consecutive
  * __run_hrtimer() invocations.
  */
-
-static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
-			  struct hrtimer_clock_base *base,
-			  struct hrtimer *timer, ktime_t *now,
-			  unsigned long flags) __must_hold(&cpu_base->lock)
+static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base,
+			  struct hrtimer *timer, ktime_t now, unsigned long flags)
+	__must_hold(&cpu_base->lock)
 {
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	bool expires_in_hardirq;
@@ -1754,15 +1899,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	base->running = timer;
 
 	/*
-	 * Separate the ->running assignment from the ->state assignment.
+	 * Separate the ->running assignment from the ->is_queued assignment.
 	 *
 	 * As with a regular write barrier, this ensures the read side in
 	 * hrtimer_active() cannot observe base->running == NULL &&
-	 * timer->state == INACTIVE.
+	 * timer->is_queued == INACTIVE.
 	 */
 	raw_write_seqcount_barrier(&base->seq);
 
-	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
+	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false);
 	fn = ACCESS_PRIVATE(timer, function);
 
 	/*
@@ -1797,16 +1942,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	 * hrtimer_start_range_ns() can have popped in and enqueued the timer
 	 * for us already.
 	 */
-	if (restart != HRTIMER_NORESTART &&
-	    !(timer->state & HRTIMER_STATE_ENQUEUED))
-		enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+	if (restart == HRTIMER_RESTART && !timer->is_queued)
+		enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false);
 
 	/*
-	 * Separate the ->running assignment from the ->state assignment.
+	 * Separate the ->running assignment from the ->is_queued assignment.
 	 *
 	 * As with a regular write barrier, this ensures the read side in
 	 * hrtimer_active() cannot observe base->running.timer == NULL &&
-	 * timer->state == INACTIVE.
+	 * timer->is_queued == INACTIVE.
 	 */
 	raw_write_seqcount_barrier(&base->seq);
 
@@ -1814,23 +1958,24 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	base->running = NULL;
 }
 
+static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base)
+{
+	struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
+
+	return next ? hrtimer_from_timerqueue_node(next) : NULL;
+}
+
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
 				 unsigned long flags, unsigned int active_mask)
 {
-	struct hrtimer_clock_base *base;
 	unsigned int active = cpu_base->active_bases & active_mask;
+	struct hrtimer_clock_base *base;
 
 	for_each_active_base(base, cpu_base, active) {
-		struct timerqueue_node *node;
-		ktime_t basenow;
-
-		basenow = ktime_add(now, base->offset);
-
-		while ((node = timerqueue_getnext(&base->active))) {
-			struct hrtimer *timer;
-
-			timer = container_of(node, struct hrtimer, node);
+		ktime_t basenow = ktime_add(now, base->offset);
+		struct hrtimer *timer;
 
+		while ((timer = clock_base_next_timer(base))) {
 			/*
 			 * The immediate goal for using the softexpires is
 			 * minimizing wakeups, not running timers at the
@@ -1846,7 +1991,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
 			if (basenow < hrtimer_get_softexpires(timer))
 				break;
 
-			__run_hrtimer(cpu_base, base, timer, &basenow, flags);
+			__run_hrtimer(cpu_base, base, timer, basenow, flags);
 			if (active_mask == HRTIMER_ACTIVE_SOFT)
 				hrtimer_sync_wait_running(cpu_base, flags);
 		}
@@ -1865,7 +2010,7 @@ static __latent_entropy void hrtimer_run_softirq(void)
 	now = hrtimer_update_base(cpu_base);
 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
 
-	cpu_base->softirq_activated = 0;
+	cpu_base->softirq_activated = false;
 	hrtimer_update_softirq_timer(cpu_base, true);
 
 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1875,6 +2020,63 @@ static __latent_entropy void hrtimer_run_softirq(void)
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 /*
+ * Very similar to hrtimer_force_reprogram(), except it deals with
+ * deferred_rearm and hang_detected.
+ */
+static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred)
+{
+	cpu_base->expires_next = expires_next;
+	cpu_base->deferred_rearm = false;
+
+	if (unlikely(cpu_base->hang_detected)) {
+		/*
+		 * Give the system a chance to do something else than looping
+		 * on hrtimer interrupts.
+		 */
+		expires_next = ktime_add_ns(ktime_get(),
+					    min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time));
+	}
+	hrtimer_rearm_event(expires_next, deferred);
+}
+
+#ifdef CONFIG_HRTIMER_REARM_DEFERRED
+void __hrtimer_rearm_deferred(void)
+{
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	ktime_t expires_next;
+
+	if (!cpu_base->deferred_rearm)
+		return;
+
+	guard(raw_spinlock)(&cpu_base->lock);
+	if (cpu_base->deferred_needs_update) {
+		hrtimer_update_base(cpu_base);
+		expires_next = hrtimer_update_next_event(cpu_base);
+	} else {
+		/* No timer added/removed. Use the cached value */
+		expires_next = cpu_base->deferred_expires_next;
+	}
+	hrtimer_rearm(cpu_base, expires_next, true);
+}
+
+static __always_inline void
+hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
+{
+	/* hrtimer_interrupt() just re-evaluated the first expiring timer */
+	cpu_base->deferred_needs_update = false;
+	/* Cache the expiry time */
+	cpu_base->deferred_expires_next = expires_next;
+	set_thread_flag(TIF_HRTIMER_REARM);
+}
+#else  /* CONFIG_HRTIMER_REARM_DEFERRED */
+static __always_inline void
+hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
+{
+	hrtimer_rearm(cpu_base, expires_next, false);
+}
+#endif  /* !CONFIG_HRTIMER_REARM_DEFERRED */
+
+/*
  * High resolution timer interrupt
  * Called with interrupts disabled
  */
@@ -1888,86 +2090,55 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	BUG_ON(!cpu_base->hres_active);
 	cpu_base->nr_events++;
 	dev->next_event = KTIME_MAX;
+	dev->next_event_forced = 0;
 
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 	entry_time = now = hrtimer_update_base(cpu_base);
 retry:
-	cpu_base->in_hrtirq = 1;
+	cpu_base->deferred_rearm = true;
 	/*
-	 * We set expires_next to KTIME_MAX here with cpu_base->lock
-	 * held to prevent that a timer is enqueued in our queue via
-	 * the migration code. This does not affect enqueueing of
-	 * timers which run their callback and need to be requeued on
-	 * this CPU.
+	 * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue
+	 * timers while __hrtimer_run_queues() is expiring the clock bases.
+	 * Timers which are re/enqueued on the local CPU are not affected by
+	 * this.
 	 */
 	cpu_base->expires_next = KTIME_MAX;
 
 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
-		cpu_base->softirq_activated = 1;
+		cpu_base->softirq_activated = true;
 		raise_timer_softirq(HRTIMER_SOFTIRQ);
 	}
 
 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
 
-	/* Reevaluate the clock bases for the [soft] next expiry */
-	expires_next = hrtimer_update_next_event(cpu_base);
-	/*
-	 * Store the new expiry value so the migration code can verify
-	 * against it.
-	 */
-	cpu_base->expires_next = expires_next;
-	cpu_base->in_hrtirq = 0;
-	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
-
-	/* Reprogramming necessary ? */
-	if (!tick_program_event(expires_next, 0)) {
-		cpu_base->hang_detected = 0;
-		return;
-	}
-
 	/*
 	 * The next timer was already expired due to:
 	 * - tracing
 	 * - long lasting callbacks
 	 * - being scheduled away when running in a VM
 	 *
-	 * We need to prevent that we loop forever in the hrtimer
-	 * interrupt routine. We give it 3 attempts to avoid
-	 * overreacting on some spurious event.
-	 *
-	 * Acquire base lock for updating the offsets and retrieving
-	 * the current time.
+	 * We need to prevent that we loop forever in the hrtiner interrupt
+	 * routine. We give it 3 attempts to avoid overreacting on some
+	 * spurious event.
 	 */
-	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 	now = hrtimer_update_base(cpu_base);
-	cpu_base->nr_retries++;
-	if (++retries < 3)
-		goto retry;
-	/*
-	 * Give the system a chance to do something else than looping
-	 * here. We stored the entry time, so we know exactly how long
-	 * we spent here. We schedule the next event this amount of
-	 * time away.
-	 */
-	cpu_base->nr_hangs++;
-	cpu_base->hang_detected = 1;
-	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+	expires_next = hrtimer_update_next_event(cpu_base);
+	cpu_base->hang_detected = false;
+	if (expires_next < now) {
+		if (++retries < 3)
+			goto retry;
+
+		delta = ktime_sub(now, entry_time);
+		cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta);
+		cpu_base->nr_hangs++;
+		cpu_base->hang_detected = true;
+	}
 
-	delta = ktime_sub(now, entry_time);
-	if ((unsigned int)delta > cpu_base->max_hang_time)
-		cpu_base->max_hang_time = (unsigned int) delta;
-	/*
-	 * Limit it to a sensible value as we enforce a longer
-	 * delay. Give the CPU at least 100ms to catch up.
-	 */
-	if (delta > 100 * NSEC_PER_MSEC)
-		expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
-	else
-		expires_next = ktime_add(now, delta);
-	tick_program_event(expires_next, 1);
-	pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
+	hrtimer_interrupt_rearm(cpu_base, expires_next);
+	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 }
+
 #endif /* !CONFIG_HIGH_RES_TIMERS */
 
 /*
@@ -1999,7 +2170,7 @@ void hrtimer_run_queues(void)
 
 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
-		cpu_base->softirq_activated = 1;
+		cpu_base->softirq_activated = true;
 		raise_timer_softirq(HRTIMER_SOFTIRQ);
 	}
 
@@ -2012,8 +2183,7 @@ void hrtimer_run_queues(void)
  */
 static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
 {
-	struct hrtimer_sleeper *t =
-		container_of(timer, struct hrtimer_sleeper, timer);
+	struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer);
 	struct task_struct *task = t->task;
 
 	t->task = NULL;
@@ -2031,8 +2201,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
  * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
  * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
  */
-void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
-				   enum hrtimer_mode mode)
+void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode)
 {
 	/*
 	 * Make the enqueue delivery mode check work on RT. If the sleeper
@@ -2048,8 +2217,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
 }
 EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
 
-static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl,
-				    clockid_t clock_id, enum hrtimer_mode mode)
+static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
+				    enum hrtimer_mode mode)
 {
 	/*
 	 * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
@@ -2085,8 +2254,8 @@ static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl,
  * @clock_id:	the clock to be used
  * @mode:	timer mode abs/rel
  */
-void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl,
-				    clockid_t clock_id, enum hrtimer_mode mode)
+void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id,
+				    enum hrtimer_mode mode)
 {
 	debug_setup_on_stack(&sl->timer, clock_id, mode);
 	__hrtimer_setup_sleeper(sl, clock_id, mode);
@@ -2159,12 +2328,11 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 	return ret;
 }
 
-long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
-		       const clockid_t clockid)
+long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid)
 {
 	struct restart_block *restart;
 	struct hrtimer_sleeper t;
-	int ret = 0;
+	int ret;
 
 	hrtimer_setup_sleeper_on_stack(&t, clockid, mode);
 	hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
@@ -2203,8 +2371,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
 	current->restart_block.fn = do_no_restart_syscall;
 	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
 	current->restart_block.nanosleep.rmtp = rmtp;
-	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
-				 CLOCK_MONOTONIC);
+	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 
 #endif
@@ -2212,7 +2379,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
 #ifdef CONFIG_COMPAT_32BIT_TIME
 
 SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
-		       struct old_timespec32 __user *, rmtp)
+		struct old_timespec32 __user *, rmtp)
 {
 	struct timespec64 tu;
 
@@ -2225,8 +2392,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
 	current->restart_block.fn = do_no_restart_syscall;
 	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
 	current->restart_block.nanosleep.compat_rmtp = rmtp;
-	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
-				 CLOCK_MONOTONIC);
+	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 #endif
 
@@ -2236,14 +2402,13 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
 int hrtimers_prepare_cpu(unsigned int cpu)
 {
 	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
-	int i;
 
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+	for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
 
 		clock_b->cpu_base = cpu_base;
 		seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
-		timerqueue_init_head(&clock_b->active);
+		timerqueue_linked_init_head(&clock_b->active);
 	}
 
 	cpu_base->cpu = cpu;
@@ -2257,13 +2422,14 @@ int hrtimers_cpu_starting(unsigned int cpu)
 
 	/* Clear out any left over state from a CPU down operation */
 	cpu_base->active_bases = 0;
-	cpu_base->hres_active = 0;
-	cpu_base->hang_detected = 0;
+	cpu_base->hres_active = false;
+	cpu_base->hang_detected = false;
 	cpu_base->next_timer = NULL;
 	cpu_base->softirq_next_timer = NULL;
 	cpu_base->expires_next = KTIME_MAX;
 	cpu_base->softirq_expires_next = KTIME_MAX;
-	cpu_base->online = 1;
+	cpu_base->softirq_activated = false;
+	cpu_base->online = true;
 	return 0;
 }
 
@@ -2272,20 +2438,20 @@ int hrtimers_cpu_starting(unsigned int cpu)
 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 				struct hrtimer_clock_base *new_base)
 {
+	struct timerqueue_linked_node *node;
 	struct hrtimer *timer;
-	struct timerqueue_node *node;
 
-	while ((node = timerqueue_getnext(&old_base->active))) {
-		timer = container_of(node, struct hrtimer, node);
+	while ((node = timerqueue_linked_first(&old_base->active))) {
+		timer = hrtimer_from_timerqueue_node(node);
 		BUG_ON(hrtimer_callback_running(timer));
-		debug_deactivate(timer);
+		debug_hrtimer_deactivate(timer);
 
 		/*
 		 * Mark it as ENQUEUED not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
 		 * under us on another CPU
 		 */
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
+		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
@@ -2295,13 +2461,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * sort out already expired timers and reprogram the
 		 * event device.
 		 */
-		enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
+		enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true);
 	}
 }
 
 int hrtimers_cpu_dying(unsigned int dying_cpu)
 {
-	int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+	int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
 	struct hrtimer_cpu_base *old_base, *new_base;
 
 	old_base = this_cpu_ptr(&hrtimer_bases);
@@ -2314,16 +2480,14 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
 	raw_spin_lock(&old_base->lock);
 	raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
 
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-		migrate_hrtimer_list(&old_base->clock_base[i],
-				     &new_base->clock_base[i]);
-	}
+	for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+		migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]);
 
 	/* Tell the other CPU to retrigger the next event */
 	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
 
 	raw_spin_unlock(&new_base->lock);
-	old_base->online = 0;
+	old_base->online = false;
 	raw_spin_unlock(&old_base->lock);
 
 	return 0;
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 9daf8c5d9687..1c954f330dfe 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -32,7 +32,6 @@ static u64 jiffies_read(struct clocksource *cs)
 static struct clocksource clocksource_jiffies = {
 	.name			= "jiffies",
 	.rating			= 1, /* lowest valid rating*/
-	.uncertainty_margin	= 32 * NSEC_PER_MSEC,
 	.read			= jiffies_read,
 	.mask			= CLOCKSOURCE_MASK(32),
 	.mult			= TICK_NSEC << JIFFIES_SHIFT, /* details above */
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 652744e00eb4..4bca3f78c8ea 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -18,8 +18,9 @@
 #include <linux/cred.h>
 #include <linux/err.h>
 #include <linux/mm.h>
+#include <linux/cleanup.h>
 
-#include <vdso/datapage.h>
+#include "namespace_internal.h"
 
 ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
 				struct timens_offsets *ns_offsets)
@@ -93,8 +94,8 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
 	if (!ns)
 		goto fail_dec;
 
-	ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-	if (!ns->vvar_page)
+	err = timens_vdso_alloc_vvar_page(ns);
+	if (err)
 		goto fail_free;
 
 	err = ns_common_init(ns);
@@ -109,7 +110,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
 	return ns;
 
 fail_free_page:
-	__free_page(ns->vvar_page);
+	timens_vdso_free_vvar_page(ns);
 fail_free:
 	kfree(ns);
 fail_dec:
@@ -138,117 +139,7 @@ struct time_namespace *copy_time_ns(u64 flags,
 	return clone_time_ns(user_ns, old_ns);
 }
 
-static struct timens_offset offset_from_ts(struct timespec64 off)
-{
-	struct timens_offset ret;
-
-	ret.sec = off.tv_sec;
-	ret.nsec = off.tv_nsec;
-
-	return ret;
-}
-
-/*
- * A time namespace VVAR page has the same layout as the VVAR page which
- * contains the system wide VDSO data.
- *
- * For a normal task the VVAR pages are installed in the normal ordering:
- *     VVAR
- *     PVCLOCK
- *     HVCLOCK
- *     TIMENS   <- Not really required
- *
- * Now for a timens task the pages are installed in the following order:
- *     TIMENS
- *     PVCLOCK
- *     HVCLOCK
- *     VVAR
- *
- * The check for vdso_clock->clock_mode is in the unlikely path of
- * the seq begin magic. So for the non-timens case most of the time
- * 'seq' is even, so the branch is not taken.
- *
- * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
- * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
- * update to finish and for 'seq' to become even anyway.
- *
- * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
- * enforces the time namespace handling path.
- */
-static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
-					 struct time_namespace *ns)
-{
-	struct timens_offset *offset = vc->offset;
-	struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
-	struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
-
-	vc->seq				= 1;
-	vc->clock_mode			= VDSO_CLOCKMODE_TIMENS;
-	offset[CLOCK_MONOTONIC]		= monotonic;
-	offset[CLOCK_MONOTONIC_RAW]	= monotonic;
-	offset[CLOCK_MONOTONIC_COARSE]	= monotonic;
-	offset[CLOCK_BOOTTIME]		= boottime;
-	offset[CLOCK_BOOTTIME_ALARM]	= boottime;
-}
-
-struct page *find_timens_vvar_page(struct vm_area_struct *vma)
-{
-	if (likely(vma->vm_mm == current->mm))
-		return current->nsproxy->time_ns->vvar_page;
-
-	/*
-	 * VM_PFNMAP | VM_IO protect .fault() handler from being called
-	 * through interfaces like /proc/$pid/mem or
-	 * process_vm_{readv,writev}() as long as there's no .access()
-	 * in special_mapping_vmops().
-	 * For more details check_vma_flags() and __access_remote_vm()
-	 */
-
-	WARN(1, "vvar_page accessed remotely");
-
-	return NULL;
-}
-
-/*
- * Protects possibly multiple offsets writers racing each other
- * and tasks entering the namespace.
- */
-static DEFINE_MUTEX(offset_lock);
-
-static void timens_set_vvar_page(struct task_struct *task,
-				struct time_namespace *ns)
-{
-	struct vdso_time_data *vdata;
-	struct vdso_clock *vc;
-	unsigned int i;
-
-	if (ns == &init_time_ns)
-		return;
-
-	/* Fast-path, taken by every task in namespace except the first. */
-	if (likely(ns->frozen_offsets))
-		return;
-
-	mutex_lock(&offset_lock);
-	/* Nothing to-do: vvar_page has been already initialized. */
-	if (ns->frozen_offsets)
-		goto out;
-
-	ns->frozen_offsets = true;
-	vdata = page_address(ns->vvar_page);
-	vc = vdata->clock_data;
-
-	for (i = 0; i < CS_BASES; i++)
-		timens_setup_vdso_clock_data(&vc[i], ns);
-
-	if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
-		for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
-			timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
-	}
-
-out:
-	mutex_unlock(&offset_lock);
-}
+DEFINE_MUTEX(timens_offset_lock);
 
 void free_time_ns(struct time_namespace *ns)
 {
@@ -256,41 +147,39 @@ void free_time_ns(struct time_namespace *ns)
 	dec_time_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	ns_common_free(ns);
-	__free_page(ns->vvar_page);
+	timens_vdso_free_vvar_page(ns);
 	/* Concurrent nstree traversal depends on a grace period. */
 	kfree_rcu(ns, ns.ns_rcu);
 }
 
 static struct ns_common *timens_get(struct task_struct *task)
 {
-	struct time_namespace *ns = NULL;
+	struct time_namespace *ns;
 	struct nsproxy *nsproxy;
 
-	task_lock(task);
+	guard(task_lock)(task);
 	nsproxy = task->nsproxy;
-	if (nsproxy) {
-		ns = nsproxy->time_ns;
-		get_time_ns(ns);
-	}
-	task_unlock(task);
+	if (!nsproxy)
+		return NULL;
 
-	return ns ? &ns->ns : NULL;
+	ns = nsproxy->time_ns;
+	get_time_ns(ns);
+	return &ns->ns;
 }
 
 static struct ns_common *timens_for_children_get(struct task_struct *task)
 {
-	struct time_namespace *ns = NULL;
+	struct time_namespace *ns;
 	struct nsproxy *nsproxy;
 
-	task_lock(task);
+	guard(task_lock)(task);
 	nsproxy = task->nsproxy;
-	if (nsproxy) {
-		ns = nsproxy->time_ns_for_children;
-		get_time_ns(ns);
-	}
-	task_unlock(task);
+	if (!nsproxy)
+		return NULL;
 
-	return ns ? &ns->ns : NULL;
+	ns = nsproxy->time_ns_for_children;
+	get_time_ns(ns);
+	return &ns->ns;
 }
 
 static void timens_put(struct ns_common *ns)
@@ -298,12 +187,6 @@ static void timens_put(struct ns_common *ns)
 	put_time_ns(to_time_ns(ns));
 }
 
-void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
-{
-	timens_set_vvar_page(tsk, ns);
-	vdso_join_timens(tsk, ns);
-}
-
 static int timens_install(struct nsset *nsset, struct ns_common *new)
 {
 	struct nsproxy *nsproxy = nsset->nsproxy;
@@ -367,36 +250,33 @@ static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts)
 
 void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m)
 {
-	struct ns_common *ns;
-	struct time_namespace *time_ns;
+	struct time_namespace *time_ns __free(time_ns) = NULL;
+	struct ns_common *ns = timens_for_children_get(p);
 
-	ns = timens_for_children_get(p);
 	if (!ns)
 		return;
+
 	time_ns = to_time_ns(ns);
 
 	show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic);
 	show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime);
-	put_time_ns(time_ns);
 }
 
 int proc_timens_set_offset(struct file *file, struct task_struct *p,
 			   struct proc_timens_offset *offsets, int noffsets)
 {
-	struct ns_common *ns;
-	struct time_namespace *time_ns;
+	struct time_namespace *time_ns __free(time_ns) = NULL;
+	struct ns_common *ns = timens_for_children_get(p);
 	struct timespec64 tp;
-	int i, err;
+	int i;
 
-	ns = timens_for_children_get(p);
 	if (!ns)
 		return -ESRCH;
+
 	time_ns = to_time_ns(ns);
 
-	if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) {
-		put_time_ns(time_ns);
+	if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME))
 		return -EPERM;
-	}
 
 	for (i = 0; i < noffsets; i++) {
 		struct proc_timens_offset *off = &offsets[i];
@@ -409,15 +289,12 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
 			ktime_get_boottime_ts64(&tp);
 			break;
 		default:
-			err = -EINVAL;
-			goto out;
+			return -EINVAL;
 		}
 
-		err = -ERANGE;
-
 		if (off->val.tv_sec > KTIME_SEC_MAX ||
 		    off->val.tv_sec < -KTIME_SEC_MAX)
-			goto out;
+			return -ERANGE;
 
 		tp = timespec64_add(tp, off->val);
 		/*
@@ -425,16 +302,13 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
 		 * still unreachable.
 		 */
 		if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2)
-			goto out;
+			return -ERANGE;
 	}
 
-	mutex_lock(&offset_lock);
-	if (time_ns->frozen_offsets) {
-		err = -EACCES;
-		goto out_unlock;
-	}
+	guard(mutex)(&timens_offset_lock);
+	if (time_ns->frozen_offsets)
+		return -EACCES;
 
-	err = 0;
 	/* Don't report errors after this line */
 	for (i = 0; i < noffsets; i++) {
 		struct proc_timens_offset *off = &offsets[i];
@@ -452,12 +326,7 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
 		*offset = off->val;
 	}
 
-out_unlock:
-	mutex_unlock(&offset_lock);
-out:
-	put_time_ns(time_ns);
-
-	return err;
+	return 0;
 }
 
 const struct proc_ns_operations timens_operations = {
diff --git a/kernel/time/namespace_internal.h b/kernel/time/namespace_internal.h
new file mode 100644
index 000000000000..b37ba179f43b
--- /dev/null
+++ b/kernel/time/namespace_internal.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TIME_NAMESPACE_INTERNAL_H
+#define _TIME_NAMESPACE_INTERNAL_H
+
+#include <linux/mutex.h>
+
+struct time_namespace;
+
+/*
+ * Protects possibly multiple offsets writers racing each other
+ * and tasks entering the namespace.
+ */
+extern struct mutex timens_offset_lock;
+
+#ifdef CONFIG_TIME_NS_VDSO
+int timens_vdso_alloc_vvar_page(struct time_namespace *ns);
+void timens_vdso_free_vvar_page(struct time_namespace *ns);
+#else /* !CONFIG_TIME_NS_VDSO */
+static inline int timens_vdso_alloc_vvar_page(struct time_namespace *ns)
+{
+	return 0;
+}
+static inline void timens_vdso_free_vvar_page(struct time_namespace *ns)
+{
+}
+#endif /* CONFIG_TIME_NS_VDSO */
+
+#endif /* _TIME_NAMESPACE_INTERNAL_H */
diff --git a/kernel/time/namespace_vdso.c b/kernel/time/namespace_vdso.c
new file mode 100644
index 000000000000..0d74d160eec9
--- /dev/null
+++ b/kernel/time/namespace_vdso.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: Andrei Vagin <avagin@openvz.org>
+ * Author: Dmitry Safonov <dima@arista.com>
+ */
+
+#include <linux/cleanup.h>
+#include <linux/mm.h>
+#include <linux/time_namespace.h>
+#include <linux/time.h>
+#include <linux/vdso_datastore.h>
+
+#include <vdso/clocksource.h>
+#include <vdso/datapage.h>
+
+#include "namespace_internal.h"
+
+static struct timens_offset offset_from_ts(struct timespec64 off)
+{
+	struct timens_offset ret;
+
+	ret.sec = off.tv_sec;
+	ret.nsec = off.tv_nsec;
+
+	return ret;
+}
+
+/*
+ * A time namespace VVAR page has the same layout as the VVAR page which
+ * contains the system wide VDSO data.
+ *
+ * For a normal task the VVAR pages are installed in the normal ordering:
+ *     VVAR
+ *     PVCLOCK
+ *     HVCLOCK
+ *     TIMENS   <- Not really required
+ *
+ * Now for a timens task the pages are installed in the following order:
+ *     TIMENS
+ *     PVCLOCK
+ *     HVCLOCK
+ *     VVAR
+ *
+ * The check for vdso_clock->clock_mode is in the unlikely path of
+ * the seq begin magic. So for the non-timens case most of the time
+ * 'seq' is even, so the branch is not taken.
+ *
+ * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
+ * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
+ * update to finish and for 'seq' to become even anyway.
+ *
+ * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
+ * enforces the time namespace handling path.
+ */
+static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
+					 struct time_namespace *ns)
+{
+	struct timens_offset *offset = vc->offset;
+	struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
+	struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
+
+	vc->seq				= 1;
+	vc->clock_mode			= VDSO_CLOCKMODE_TIMENS;
+	offset[CLOCK_MONOTONIC]		= monotonic;
+	offset[CLOCK_MONOTONIC_RAW]	= monotonic;
+	offset[CLOCK_MONOTONIC_COARSE]	= monotonic;
+	offset[CLOCK_BOOTTIME]		= boottime;
+	offset[CLOCK_BOOTTIME_ALARM]	= boottime;
+}
+
+struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+	if (likely(vma->vm_mm == current->mm))
+		return current->nsproxy->time_ns->vvar_page;
+
+	/*
+	 * VM_PFNMAP | VM_IO protect .fault() handler from being called
+	 * through interfaces like /proc/$pid/mem or
+	 * process_vm_{readv,writev}() as long as there's no .access()
+	 * in special_mapping_vmops().
+	 * For more details check_vma_flags() and __access_remote_vm()
+	 */
+
+	WARN(1, "vvar_page accessed remotely");
+
+	return NULL;
+}
+
+static void timens_set_vvar_page(struct task_struct *task,
+				struct time_namespace *ns)
+{
+	struct vdso_time_data *vdata;
+	struct vdso_clock *vc;
+	unsigned int i;
+
+	if (ns == &init_time_ns)
+		return;
+
+	/* Fast-path, taken by every task in namespace except the first. */
+	if (likely(ns->frozen_offsets))
+		return;
+
+	guard(mutex)(&timens_offset_lock);
+	/* Nothing to-do: vvar_page has been already initialized. */
+	if (ns->frozen_offsets)
+		return;
+
+	ns->frozen_offsets = true;
+	vdata = page_address(ns->vvar_page);
+	vc = vdata->clock_data;
+
+	for (i = 0; i < CS_BASES; i++)
+		timens_setup_vdso_clock_data(&vc[i], ns);
+
+	if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
+		for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
+			timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
+	}
+}
+
+/*
+ * The vvar page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will be re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_clock_data() for details.
+ */
+static int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+	struct mm_struct *mm = task->mm;
+	struct vm_area_struct *vma;
+	VMA_ITERATOR(vmi, mm, 0);
+
+	guard(mmap_read_lock)(mm);
+	for_each_vma(vmi, vma) {
+		if (vma_is_special_mapping(vma, &vdso_vvar_mapping))
+			zap_vma(vma);
+	}
+	return 0;
+}
+
+void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
+{
+	timens_set_vvar_page(tsk, ns);
+	vdso_join_timens(tsk, ns);
+}
+
+int timens_vdso_alloc_vvar_page(struct time_namespace *ns)
+{
+	ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!ns->vvar_page)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void timens_vdso_free_vvar_page(struct time_namespace *ns)
+{
+	__free_page(ns->vvar_page);
+}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 413e2389f0a5..9331e1614124 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1092,7 +1092,7 @@ void exit_itimers(struct task_struct *tsk)
 	}
 
 	/*
-	 * There should be no timers on the ignored list. itimer_delete() has
+	 * There should be no timers on the ignored list. posix_timer_delete() has
 	 * mopped them up.
 	 */
 	if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers)))
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index a88b72b0f35e..51f6a1032c83 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -78,7 +78,6 @@ static struct clock_event_device ce_broadcast_hrtimer = {
 	.set_state_shutdown	= bc_shutdown,
 	.set_next_ktime		= bc_set_next,
 	.features		= CLOCK_EVT_FEAT_ONESHOT |
-				  CLOCK_EVT_FEAT_KTIME |
 				  CLOCK_EVT_FEAT_HRTIMER,
 	.rating			= 0,
 	.bound_on		= -1,
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f63c65881364..115e0bf01276 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -76,8 +76,10 @@ const struct clock_event_device *tick_get_wakeup_device(int cpu)
  */
 static void tick_broadcast_start_periodic(struct clock_event_device *bc)
 {
-	if (bc)
+	if (bc) {
+		bc->next_event_forced = 0;
 		tick_setup_periodic(bc, 1);
+	}
 }
 
 /*
@@ -106,6 +108,7 @@ static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu)
 
 static void tick_oneshot_wakeup_handler(struct clock_event_device *wd)
 {
+	wd->next_event_forced = 0;
 	/*
 	 * If we woke up early and the tick was reprogrammed in the
 	 * meantime then this may be spurious but harmless.
@@ -403,6 +406,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 	bool bc_local;
 
 	raw_spin_lock(&tick_broadcast_lock);
+	tick_broadcast_device.evtdev->next_event_forced = 0;
 
 	/* Handle spurious interrupts gracefully */
 	if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) {
@@ -696,6 +700,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 
 	raw_spin_lock(&tick_broadcast_lock);
 	dev->next_event = KTIME_MAX;
+	tick_broadcast_device.evtdev->next_event_forced = 0;
 	next_event = KTIME_MAX;
 	cpumask_clear(tmpmask);
 	now = ktime_get();
@@ -1063,6 +1068,7 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc,
 
 
 	bc->event_handler = tick_handle_oneshot_broadcast;
+	bc->next_event_forced = 0;
 	bc->next_event = KTIME_MAX;
 
 	/*
@@ -1175,6 +1181,7 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
 		}
 
 		/* This moves the broadcast assignment to this CPU: */
+		bc->next_event_forced = 0;
 		clockevents_program_event(bc, bc->next_event, 1);
 	}
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index d305d8521896..6a9198a4279b 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -110,6 +110,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
 	int cpu = smp_processor_id();
 	ktime_t next = dev->next_event;
 
+	dev->next_event_forced = 0;
 	tick_periodic(cpu);
 
 	/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f7907fadd63f..cbbb87a0c6e7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -345,7 +345,7 @@ static bool check_tick_dependency(atomic_t *dep)
 	int val = atomic_read(dep);
 
 	if (likely(!tracepoint_enabled(tick_stop)))
-		return !val;
+		return !!val;
 
 	if (val & TICK_DEP_MASK_POSIX_TIMER) {
 		trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
@@ -864,19 +864,32 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
+/* Simplified variant of hrtimer_forward_now() */
+static ktime_t tick_forward_now(ktime_t expires, ktime_t now)
+{
+	ktime_t delta = now - expires;
+
+	if (likely(delta < TICK_NSEC))
+		return expires + TICK_NSEC;
+
+	expires += TICK_NSEC * ktime_divns(delta, TICK_NSEC);
+	if (expires > now)
+		return expires;
+	return expires + TICK_NSEC;
+}
+
 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 {
-	hrtimer_cancel(&ts->sched_timer);
-	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+	ktime_t expires = ts->last_tick;
 
-	/* Forward the time to expire in the future */
-	hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
+	if (now >= expires)
+		expires = tick_forward_now(expires, now);
 
 	if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
-		hrtimer_start_expires(&ts->sched_timer,
-				      HRTIMER_MODE_ABS_PINNED_HARD);
+		hrtimer_start(&ts->sched_timer,	expires, HRTIMER_MODE_ABS_PINNED_HARD);
 	} else {
-		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+		hrtimer_set_expires(&ts->sched_timer, expires);
+		tick_program_event(expires, 1);
 	}
 
 	/*
@@ -1513,6 +1526,7 @@ static void tick_nohz_lowres_handler(struct clock_event_device *dev)
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 
 	dev->next_event = KTIME_MAX;
+	dev->next_event_forced = 0;
 
 	if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
 		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 0d832317d576..771cef87ad3b 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
 		    get_user(new_ts.tv_nsec, &tv->tv_usec))
 			return -EFAULT;
 
-		if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
+		if (new_ts.tv_nsec >= USEC_PER_SEC || new_ts.tv_nsec < 0)
 			return -EINVAL;
 
 		new_ts.tv_nsec *= NSEC_PER_USEC;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c07e562ee4c1..c493a4010305 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -3,34 +3,30 @@
  *  Kernel timekeeping code and accessor functions. Based on code from
  *  timer.c, moved in commit 8524070b7982.
  */
-#include <linux/timekeeper_internal.h>
-#include <linux/module.h>
-#include <linux/interrupt.h>
+#include <linux/audit.h>
+#include <linux/clocksource.h>
+#include <linux/compiler.h>
+#include <linux/jiffies.h>
 #include <linux/kobject.h>
-#include <linux/percpu.h>
-#include <linux/init.h>
-#include <linux/mm.h>
+#include <linux/module.h>
 #include <linux/nmi.h>
-#include <linux/sched.h>
-#include <linux/sched/loadavg.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/random.h>
 #include <linux/sched/clock.h>
+#include <linux/sched/loadavg.h>
+#include <linux/static_key.h>
+#include <linux/stop_machine.h>
 #include <linux/syscore_ops.h>
-#include <linux/clocksource.h>
-#include <linux/jiffies.h>
+#include <linux/tick.h>
 #include <linux/time.h>
 #include <linux/timex.h>
-#include <linux/tick.h>
-#include <linux/stop_machine.h>
-#include <linux/pvclock_gtod.h>
-#include <linux/compiler.h>
-#include <linux/audit.h>
-#include <linux/random.h>
+#include <linux/timekeeper_internal.h>
 
 #include <vdso/auxclock.h>
 
 #include "tick-internal.h"
-#include "ntp_internal.h"
 #include "timekeeping_internal.h"
+#include "ntp_internal.h"
 
 #define TK_CLEAR_NTP		(1 << 0)
 #define TK_CLOCK_WAS_SET	(1 << 1)
@@ -275,6 +271,11 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 	tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
 }
 
+#ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE
+#include <asm/clock_inlined.h>
+
+static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined);
+
 /*
  * tk_clock_read - atomic clocksource read() helper
  *
@@ -288,12 +289,35 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
  * a read of the fast-timekeeper tkrs (which is protected by its own locking
  * and update logic).
  */
-static inline u64 tk_clock_read(const struct tk_read_base *tkr)
+static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
+{
+	struct clocksource *clock = READ_ONCE(tkr->clock);
+
+	if (static_branch_likely(&clocksource_read_inlined))
+		return arch_inlined_clocksource_read(clock);
+
+	return clock->read(clock);
+}
+
+static inline void clocksource_disable_inline_read(void)
+{
+	static_branch_disable(&clocksource_read_inlined);
+}
+
+static inline void clocksource_enable_inline_read(void)
+{
+	static_branch_enable(&clocksource_read_inlined);
+}
+#else
+static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
 {
 	struct clocksource *clock = READ_ONCE(tkr->clock);
 
 	return clock->read(clock);
 }
+static inline void clocksource_disable_inline_read(void) { }
+static inline void clocksource_enable_inline_read(void) { }
+#endif
 
 /**
  * tk_setup_internals - Set up internals to use clocksource clock.
@@ -367,6 +391,27 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	tk->tkr_raw.mult = clock->mult;
 	tk->ntp_err_mult = 0;
 	tk->skip_second_overflow = 0;
+
+	tk->cs_id = clock->id;
+
+	/* Coupled clockevent data */
+	if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) &&
+	    clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) {
+		/*
+		 * Aim for an one hour maximum delta and use KHz to handle
+		 * clocksources with a frequency above 4GHz correctly as
+		 * the frequency argument of clocks_calc_mult_shift() is u32.
+		 */
+		clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift,
+				       NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000);
+		/*
+		 * Initialize the conversion limit as the previous clocksource
+		 * might have the same shift/mult pair so the quick check in
+		 * tk_update_ns_to_cyc() fails to update it after a clocksource
+		 * change leaving it effectivly zero.
+		 */
+		tk->cs_ns_to_cyc_maxns = div_u64(clock->mask, tk->cs_ns_to_cyc_mult);
+	}
 }
 
 /* Timekeeper helper functions. */
@@ -375,7 +420,7 @@ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
 	return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
 }
 
-static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
+static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
 {
 	/* Calculate the delta since the last update_wall_time() */
 	u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
@@ -696,6 +741,36 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
 }
 
+static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc)
+{
+	struct tk_read_base *tkrs = &tks->tkr_mono;
+	struct tk_read_base *tkrc = &tkc->tkr_mono;
+	unsigned int shift;
+
+	if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) ||
+	    !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT))
+		return;
+
+	if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift)
+		return;
+	/*
+	 * The conversion math is simple:
+	 *
+	 *      CS::MULT       (1 << NS_TO_CYC_SHIFT)
+	 *   --------------- = ----------------------
+	 *   (1 << CS:SHIFT)       NS_TO_CYC_MULT
+	 *
+	 * Ergo:
+	 *
+	 *   NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT
+	 *
+	 * NS_TO_CYC_SHIFT has been set up in tk_setup_internals()
+	 */
+	shift = tkrs->shift + tks->cs_ns_to_cyc_shift;
+	tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult);
+	tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult);
+}
+
 /*
  * Restore the shadow timekeeper from the real timekeeper.
  */
@@ -730,6 +805,7 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act
 	tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
 
 	if (tk->id == TIMEKEEPER_CORE) {
+		tk_update_ns_to_cyc(tk, &tkd->timekeeper);
 		update_vsyscall(tk);
 		update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
 
@@ -784,6 +860,71 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 	tk_update_coarse_nsecs(tk);
 }
 
+/*
+ * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles
+ * @id:		Clocksource ID which is required for validity
+ * @expires_ns:	Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted
+ * @cycles:	Pointer to storage for corresponding absolute cycles value
+ *
+ * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value
+ * based on the correlated clocksource of the clockevent device by using
+ * the base nanoseconds and cycles values of the last timekeeper update and
+ * converting the delta between @expires_ns and base nanoseconds to cycles.
+ *
+ * This only works for clockevent devices which are using a less than or
+ * equal comparator against the clocksource.
+ *
+ * Utilizing this avoids two clocksource reads for such devices, the
+ * ktime_get() in clockevents_program_event() to calculate the delta expiry
+ * value and the readout in the device::set_next_event() callback to
+ * convert the delta back to a absolute comparator value.
+ *
+ * Returns: True if @id matches the current clocksource ID, false otherwise
+ */
+bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	struct tk_read_base *tkrm = &tk->tkr_mono;
+	ktime_t base_ns, delta_ns, max_ns;
+	u64 base_cycles, delta_cycles;
+	unsigned int seq;
+	u32 mult, shift;
+
+	/*
+	 * Racy check to avoid the seqcount overhead when ID does not match. If
+	 * the relevant clocksource is installed concurrently, then this will
+	 * just delay the switch over to this mechanism until the next event is
+	 * programmed. If the ID is not matching the clock events code will use
+	 * the regular relative set_next_event() callback as before.
+	 */
+	if (data_race(tk->cs_id) != id)
+		return false;
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+
+		if (tk->cs_id != id)
+			return false;
+
+		base_cycles = tkrm->cycle_last;
+		base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift);
+
+		mult = tk->cs_ns_to_cyc_mult;
+		shift = tk->cs_ns_to_cyc_shift;
+		max_ns = tk->cs_ns_to_cyc_maxns;
+
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	/* Prevent negative deltas and multiplication overflows */
+	delta_ns = min(expires_ns - base_ns, max_ns);
+	delta_ns = max(delta_ns, 0);
+
+	/* Convert to cycles */
+	delta_cycles = ((u64)delta_ns * mult) >> shift;
+	*cycles = base_cycles + delta_cycles;
+	return true;
+}
+
 /**
  * ktime_get_real_ts64 - Returns the time of day in a timespec64.
  * @ts:		pointer to the timespec to be set
@@ -848,7 +989,7 @@ u32 ktime_get_resolution_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
 
-static ktime_t *offsets[TK_OFFS_MAX] = {
+static const ktime_t *const offsets[TK_OFFS_MAX] = {
 	[TK_OFFS_REAL]	= &tk_core.timekeeper.offs_real,
 	[TK_OFFS_BOOT]	= &tk_core.timekeeper.offs_boot,
 	[TK_OFFS_TAI]	= &tk_core.timekeeper.offs_tai,
@@ -857,8 +998,9 @@ static ktime_t *offsets[TK_OFFS_MAX] = {
 ktime_t ktime_get_with_offset(enum tk_offsets offs)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
+	const ktime_t *offset = offsets[offs];
 	unsigned int seq;
-	ktime_t base, *offset = offsets[offs];
+	ktime_t base;
 	u64 nsecs;
 
 	WARN_ON(timekeeping_suspended);
@@ -878,8 +1020,9 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset);
 ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	ktime_t base, *offset = offsets[offs];
+	const ktime_t *offset = offsets[offs];
 	unsigned int seq;
+	ktime_t base;
 	u64 nsecs;
 
 	WARN_ON(timekeeping_suspended);
@@ -902,7 +1045,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
  */
 ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
 {
-	ktime_t *offset = offsets[offs];
+	const ktime_t *offset = offsets[offs];
 	unsigned int seq;
 	ktime_t tconv;
 
@@ -1631,7 +1774,19 @@ int timekeeping_notify(struct clocksource *clock)
 
 	if (tk->tkr_mono.clock == clock)
 		return 0;
+
+	/* Disable inlined reads accross the clocksource switch */
+	clocksource_disable_inline_read();
+
 	stop_machine(change_clocksource, clock, NULL);
+
+	/*
+	 * If the clocksource has been selected and supports inlined reads
+	 * enable the branch.
+	 */
+	if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ)
+		clocksource_enable_inline_read();
+
 	tick_clock_notify();
 	return tk->tkr_mono.clock == clock ? 0 : -1;
 }
@@ -2834,7 +2989,7 @@ static void tk_aux_update_clocksource(void)
 			continue;
 
 		timekeeping_forward_now(tks);
-		tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock);
+		tk_setup_internals(tks, tk_core.timekeeper.tkr_raw.clock);
 		timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
 	}
 }
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 543beba096c7..198d0608db74 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -9,6 +9,8 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
 					    ktime_t *offs_boot,
 					    ktime_t *offs_tai);
 
+bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles);
+
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
 extern void timekeeping_warp_clock(void);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 7e1e3bde6b8b..04d928c21aba 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2319,6 +2319,7 @@ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
  */
 void timer_clear_idle(void)
 {
+	int this_cpu = smp_processor_id();
 	/*
 	 * We do this unlocked. The worst outcome is a remote pinned timer
 	 * enqueue sending a pointless IPI, but taking the lock would just
@@ -2327,9 +2328,9 @@ void timer_clear_idle(void)
 	 * path. Required for BASE_LOCAL only.
 	 */
 	__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
-	if (tick_nohz_full_cpu(smp_processor_id()))
+	if (tick_nohz_full_cpu(this_cpu))
 		__this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
-	trace_timer_base_idle(false, smp_processor_id());
+	trace_timer_base_idle(false, this_cpu);
 
 	/* Activate without holding the timer_base->lock */
 	tmigr_cpu_activate();
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 488e47e96e93..427d7ddea3af 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -47,7 +47,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
 	    int idx, u64 now)
 {
 	SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function));
-	SEQ_printf(m, ", S:%02x", timer->state);
+	SEQ_printf(m, ", S:%02x", timer->is_queued);
 	SEQ_printf(m, "\n");
 	SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
 		(unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
@@ -56,13 +56,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
 		(long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
 }
 
-static void
-print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
-		    u64 now)
+static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 {
+	struct timerqueue_linked_node *curr;
 	struct hrtimer *timer, tmp;
 	unsigned long next = 0, i;
-	struct timerqueue_node *curr;
 	unsigned long flags;
 
 next_one:
@@ -72,13 +70,13 @@ next_one:
 
 	raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
 
-	curr = timerqueue_getnext(&base->active);
+	curr = timerqueue_linked_first(&base->active);
 	/*
 	 * Crude but we have to do this O(N*N) thing, because
 	 * we have to unlock the base when printing:
 	 */
 	while (curr && i < next) {
-		curr = timerqueue_iterate_next(curr);
+		curr = timerqueue_linked_next(curr);
 		i++;
 	}
 
@@ -103,8 +101,8 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 
 	SEQ_printf(m, "  .resolution: %u nsecs\n", hrtimer_resolution);
 #ifdef CONFIG_HIGH_RES_TIMERS
-	SEQ_printf(m, "  .offset:     %Lu nsecs\n",
-		   (unsigned long long) ktime_to_ns(base->offset));
+	SEQ_printf(m, "  .offset:     %Ld nsecs\n",
+		   (long long) base->offset);
 #endif
 	SEQ_printf(m,   "active timers:\n");
 	print_active_timers(m, base, now + ktime_to_ns(base->offset));
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 155eeaea4113..52c15affdbff 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -978,8 +978,12 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
 	/* Drop the lock to allow the remote CPU to exit idle */
 	raw_spin_unlock_irq(&tmc->lock);
 
-	if (cpu != smp_processor_id())
-		timer_expire_remote(cpu);
+	/*
+	 * This can't exclude the local CPU because jiffies might have advanced
+	 * after the timer softirq invoked run_timer_base(BASE_GLOBAL) and the
+	 * point where the jiffies snapshot @jif was taken in tmigr_handle_remote().
+	 */
+	timer_expire_remote(cpu);
 
 	/*
 	 * Lock ordering needs to be preserved - timer_base locks before tmigr
@@ -1860,19 +1864,37 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
 		 *   child to the new parents. So tmigr_active_up() activates the
 		 *   new parents while walking up from the old root to the new.
 		 *
-		 * * It is ensured that @start is active, as this setup path is
-		 *   executed in hotplug prepare callback. This is executed by an
-		 *   already connected and !idle CPU. Even if all other CPUs go idle,
-		 *   the CPU executing the setup will be responsible up to current top
-		 *   level group. And the next time it goes inactive, it will release
-		 *   the new childmask and parent to subsequent walkers through this
-		 *   @child. Therefore propagate active state unconditionally.
+		 * * It is ensured that @start is active, (or on the way to be activated
+		 *   by another CPU that woke up before the current one) as this setup path
+		 *   is executed in hotplug prepare callback. This is executed by an already
+		 *   connected and !idle CPU in the hierarchy.
+		 *
+		 * * The below RmW atomic operation ensures that:
+		 *
+		 *   1) If the old root has been completely activated, the latest state is
+		 *      acquired (the below implicit acquire pairs with the implicit release
+		 *      from cmpxchg() in tmigr_active_up()).
+		 *
+		 *   2) If the old root is still on the way to be activated, the lagging behind
+		 *      CPU performing the activation will acquire the links up to the new root.
+		 *      (The below implicit release pairs with the implicit acquire from cmpxchg()
+		 *      in tmigr_active_up()).
+		 *
+		 *   3) Every subsequent CPU below the old root will acquire the new links while
+		 *      walking through the old root (The below implicit release pairs with the
+		 *      implicit acquire from cmpxchg() in either tmigr_active_up()) or
+		 *      tmigr_inactive_up().
 		 */
-		state.state = atomic_read(&start->migr_state);
-		WARN_ON_ONCE(!state.active);
+		state.state = atomic_fetch_or(0, &start->migr_state);
 		WARN_ON_ONCE(!start->parent);
-		data.childmask = start->groupmask;
-		__walk_groups_from(tmigr_active_up, &data, start, start->parent);
+		/*
+		 * If the state of the old root is inactive, another CPU is on its way to activate
+		 * it and propagate to the new root.
+		 */
+		if (state.active) {
+			data.childmask = start->groupmask;
+			__walk_groups_from(tmigr_active_up, &data, start, start->parent);
+		}
 	}
 
 	/* Root update */