summaryrefslogtreecommitdiff
path: root/kernel/time
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/time')
-rw-r--r--kernel/time/.kunitconfig2
-rw-r--r--kernel/time/Kconfig32
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/alarmtimer.c14
-rw-r--r--kernel/time/clockevents.c76
-rw-r--r--kernel/time/clocksource-wdtest.c268
-rw-r--r--kernel/time/clocksource.c805
-rw-r--r--kernel/time/hrtimer.c1128
-rw-r--r--kernel/time/jiffies.c1
-rw-r--r--kernel/time/namespace.c203
-rw-r--r--kernel/time/namespace_internal.h28
-rw-r--r--kernel/time/namespace_vdso.c160
-rw-r--r--kernel/time/posix-timers.c2
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c1
-rw-r--r--kernel/time/tick-broadcast.c9
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-sched.c30
-rw-r--r--kernel/time/time.c2
-rw-r--r--kernel/time/timekeeping.c203
-rw-r--r--kernel/time/timekeeping.h2
-rw-r--r--kernel/time/timer.c5
-rw-r--r--kernel/time/timer_list.c16
-rw-r--r--kernel/time/timer_migration.c48
23 files changed, 1789 insertions, 1248 deletions
diff --git a/kernel/time/.kunitconfig b/kernel/time/.kunitconfig
new file mode 100644
index 000000000000..d60a611b2853
--- /dev/null
+++ b/kernel/time/.kunitconfig
@@ -0,0 +1,2 @@
+CONFIG_KUNIT=y
+CONFIG_TIME_KUNIT_TEST=y
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 7c6a52f7836c..02aac7c5aa76 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -9,14 +9,13 @@
config CLOCKSOURCE_WATCHDOG
bool
-# Architecture has extra clocksource data
-config ARCH_CLOCKSOURCE_DATA
- bool
-
# Architecture has extra clocksource init called from registration
config ARCH_CLOCKSOURCE_INIT
bool
+config ARCH_WANTS_CLOCKSOURCE_READ_INLINE
+ bool
+
# Timekeeping vsyscall support
config GENERIC_TIME_VSYSCALL
bool
@@ -44,10 +43,23 @@ config GENERIC_CLOCKEVENTS_BROADCAST_IDLE
config GENERIC_CLOCKEVENTS_MIN_ADJUST
bool
+config GENERIC_CLOCKEVENTS_COUPLED
+ bool
+
+config GENERIC_CLOCKEVENTS_COUPLED_INLINE
+ select GENERIC_CLOCKEVENTS_COUPLED
+ bool
+
# Generic update of CMOS clock
config GENERIC_CMOS_UPDATE
bool
+# Deferred rearming of the hrtimer interrupt
+config HRTIMER_REARM_DEFERRED
+ def_bool y
+ depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS
+ depends on HIGH_RES_TIMERS && SCHED_HRTICK
+
# Select to handle posix CPU timers from task_work
# and not from the timer interrupt context
config HAVE_POSIX_CPU_TIMERS_TASK_WORK
@@ -196,18 +208,6 @@ config HIGH_RES_TIMERS
hardware is not capable then this option only increases
the size of the kernel image.
-config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
- int "Clocksource watchdog maximum allowable skew (in microseconds)"
- depends on CLOCKSOURCE_WATCHDOG
- range 50 1000
- default 125
- help
- Specify the maximum amount of allowable watchdog skew in
- microseconds before reporting the clocksource to be unstable.
- The default is based on a half-second clocksource watchdog
- interval and NTP's maximum frequency drift of 500 parts
- per million. If the clocksource is good enough for NTP,
- it is good enough for the clocksource watchdog!
endif
config POSIX_AUX_CLOCKS
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f7d52d9543cc..eaf290c972f9 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -30,5 +30,6 @@ obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
obj-$(CONFIG_TIME_NS) += namespace.o
+obj-$(CONFIG_TIME_NS_VDSO) += namespace_vdso.o
obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o
obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 069d93bfb0c7..6e173d70d825 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -234,19 +234,23 @@ static int alarmtimer_suspend(struct device *dev)
if (!rtc)
return 0;
- /* Find the soonest timer to expire*/
+ /* Find the soonest timer to expire */
for (i = 0; i < ALARM_NUMTYPE; i++) {
struct alarm_base *base = &alarm_bases[i];
struct timerqueue_node *next;
+ ktime_t next_expires;
ktime_t delta;
- scoped_guard(spinlock_irqsave, &base->lock)
+ scoped_guard(spinlock_irqsave, &base->lock) {
next = timerqueue_getnext(&base->timerqueue);
+ if (next)
+ next_expires = next->expires;
+ }
if (!next)
continue;
- delta = ktime_sub(next->expires, base->get_ktime());
+ delta = ktime_sub(next_expires, base->get_ktime());
if (!min || (delta < min)) {
- expires = next->expires;
+ expires = next_expires;
min = delta;
type = i;
}
@@ -540,7 +544,7 @@ static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
- return alarm_forward(alarm, timr->it_interval, now);
+ return alarm_forward(alarm, now, timr->it_interval);
}
/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index eaae1ce9f060..0014d163f989 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,6 +94,9 @@ static int __clockevents_switch_state(struct clock_event_device *dev,
if (dev->features & CLOCK_EVT_FEAT_DUMMY)
return 0;
+ /* On state transitions clear the forced flag unconditionally */
+ dev->next_event_forced = 0;
+
/* Transition with new state-specific callbacks */
switch (state) {
case CLOCK_EVT_STATE_DETACHED:
@@ -172,6 +175,7 @@ void clockevents_shutdown(struct clock_event_device *dev)
{
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
}
/**
@@ -292,6 +296,38 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE
+#include <asm/clock_inlined.h>
+#else
+static __always_inline void
+arch_inlined_clockevent_set_next_coupled(u64 cycles, struct clock_event_device *dev) { }
+#endif
+
+static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires)
+{
+ u64 cycles;
+
+ if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED)))
+ return false;
+
+ if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles)))
+ return false;
+
+ if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE))
+ arch_inlined_clockevent_set_next_coupled(cycles, dev);
+ else
+ dev->set_next_coupled(cycles, dev);
+ return true;
+}
+
+#else
+static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires)
+{
+ return false;
+}
+#endif
+
/**
* clockevents_program_event - Reprogram the clock event device.
* @dev: device to program
@@ -300,12 +336,10 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
*
* Returns 0 on success, -ETIME when the event is in the past.
*/
-int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
- bool force)
+int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force)
{
- unsigned long long clc;
int64_t delta;
- int rc;
+ u64 cycles;
if (WARN_ON_ONCE(expires < 0))
return -ETIME;
@@ -319,21 +353,37 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
clockevent_get_state(dev));
- /* Shortcut for clockevent devices that can deal with ktime. */
- if (dev->features & CLOCK_EVT_FEAT_KTIME)
+ /* ktime_t based reprogramming for the broadcast hrtimer device */
+ if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER))
return dev->set_next_ktime(expires, dev);
+ if (likely(clockevent_set_next_coupled(dev, expires)))
+ return 0;
+
delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
- if (delta <= 0)
- return force ? clockevents_program_min_delta(dev) : -ETIME;
- delta = min(delta, (int64_t) dev->max_delta_ns);
- delta = max(delta, (int64_t) dev->min_delta_ns);
+ /* Required for tick_periodic() during early boot */
+ if (delta <= 0 && !force)
+ return -ETIME;
- clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
- rc = dev->set_next_event((unsigned long) clc, dev);
+ if (delta > (int64_t)dev->min_delta_ns) {
+ delta = min(delta, (int64_t) dev->max_delta_ns);
+ cycles = ((u64)delta * dev->mult) >> dev->shift;
+ if (!dev->set_next_event((unsigned long) cycles, dev)) {
+ dev->next_event_forced = 0;
+ return 0;
+ }
+ }
+
+ if (dev->next_event_forced)
+ return 0;
- return (rc && force) ? clockevents_program_min_delta(dev) : rc;
+ if (dev->set_next_event(dev->min_delta_ticks, dev)) {
+ if (!force || clockevents_program_min_delta(dev))
+ return -ETIME;
+ }
+ dev->next_event_forced = 1;
+ return 0;
}
/*
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
index 38dae590b29f..b4cf17b4aeed 100644
--- a/kernel/time/clocksource-wdtest.c
+++ b/kernel/time/clocksource-wdtest.c
@@ -3,202 +3,196 @@
* Unit test for the clocksource watchdog.
*
* Copyright (C) 2021 Facebook, Inc.
+ * Copyright (C) 2026 Intel Corp.
*
* Author: Paul E. McKenney <paulmck@kernel.org>
+ * Author: Thomas Gleixner <tglx@kernel.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/device.h>
#include <linux/clocksource.h>
-#include <linux/init.h>
+#include <linux/delay.h>
#include <linux/module.h>
-#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
-#include <linux/tick.h>
#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/prandom.h>
-#include <linux/cpu.h>
#include "tick-internal.h"
+#include "timekeeping_internal.h"
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Clocksource watchdog unit test");
MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
+MODULE_AUTHOR("Thomas Gleixner <tglx@kernel.org>");
+
+enum wdtest_states {
+ WDTEST_INJECT_NONE,
+ WDTEST_INJECT_DELAY,
+ WDTEST_INJECT_POSITIVE,
+ WDTEST_INJECT_NEGATIVE,
+ WDTEST_INJECT_PERCPU = 0x100,
+};
-static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0;
-module_param(holdoff, int, 0444);
-MODULE_PARM_DESC(holdoff, "Time to wait to start test (s).");
+static enum wdtest_states wdtest_state;
+static unsigned long wdtest_test_count;
+static ktime_t wdtest_last_ts, wdtest_offset;
-/* Watchdog kthread's task_struct pointer for debug purposes. */
-static struct task_struct *wdtest_task;
+#define SHIFT_4000PPM 8
-static u64 wdtest_jiffies_read(struct clocksource *cs)
+static ktime_t wdtest_get_offset(struct clocksource *cs)
{
- return (u64)jiffies;
-}
-
-static struct clocksource clocksource_wdtest_jiffies = {
- .name = "wdtest-jiffies",
- .rating = 1, /* lowest valid rating*/
- .uncertainty_margin = TICK_NSEC,
- .read = wdtest_jiffies_read,
- .mask = CLOCKSOURCE_MASK(32),
- .flags = CLOCK_SOURCE_MUST_VERIFY,
- .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
- .shift = JIFFIES_SHIFT,
- .max_cycles = 10,
-};
+ if (wdtest_state < WDTEST_INJECT_PERCPU)
+ return wdtest_test_count & 0x1 ? 0 : wdtest_offset >> SHIFT_4000PPM;
-static int wdtest_ktime_read_ndelays;
-static bool wdtest_ktime_read_fuzz;
+ /* Only affect the readout of the "remote" CPU */
+ return cs->wd_cpu == smp_processor_id() ? 0 : NSEC_PER_MSEC;
+}
static u64 wdtest_ktime_read(struct clocksource *cs)
{
- int wkrn = READ_ONCE(wdtest_ktime_read_ndelays);
- static int sign = 1;
- u64 ret;
+ ktime_t now = ktime_get_raw_fast_ns();
+ ktime_t intv = now - wdtest_last_ts;
- if (wkrn) {
- udelay(cs->uncertainty_margin / 250);
- WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1);
- }
- ret = ktime_get_real_fast_ns();
- if (READ_ONCE(wdtest_ktime_read_fuzz)) {
- sign = -sign;
- ret = ret + sign * 100 * NSEC_PER_MSEC;
+ /*
+ * Only increment the test counter once per watchdog interval and
+ * store the interval for the offset calculation of this step. This
+ * guarantees a consistent behaviour even if the other side needs
+ * to repeat due to a watchdog read timeout.
+ */
+ if (intv > (NSEC_PER_SEC / 4)) {
+ WRITE_ONCE(wdtest_test_count, wdtest_test_count + 1);
+ wdtest_last_ts = now;
+ wdtest_offset = intv;
}
- return ret;
-}
-static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs)
-{
- pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name);
+ switch (wdtest_state & ~WDTEST_INJECT_PERCPU) {
+ case WDTEST_INJECT_POSITIVE:
+ return now + wdtest_get_offset(cs);
+ case WDTEST_INJECT_NEGATIVE:
+ return now - wdtest_get_offset(cs);
+ case WDTEST_INJECT_DELAY:
+ udelay(500);
+ return now;
+ default:
+ return now;
+ }
}
-#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \
- CLOCK_SOURCE_VALID_FOR_HRES | \
- CLOCK_SOURCE_MUST_VERIFY | \
- CLOCK_SOURCE_VERIFY_PERCPU)
+#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \
+ CLOCK_SOURCE_CALIBRATED | \
+ CLOCK_SOURCE_MUST_VERIFY | \
+ CLOCK_SOURCE_WDTEST)
static struct clocksource clocksource_wdtest_ktime = {
.name = "wdtest-ktime",
- .rating = 300,
+ .rating = 10,
.read = wdtest_ktime_read,
.mask = CLOCKSOURCE_MASK(64),
.flags = KTIME_FLAGS,
- .mark_unstable = wdtest_ktime_cs_mark_unstable,
.list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list),
};
-/* Reset the clocksource if needed. */
-static void wdtest_ktime_clocksource_reset(void)
+static void wdtest_clocksource_reset(enum wdtest_states which, bool percpu)
+{
+ clocksource_unregister(&clocksource_wdtest_ktime);
+
+ pr_info("Test: State %d percpu %d\n", which, percpu);
+
+ wdtest_state = which;
+ if (percpu)
+ wdtest_state |= WDTEST_INJECT_PERCPU;
+ wdtest_test_count = 0;
+ wdtest_last_ts = 0;
+
+ clocksource_wdtest_ktime.rating = 10;
+ clocksource_wdtest_ktime.flags = KTIME_FLAGS;
+ if (percpu)
+ clocksource_wdtest_ktime.flags |= CLOCK_SOURCE_WDTEST_PERCPU;
+ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+}
+
+static bool wdtest_execute(enum wdtest_states which, bool percpu, unsigned int expect,
+ unsigned long calls)
{
- if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) {
- clocksource_unregister(&clocksource_wdtest_ktime);
- clocksource_wdtest_ktime.flags = KTIME_FLAGS;
- schedule_timeout_uninterruptible(HZ / 10);
- clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+ wdtest_clocksource_reset(which, percpu);
+
+ for (; READ_ONCE(wdtest_test_count) < calls; msleep(100)) {
+ unsigned int flags = READ_ONCE(clocksource_wdtest_ktime.flags);
+
+ if (kthread_should_stop())
+ return false;
+
+ if (flags & CLOCK_SOURCE_UNSTABLE) {
+ if (expect & CLOCK_SOURCE_UNSTABLE)
+ return true;
+ pr_warn("Fail: Unexpected unstable\n");
+ return false;
+ }
+ if (flags & CLOCK_SOURCE_VALID_FOR_HRES) {
+ if (expect & CLOCK_SOURCE_VALID_FOR_HRES)
+ return true;
+ pr_warn("Fail: Unexpected valid for highres\n");
+ return false;
+ }
}
+
+ if (!expect)
+ return true;
+
+ pr_warn("Fail: Timed out\n");
+ return false;
}
-/* Run the specified series of watchdog tests. */
-static int wdtest_func(void *arg)
+static bool wdtest_run(bool percpu)
{
- unsigned long j1, j2;
- int i, max_retries;
- char *s;
+ if (!wdtest_execute(WDTEST_INJECT_NONE, percpu, CLOCK_SOURCE_VALID_FOR_HRES, 8))
+ return false;
- schedule_timeout_uninterruptible(holdoff * HZ);
+ if (!wdtest_execute(WDTEST_INJECT_DELAY, percpu, 0, 4))
+ return false;
- /*
- * Verify that jiffies-like clocksources get the manually
- * specified uncertainty margin.
- */
- pr_info("--- Verify jiffies-like uncertainty margin.\n");
- __clocksource_register(&clocksource_wdtest_jiffies);
- WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC);
+ if (!wdtest_execute(WDTEST_INJECT_POSITIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8))
+ return false;
- j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
- schedule_timeout_uninterruptible(HZ);
- j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
- WARN_ON_ONCE(j1 == j2);
+ if (!wdtest_execute(WDTEST_INJECT_NEGATIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8))
+ return false;
- clocksource_unregister(&clocksource_wdtest_jiffies);
+ return true;
+}
- /*
- * Verify that tsc-like clocksources are assigned a reasonable
- * uncertainty margin.
- */
- pr_info("--- Verify tsc-like uncertainty margin.\n");
+static int wdtest_func(void *arg)
+{
clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
- WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC);
-
- j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
- udelay(1);
- j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
- pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1);
- WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC),
- "Expected at least 1000ns, got %lu.\n", j2 - j1);
-
- /* Verify tsc-like stability with various numbers of errors injected. */
- max_retries = clocksource_get_max_watchdog_retry();
- for (i = 0; i <= max_retries + 1; i++) {
- if (i <= 1 && i < max_retries)
- s = "";
- else if (i <= max_retries)
- s = ", expect message";
- else
- s = ", expect clock skew";
- pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s);
- WRITE_ONCE(wdtest_ktime_read_ndelays, i);
- schedule_timeout_uninterruptible(2 * HZ);
- WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
- WARN_ON_ONCE((i <= max_retries) !=
- !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
- wdtest_ktime_clocksource_reset();
+ if (wdtest_run(false)) {
+ if (wdtest_run(true))
+ pr_info("Success: All tests passed\n");
}
-
- /* Verify tsc-like stability with clock-value-fuzz error injection. */
- pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n");
- WRITE_ONCE(wdtest_ktime_read_fuzz, true);
- schedule_timeout_uninterruptible(2 * HZ);
- WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
- clocksource_verify_percpu(&clocksource_wdtest_ktime);
- WRITE_ONCE(wdtest_ktime_read_fuzz, false);
-
clocksource_unregister(&clocksource_wdtest_ktime);
- pr_info("--- Done with test.\n");
- return 0;
-}
+ if (!IS_MODULE(CONFIG_TEST_CLOCKSOURCE_WATCHDOG))
+ return 0;
-static void wdtest_print_module_parms(void)
-{
- pr_alert("--- holdoff=%d\n", holdoff);
+ while (!kthread_should_stop())
+ schedule_timeout_interruptible(3600 * HZ);
+ return 0;
}
-/* Cleanup function. */
-static void clocksource_wdtest_cleanup(void)
-{
-}
+static struct task_struct *wdtest_thread;
static int __init clocksource_wdtest_init(void)
{
- int ret = 0;
-
- wdtest_print_module_parms();
+ struct task_struct *t = kthread_run(wdtest_func, NULL, "wdtest");
- /* Create watchdog-test task. */
- wdtest_task = kthread_run(wdtest_func, NULL, "wdtest");
- if (IS_ERR(wdtest_task)) {
- ret = PTR_ERR(wdtest_task);
- pr_warn("%s: Failed to create wdtest kthread.\n", __func__);
- wdtest_task = NULL;
- return ret;
+ if (IS_ERR(t)) {
+ pr_warn("Failed to create wdtest kthread.\n");
+ return PTR_ERR(t);
}
-
+ wdtest_thread = t;
return 0;
}
-
module_init(clocksource_wdtest_init);
+
+static void clocksource_wdtest_cleanup(void)
+{
+ if (wdtest_thread)
+ kthread_stop(wdtest_thread);
+}
module_exit(clocksource_wdtest_cleanup);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index df7194961658..baee13a1f87f 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -7,15 +7,17 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/device.h>
#include <linux/clocksource.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/device.h>
#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
-#include <linux/tick.h>
#include <linux/kthread.h>
+#include <linux/module.h>
#include <linux/prandom.h>
-#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/topology.h>
#include "tick-internal.h"
#include "timekeeping_internal.h"
@@ -107,48 +109,6 @@ static char override_name[CS_NAME_LEN];
static int finished_booting;
static u64 suspend_start;
-/*
- * Interval: 0.5sec.
- */
-#define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
-
-/*
- * Threshold: 0.0312s, when doubled: 0.0625s.
- */
-#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
-
-/*
- * Maximum permissible delay between two readouts of the watchdog
- * clocksource surrounding a read of the clocksource being validated.
- * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
- * a lower bound for cs->uncertainty_margin values when registering clocks.
- *
- * The default of 500 parts per million is based on NTP's limits.
- * If a clocksource is good enough for NTP, it is good enough for us!
- *
- * In other words, by default, even if a clocksource is extremely
- * precise (for example, with a sub-nanosecond period), the maximum
- * permissible skew between the clocksource watchdog and the clocksource
- * under test is not permitted to go below the 500ppm minimum defined
- * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the
- * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option.
- */
-#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
-#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
-#else
-#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
-#endif
-
-/*
- * Default for maximum permissible skew when cs->uncertainty_margin is
- * not specified, and the lower bound even when cs->uncertainty_margin
- * is specified. This is also the default that is used when registering
- * clocks with unspecified cs->uncertainty_margin, so this macro is used
- * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels.
- */
-#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
-
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
static void clocksource_select(void);
@@ -160,7 +120,42 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
-static int64_t watchdog_max_interval;
+
+/* Watchdog interval: 0.5sec. */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_INTERVAL_NS (WATCHDOG_INTERVAL * (NSEC_PER_SEC / HZ))
+
+/* Maximum time between two reference watchdog readouts */
+#define WATCHDOG_READOUT_MAX_NS (50U * NSEC_PER_USEC)
+
+/*
+ * Maximum time between two remote readouts for NUMA=n. On NUMA enabled systems
+ * the timeout is calculated from the numa distance.
+ */
+#define WATCHDOG_DEFAULT_TIMEOUT_NS (50U * NSEC_PER_USEC)
+
+/*
+ * Remote timeout NUMA distance multiplier. The local distance is 10. The
+ * default remote distance is 20. ACPI tables provide more accurate numbers
+ * which are guaranteed to be greater than the local distance.
+ *
+ * This results in a 5us base value, which is equivalent to the above !NUMA
+ * default.
+ */
+#define WATCHDOG_NUMA_MULTIPLIER_NS ((u64)(WATCHDOG_DEFAULT_TIMEOUT_NS / LOCAL_DISTANCE))
+
+/* Limit the NUMA timeout in case the distance values are insanely big */
+#define WATCHDOG_NUMA_MAX_TIMEOUT_NS ((u64)(500U * NSEC_PER_USEC))
+
+/* Shift values to calculate the approximate $N ppm of a given delta. */
+#define SHIFT_500PPM 11
+#define SHIFT_4000PPM 8
+
+/* Number of attempts to read the watchdog */
+#define WATCHDOG_FREQ_RETRIES 3
+
+/* Five reads local and remote for inter CPU skew detection */
+#define WATCHDOG_REMOTE_MAX_SEQ 10
static inline void clocksource_watchdog_lock(unsigned long *flags)
{
@@ -241,204 +236,422 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static int verify_n_cpus = 8;
-module_param(verify_n_cpus, int, 0644);
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
-enum wd_read_status {
- WD_READ_SUCCESS,
- WD_READ_UNSTABLE,
- WD_READ_SKIP
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+enum wd_result {
+ WD_SUCCESS,
+ WD_FREQ_NO_WATCHDOG,
+ WD_FREQ_TIMEOUT,
+ WD_FREQ_RESET,
+ WD_FREQ_SKEWED,
+ WD_CPU_TIMEOUT,
+ WD_CPU_SKEWED,
+};
+
+struct watchdog_cpu_data {
+ /* Keep first as it is 32 byte aligned */
+ call_single_data_t csd;
+ atomic_t remote_inprogress;
+ enum wd_result result;
+ u64 cpu_ts[2];
+ struct clocksource *cs;
+ /* Ensure that the sequence is in a separate cache line */
+ atomic_t seq ____cacheline_aligned;
+ /* Set by the control CPU according to NUMA distance */
+ u64 timeout_ns;
};
-static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
-{
- int64_t md = watchdog->uncertainty_margin;
- unsigned int nretries, max_retries;
- int64_t wd_delay, wd_seq_delay;
- u64 wd_end, wd_end2;
-
- max_retries = clocksource_get_max_watchdog_retry();
- for (nretries = 0; nretries <= max_retries; nretries++) {
- local_irq_disable();
- *wdnow = watchdog->read(watchdog);
- *csnow = cs->read(cs);
- wd_end = watchdog->read(watchdog);
- wd_end2 = watchdog->read(watchdog);
- local_irq_enable();
-
- wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
- if (wd_delay <= md + cs->uncertainty_margin) {
- if (nretries > 1 && nretries >= max_retries) {
- pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
- smp_processor_id(), watchdog->name, nretries);
+struct watchdog_data {
+ raw_spinlock_t lock;
+ enum wd_result result;
+
+ u64 wd_seq;
+ u64 wd_delta;
+ u64 cs_delta;
+ u64 cpu_ts[2];
+
+ unsigned int curr_cpu;
+} ____cacheline_aligned_in_smp;
+
+static void watchdog_check_skew_remote(void *unused);
+
+static DEFINE_PER_CPU_ALIGNED(struct watchdog_cpu_data, watchdog_cpu_data) = {
+ .csd = CSD_INIT(watchdog_check_skew_remote, NULL),
+};
+
+static struct watchdog_data watchdog_data = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(watchdog_data.lock),
+};
+
+static inline void watchdog_set_result(struct watchdog_cpu_data *wd, enum wd_result result)
+{
+ guard(raw_spinlock)(&watchdog_data.lock);
+ if (!wd->result) {
+ atomic_set(&wd->seq, WATCHDOG_REMOTE_MAX_SEQ);
+ WRITE_ONCE(wd->result, result);
+ }
+}
+
+/* Wait for the sequence number to hand over control. */
+static bool watchdog_wait_seq(struct watchdog_cpu_data *wd, u64 start, int seq)
+{
+ for(int cnt = 0; atomic_read(&wd->seq) < seq; cnt++) {
+ /* Bail if the other side set an error result */
+ if (READ_ONCE(wd->result) != WD_SUCCESS)
+ return false;
+
+ /* Prevent endless loops if the other CPU does not react. */
+ if (cnt == 5000) {
+ u64 nsecs = ktime_get_raw_fast_ns();
+
+ if (nsecs - start >=wd->timeout_ns) {
+ watchdog_set_result(wd, WD_CPU_TIMEOUT);
+ return false;
}
- return WD_READ_SUCCESS;
+ cnt = 0;
}
+ cpu_relax();
+ }
+ return seq < WATCHDOG_REMOTE_MAX_SEQ;
+}
- /*
- * Now compute delay in consecutive watchdog read to see if
- * there is too much external interferences that cause
- * significant delay in reading both clocksource and watchdog.
- *
- * If consecutive WD read-back delay > md, report
- * system busy, reinit the watchdog and skip the current
- * watchdog test.
- */
- wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
- if (wd_seq_delay > md)
- goto skip_test;
+static void watchdog_check_skew(struct watchdog_cpu_data *wd, int index)
+{
+ u64 prev, now, delta, start = ktime_get_raw_fast_ns();
+ int local = index, remote = (index + 1) & 0x1;
+ struct clocksource *cs = wd->cs;
+
+ /* Set the local timestamp so that the first iteration works correctly */
+ wd->cpu_ts[local] = cs->read(cs);
+
+ /* Signal arrival */
+ atomic_inc(&wd->seq);
+
+ for (int seq = local + 2; seq < WATCHDOG_REMOTE_MAX_SEQ; seq += 2) {
+ if (!watchdog_wait_seq(wd, start, seq))
+ return;
+
+ /* Capture local timestamp before possible non-local coherency overhead */
+ now = cs->read(cs);
+
+ /* Store local timestamp before reading remote to limit coherency stalls */
+ wd->cpu_ts[local] = now;
+
+ prev = wd->cpu_ts[remote];
+ delta = (now - prev) & cs->mask;
+
+ if (delta > cs->max_raw_delta) {
+ watchdog_set_result(wd, WD_CPU_SKEWED);
+ return;
+ }
+
+ /* Hand over to the remote CPU */
+ atomic_inc(&wd->seq);
}
+}
- pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
- smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
- return WD_READ_UNSTABLE;
+static void watchdog_check_skew_remote(void *unused)
+{
+ struct watchdog_cpu_data *wd = this_cpu_ptr(&watchdog_cpu_data);
-skip_test:
- pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
- smp_processor_id(), watchdog->name, wd_seq_delay);
- pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
- cs->name, wd_delay);
- return WD_READ_SKIP;
+ atomic_inc(&wd->remote_inprogress);
+ watchdog_check_skew(wd, 1);
+ atomic_dec(&wd->remote_inprogress);
}
-static u64 csnow_mid;
-static cpumask_t cpus_ahead;
-static cpumask_t cpus_behind;
-static cpumask_t cpus_chosen;
+static inline bool wd_csd_locked(struct watchdog_cpu_data *wd)
+{
+ return READ_ONCE(wd->csd.node.u_flags) & CSD_FLAG_LOCK;
+}
+
+/*
+ * This is only invoked for remote CPUs. See watchdog_check_cpu_skew().
+ */
+static inline u64 wd_get_remote_timeout(unsigned int remote_cpu)
+{
+ unsigned int n1, n2;
+ u64 ns;
+
+ if (nr_node_ids == 1)
+ return WATCHDOG_DEFAULT_TIMEOUT_NS;
+
+ n1 = cpu_to_node(smp_processor_id());
+ n2 = cpu_to_node(remote_cpu);
+ ns = WATCHDOG_NUMA_MULTIPLIER_NS * node_distance(n1, n2);
+ return min(ns, WATCHDOG_NUMA_MAX_TIMEOUT_NS);
+}
-static void clocksource_verify_choose_cpus(void)
+static void __watchdog_check_cpu_skew(struct clocksource *cs, unsigned int cpu)
{
- int cpu, i, n = verify_n_cpus;
+ struct watchdog_cpu_data *wd;
- if (n < 0 || n >= num_online_cpus()) {
- /* Check all of the CPUs. */
- cpumask_copy(&cpus_chosen, cpu_online_mask);
- cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+ wd = per_cpu_ptr(&watchdog_cpu_data, cpu);
+ if (atomic_read(&wd->remote_inprogress) || wd_csd_locked(wd)) {
+ watchdog_data.result = WD_CPU_TIMEOUT;
return;
}
- /* If no checking desired, or no other CPU to check, leave. */
- cpumask_clear(&cpus_chosen);
- if (n == 0 || num_online_cpus() <= 1)
+ atomic_set(&wd->seq, 0);
+ wd->result = WD_SUCCESS;
+ wd->cs = cs;
+ /* Store the current CPU ID for the watchdog test unit */
+ cs->wd_cpu = smp_processor_id();
+
+ wd->timeout_ns = wd_get_remote_timeout(cpu);
+
+ /* Kick the remote CPU into the watchdog function */
+ if (WARN_ON_ONCE(smp_call_function_single_async(cpu, &wd->csd))) {
+ watchdog_data.result = WD_CPU_TIMEOUT;
+ return;
+ }
+
+ scoped_guard(irq)
+ watchdog_check_skew(wd, 0);
+
+ scoped_guard(raw_spinlock_irq, &watchdog_data.lock) {
+ watchdog_data.result = wd->result;
+ memcpy(watchdog_data.cpu_ts, wd->cpu_ts, sizeof(wd->cpu_ts));
+ }
+}
+
+static void watchdog_check_cpu_skew(struct clocksource *cs)
+{
+ unsigned int cpu = watchdog_data.curr_cpu;
+
+ cpu = cpumask_next_wrap(cpu, cpu_online_mask);
+ watchdog_data.curr_cpu = cpu;
+
+ /* Skip the current CPU. Handles num_online_cpus() == 1 as well */
+ if (cpu == smp_processor_id())
return;
- /* Make sure to select at least one CPU other than the current CPU. */
- cpu = cpumask_any_but(cpu_online_mask, smp_processor_id());
- if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ /* Don't interfere with the test mechanics */
+ if ((cs->flags & CLOCK_SOURCE_WDTEST) && !(cs->flags & CLOCK_SOURCE_WDTEST_PERCPU))
return;
- cpumask_set_cpu(cpu, &cpus_chosen);
- /* Force a sane value for the boot parameter. */
- if (n > nr_cpu_ids)
- n = nr_cpu_ids;
+ __watchdog_check_cpu_skew(cs, cpu);
+}
+
+static bool watchdog_check_freq(struct clocksource *cs, bool reset_pending)
+{
+ unsigned int ppm_shift = SHIFT_4000PPM;
+ u64 wd_ts0, wd_ts1, cs_ts;
+
+ watchdog_data.result = WD_SUCCESS;
+ if (!watchdog) {
+ watchdog_data.result = WD_FREQ_NO_WATCHDOG;
+ return false;
+ }
+
+ if (cs->flags & CLOCK_SOURCE_WDTEST_PERCPU)
+ return true;
/*
- * Randomly select the specified number of CPUs. If the same
- * CPU is selected multiple times, that CPU is checked only once,
- * and no replacement CPU is selected. This gracefully handles
- * situations where verify_n_cpus is greater than the number of
- * CPUs that are currently online.
+ * If both the clocksource and the watchdog claim they are
+ * calibrated use 500ppm limit. Uncalibrated clocksources need a
+ * larger allowance because thefirmware supplied frequencies can be
+ * way off.
*/
- for (i = 1; i < n; i++) {
- cpu = cpumask_random(cpu_online_mask);
- if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
- cpumask_set_cpu(cpu, &cpus_chosen);
+ if (watchdog->flags & CLOCK_SOURCE_CALIBRATED && cs->flags & CLOCK_SOURCE_CALIBRATED)
+ ppm_shift = SHIFT_500PPM;
+
+ for (int retries = 0; retries < WATCHDOG_FREQ_RETRIES; retries++) {
+ s64 wd_last, cs_last, wd_seq, wd_delta, cs_delta, max_delta;
+
+ scoped_guard(irq) {
+ wd_ts0 = watchdog->read(watchdog);
+ cs_ts = cs->read(cs);
+ wd_ts1 = watchdog->read(watchdog);
+ }
+
+ wd_last = cs->wd_last;
+ cs_last = cs->cs_last;
+
+ /* Validate the watchdog readout window */
+ wd_seq = cycles_to_nsec_safe(watchdog, wd_ts0, wd_ts1);
+ if (wd_seq > WATCHDOG_READOUT_MAX_NS) {
+ /* Store for printout in case all retries fail */
+ watchdog_data.wd_seq = wd_seq;
+ continue;
+ }
+
+ /* Store for subsequent processing */
+ cs->wd_last = wd_ts0;
+ cs->cs_last = cs_ts;
+
+ /* First round or reset pending? */
+ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || reset_pending)
+ goto reset;
+
+ /* Calculate the nanosecond deltas from the last invocation */
+ wd_delta = cycles_to_nsec_safe(watchdog, wd_last, wd_ts0);
+ cs_delta = cycles_to_nsec_safe(cs, cs_last, cs_ts);
+
+ watchdog_data.wd_delta = wd_delta;
+ watchdog_data.cs_delta = cs_delta;
+
+ /*
+ * Ensure that the deltas are within the readout limits of
+ * the clocksource and the watchdog. Long delays can cause
+ * clocksources to overflow.
+ */
+ max_delta = max(wd_delta, cs_delta);
+ if (max_delta > cs->max_idle_ns || max_delta > watchdog->max_idle_ns)
+ goto reset;
+
+ /*
+ * Calculate and validate the skew against the allowed PPM
+ * value of the maximum delta plus the watchdog readout
+ * time.
+ */
+ if (abs(wd_delta - cs_delta) < (max_delta >> ppm_shift) + wd_seq)
+ return true;
+
+ watchdog_data.result = WD_FREQ_SKEWED;
+ return false;
}
- /* Don't verify ourselves. */
- cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+ watchdog_data.result = WD_FREQ_TIMEOUT;
+ return false;
+
+reset:
+ cs->flags |= CLOCK_SOURCE_WATCHDOG;
+ watchdog_data.result = WD_FREQ_RESET;
+ return false;
}
-static void clocksource_verify_one_cpu(void *csin)
+/* Synchronization for sched clock */
+static void clocksource_tick_stable(struct clocksource *cs)
{
- struct clocksource *cs = (struct clocksource *)csin;
-
- csnow_mid = cs->read(cs);
+ if (cs == curr_clocksource && cs->tick_stable)
+ cs->tick_stable(cs);
}
-void clocksource_verify_percpu(struct clocksource *cs)
+/* Conditionaly enable high resolution mode */
+static void clocksource_enable_highres(struct clocksource *cs)
{
- int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
- u64 csnow_begin, csnow_end;
- int cpu, testcpu;
- s64 delta;
+ if ((cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) ||
+ !(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) ||
+ !watchdog || !(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS))
+ return;
+
+ /* Mark it valid for high-res. */
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
- if (verify_n_cpus == 0)
+ /*
+ * Can't schedule work before finished_booting is
+ * true. clocksource_done_booting will take care of it.
+ */
+ if (!finished_booting)
return;
- cpumask_clear(&cpus_ahead);
- cpumask_clear(&cpus_behind);
- cpus_read_lock();
- migrate_disable();
- clocksource_verify_choose_cpus();
- if (cpumask_empty(&cpus_chosen)) {
- migrate_enable();
- cpus_read_unlock();
- pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
+
+ if (cs->flags & CLOCK_SOURCE_WDTEST)
return;
+
+ /*
+ * If this is not the current clocksource let the watchdog thread
+ * reselect it. Due to the change to high res this clocksource
+ * might be preferred now. If it is the current clocksource let the
+ * tick code know about that change.
+ */
+ if (cs != curr_clocksource) {
+ cs->flags |= CLOCK_SOURCE_RESELECT;
+ schedule_work(&watchdog_work);
+ } else {
+ tick_clock_notify();
}
- testcpu = smp_processor_id();
- pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n",
- cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
- preempt_disable();
- for_each_cpu(cpu, &cpus_chosen) {
- if (cpu == testcpu)
- continue;
- csnow_begin = cs->read(cs);
- smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
- csnow_end = cs->read(cs);
- delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
- if (delta < 0)
- cpumask_set_cpu(cpu, &cpus_behind);
- delta = (csnow_end - csnow_mid) & cs->mask;
- if (delta < 0)
- cpumask_set_cpu(cpu, &cpus_ahead);
- cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end);
- if (cs_nsec > cs_nsec_max)
- cs_nsec_max = cs_nsec;
- if (cs_nsec < cs_nsec_min)
- cs_nsec_min = cs_nsec;
+}
+
+static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 2);
+
+static void watchdog_print_freq_timeout(struct clocksource *cs)
+{
+ if (!__ratelimit(&ratelimit_state))
+ return;
+ pr_info("Watchdog %s read timed out. Readout sequence took: %lluns\n",
+ watchdog->name, watchdog_data.wd_seq);
+}
+
+static void watchdog_print_freq_skew(struct clocksource *cs)
+{
+ pr_warn("Marking clocksource %s unstable due to frequency skew\n", cs->name);
+ pr_warn("Watchdog %20s interval: %16lluns\n", watchdog->name, watchdog_data.wd_delta);
+ pr_warn("Clocksource %20s interval: %16lluns\n", cs->name, watchdog_data.cs_delta);
+}
+
+static void watchdog_handle_remote_timeout(struct clocksource *cs)
+{
+ pr_info_once("Watchdog remote CPU %u read timed out\n", watchdog_data.curr_cpu);
+}
+
+static void watchdog_print_remote_skew(struct clocksource *cs)
+{
+ pr_warn("Marking clocksource %s unstable due to inter CPU skew\n", cs->name);
+ if (watchdog_data.cpu_ts[0] < watchdog_data.cpu_ts[1]) {
+ pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", smp_processor_id(),
+ watchdog_data.cpu_ts[0], watchdog_data.curr_cpu, watchdog_data.cpu_ts[1]);
+ } else {
+ pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", watchdog_data.curr_cpu,
+ watchdog_data.cpu_ts[1], smp_processor_id(), watchdog_data.cpu_ts[0]);
}
- preempt_enable();
- migrate_enable();
- cpus_read_unlock();
- if (!cpumask_empty(&cpus_ahead))
- pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
- cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
- if (!cpumask_empty(&cpus_behind))
- pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n",
- cpumask_pr_args(&cpus_behind), testcpu, cs->name);
- pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
- testcpu, cs_nsec_min, cs_nsec_max, cs->name);
-}
-EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+}
-static inline void clocksource_reset_watchdog(void)
+static void watchdog_check_result(struct clocksource *cs)
{
- struct clocksource *cs;
+ switch (watchdog_data.result) {
+ case WD_SUCCESS:
+ clocksource_tick_stable(cs);
+ clocksource_enable_highres(cs);
+ return;
- list_for_each_entry(cs, &watchdog_list, wd_list)
+ case WD_FREQ_TIMEOUT:
+ watchdog_print_freq_timeout(cs);
+ /* Try again later and invalidate the reference timestamps. */
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
+ return;
+ case WD_FREQ_NO_WATCHDOG:
+ case WD_FREQ_RESET:
+ /*
+ * Nothing to do when the reference timestamps were reset
+ * or no watchdog clocksource registered.
+ */
+ return;
+
+ case WD_FREQ_SKEWED:
+ watchdog_print_freq_skew(cs);
+ break;
+
+ case WD_CPU_TIMEOUT:
+ /* Remote check timed out. Try again next cycle. */
+ watchdog_handle_remote_timeout(cs);
+ return;
+
+ case WD_CPU_SKEWED:
+ watchdog_print_remote_skew(cs);
+ break;
+ }
+ __clocksource_unstable(cs);
+}
static void clocksource_watchdog(struct timer_list *unused)
{
- int64_t wd_nsec, cs_nsec, interval;
- u64 csnow, wdnow, cslast, wdlast;
- int next_cpu, reset_pending;
struct clocksource *cs;
- enum wd_read_status read_ret;
- unsigned long extra_wait = 0;
- u32 md;
+ bool reset_pending;
- spin_lock(&watchdog_lock);
+ guard(spinlock)(&watchdog_lock);
if (!watchdog_running)
- goto out;
+ return;
reset_pending = atomic_read(&watchdog_reset_pending);
list_for_each_entry(cs, &watchdog_list, wd_list) {
-
/* Clocksource already marked unstable? */
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
if (finished_booting)
@@ -446,170 +659,40 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
- read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
-
- if (read_ret == WD_READ_UNSTABLE) {
- /* Clock readout unreliable, so give it up. */
- __clocksource_unstable(cs);
- continue;
- }
-
- /*
- * When WD_READ_SKIP is returned, it means the system is likely
- * under very heavy load, where the latency of reading
- * watchdog/clocksource is very big, and affect the accuracy of
- * watchdog check. So give system some space and suspend the
- * watchdog check for 5 minutes.
- */
- if (read_ret == WD_READ_SKIP) {
- /*
- * As the watchdog timer will be suspended, and
- * cs->last could keep unchanged for 5 minutes, reset
- * the counters.
- */
- clocksource_reset_watchdog();
- extra_wait = HZ * 300;
- break;
- }
-
- /* Clocksource initialized ? */
- if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
- atomic_read(&watchdog_reset_pending)) {
- cs->flags |= CLOCK_SOURCE_WATCHDOG;
- cs->wd_last = wdnow;
- cs->cs_last = csnow;
- continue;
+ /* Compare against watchdog clocksource if available */
+ if (watchdog_check_freq(cs, reset_pending)) {
+ /* Check for inter CPU skew */
+ watchdog_check_cpu_skew(cs);
}
- wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow);
- cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow);
- wdlast = cs->wd_last; /* save these in case we print them */
- cslast = cs->cs_last;
- cs->cs_last = csnow;
- cs->wd_last = wdnow;
-
- if (atomic_read(&watchdog_reset_pending))
- continue;
-
- /*
- * The processing of timer softirqs can get delayed (usually
- * on account of ksoftirqd not getting to run in a timely
- * manner), which causes the watchdog interval to stretch.
- * Skew detection may fail for longer watchdog intervals
- * on account of fixed margins being used.
- * Some clocksources, e.g. acpi_pm, cannot tolerate
- * watchdog intervals longer than a few seconds.
- */
- interval = max(cs_nsec, wd_nsec);
- if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
- if (system_state > SYSTEM_SCHEDULING &&
- interval > 2 * watchdog_max_interval) {
- watchdog_max_interval = interval;
- pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
- cs_nsec, wd_nsec);
- }
- watchdog_timer.expires = jiffies;
- continue;
- }
-
- /* Check the deviation from the watchdog clocksource. */
- md = cs->uncertainty_margin + watchdog->uncertainty_margin;
- if (abs(cs_nsec - wd_nsec) > md) {
- s64 cs_wd_msec;
- s64 wd_msec;
- u32 wd_rem;
-
- pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
- smp_processor_id(), cs->name);
- pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
- watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
- pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
- cs->name, cs_nsec, csnow, cslast, cs->mask);
- cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem);
- wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem);
- pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
- cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
- if (curr_clocksource == cs)
- pr_warn(" '%s' is current clocksource.\n", cs->name);
- else if (curr_clocksource)
- pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
- else
- pr_warn(" No current clocksource.\n");
- __clocksource_unstable(cs);
- continue;
- }
-
- if (cs == curr_clocksource && cs->tick_stable)
- cs->tick_stable(cs);
-
- if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
- (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
- (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
- /* Mark it valid for high-res. */
- cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
-
- /*
- * clocksource_done_booting() will sort it if
- * finished_booting is not set yet.
- */
- if (!finished_booting)
- continue;
-
- /*
- * If this is not the current clocksource let
- * the watchdog thread reselect it. Due to the
- * change to high res this clocksource might
- * be preferred now. If it is the current
- * clocksource let the tick code know about
- * that change.
- */
- if (cs != curr_clocksource) {
- cs->flags |= CLOCK_SOURCE_RESELECT;
- schedule_work(&watchdog_work);
- } else {
- tick_clock_notify();
- }
- }
+ watchdog_check_result(cs);
}
- /*
- * We only clear the watchdog_reset_pending, when we did a
- * full cycle through all clocksources.
- */
+ /* Clear after the full clocksource walk */
if (reset_pending)
atomic_dec(&watchdog_reset_pending);
- /*
- * Cycle through CPUs to check if the CPUs stay synchronized
- * to each other.
- */
- next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask);
-
- /*
- * Arm timer if not already pending: could race with concurrent
- * pair clocksource_stop_watchdog() clocksource_start_watchdog().
- */
+ /* Could have been rearmed by a stop/start cycle */
if (!timer_pending(&watchdog_timer)) {
- watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
- add_timer_on(&watchdog_timer, next_cpu);
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_local(&watchdog_timer);
}
-out:
- spin_unlock(&watchdog_lock);
}
static inline void clocksource_start_watchdog(void)
{
- if (watchdog_running || !watchdog || list_empty(&watchdog_list))
+ if (watchdog_running || list_empty(&watchdog_list))
return;
- timer_setup(&watchdog_timer, clocksource_watchdog, 0);
+ timer_setup(&watchdog_timer, clocksource_watchdog, TIMER_PINNED);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
+
+ add_timer_on(&watchdog_timer, get_boot_cpu_id());
watchdog_running = 1;
}
static inline void clocksource_stop_watchdog(void)
{
- if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
+ if (!watchdog_running || !list_empty(&watchdog_list))
return;
timer_delete(&watchdog_timer);
watchdog_running = 0;
@@ -651,6 +734,13 @@ static void clocksource_select_watchdog(bool fallback)
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
continue;
+ /*
+ * If it's not continuous, don't put the fox in charge of
+ * the henhouse.
+ */
+ if (!(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS))
+ continue;
+
/* Skip current if we were requested for a fallback. */
if (fallback && cs == old_wd)
continue;
@@ -690,12 +780,6 @@ static int __clocksource_watchdog_kthread(void)
unsigned long flags;
int select = 0;
- /* Do any required per-CPU skew verification. */
- if (curr_clocksource &&
- curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
- curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
- clocksource_verify_percpu(curr_clocksource);
-
spin_lock_irqsave(&watchdog_lock, flags);
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
@@ -1016,6 +1100,8 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
continue;
if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
continue;
+ if (cs->flags & CLOCK_SOURCE_WDTEST)
+ continue;
return cs;
}
return NULL;
@@ -1040,6 +1126,8 @@ static void __clocksource_select(bool skipcur)
continue;
if (strcmp(cs->name, override_name) != 0)
continue;
+ if (cs->flags & CLOCK_SOURCE_WDTEST)
+ continue;
/*
* Check to make sure we don't switch to a non-highres
* capable clocksource if the tick code is in oneshot
@@ -1169,31 +1257,10 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
NSEC_PER_SEC / scale, sec * scale);
- }
- /*
- * If the uncertainty margin is not specified, calculate it. If
- * both scale and freq are non-zero, calculate the clock period, but
- * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default.
- * However, if either of scale or freq is zero, be very conservative
- * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value
- * for the uncertainty margin. Allow stupidly small uncertainty
- * margins to be specified by the caller for testing purposes,
- * but warn to discourage production use of this capability.
- *
- * Bottom line: The sum of the uncertainty margins of the
- * watchdog clocksource and the clocksource under test will be at
- * least 500ppm by default. For more information, please see the
- * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above.
- */
- if (scale && freq && !cs->uncertainty_margin) {
- cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
- if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
- cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
- } else if (!cs->uncertainty_margin) {
- cs->uncertainty_margin = WATCHDOG_THRESHOLD;
+ /* Update cs::freq_khz */
+ cs->freq_khz = div_u64((u64)freq * scale, 1000);
}
- WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
/*
* Ensure clocksources that have large 'mult' values don't overflow
@@ -1241,6 +1308,10 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
cs->id = CSID_GENERIC;
+
+ if (WARN_ON_ONCE(!freq && cs->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT))
+ cs->flags &= ~CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT;
+
if (cs->vdso_clock_mode < 0 ||
cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 860af7a58428..5bd6efe598f0 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -50,6 +50,28 @@
#include "tick-internal.h"
/*
+ * Constants to set the queued state of the timer (INACTIVE, ENQUEUED)
+ *
+ * The callback state is kept separate in the CPU base because having it in
+ * the timer would required touching the timer after the callback, which
+ * makes it impossible to free the timer from the callback function.
+ *
+ * Therefore we track the callback state in:
+ *
+ * timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
+ * queued a signal. Between dropping the lock which protects the posix timer
+ * and reacquiring the base lock of the hrtimer, another CPU can deliver the
+ * signal and rearm the timer.
+ *
+ * All state transitions are protected by cpu_base->lock.
+ */
+#define HRTIMER_STATE_INACTIVE false
+#define HRTIMER_STATE_ENQUEUED true
+
+/*
* The resolution of the clocks. The resolution value is returned in
* the clock_getres() system call to give application programmers an
* idea of the (in)accuracy of timers. Timer values are rounded up to
@@ -77,43 +99,22 @@ static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);
* to reach a base using a clockid, hrtimer_clockid_to_base()
* is used to convert from clockid to the proper hrtimer_base_type.
*/
+
+#define BASE_INIT(idx, cid) \
+ [idx] = { .index = idx, .clockid = cid }
+
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
- .clock_base =
- {
- {
- .index = HRTIMER_BASE_MONOTONIC,
- .clockid = CLOCK_MONOTONIC,
- },
- {
- .index = HRTIMER_BASE_REALTIME,
- .clockid = CLOCK_REALTIME,
- },
- {
- .index = HRTIMER_BASE_BOOTTIME,
- .clockid = CLOCK_BOOTTIME,
- },
- {
- .index = HRTIMER_BASE_TAI,
- .clockid = CLOCK_TAI,
- },
- {
- .index = HRTIMER_BASE_MONOTONIC_SOFT,
- .clockid = CLOCK_MONOTONIC,
- },
- {
- .index = HRTIMER_BASE_REALTIME_SOFT,
- .clockid = CLOCK_REALTIME,
- },
- {
- .index = HRTIMER_BASE_BOOTTIME_SOFT,
- .clockid = CLOCK_BOOTTIME,
- },
- {
- .index = HRTIMER_BASE_TAI_SOFT,
- .clockid = CLOCK_TAI,
- },
+ .clock_base = {
+ BASE_INIT(HRTIMER_BASE_MONOTONIC, CLOCK_MONOTONIC),
+ BASE_INIT(HRTIMER_BASE_REALTIME, CLOCK_REALTIME),
+ BASE_INIT(HRTIMER_BASE_BOOTTIME, CLOCK_BOOTTIME),
+ BASE_INIT(HRTIMER_BASE_TAI, CLOCK_TAI),
+ BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT, CLOCK_MONOTONIC),
+ BASE_INIT(HRTIMER_BASE_REALTIME_SOFT, CLOCK_REALTIME),
+ BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT, CLOCK_BOOTTIME),
+ BASE_INIT(HRTIMER_BASE_TAI_SOFT, CLOCK_TAI),
},
.csd = CSD_INIT(retrigger_next_event, NULL)
};
@@ -126,23 +127,43 @@ static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
return likely(base->online);
}
+#ifdef CONFIG_HIGH_RES_TIMERS
+DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key);
+
+static void hrtimer_hres_workfn(struct work_struct *work)
+{
+ static_branch_enable(&hrtimer_highres_enabled_key);
+}
+
+static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn);
+
+static inline void hrtimer_schedule_hres_work(void)
+{
+ if (!hrtimer_highres_enabled())
+ schedule_work(&hrtimer_hres_work);
+}
+#else
+static inline void hrtimer_schedule_hres_work(void) { }
+#endif
+
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
*/
#ifdef CONFIG_SMP
-
/*
* We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
* such that hrtimer_callback_running() can unconditionally dereference
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
- .clock_base = { {
- .cpu_base = &migration_cpu_base,
- .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
- &migration_cpu_base.lock),
- }, },
+ .clock_base = {
+ [0] = {
+ .cpu_base = &migration_cpu_base,
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
+ &migration_cpu_base.lock),
+ },
+ },
};
#define migration_base migration_cpu_base.clock_base[0]
@@ -159,15 +180,13 @@ static struct hrtimer_cpu_base migration_cpu_base = {
* possible to set timer->base = &migration_base and drop the lock: the timer
* remains locked.
*/
-static
-struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
- unsigned long *flags)
+static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+ unsigned long *flags)
__acquires(&timer->base->lock)
{
- struct hrtimer_clock_base *base;
-
for (;;) {
- base = READ_ONCE(timer->base);
+ struct hrtimer_clock_base *base = READ_ONCE(timer->base);
+
if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
@@ -220,7 +239,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_
return expires >= new_base->cpu_base->expires_next;
}
-static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned)
{
if (!hrtimer_base_is_online(base)) {
int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
@@ -248,8 +267,7 @@ static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *
* the timer callback is currently running.
*/
static inline struct hrtimer_clock_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
- int pinned)
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned)
{
struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
struct hrtimer_clock_base *new_base;
@@ -262,13 +280,12 @@ again:
if (base != new_base) {
/*
- * We are trying to move timer to new_base.
- * However we can't change timer's base while it is running,
- * so we keep it on the same CPU. No hassle vs. reprogramming
- * the event source in the high resolution case. The softirq
- * code will take care of this when the timer function has
- * completed. There is no conflict as we hold the lock until
- * the timer is enqueued.
+ * We are trying to move timer to new_base. However we can't
+ * change timer's base while it is running, so we keep it on
+ * the same CPU. No hassle vs. reprogramming the event source
+ * in the high resolution case. The remote CPU will take care
+ * of this when the timer function has completed. There is no
+ * conflict as we hold the lock until the timer is enqueued.
*/
if (unlikely(hrtimer_callback_running(timer)))
return base;
@@ -278,8 +295,7 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
- this_cpu_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
@@ -298,14 +314,13 @@ again:
#else /* CONFIG_SMP */
-static inline struct hrtimer_clock_base *
-lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+ unsigned long *flags)
__acquires(&timer->base->cpu_base->lock)
{
struct hrtimer_clock_base *base = timer->base;
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
-
return base;
}
@@ -340,7 +355,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div)
return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
-#endif /* BITS_PER_LONG >= 64 */
+#endif /* BITS_PER_LONG < 64 */
/*
* Add two ktime values and do a safety check for overflow:
@@ -422,12 +437,37 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
}
}
+/* Stub timer callback for improperly used timers. */
+static enum hrtimer_restart stub_timer(struct hrtimer *unused)
+{
+ WARN_ON_ONCE(1);
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * hrtimer_fixup_assert_init is called when:
+ * - an untracked/uninit-ed object is found
+ */
+static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state)
+{
+ struct hrtimer *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_NOTAVAILABLE:
+ hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0);
+ return true;
+ default:
+ return false;
+ }
+}
+
static const struct debug_obj_descr hrtimer_debug_descr = {
- .name = "hrtimer",
- .debug_hint = hrtimer_debug_hint,
- .fixup_init = hrtimer_fixup_init,
- .fixup_activate = hrtimer_fixup_activate,
- .fixup_free = hrtimer_fixup_free,
+ .name = "hrtimer",
+ .debug_hint = hrtimer_debug_hint,
+ .fixup_init = hrtimer_fixup_init,
+ .fixup_activate = hrtimer_fixup_activate,
+ .fixup_free = hrtimer_fixup_free,
+ .fixup_assert_init = hrtimer_fixup_assert_init,
};
static inline void debug_hrtimer_init(struct hrtimer *timer)
@@ -440,8 +480,7 @@ static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer)
debug_object_init_on_stack(timer, &hrtimer_debug_descr);
}
-static inline void debug_hrtimer_activate(struct hrtimer *timer,
- enum hrtimer_mode mode)
+static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode)
{
debug_object_activate(timer, &hrtimer_debug_descr);
}
@@ -451,6 +490,11 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
debug_object_deactivate(timer, &hrtimer_debug_descr);
}
+static inline void debug_hrtimer_assert_init(struct hrtimer *timer)
+{
+ debug_object_assert_init(timer, &hrtimer_debug_descr);
+}
+
void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
debug_object_free(timer, &hrtimer_debug_descr);
@@ -461,9 +505,9 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { }
-static inline void debug_hrtimer_activate(struct hrtimer *timer,
- enum hrtimer_mode mode) { }
+static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
+static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { }
#endif
static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode)
@@ -479,80 +523,80 @@ static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid
trace_hrtimer_setup(timer, clockid, mode);
}
-static inline void debug_activate(struct hrtimer *timer,
- enum hrtimer_mode mode)
+static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed)
{
debug_hrtimer_activate(timer, mode);
- trace_hrtimer_start(timer, mode);
+ trace_hrtimer_start(timer, mode, was_armed);
}
-static inline void debug_deactivate(struct hrtimer *timer)
-{
- debug_hrtimer_deactivate(timer);
- trace_hrtimer_cancel(timer);
-}
+#define for_each_active_base(base, cpu_base, active) \
+ for (unsigned int idx = ffs(active); idx--; idx = ffs((active))) \
+ for (bool done = false; !done; active &= ~(1U << idx)) \
+ for (base = &cpu_base->clock_base[idx]; !done; done = true)
-static struct hrtimer_clock_base *
-__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
+#define hrtimer_from_timerqueue_node(_n) container_of_const(_n, struct hrtimer, node)
+
+#if defined(CONFIG_NO_HZ_COMMON)
+/*
+ * Same as hrtimer_bases_next_event() below, but skips the excluded timer and
+ * does not update cpu_base->next_timer/expires.
+ */
+static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base,
+ const struct hrtimer *exclude,
+ unsigned int active, ktime_t expires_next)
{
- unsigned int idx;
+ struct hrtimer_clock_base *base;
+ ktime_t expires;
- if (!*active)
- return NULL;
+ lockdep_assert_held(&cpu_base->lock);
- idx = __ffs(*active);
- *active &= ~(1U << idx);
+ for_each_active_base(base, cpu_base, active) {
+ expires = ktime_sub(base->expires_next, base->offset);
+ if (expires >= expires_next)
+ continue;
+
+ /*
+ * If the excluded timer is the first on this base evaluate the
+ * next timer.
+ */
+ struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active);
- return &cpu_base->clock_base[idx];
+ if (unlikely(&exclude->node == node)) {
+ node = timerqueue_linked_next(node);
+ if (!node)
+ continue;
+ expires = ktime_sub(node->expires, base->offset);
+ if (expires >= expires_next)
+ continue;
+ }
+ expires_next = expires;
+ }
+ /* If base->offset changed, the result might be negative */
+ return max(expires_next, 0);
}
+#endif
-#define for_each_active_base(base, cpu_base, active) \
- while ((base = __next_base((cpu_base), &(active))))
+static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
-static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
- const struct hrtimer *exclude,
- unsigned int active,
- ktime_t expires_next)
+ return hrtimer_from_timerqueue_node(next);
+}
+
+/* Find the base with the earliest expiry */
+static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active,
+ ktime_t *expires_next, struct hrtimer **next_timer)
{
struct hrtimer_clock_base *base;
ktime_t expires;
for_each_active_base(base, cpu_base, active) {
- struct timerqueue_node *next;
- struct hrtimer *timer;
-
- next = timerqueue_getnext(&base->active);
- timer = container_of(next, struct hrtimer, node);
- if (timer == exclude) {
- /* Get to the next timer in the queue. */
- next = timerqueue_iterate_next(next);
- if (!next)
- continue;
-
- timer = container_of(next, struct hrtimer, node);
- }
- expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- if (expires < expires_next) {
- expires_next = expires;
-
- /* Skip cpu_base update if a timer is being excluded. */
- if (exclude)
- continue;
-
- if (timer->is_soft)
- cpu_base->softirq_next_timer = timer;
- else
- cpu_base->next_timer = timer;
+ expires = ktime_sub(base->expires_next, base->offset);
+ if (expires < *expires_next) {
+ *expires_next = expires;
+ *next_timer = clock_base_next_timer(base);
}
}
- /*
- * clock_was_set() might have changed base->offset of any of
- * the clock bases so the result might be negative. Fix it up
- * to prevent a false positive in clockevents_program_event().
- */
- if (expires_next < 0)
- expires_next = 0;
- return expires_next;
}
/*
@@ -575,30 +619,28 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
* - HRTIMER_ACTIVE_SOFT, or
* - HRTIMER_ACTIVE_HARD.
*/
-static ktime_t
-__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
+static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
- unsigned int active;
struct hrtimer *next_timer = NULL;
ktime_t expires_next = KTIME_MAX;
+ unsigned int active;
+
+ lockdep_assert_held(&cpu_base->lock);
if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
- cpu_base->softirq_next_timer = NULL;
- expires_next = __hrtimer_next_event_base(cpu_base, NULL,
- active, KTIME_MAX);
-
- next_timer = cpu_base->softirq_next_timer;
+ if (active)
+ hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
+ cpu_base->softirq_next_timer = next_timer;
}
if (active_mask & HRTIMER_ACTIVE_HARD) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ if (active)
+ hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
cpu_base->next_timer = next_timer;
- expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
- expires_next);
}
-
- return expires_next;
+ return max(expires_next, 0);
}
static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
@@ -638,8 +680,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
- offs_real, offs_boot, offs_tai);
+ ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real,
+ offs_boot, offs_tai);
base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
@@ -649,7 +691,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
}
/*
- * Is the high resolution mode active ?
+ * Is the high resolution mode active in the CPU base. This cannot use the
+ * static key as the CPUs are switched to high resolution mode
+ * asynchronously.
*/
static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
@@ -657,8 +701,13 @@ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
cpu_base->hres_active : 0;
}
-static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
- struct hrtimer *next_timer,
+static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred)
+{
+ trace_hrtimer_rearm(expires_next, deferred);
+ tick_program_event(expires_next, 1);
+}
+
+static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer,
ktime_t expires_next)
{
cpu_base->expires_next = expires_next;
@@ -683,20 +732,13 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
- tick_program_event(expires_next, 1);
+ hrtimer_rearm_event(expires_next, false);
}
-/*
- * Reprogram the event source with checking both queues for the
- * next event
- * Called with interrupts disabled and base->lock held
- */
-static void
-hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
+/* Reprogram the event source with a evaluation of all clock bases */
+static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal)
{
- ktime_t expires_next;
-
- expires_next = hrtimer_update_next_event(cpu_base);
+ ktime_t expires_next = hrtimer_update_next_event(cpu_base);
if (skip_equal && expires_next == cpu_base->expires_next)
return;
@@ -707,57 +749,49 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
-/*
- * High resolution timer enabled ?
- */
+/* High resolution timer enabled ? */
static bool hrtimer_hres_enabled __read_mostly = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);
-/*
- * Enable / Disable high resolution mode
- */
+/* Enable / Disable high resolution mode */
static int __init setup_hrtimer_hres(char *str)
{
return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
-
__setup("highres=", setup_hrtimer_hres);
-/*
- * hrtimer_high_res_enabled - query, if the highres mode is enabled
- */
-static inline int hrtimer_is_hres_enabled(void)
+/* hrtimer_high_res_enabled - query, if the highres mode is enabled */
+static inline bool hrtimer_is_hres_enabled(void)
{
return hrtimer_hres_enabled;
}
-/*
- * Switch to high resolution mode
- */
+/* Switch to high resolution mode */
static void hrtimer_switch_to_hres(void)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
if (tick_init_highres()) {
- pr_warn("Could not switch to high resolution mode on CPU %u\n",
- base->cpu);
+ pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu);
return;
}
- base->hres_active = 1;
+ base->hres_active = true;
hrtimer_resolution = HIGH_RES_NSEC;
tick_setup_sched_timer(true);
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
+ hrtimer_schedule_hres_work();
}
#else
-static inline int hrtimer_is_hres_enabled(void) { return 0; }
+static inline bool hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
+
/*
* Retrigger next event is called after clock was set with interrupts
* disabled through an SMP function call or directly from low level
@@ -792,13 +826,12 @@ static void retrigger_next_event(void *arg)
* In periodic low resolution mode, the next softirq expiration
* must also be updated.
*/
- raw_spin_lock(&base->lock);
+ guard(raw_spinlock)(&base->lock);
hrtimer_update_base(base);
if (hrtimer_hres_active(base))
- hrtimer_force_reprogram(base, 0);
+ hrtimer_force_reprogram(base, /* skip_equal */ false);
else
hrtimer_update_next_event(base);
- raw_spin_unlock(&base->lock);
}
/*
@@ -812,10 +845,11 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *base = timer->base;
- ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ ktime_t expires = hrtimer_get_expires(timer);
- WARN_ON_ONCE(hrtimer_get_expires(timer) < 0);
+ WARN_ON_ONCE(expires < 0);
+ expires = ktime_sub(expires, base->offset);
/*
* CLOCK_REALTIME timer might be requested with an absolute
* expiry time which is less than base->offset. Set it to 0.
@@ -842,8 +876,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
timer_cpu_base->softirq_next_timer = timer;
timer_cpu_base->softirq_expires_next = expires;
- if (!ktime_before(expires, timer_cpu_base->expires_next) ||
- !reprogram)
+ if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram)
return;
}
@@ -857,11 +890,8 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
if (expires >= cpu_base->expires_next)
return;
- /*
- * If the hrtimer interrupt is running, then it will reevaluate the
- * clock bases and reprogram the clock event device.
- */
- if (cpu_base->in_hrtirq)
+ /* If a deferred rearm is pending skip reprogramming the device */
+ if (cpu_base->deferred_rearm)
return;
cpu_base->next_timer = timer;
@@ -869,8 +899,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
__hrtimer_reprogram(cpu_base, timer, expires);
}
-static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
- unsigned int active)
+static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active)
{
struct hrtimer_clock_base *base;
unsigned int seq;
@@ -896,13 +925,11 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
if (seq == cpu_base->clock_was_set_seq)
return false;
- /*
- * If the remote CPU is currently handling an hrtimer interrupt, it
- * will reevaluate the first expiring timer of all clock bases
- * before reprogramming. Nothing to do here.
- */
- if (cpu_base->in_hrtirq)
+ /* If a deferred rearm is pending the remote CPU will take care of it */
+ if (cpu_base->deferred_rearm) {
+ cpu_base->deferred_needs_update = true;
return false;
+ }
/*
* Walk the affected clock bases and check whether the first expiring
@@ -913,9 +940,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
active &= cpu_base->active_bases;
for_each_active_base(base, cpu_base, active) {
- struct timerqueue_node *next;
+ struct timerqueue_linked_node *next;
- next = timerqueue_getnext(&base->active);
+ next = timerqueue_linked_first(&base->active);
expires = ktime_sub(next->expires, base->offset);
if (expires < cpu_base->expires_next)
return true;
@@ -947,11 +974,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
*/
void clock_was_set(unsigned int bases)
{
- struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
cpumask_var_t mask;
- int cpu;
- if (!hrtimer_hres_active(cpu_base) && !tick_nohz_is_active())
+ if (!hrtimer_highres_enabled() && !tick_nohz_is_active())
goto out_timerfd;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -960,23 +985,19 @@ void clock_was_set(unsigned int bases)
}
/* Avoid interrupting CPUs if possible */
- cpus_read_lock();
- for_each_online_cpu(cpu) {
- unsigned long flags;
-
- cpu_base = &per_cpu(hrtimer_bases, cpu);
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
+ scoped_guard(cpus_read_lock) {
+ int cpu;
- if (update_needs_ipi(cpu_base, bases))
- cpumask_set_cpu(cpu, mask);
+ for_each_online_cpu(cpu) {
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_base->lock);
+ if (update_needs_ipi(cpu_base, bases))
+ cpumask_set_cpu(cpu, mask);
+ }
+ scoped_guard(preempt)
+ smp_call_function_many(mask, retrigger_next_event, NULL, 1);
}
-
- preempt_disable();
- smp_call_function_many(mask, retrigger_next_event, NULL, 1);
- preempt_enable();
- cpus_read_unlock();
free_cpumask_var(mask);
out_timerfd:
@@ -1011,11 +1032,8 @@ void hrtimers_resume_local(void)
retrigger_next_event(NULL);
}
-/*
- * Counterpart to lock_hrtimer_base above:
- */
-static inline
-void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+/* Counterpart to lock_hrtimer_base above */
+static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
__releases(&timer->base->cpu_base->lock)
{
raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
@@ -1032,7 +1050,7 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
* .. note::
* This only updates the timer expiry value and does not requeue the timer.
*
- * There is also a variant of the function hrtimer_forward_now().
+ * There is also a variant of this function: hrtimer_forward_now().
*
* Context: Can be safely called from the callback function of @timer. If called
* from other contexts @timer must neither be enqueued nor running the
@@ -1042,15 +1060,15 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
*/
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
- u64 orun = 1;
ktime_t delta;
+ u64 orun = 1;
delta = ktime_sub(now, hrtimer_get_expires(timer));
if (delta < 0)
return 0;
- if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+ if (WARN_ON(timer->is_queued))
return 0;
if (interval < hrtimer_resolution)
@@ -1079,73 +1097,98 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
* enqueue_hrtimer - internal function to (re)start a timer
*
* The timer is inserted in expiry order. Insertion into the
- * red black tree is O(log(n)). Must hold the base lock.
+ * red black tree is O(log(n)).
*
* Returns true when the new timer is the leftmost timer in the tree.
*/
static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
- enum hrtimer_mode mode)
+ enum hrtimer_mode mode, bool was_armed)
{
- debug_activate(timer, mode);
+ lockdep_assert_held(&base->cpu_base->lock);
+
+ debug_activate(timer, mode, was_armed);
WARN_ON_ONCE(!base->cpu_base->online);
base->cpu_base->active_bases |= 1 << base->index;
/* Pairs with the lockless read in hrtimer_is_queued() */
- WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
+ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
+
+ if (!timerqueue_linked_add(&base->active, &timer->node))
+ return false;
+
+ base->expires_next = hrtimer_get_expires(timer);
+ return true;
+}
- return timerqueue_add(&base->active, &timer->node);
+static inline void base_update_next_timer(struct hrtimer_clock_base *base)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
+
+ base->expires_next = next ? next->expires : KTIME_MAX;
}
/*
* __remove_hrtimer - internal function to remove a timer
*
- * Caller must hold the base lock.
- *
* High resolution timer mode reprograms the clock event device when the
* timer is the one which expires next. The caller can disable this by setting
* reprogram to zero. This is useful, when the context does a reprogramming
* anyway (e.g. timer interrupt)
*/
-static void __remove_hrtimer(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- u8 newstate, int reprogram)
+static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ bool newstate, bool reprogram)
{
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
- u8 state = timer->state;
+ bool was_first;
- /* Pairs with the lockless read in hrtimer_is_queued() */
- WRITE_ONCE(timer->state, newstate);
- if (!(state & HRTIMER_STATE_ENQUEUED))
+ lockdep_assert_held(&cpu_base->lock);
+
+ if (!timer->is_queued)
return;
- if (!timerqueue_del(&base->active, &timer->node))
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->is_queued, newstate);
+
+ was_first = !timerqueue_linked_prev(&timer->node);
+
+ if (!timerqueue_linked_del(&base->active, &timer->node))
cpu_base->active_bases &= ~(1 << base->index);
+ /* Nothing to update if this was not the first timer in the base */
+ if (!was_first)
+ return;
+
+ base_update_next_timer(base);
+
/*
- * Note: If reprogram is false we do not update
- * cpu_base->next_timer. This happens when we remove the first
- * timer on a remote cpu. No harm as we never dereference
- * cpu_base->next_timer. So the worst thing what can happen is
- * an superfluous call to hrtimer_force_reprogram() on the
- * remote cpu later on if the same timer gets enqueued again.
+ * If reprogram is false don't update cpu_base->next_timer and do not
+ * touch the clock event device.
+ *
+ * This happens when removing the first timer on a remote CPU, which
+ * will be handled by the remote CPU's interrupt. It also happens when
+ * a local timer is removed to be immediately restarted. That's handled
+ * at the call site.
*/
- if (reprogram && timer == cpu_base->next_timer)
- hrtimer_force_reprogram(cpu_base, 1);
+ if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy)
+ return;
+
+ if (cpu_base->deferred_rearm)
+ cpu_base->deferred_needs_update = true;
+ else
+ hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
}
-/*
- * remove hrtimer, called with base lock held
- */
-static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
- bool restart, bool keep_local)
+static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ bool newstate)
{
- u8 state = timer->state;
+ lockdep_assert_held(&base->cpu_base->lock);
- if (state & HRTIMER_STATE_ENQUEUED) {
+ if (timer->is_queued) {
bool reprogram;
+ debug_hrtimer_deactivate(timer);
+
/*
* Remove the timer and force reprogramming when high
* resolution mode is active and the timer is on the current
@@ -1154,24 +1197,81 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
* reprogramming happens in the interrupt handler. This is a
* rare case and less expensive than a smp call.
*/
- debug_deactivate(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
- /*
- * If the timer is not restarted then reprogramming is
- * required if the timer is local. If it is local and about
- * to be restarted, avoid programming it twice (on removal
- * and a moment later when it's requeued).
- */
- if (!restart)
- state = HRTIMER_STATE_INACTIVE;
- else
- reprogram &= !keep_local;
+ __remove_hrtimer(timer, base, newstate, reprogram);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Update in place has to retrieve the expiry times of the neighbour nodes
+ * if they exist. That is cache line neutral because the dequeue/enqueue
+ * operation is going to need the same cache lines. But there is a big win
+ * when the dequeue/enqueue can be avoided because the RB tree does not
+ * have to be rebalanced twice.
+ */
+static inline bool
+hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node);
+ struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node);
+
+ /* If the new expiry goes behind the next timer, requeue is required */
+ if (next && expires > next->expires)
+ return false;
+
+ /* If this is the first timer, update in place */
+ if (!prev)
+ return true;
+
+ /* Update in place when it does not go ahead of the previous one */
+ return expires >= prev->expires;
+}
+
+static inline bool
+remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns)
+{
+ bool was_first = false;
+
+ /* Remove it from the timer queue if active */
+ if (timer->is_queued) {
+ was_first = !timerqueue_linked_prev(&timer->node);
+
+ /* Try to update in place to avoid the de/enqueue dance */
+ if (hrtimer_can_update_in_place(timer, base, expires)) {
+ hrtimer_set_expires_range_ns(timer, expires, delta_ns);
+ trace_hrtimer_start(timer, mode, true);
+ if (was_first)
+ base->expires_next = expires;
+ return was_first;
+ }
- __remove_hrtimer(timer, base, state, reprogram);
- return 1;
+ debug_hrtimer_deactivate(timer);
+ timerqueue_linked_del(&base->active, &timer->node);
}
- return 0;
+
+ /* Set the new expiry time */
+ hrtimer_set_expires_range_ns(timer, expires, delta_ns);
+
+ debug_activate(timer, mode, timer->is_queued);
+ base->cpu_base->active_bases |= 1 << base->index;
+
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
+
+ /* If it's the first expiring timer now or again, update base */
+ if (timerqueue_linked_add(&base->active, &timer->node)) {
+ base->expires_next = expires;
+ return true;
+ }
+
+ if (was_first)
+ base_update_next_timer(base);
+
+ return false;
}
static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
@@ -1190,55 +1290,93 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
return tim;
}
-static void
-hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
+static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
- ktime_t expires;
-
- /*
- * Find the next SOFT expiration.
- */
- expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
+ ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
/*
- * reprogramming needs to be triggered, even if the next soft
- * hrtimer expires at the same time than the next hard
+ * Reprogramming needs to be triggered, even if the next soft
+ * hrtimer expires at the same time as the next hard
* hrtimer. cpu_base->softirq_expires_next needs to be updated!
*/
if (expires == KTIME_MAX)
return;
/*
- * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
- * cpu_base->*expires_next is only set by hrtimer_reprogram()
+ * cpu_base->next_timer is recomputed by __hrtimer_get_next_event()
+ * cpu_base->expires_next is only set by hrtimer_reprogram()
*/
hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
}
-static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- u64 delta_ns, const enum hrtimer_mode mode,
- struct hrtimer_clock_base *base)
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
+{
+ if (static_branch_likely(&timers_migration_enabled)) {
+ /*
+ * If it is local and the first expiring timer keep it on the local
+ * CPU to optimize reprogramming of the clockevent device. Also
+ * avoid switch_hrtimer_base() overhead when local and pinned.
+ */
+ if (!is_local)
+ return false;
+ if (is_first || is_pinned)
+ return true;
+
+ /* Honour the NOHZ full restrictions */
+ if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE))
+ return false;
+
+ /*
+ * If the tick is not stopped or need_resched() is set, then
+ * there is no point in moving the timer somewhere else.
+ */
+ return !tick_nohz_tick_stopped() || need_resched();
+ }
+ return is_local;
+}
+#else
+static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
+{
+ return is_local;
+}
+#endif
+
+static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first,
+ bool is_pinned)
+{
+ /* If the timer is running the callback it has to stay on its CPU base. */
+ if (unlikely(timer->base->running == timer))
+ return true;
+
+ return hrtimer_prefer_local(is_local, is_first, is_pinned);
+}
+
+static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+ const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
{
struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
- struct hrtimer_clock_base *new_base;
- bool force_local, first;
+ bool is_pinned, first, was_first, keep_base = false;
+ struct hrtimer_cpu_base *cpu_base = base->cpu_base;
- /*
- * If the timer is on the local cpu base and is the first expiring
- * timer then this might end up reprogramming the hardware twice
- * (on removal and on enqueue). To avoid that by prevent the
- * reprogram on removal, keep the timer local to the current CPU
- * and enforce reprogramming after it is queued no matter whether
- * it is the new first expiring timer again or not.
- */
- force_local = base->cpu_base == this_cpu_base;
- force_local &= base->cpu_base->next_timer == timer;
+ was_first = cpu_base->next_timer == timer;
+ is_pinned = !!(mode & HRTIMER_MODE_PINNED);
/*
- * Don't force local queuing if this enqueue happens on a unplugged
- * CPU after hrtimer_cpu_dying() has been invoked.
+ * Don't keep it local if this enqueue happens on a unplugged CPU
+ * after hrtimer_cpu_dying() has been invoked.
*/
- force_local &= this_cpu_base->online;
+ if (likely(this_cpu_base->online)) {
+ bool is_local = cpu_base == this_cpu_base;
+
+ keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned);
+ }
+
+ /* Calculate absolute expiry time for relative timers */
+ if (mode & HRTIMER_MODE_REL)
+ tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
+ /* Compensate for low resolution granularity */
+ tim = hrtimer_update_lowres(timer, tim, mode);
/*
* Remove an active timer from the queue. In case it is not queued
@@ -1250,32 +1388,41 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* reprogramming later if it was the first expiring timer. This
* avoids programming the underlying clock event twice (once at
* removal and once after enqueue).
+ *
+ * @keep_base is also true if the timer callback is running on a
+ * remote CPU and for local pinned timers.
*/
- remove_hrtimer(timer, base, true, force_local);
+ if (likely(keep_base)) {
+ first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns);
+ } else {
+ /* Keep the ENQUEUED state in case it is queued */
+ bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED);
- if (mode & HRTIMER_MODE_REL)
- tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
+ hrtimer_set_expires_range_ns(timer, tim, delta_ns);
- tim = hrtimer_update_lowres(timer, tim, mode);
+ /* Switch the timer base, if necessary: */
+ base = switch_hrtimer_base(timer, base, is_pinned);
+ cpu_base = base->cpu_base;
- hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+ first = enqueue_hrtimer(timer, base, mode, was_armed);
+ }
- /* Switch the timer base, if necessary: */
- if (!force_local) {
- new_base = switch_hrtimer_base(timer, base,
- mode & HRTIMER_MODE_PINNED);
- } else {
- new_base = base;
+ /* If a deferred rearm is pending skip reprogramming the device */
+ if (cpu_base->deferred_rearm) {
+ cpu_base->deferred_needs_update = true;
+ return false;
}
- first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local) {
+ if (!was_first || cpu_base != this_cpu_base) {
/*
- * If the current CPU base is online, then the timer is
- * never queued on a remote CPU if it would be the first
- * expiring timer there.
+ * If the current CPU base is online, then the timer is never
+ * queued on a remote CPU if it would be the first expiring
+ * timer there unless the timer callback is currently executed
+ * on the remote CPU. In the latter case the remote CPU will
+ * re-evaluate the first expiring timer after completing the
+ * callbacks.
*/
- if (hrtimer_base_is_online(this_cpu_base))
+ if (likely(hrtimer_base_is_online(this_cpu_base)))
return first;
/*
@@ -1283,21 +1430,33 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* already offline. If the timer is the first to expire,
* kick the remote CPU to reprogram the clock event.
*/
- if (first) {
- struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+ if (first)
+ smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd);
+ return false;
+ }
- smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
- }
- return 0;
+ /*
+ * Special case for the HRTICK timer. It is frequently rearmed and most
+ * of the time moves the expiry into the future. That's expensive in
+ * virtual machines and it's better to take the pointless already armed
+ * interrupt than reprogramming the hardware on every context switch.
+ *
+ * If the new expiry is before the armed time, then reprogramming is
+ * required.
+ */
+ if (timer->is_lazy) {
+ if (cpu_base->expires_next <= hrtimer_get_expires(timer))
+ return false;
}
/*
- * Timer was forced to stay on the current CPU to avoid
- * reprogramming on removal and enqueue. Force reprogram the
- * hardware by evaluating the new first expiring timer.
+ * Timer was the first expiring timer and forced to stay on the
+ * current CPU to avoid reprogramming on removal and enqueue. Force
+ * reprogram the hardware by evaluating the new first expiring
+ * timer.
*/
- hrtimer_force_reprogram(new_base->cpu_base, 1);
- return 0;
+ hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
+ return false;
}
/**
@@ -1309,12 +1468,14 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
* softirq based mode is considered for debug purpose only!
*/
-void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- u64 delta_ns, const enum hrtimer_mode mode)
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+ const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base;
unsigned long flags;
+ debug_hrtimer_assert_init(timer);
+
/*
* Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
* match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
@@ -1362,8 +1523,11 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
base = lock_hrtimer_base(timer, &flags);
- if (!hrtimer_callback_running(timer))
- ret = remove_hrtimer(timer, base, false, false);
+ if (!hrtimer_callback_running(timer)) {
+ ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE);
+ if (ret)
+ trace_hrtimer_cancel(timer);
+ }
unlock_hrtimer_base(timer, &flags);
@@ -1397,8 +1561,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
* the timer callback to finish. Drop expiry_lock and reacquire it. That
* allows the waiter to acquire the lock and make progress.
*/
-static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
- unsigned long flags)
+static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags)
{
if (atomic_read(&cpu_base->timer_waiters)) {
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1463,14 +1626,10 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer)
spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
}
#else
-static inline void
-hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
-static inline void
-hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
-static inline void
-hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
-static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
- unsigned long flags) { }
+static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { }
#endif
/**
@@ -1526,15 +1685,11 @@ u64 hrtimer_get_next_event(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
u64 expires = KTIME_MAX;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_base->lock);
if (!hrtimer_hres_active(cpu_base))
expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
-
return expires;
}
@@ -1549,26 +1704,20 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
u64 expires = KTIME_MAX;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
-
- if (hrtimer_hres_active(cpu_base)) {
- unsigned int active;
+ unsigned int active;
- if (!cpu_base->softirq_activated) {
- active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
- expires = __hrtimer_next_event_base(cpu_base, exclude,
- active, KTIME_MAX);
- }
- active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
- expires = __hrtimer_next_event_base(cpu_base, exclude, active,
- expires);
- }
+ guard(raw_spinlock_irqsave)(&cpu_base->lock);
+ if (!hrtimer_hres_active(cpu_base))
+ return expires;
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+ if (active && !cpu_base->softirq_activated)
+ expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX);
- return expires;
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ if (!active)
+ return expires;
+ return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires);
}
#endif
@@ -1612,8 +1761,7 @@ ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
}
EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);
-static void __hrtimer_setup(struct hrtimer *timer,
- enum hrtimer_restart (*function)(struct hrtimer *),
+static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *),
clockid_t clock_id, enum hrtimer_mode mode)
{
bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
@@ -1645,13 +1793,14 @@ static void __hrtimer_setup(struct hrtimer *timer,
base += hrtimer_clockid_to_base(clock_id);
timer->is_soft = softtimer;
timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
+ timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM);
timer->base = &cpu_base->clock_base[base];
- timerqueue_init(&timer->node);
+ timerqueue_linked_init(&timer->node);
- if (WARN_ON_ONCE(!function))
+ if (WARN_ON_ONCE(!fn))
ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout;
else
- ACCESS_PRIVATE(timer, function) = function;
+ ACCESS_PRIVATE(timer, function) = fn;
}
/**
@@ -1710,12 +1859,10 @@ bool hrtimer_active(const struct hrtimer *timer)
base = READ_ONCE(timer->base);
seq = raw_read_seqcount_begin(&base->seq);
- if (timer->state != HRTIMER_STATE_INACTIVE ||
- base->running == timer)
+ if (timer->is_queued || base->running == timer)
return true;
- } while (read_seqcount_retry(&base->seq, seq) ||
- base != READ_ONCE(timer->base));
+ } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base));
return false;
}
@@ -1729,7 +1876,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
* - callback: the timer is being ran
* - post: the timer is inactive or (re)queued
*
- * On the read side we ensure we observe timer->state and cpu_base->running
+ * On the read side we ensure we observe timer->is_queued and cpu_base->running
* from the same section, if anything changed while we looked at it, we retry.
* This includes timer->base changing because sequence numbers alone are
* insufficient for that.
@@ -1738,11 +1885,9 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
* a false negative if the read side got smeared over multiple consecutive
* __run_hrtimer() invocations.
*/
-
-static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
- struct hrtimer_clock_base *base,
- struct hrtimer *timer, ktime_t *now,
- unsigned long flags) __must_hold(&cpu_base->lock)
+static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base,
+ struct hrtimer *timer, ktime_t now, unsigned long flags)
+ __must_hold(&cpu_base->lock)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
bool expires_in_hardirq;
@@ -1754,15 +1899,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
base->running = timer;
/*
- * Separate the ->running assignment from the ->state assignment.
+ * Separate the ->running assignment from the ->is_queued assignment.
*
* As with a regular write barrier, this ensures the read side in
* hrtimer_active() cannot observe base->running == NULL &&
- * timer->state == INACTIVE.
+ * timer->is_queued == INACTIVE.
*/
raw_write_seqcount_barrier(&base->seq);
- __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false);
fn = ACCESS_PRIVATE(timer, function);
/*
@@ -1797,16 +1942,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
* hrtimer_start_range_ns() can have popped in and enqueued the timer
* for us already.
*/
- if (restart != HRTIMER_NORESTART &&
- !(timer->state & HRTIMER_STATE_ENQUEUED))
- enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+ if (restart == HRTIMER_RESTART && !timer->is_queued)
+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false);
/*
- * Separate the ->running assignment from the ->state assignment.
+ * Separate the ->running assignment from the ->is_queued assignment.
*
* As with a regular write barrier, this ensures the read side in
* hrtimer_active() cannot observe base->running.timer == NULL &&
- * timer->state == INACTIVE.
+ * timer->is_queued == INACTIVE.
*/
raw_write_seqcount_barrier(&base->seq);
@@ -1814,23 +1958,24 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
base->running = NULL;
}
+static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
+
+ return next ? hrtimer_from_timerqueue_node(next) : NULL;
+}
+
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
unsigned long flags, unsigned int active_mask)
{
- struct hrtimer_clock_base *base;
unsigned int active = cpu_base->active_bases & active_mask;
+ struct hrtimer_clock_base *base;
for_each_active_base(base, cpu_base, active) {
- struct timerqueue_node *node;
- ktime_t basenow;
-
- basenow = ktime_add(now, base->offset);
-
- while ((node = timerqueue_getnext(&base->active))) {
- struct hrtimer *timer;
-
- timer = container_of(node, struct hrtimer, node);
+ ktime_t basenow = ktime_add(now, base->offset);
+ struct hrtimer *timer;
+ while ((timer = clock_base_next_timer(base))) {
/*
* The immediate goal for using the softexpires is
* minimizing wakeups, not running timers at the
@@ -1846,7 +1991,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
if (basenow < hrtimer_get_softexpires(timer))
break;
- __run_hrtimer(cpu_base, base, timer, &basenow, flags);
+ __run_hrtimer(cpu_base, base, timer, basenow, flags);
if (active_mask == HRTIMER_ACTIVE_SOFT)
hrtimer_sync_wait_running(cpu_base, flags);
}
@@ -1865,7 +2010,7 @@ static __latent_entropy void hrtimer_run_softirq(void)
now = hrtimer_update_base(cpu_base);
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
- cpu_base->softirq_activated = 0;
+ cpu_base->softirq_activated = false;
hrtimer_update_softirq_timer(cpu_base, true);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1875,6 +2020,63 @@ static __latent_entropy void hrtimer_run_softirq(void)
#ifdef CONFIG_HIGH_RES_TIMERS
/*
+ * Very similar to hrtimer_force_reprogram(), except it deals with
+ * deferred_rearm and hang_detected.
+ */
+static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred)
+{
+ cpu_base->expires_next = expires_next;
+ cpu_base->deferred_rearm = false;
+
+ if (unlikely(cpu_base->hang_detected)) {
+ /*
+ * Give the system a chance to do something else than looping
+ * on hrtimer interrupts.
+ */
+ expires_next = ktime_add_ns(ktime_get(),
+ min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time));
+ }
+ hrtimer_rearm_event(expires_next, deferred);
+}
+
+#ifdef CONFIG_HRTIMER_REARM_DEFERRED
+void __hrtimer_rearm_deferred(void)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t expires_next;
+
+ if (!cpu_base->deferred_rearm)
+ return;
+
+ guard(raw_spinlock)(&cpu_base->lock);
+ if (cpu_base->deferred_needs_update) {
+ hrtimer_update_base(cpu_base);
+ expires_next = hrtimer_update_next_event(cpu_base);
+ } else {
+ /* No timer added/removed. Use the cached value */
+ expires_next = cpu_base->deferred_expires_next;
+ }
+ hrtimer_rearm(cpu_base, expires_next, true);
+}
+
+static __always_inline void
+hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
+{
+ /* hrtimer_interrupt() just re-evaluated the first expiring timer */
+ cpu_base->deferred_needs_update = false;
+ /* Cache the expiry time */
+ cpu_base->deferred_expires_next = expires_next;
+ set_thread_flag(TIF_HRTIMER_REARM);
+}
+#else /* CONFIG_HRTIMER_REARM_DEFERRED */
+static __always_inline void
+hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
+{
+ hrtimer_rearm(cpu_base, expires_next, false);
+}
+#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */
+
+/*
* High resolution timer interrupt
* Called with interrupts disabled
*/
@@ -1888,86 +2090,55 @@ void hrtimer_interrupt(struct clock_event_device *dev)
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
entry_time = now = hrtimer_update_base(cpu_base);
retry:
- cpu_base->in_hrtirq = 1;
+ cpu_base->deferred_rearm = true;
/*
- * We set expires_next to KTIME_MAX here with cpu_base->lock
- * held to prevent that a timer is enqueued in our queue via
- * the migration code. This does not affect enqueueing of
- * timers which run their callback and need to be requeued on
- * this CPU.
+ * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue
+ * timers while __hrtimer_run_queues() is expiring the clock bases.
+ * Timers which are re/enqueued on the local CPU are not affected by
+ * this.
*/
cpu_base->expires_next = KTIME_MAX;
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
- cpu_base->softirq_activated = 1;
+ cpu_base->softirq_activated = true;
raise_timer_softirq(HRTIMER_SOFTIRQ);
}
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
- /* Reevaluate the clock bases for the [soft] next expiry */
- expires_next = hrtimer_update_next_event(cpu_base);
- /*
- * Store the new expiry value so the migration code can verify
- * against it.
- */
- cpu_base->expires_next = expires_next;
- cpu_base->in_hrtirq = 0;
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
-
- /* Reprogramming necessary ? */
- if (!tick_program_event(expires_next, 0)) {
- cpu_base->hang_detected = 0;
- return;
- }
-
/*
* The next timer was already expired due to:
* - tracing
* - long lasting callbacks
* - being scheduled away when running in a VM
*
- * We need to prevent that we loop forever in the hrtimer
- * interrupt routine. We give it 3 attempts to avoid
- * overreacting on some spurious event.
- *
- * Acquire base lock for updating the offsets and retrieving
- * the current time.
+ * We need to prevent that we loop forever in the hrtiner interrupt
+ * routine. We give it 3 attempts to avoid overreacting on some
+ * spurious event.
*/
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
- cpu_base->nr_retries++;
- if (++retries < 3)
- goto retry;
- /*
- * Give the system a chance to do something else than looping
- * here. We stored the entry time, so we know exactly how long
- * we spent here. We schedule the next event this amount of
- * time away.
- */
- cpu_base->nr_hangs++;
- cpu_base->hang_detected = 1;
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ expires_next = hrtimer_update_next_event(cpu_base);
+ cpu_base->hang_detected = false;
+ if (expires_next < now) {
+ if (++retries < 3)
+ goto retry;
+
+ delta = ktime_sub(now, entry_time);
+ cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta);
+ cpu_base->nr_hangs++;
+ cpu_base->hang_detected = true;
+ }
- delta = ktime_sub(now, entry_time);
- if ((unsigned int)delta > cpu_base->max_hang_time)
- cpu_base->max_hang_time = (unsigned int) delta;
- /*
- * Limit it to a sensible value as we enforce a longer
- * delay. Give the CPU at least 100ms to catch up.
- */
- if (delta > 100 * NSEC_PER_MSEC)
- expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
- else
- expires_next = ktime_add(now, delta);
- tick_program_event(expires_next, 1);
- pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
+ hrtimer_interrupt_rearm(cpu_base, expires_next);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}
+
#endif /* !CONFIG_HIGH_RES_TIMERS */
/*
@@ -1999,7 +2170,7 @@ void hrtimer_run_queues(void)
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
- cpu_base->softirq_activated = 1;
+ cpu_base->softirq_activated = true;
raise_timer_softirq(HRTIMER_SOFTIRQ);
}
@@ -2012,8 +2183,7 @@ void hrtimer_run_queues(void)
*/
static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
{
- struct hrtimer_sleeper *t =
- container_of(timer, struct hrtimer_sleeper, timer);
+ struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer);
struct task_struct *task = t->task;
t->task = NULL;
@@ -2031,8 +2201,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
* Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
* to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
*/
-void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
- enum hrtimer_mode mode)
+void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode)
{
/*
* Make the enqueue delivery mode check work on RT. If the sleeper
@@ -2048,8 +2217,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
-static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl,
- clockid_t clock_id, enum hrtimer_mode mode)
+static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
+ enum hrtimer_mode mode)
{
/*
* On PREEMPT_RT enabled kernels hrtimers which are not explicitly
@@ -2085,8 +2254,8 @@ static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl,
* @clock_id: the clock to be used
* @mode: timer mode abs/rel
*/
-void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl,
- clockid_t clock_id, enum hrtimer_mode mode)
+void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id,
+ enum hrtimer_mode mode)
{
debug_setup_on_stack(&sl->timer, clock_id, mode);
__hrtimer_setup_sleeper(sl, clock_id, mode);
@@ -2159,12 +2328,11 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
return ret;
}
-long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
- const clockid_t clockid)
+long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid)
{
struct restart_block *restart;
struct hrtimer_sleeper t;
- int ret = 0;
+ int ret;
hrtimer_setup_sleeper_on_stack(&t, clockid, mode);
hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
@@ -2203,8 +2371,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
- CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif
@@ -2212,7 +2379,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
- struct old_timespec32 __user *, rmtp)
+ struct old_timespec32 __user *, rmtp)
{
struct timespec64 tu;
@@ -2225,8 +2392,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
- CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif
@@ -2236,14 +2402,13 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
int hrtimers_prepare_cpu(unsigned int cpu)
{
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
- int i;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
clock_b->cpu_base = cpu_base;
seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
- timerqueue_init_head(&clock_b->active);
+ timerqueue_linked_init_head(&clock_b->active);
}
cpu_base->cpu = cpu;
@@ -2257,13 +2422,14 @@ int hrtimers_cpu_starting(unsigned int cpu)
/* Clear out any left over state from a CPU down operation */
cpu_base->active_bases = 0;
- cpu_base->hres_active = 0;
- cpu_base->hang_detected = 0;
+ cpu_base->hres_active = false;
+ cpu_base->hang_detected = false;
cpu_base->next_timer = NULL;
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
- cpu_base->online = 1;
+ cpu_base->softirq_activated = false;
+ cpu_base->online = true;
return 0;
}
@@ -2272,20 +2438,20 @@ int hrtimers_cpu_starting(unsigned int cpu)
static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
struct hrtimer_clock_base *new_base)
{
+ struct timerqueue_linked_node *node;
struct hrtimer *timer;
- struct timerqueue_node *node;
- while ((node = timerqueue_getnext(&old_base->active))) {
- timer = container_of(node, struct hrtimer, node);
+ while ((node = timerqueue_linked_first(&old_base->active))) {
+ timer = hrtimer_from_timerqueue_node(node);
BUG_ON(hrtimer_callback_running(timer));
- debug_deactivate(timer);
+ debug_hrtimer_deactivate(timer);
/*
* Mark it as ENQUEUED not INACTIVE otherwise the
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
@@ -2295,13 +2461,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* sort out already expired timers and reprogram the
* event device.
*/
- enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true);
}
}
int hrtimers_cpu_dying(unsigned int dying_cpu)
{
- int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+ int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
struct hrtimer_cpu_base *old_base, *new_base;
old_base = this_cpu_ptr(&hrtimer_bases);
@@ -2314,16 +2480,14 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
raw_spin_lock(&old_base->lock);
raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- migrate_hrtimer_list(&old_base->clock_base[i],
- &new_base->clock_base[i]);
- }
+ for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]);
/* Tell the other CPU to retrigger the next event */
smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
raw_spin_unlock(&new_base->lock);
- old_base->online = 0;
+ old_base->online = false;
raw_spin_unlock(&old_base->lock);
return 0;
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 9daf8c5d9687..1c954f330dfe 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -32,7 +32,6 @@ static u64 jiffies_read(struct clocksource *cs)
static struct clocksource clocksource_jiffies = {
.name = "jiffies",
.rating = 1, /* lowest valid rating*/
- .uncertainty_margin = 32 * NSEC_PER_MSEC,
.read = jiffies_read,
.mask = CLOCKSOURCE_MASK(32),
.mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 652744e00eb4..4bca3f78c8ea 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -18,8 +18,9 @@
#include <linux/cred.h>
#include <linux/err.h>
#include <linux/mm.h>
+#include <linux/cleanup.h>
-#include <vdso/datapage.h>
+#include "namespace_internal.h"
ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
struct timens_offsets *ns_offsets)
@@ -93,8 +94,8 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
if (!ns)
goto fail_dec;
- ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!ns->vvar_page)
+ err = timens_vdso_alloc_vvar_page(ns);
+ if (err)
goto fail_free;
err = ns_common_init(ns);
@@ -109,7 +110,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
return ns;
fail_free_page:
- __free_page(ns->vvar_page);
+ timens_vdso_free_vvar_page(ns);
fail_free:
kfree(ns);
fail_dec:
@@ -138,117 +139,7 @@ struct time_namespace *copy_time_ns(u64 flags,
return clone_time_ns(user_ns, old_ns);
}
-static struct timens_offset offset_from_ts(struct timespec64 off)
-{
- struct timens_offset ret;
-
- ret.sec = off.tv_sec;
- ret.nsec = off.tv_nsec;
-
- return ret;
-}
-
-/*
- * A time namespace VVAR page has the same layout as the VVAR page which
- * contains the system wide VDSO data.
- *
- * For a normal task the VVAR pages are installed in the normal ordering:
- * VVAR
- * PVCLOCK
- * HVCLOCK
- * TIMENS <- Not really required
- *
- * Now for a timens task the pages are installed in the following order:
- * TIMENS
- * PVCLOCK
- * HVCLOCK
- * VVAR
- *
- * The check for vdso_clock->clock_mode is in the unlikely path of
- * the seq begin magic. So for the non-timens case most of the time
- * 'seq' is even, so the branch is not taken.
- *
- * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
- * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
- * update to finish and for 'seq' to become even anyway.
- *
- * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
- * enforces the time namespace handling path.
- */
-static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
- struct time_namespace *ns)
-{
- struct timens_offset *offset = vc->offset;
- struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
- struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
-
- vc->seq = 1;
- vc->clock_mode = VDSO_CLOCKMODE_TIMENS;
- offset[CLOCK_MONOTONIC] = monotonic;
- offset[CLOCK_MONOTONIC_RAW] = monotonic;
- offset[CLOCK_MONOTONIC_COARSE] = monotonic;
- offset[CLOCK_BOOTTIME] = boottime;
- offset[CLOCK_BOOTTIME_ALARM] = boottime;
-}
-
-struct page *find_timens_vvar_page(struct vm_area_struct *vma)
-{
- if (likely(vma->vm_mm == current->mm))
- return current->nsproxy->time_ns->vvar_page;
-
- /*
- * VM_PFNMAP | VM_IO protect .fault() handler from being called
- * through interfaces like /proc/$pid/mem or
- * process_vm_{readv,writev}() as long as there's no .access()
- * in special_mapping_vmops().
- * For more details check_vma_flags() and __access_remote_vm()
- */
-
- WARN(1, "vvar_page accessed remotely");
-
- return NULL;
-}
-
-/*
- * Protects possibly multiple offsets writers racing each other
- * and tasks entering the namespace.
- */
-static DEFINE_MUTEX(offset_lock);
-
-static void timens_set_vvar_page(struct task_struct *task,
- struct time_namespace *ns)
-{
- struct vdso_time_data *vdata;
- struct vdso_clock *vc;
- unsigned int i;
-
- if (ns == &init_time_ns)
- return;
-
- /* Fast-path, taken by every task in namespace except the first. */
- if (likely(ns->frozen_offsets))
- return;
-
- mutex_lock(&offset_lock);
- /* Nothing to-do: vvar_page has been already initialized. */
- if (ns->frozen_offsets)
- goto out;
-
- ns->frozen_offsets = true;
- vdata = page_address(ns->vvar_page);
- vc = vdata->clock_data;
-
- for (i = 0; i < CS_BASES; i++)
- timens_setup_vdso_clock_data(&vc[i], ns);
-
- if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
- for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
- timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
- }
-
-out:
- mutex_unlock(&offset_lock);
-}
+DEFINE_MUTEX(timens_offset_lock);
void free_time_ns(struct time_namespace *ns)
{
@@ -256,41 +147,39 @@ void free_time_ns(struct time_namespace *ns)
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_common_free(ns);
- __free_page(ns->vvar_page);
+ timens_vdso_free_vvar_page(ns);
/* Concurrent nstree traversal depends on a grace period. */
kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *timens_get(struct task_struct *task)
{
- struct time_namespace *ns = NULL;
+ struct time_namespace *ns;
struct nsproxy *nsproxy;
- task_lock(task);
+ guard(task_lock)(task);
nsproxy = task->nsproxy;
- if (nsproxy) {
- ns = nsproxy->time_ns;
- get_time_ns(ns);
- }
- task_unlock(task);
+ if (!nsproxy)
+ return NULL;
- return ns ? &ns->ns : NULL;
+ ns = nsproxy->time_ns;
+ get_time_ns(ns);
+ return &ns->ns;
}
static struct ns_common *timens_for_children_get(struct task_struct *task)
{
- struct time_namespace *ns = NULL;
+ struct time_namespace *ns;
struct nsproxy *nsproxy;
- task_lock(task);
+ guard(task_lock)(task);
nsproxy = task->nsproxy;
- if (nsproxy) {
- ns = nsproxy->time_ns_for_children;
- get_time_ns(ns);
- }
- task_unlock(task);
+ if (!nsproxy)
+ return NULL;
- return ns ? &ns->ns : NULL;
+ ns = nsproxy->time_ns_for_children;
+ get_time_ns(ns);
+ return &ns->ns;
}
static void timens_put(struct ns_common *ns)
@@ -298,12 +187,6 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
-void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
-{
- timens_set_vvar_page(tsk, ns);
- vdso_join_timens(tsk, ns);
-}
-
static int timens_install(struct nsset *nsset, struct ns_common *new)
{
struct nsproxy *nsproxy = nsset->nsproxy;
@@ -367,36 +250,33 @@ static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts)
void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m)
{
- struct ns_common *ns;
- struct time_namespace *time_ns;
+ struct time_namespace *time_ns __free(time_ns) = NULL;
+ struct ns_common *ns = timens_for_children_get(p);
- ns = timens_for_children_get(p);
if (!ns)
return;
+
time_ns = to_time_ns(ns);
show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic);
show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime);
- put_time_ns(time_ns);
}
int proc_timens_set_offset(struct file *file, struct task_struct *p,
struct proc_timens_offset *offsets, int noffsets)
{
- struct ns_common *ns;
- struct time_namespace *time_ns;
+ struct time_namespace *time_ns __free(time_ns) = NULL;
+ struct ns_common *ns = timens_for_children_get(p);
struct timespec64 tp;
- int i, err;
+ int i;
- ns = timens_for_children_get(p);
if (!ns)
return -ESRCH;
+
time_ns = to_time_ns(ns);
- if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) {
- put_time_ns(time_ns);
+ if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME))
return -EPERM;
- }
for (i = 0; i < noffsets; i++) {
struct proc_timens_offset *off = &offsets[i];
@@ -409,15 +289,12 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
ktime_get_boottime_ts64(&tp);
break;
default:
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
- err = -ERANGE;
-
if (off->val.tv_sec > KTIME_SEC_MAX ||
off->val.tv_sec < -KTIME_SEC_MAX)
- goto out;
+ return -ERANGE;
tp = timespec64_add(tp, off->val);
/*
@@ -425,16 +302,13 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
* still unreachable.
*/
if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2)
- goto out;
+ return -ERANGE;
}
- mutex_lock(&offset_lock);
- if (time_ns->frozen_offsets) {
- err = -EACCES;
- goto out_unlock;
- }
+ guard(mutex)(&timens_offset_lock);
+ if (time_ns->frozen_offsets)
+ return -EACCES;
- err = 0;
/* Don't report errors after this line */
for (i = 0; i < noffsets; i++) {
struct proc_timens_offset *off = &offsets[i];
@@ -452,12 +326,7 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
*offset = off->val;
}
-out_unlock:
- mutex_unlock(&offset_lock);
-out:
- put_time_ns(time_ns);
-
- return err;
+ return 0;
}
const struct proc_ns_operations timens_operations = {
diff --git a/kernel/time/namespace_internal.h b/kernel/time/namespace_internal.h
new file mode 100644
index 000000000000..b37ba179f43b
--- /dev/null
+++ b/kernel/time/namespace_internal.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TIME_NAMESPACE_INTERNAL_H
+#define _TIME_NAMESPACE_INTERNAL_H
+
+#include <linux/mutex.h>
+
+struct time_namespace;
+
+/*
+ * Protects possibly multiple offsets writers racing each other
+ * and tasks entering the namespace.
+ */
+extern struct mutex timens_offset_lock;
+
+#ifdef CONFIG_TIME_NS_VDSO
+int timens_vdso_alloc_vvar_page(struct time_namespace *ns);
+void timens_vdso_free_vvar_page(struct time_namespace *ns);
+#else /* !CONFIG_TIME_NS_VDSO */
+static inline int timens_vdso_alloc_vvar_page(struct time_namespace *ns)
+{
+ return 0;
+}
+static inline void timens_vdso_free_vvar_page(struct time_namespace *ns)
+{
+}
+#endif /* CONFIG_TIME_NS_VDSO */
+
+#endif /* _TIME_NAMESPACE_INTERNAL_H */
diff --git a/kernel/time/namespace_vdso.c b/kernel/time/namespace_vdso.c
new file mode 100644
index 000000000000..0d74d160eec9
--- /dev/null
+++ b/kernel/time/namespace_vdso.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: Andrei Vagin <avagin@openvz.org>
+ * Author: Dmitry Safonov <dima@arista.com>
+ */
+
+#include <linux/cleanup.h>
+#include <linux/mm.h>
+#include <linux/time_namespace.h>
+#include <linux/time.h>
+#include <linux/vdso_datastore.h>
+
+#include <vdso/clocksource.h>
+#include <vdso/datapage.h>
+
+#include "namespace_internal.h"
+
+static struct timens_offset offset_from_ts(struct timespec64 off)
+{
+ struct timens_offset ret;
+
+ ret.sec = off.tv_sec;
+ ret.nsec = off.tv_nsec;
+
+ return ret;
+}
+
+/*
+ * A time namespace VVAR page has the same layout as the VVAR page which
+ * contains the system wide VDSO data.
+ *
+ * For a normal task the VVAR pages are installed in the normal ordering:
+ * VVAR
+ * PVCLOCK
+ * HVCLOCK
+ * TIMENS <- Not really required
+ *
+ * Now for a timens task the pages are installed in the following order:
+ * TIMENS
+ * PVCLOCK
+ * HVCLOCK
+ * VVAR
+ *
+ * The check for vdso_clock->clock_mode is in the unlikely path of
+ * the seq begin magic. So for the non-timens case most of the time
+ * 'seq' is even, so the branch is not taken.
+ *
+ * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
+ * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
+ * update to finish and for 'seq' to become even anyway.
+ *
+ * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
+ * enforces the time namespace handling path.
+ */
+static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
+ struct time_namespace *ns)
+{
+ struct timens_offset *offset = vc->offset;
+ struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
+ struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
+
+ vc->seq = 1;
+ vc->clock_mode = VDSO_CLOCKMODE_TIMENS;
+ offset[CLOCK_MONOTONIC] = monotonic;
+ offset[CLOCK_MONOTONIC_RAW] = monotonic;
+ offset[CLOCK_MONOTONIC_COARSE] = monotonic;
+ offset[CLOCK_BOOTTIME] = boottime;
+ offset[CLOCK_BOOTTIME_ALARM] = boottime;
+}
+
+struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_mm == current->mm))
+ return current->nsproxy->time_ns->vvar_page;
+
+ /*
+ * VM_PFNMAP | VM_IO protect .fault() handler from being called
+ * through interfaces like /proc/$pid/mem or
+ * process_vm_{readv,writev}() as long as there's no .access()
+ * in special_mapping_vmops().
+ * For more details check_vma_flags() and __access_remote_vm()
+ */
+
+ WARN(1, "vvar_page accessed remotely");
+
+ return NULL;
+}
+
+static void timens_set_vvar_page(struct task_struct *task,
+ struct time_namespace *ns)
+{
+ struct vdso_time_data *vdata;
+ struct vdso_clock *vc;
+ unsigned int i;
+
+ if (ns == &init_time_ns)
+ return;
+
+ /* Fast-path, taken by every task in namespace except the first. */
+ if (likely(ns->frozen_offsets))
+ return;
+
+ guard(mutex)(&timens_offset_lock);
+ /* Nothing to-do: vvar_page has been already initialized. */
+ if (ns->frozen_offsets)
+ return;
+
+ ns->frozen_offsets = true;
+ vdata = page_address(ns->vvar_page);
+ vc = vdata->clock_data;
+
+ for (i = 0; i < CS_BASES; i++)
+ timens_setup_vdso_clock_data(&vc[i], ns);
+
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
+ for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
+ timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
+ }
+}
+
+/*
+ * The vvar page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will be re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_clock_data() for details.
+ */
+static int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+ struct mm_struct *mm = task->mm;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ guard(mmap_read_lock)(mm);
+ for_each_vma(vmi, vma) {
+ if (vma_is_special_mapping(vma, &vdso_vvar_mapping))
+ zap_vma(vma);
+ }
+ return 0;
+}
+
+void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
+{
+ timens_set_vvar_page(tsk, ns);
+ vdso_join_timens(tsk, ns);
+}
+
+int timens_vdso_alloc_vvar_page(struct time_namespace *ns)
+{
+ ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!ns->vvar_page)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void timens_vdso_free_vvar_page(struct time_namespace *ns)
+{
+ __free_page(ns->vvar_page);
+}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 413e2389f0a5..9331e1614124 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1092,7 +1092,7 @@ void exit_itimers(struct task_struct *tsk)
}
/*
- * There should be no timers on the ignored list. itimer_delete() has
+ * There should be no timers on the ignored list. posix_timer_delete() has
* mopped them up.
*/
if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers)))
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index a88b72b0f35e..51f6a1032c83 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -78,7 +78,6 @@ static struct clock_event_device ce_broadcast_hrtimer = {
.set_state_shutdown = bc_shutdown,
.set_next_ktime = bc_set_next,
.features = CLOCK_EVT_FEAT_ONESHOT |
- CLOCK_EVT_FEAT_KTIME |
CLOCK_EVT_FEAT_HRTIMER,
.rating = 0,
.bound_on = -1,
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f63c65881364..115e0bf01276 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -76,8 +76,10 @@ const struct clock_event_device *tick_get_wakeup_device(int cpu)
*/
static void tick_broadcast_start_periodic(struct clock_event_device *bc)
{
- if (bc)
+ if (bc) {
+ bc->next_event_forced = 0;
tick_setup_periodic(bc, 1);
+ }
}
/*
@@ -106,6 +108,7 @@ static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu)
static void tick_oneshot_wakeup_handler(struct clock_event_device *wd)
{
+ wd->next_event_forced = 0;
/*
* If we woke up early and the tick was reprogrammed in the
* meantime then this may be spurious but harmless.
@@ -403,6 +406,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
bool bc_local;
raw_spin_lock(&tick_broadcast_lock);
+ tick_broadcast_device.evtdev->next_event_forced = 0;
/* Handle spurious interrupts gracefully */
if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) {
@@ -696,6 +700,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
raw_spin_lock(&tick_broadcast_lock);
dev->next_event = KTIME_MAX;
+ tick_broadcast_device.evtdev->next_event_forced = 0;
next_event = KTIME_MAX;
cpumask_clear(tmpmask);
now = ktime_get();
@@ -1063,6 +1068,7 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc,
bc->event_handler = tick_handle_oneshot_broadcast;
+ bc->next_event_forced = 0;
bc->next_event = KTIME_MAX;
/*
@@ -1175,6 +1181,7 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
}
/* This moves the broadcast assignment to this CPU: */
+ bc->next_event_forced = 0;
clockevents_program_event(bc, bc->next_event, 1);
}
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index d305d8521896..6a9198a4279b 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -110,6 +110,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
int cpu = smp_processor_id();
ktime_t next = dev->next_event;
+ dev->next_event_forced = 0;
tick_periodic(cpu);
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f7907fadd63f..cbbb87a0c6e7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -345,7 +345,7 @@ static bool check_tick_dependency(atomic_t *dep)
int val = atomic_read(dep);
if (likely(!tracepoint_enabled(tick_stop)))
- return !val;
+ return !!val;
if (val & TICK_DEP_MASK_POSIX_TIMER) {
trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
@@ -864,19 +864,32 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+/* Simplified variant of hrtimer_forward_now() */
+static ktime_t tick_forward_now(ktime_t expires, ktime_t now)
+{
+ ktime_t delta = now - expires;
+
+ if (likely(delta < TICK_NSEC))
+ return expires + TICK_NSEC;
+
+ expires += TICK_NSEC * ktime_divns(delta, TICK_NSEC);
+ if (expires > now)
+ return expires;
+ return expires + TICK_NSEC;
+}
+
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
- hrtimer_cancel(&ts->sched_timer);
- hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+ ktime_t expires = ts->last_tick;
- /* Forward the time to expire in the future */
- hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
+ if (now >= expires)
+ expires = tick_forward_now(expires, now);
if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
- hrtimer_start_expires(&ts->sched_timer,
- HRTIMER_MODE_ABS_PINNED_HARD);
+ hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
} else {
- tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+ hrtimer_set_expires(&ts->sched_timer, expires);
+ tick_program_event(expires, 1);
}
/*
@@ -1513,6 +1526,7 @@ static void tick_nohz_lowres_handler(struct clock_event_device *dev)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 0d832317d576..771cef87ad3b 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
get_user(new_ts.tv_nsec, &tv->tv_usec))
return -EFAULT;
- if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
+ if (new_ts.tv_nsec >= USEC_PER_SEC || new_ts.tv_nsec < 0)
return -EINVAL;
new_ts.tv_nsec *= NSEC_PER_USEC;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c07e562ee4c1..c493a4010305 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -3,34 +3,30 @@
* Kernel timekeeping code and accessor functions. Based on code from
* timer.c, moved in commit 8524070b7982.
*/
-#include <linux/timekeeper_internal.h>
-#include <linux/module.h>
-#include <linux/interrupt.h>
+#include <linux/audit.h>
+#include <linux/clocksource.h>
+#include <linux/compiler.h>
+#include <linux/jiffies.h>
#include <linux/kobject.h>
-#include <linux/percpu.h>
-#include <linux/init.h>
-#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/nmi.h>
-#include <linux/sched.h>
-#include <linux/sched/loadavg.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/random.h>
#include <linux/sched/clock.h>
+#include <linux/sched/loadavg.h>
+#include <linux/static_key.h>
+#include <linux/stop_machine.h>
#include <linux/syscore_ops.h>
-#include <linux/clocksource.h>
-#include <linux/jiffies.h>
+#include <linux/tick.h>
#include <linux/time.h>
#include <linux/timex.h>
-#include <linux/tick.h>
-#include <linux/stop_machine.h>
-#include <linux/pvclock_gtod.h>
-#include <linux/compiler.h>
-#include <linux/audit.h>
-#include <linux/random.h>
+#include <linux/timekeeper_internal.h>
#include <vdso/auxclock.h>
#include "tick-internal.h"
-#include "ntp_internal.h"
#include "timekeeping_internal.h"
+#include "ntp_internal.h"
#define TK_CLEAR_NTP (1 << 0)
#define TK_CLOCK_WAS_SET (1 << 1)
@@ -275,6 +271,11 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}
+#ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE
+#include <asm/clock_inlined.h>
+
+static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined);
+
/*
* tk_clock_read - atomic clocksource read() helper
*
@@ -288,12 +289,35 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
* a read of the fast-timekeeper tkrs (which is protected by its own locking
* and update logic).
*/
-static inline u64 tk_clock_read(const struct tk_read_base *tkr)
+static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
+{
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ if (static_branch_likely(&clocksource_read_inlined))
+ return arch_inlined_clocksource_read(clock);
+
+ return clock->read(clock);
+}
+
+static inline void clocksource_disable_inline_read(void)
+{
+ static_branch_disable(&clocksource_read_inlined);
+}
+
+static inline void clocksource_enable_inline_read(void)
+{
+ static_branch_enable(&clocksource_read_inlined);
+}
+#else
+static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
{
struct clocksource *clock = READ_ONCE(tkr->clock);
return clock->read(clock);
}
+static inline void clocksource_disable_inline_read(void) { }
+static inline void clocksource_enable_inline_read(void) { }
+#endif
/**
* tk_setup_internals - Set up internals to use clocksource clock.
@@ -367,6 +391,27 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
tk->tkr_raw.mult = clock->mult;
tk->ntp_err_mult = 0;
tk->skip_second_overflow = 0;
+
+ tk->cs_id = clock->id;
+
+ /* Coupled clockevent data */
+ if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) &&
+ clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) {
+ /*
+ * Aim for an one hour maximum delta and use KHz to handle
+ * clocksources with a frequency above 4GHz correctly as
+ * the frequency argument of clocks_calc_mult_shift() is u32.
+ */
+ clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift,
+ NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000);
+ /*
+ * Initialize the conversion limit as the previous clocksource
+ * might have the same shift/mult pair so the quick check in
+ * tk_update_ns_to_cyc() fails to update it after a clocksource
+ * change leaving it effectivly zero.
+ */
+ tk->cs_ns_to_cyc_maxns = div_u64(clock->mask, tk->cs_ns_to_cyc_mult);
+ }
}
/* Timekeeper helper functions. */
@@ -375,7 +420,7 @@ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
}
-static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
+static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
/* Calculate the delta since the last update_wall_time() */
u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
@@ -696,6 +741,36 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
+static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc)
+{
+ struct tk_read_base *tkrs = &tks->tkr_mono;
+ struct tk_read_base *tkrc = &tkc->tkr_mono;
+ unsigned int shift;
+
+ if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) ||
+ !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT))
+ return;
+
+ if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift)
+ return;
+ /*
+ * The conversion math is simple:
+ *
+ * CS::MULT (1 << NS_TO_CYC_SHIFT)
+ * --------------- = ----------------------
+ * (1 << CS:SHIFT) NS_TO_CYC_MULT
+ *
+ * Ergo:
+ *
+ * NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT
+ *
+ * NS_TO_CYC_SHIFT has been set up in tk_setup_internals()
+ */
+ shift = tkrs->shift + tks->cs_ns_to_cyc_shift;
+ tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult);
+ tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult);
+}
+
/*
* Restore the shadow timekeeper from the real timekeeper.
*/
@@ -730,6 +805,7 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act
tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
if (tk->id == TIMEKEEPER_CORE) {
+ tk_update_ns_to_cyc(tk, &tkd->timekeeper);
update_vsyscall(tk);
update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
@@ -784,6 +860,71 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk_update_coarse_nsecs(tk);
}
+/*
+ * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles
+ * @id: Clocksource ID which is required for validity
+ * @expires_ns: Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted
+ * @cycles: Pointer to storage for corresponding absolute cycles value
+ *
+ * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value
+ * based on the correlated clocksource of the clockevent device by using
+ * the base nanoseconds and cycles values of the last timekeeper update and
+ * converting the delta between @expires_ns and base nanoseconds to cycles.
+ *
+ * This only works for clockevent devices which are using a less than or
+ * equal comparator against the clocksource.
+ *
+ * Utilizing this avoids two clocksource reads for such devices, the
+ * ktime_get() in clockevents_program_event() to calculate the delta expiry
+ * value and the readout in the device::set_next_event() callback to
+ * convert the delta back to a absolute comparator value.
+ *
+ * Returns: True if @id matches the current clocksource ID, false otherwise
+ */
+bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct tk_read_base *tkrm = &tk->tkr_mono;
+ ktime_t base_ns, delta_ns, max_ns;
+ u64 base_cycles, delta_cycles;
+ unsigned int seq;
+ u32 mult, shift;
+
+ /*
+ * Racy check to avoid the seqcount overhead when ID does not match. If
+ * the relevant clocksource is installed concurrently, then this will
+ * just delay the switch over to this mechanism until the next event is
+ * programmed. If the ID is not matching the clock events code will use
+ * the regular relative set_next_event() callback as before.
+ */
+ if (data_race(tk->cs_id) != id)
+ return false;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ if (tk->cs_id != id)
+ return false;
+
+ base_cycles = tkrm->cycle_last;
+ base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift);
+
+ mult = tk->cs_ns_to_cyc_mult;
+ shift = tk->cs_ns_to_cyc_shift;
+ max_ns = tk->cs_ns_to_cyc_maxns;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ /* Prevent negative deltas and multiplication overflows */
+ delta_ns = min(expires_ns - base_ns, max_ns);
+ delta_ns = max(delta_ns, 0);
+
+ /* Convert to cycles */
+ delta_cycles = ((u64)delta_ns * mult) >> shift;
+ *cycles = base_cycles + delta_cycles;
+ return true;
+}
+
/**
* ktime_get_real_ts64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
@@ -848,7 +989,7 @@ u32 ktime_get_resolution_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
-static ktime_t *offsets[TK_OFFS_MAX] = {
+static const ktime_t *const offsets[TK_OFFS_MAX] = {
[TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
[TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
[TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
@@ -857,8 +998,9 @@ static ktime_t *offsets[TK_OFFS_MAX] = {
ktime_t ktime_get_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
+ const ktime_t *offset = offsets[offs];
unsigned int seq;
- ktime_t base, *offset = offsets[offs];
+ ktime_t base;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -878,8 +1020,9 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset);
ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
- ktime_t base, *offset = offsets[offs];
+ const ktime_t *offset = offsets[offs];
unsigned int seq;
+ ktime_t base;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -902,7 +1045,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
*/
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
- ktime_t *offset = offsets[offs];
+ const ktime_t *offset = offsets[offs];
unsigned int seq;
ktime_t tconv;
@@ -1631,7 +1774,19 @@ int timekeeping_notify(struct clocksource *clock)
if (tk->tkr_mono.clock == clock)
return 0;
+
+ /* Disable inlined reads accross the clocksource switch */
+ clocksource_disable_inline_read();
+
stop_machine(change_clocksource, clock, NULL);
+
+ /*
+ * If the clocksource has been selected and supports inlined reads
+ * enable the branch.
+ */
+ if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ)
+ clocksource_enable_inline_read();
+
tick_clock_notify();
return tk->tkr_mono.clock == clock ? 0 : -1;
}
@@ -2834,7 +2989,7 @@ static void tk_aux_update_clocksource(void)
continue;
timekeeping_forward_now(tks);
- tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock);
+ tk_setup_internals(tks, tk_core.timekeeper.tkr_raw.clock);
timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
}
}
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 543beba096c7..198d0608db74 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -9,6 +9,8 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
ktime_t *offs_boot,
ktime_t *offs_tai);
+bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles);
+
extern int timekeeping_valid_for_hres(void);
extern u64 timekeeping_max_deferment(void);
extern void timekeeping_warp_clock(void);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 7e1e3bde6b8b..04d928c21aba 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2319,6 +2319,7 @@ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
*/
void timer_clear_idle(void)
{
+ int this_cpu = smp_processor_id();
/*
* We do this unlocked. The worst outcome is a remote pinned timer
* enqueue sending a pointless IPI, but taking the lock would just
@@ -2327,9 +2328,9 @@ void timer_clear_idle(void)
* path. Required for BASE_LOCAL only.
*/
__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
- if (tick_nohz_full_cpu(smp_processor_id()))
+ if (tick_nohz_full_cpu(this_cpu))
__this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
- trace_timer_base_idle(false, smp_processor_id());
+ trace_timer_base_idle(false, this_cpu);
/* Activate without holding the timer_base->lock */
tmigr_cpu_activate();
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 488e47e96e93..427d7ddea3af 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -47,7 +47,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function));
- SEQ_printf(m, ", S:%02x", timer->state);
+ SEQ_printf(m, ", S:%02x", timer->is_queued);
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
(unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
@@ -56,13 +56,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
(long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
}
-static void
-print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
- u64 now)
+static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
+ struct timerqueue_linked_node *curr;
struct hrtimer *timer, tmp;
unsigned long next = 0, i;
- struct timerqueue_node *curr;
unsigned long flags;
next_one:
@@ -72,13 +70,13 @@ next_one:
raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
- curr = timerqueue_getnext(&base->active);
+ curr = timerqueue_linked_first(&base->active);
/*
* Crude but we have to do this O(N*N) thing, because
* we have to unlock the base when printing:
*/
while (curr && i < next) {
- curr = timerqueue_iterate_next(curr);
+ curr = timerqueue_linked_next(curr);
i++;
}
@@ -103,8 +101,8 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
#ifdef CONFIG_HIGH_RES_TIMERS
- SEQ_printf(m, " .offset: %Lu nsecs\n",
- (unsigned long long) ktime_to_ns(base->offset));
+ SEQ_printf(m, " .offset: %Ld nsecs\n",
+ (long long) base->offset);
#endif
SEQ_printf(m, "active timers:\n");
print_active_timers(m, base, now + ktime_to_ns(base->offset));
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 155eeaea4113..52c15affdbff 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -978,8 +978,12 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
/* Drop the lock to allow the remote CPU to exit idle */
raw_spin_unlock_irq(&tmc->lock);
- if (cpu != smp_processor_id())
- timer_expire_remote(cpu);
+ /*
+ * This can't exclude the local CPU because jiffies might have advanced
+ * after the timer softirq invoked run_timer_base(BASE_GLOBAL) and the
+ * point where the jiffies snapshot @jif was taken in tmigr_handle_remote().
+ */
+ timer_expire_remote(cpu);
/*
* Lock ordering needs to be preserved - timer_base locks before tmigr
@@ -1860,19 +1864,37 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
* child to the new parents. So tmigr_active_up() activates the
* new parents while walking up from the old root to the new.
*
- * * It is ensured that @start is active, as this setup path is
- * executed in hotplug prepare callback. This is executed by an
- * already connected and !idle CPU. Even if all other CPUs go idle,
- * the CPU executing the setup will be responsible up to current top
- * level group. And the next time it goes inactive, it will release
- * the new childmask and parent to subsequent walkers through this
- * @child. Therefore propagate active state unconditionally.
+ * * It is ensured that @start is active, (or on the way to be activated
+ * by another CPU that woke up before the current one) as this setup path
+ * is executed in hotplug prepare callback. This is executed by an already
+ * connected and !idle CPU in the hierarchy.
+ *
+ * * The below RmW atomic operation ensures that:
+ *
+ * 1) If the old root has been completely activated, the latest state is
+ * acquired (the below implicit acquire pairs with the implicit release
+ * from cmpxchg() in tmigr_active_up()).
+ *
+ * 2) If the old root is still on the way to be activated, the lagging behind
+ * CPU performing the activation will acquire the links up to the new root.
+ * (The below implicit release pairs with the implicit acquire from cmpxchg()
+ * in tmigr_active_up()).
+ *
+ * 3) Every subsequent CPU below the old root will acquire the new links while
+ * walking through the old root (The below implicit release pairs with the
+ * implicit acquire from cmpxchg() in either tmigr_active_up()) or
+ * tmigr_inactive_up().
*/
- state.state = atomic_read(&start->migr_state);
- WARN_ON_ONCE(!state.active);
+ state.state = atomic_fetch_or(0, &start->migr_state);
WARN_ON_ONCE(!start->parent);
- data.childmask = start->groupmask;
- __walk_groups_from(tmigr_active_up, &data, start, start->parent);
+ /*
+ * If the state of the old root is inactive, another CPU is on its way to activate
+ * it and propagate to the new root.
+ */
+ if (state.active) {
+ data.childmask = start->groupmask;
+ __walk_groups_from(tmigr_active_up, &data, start, start->parent);
+ }
}
/* Root update */