diff options
Diffstat (limited to 'kernel/time')
| -rw-r--r-- | kernel/time/.kunitconfig | 2 | ||||
| -rw-r--r-- | kernel/time/Kconfig | 32 | ||||
| -rw-r--r-- | kernel/time/Makefile | 1 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 14 | ||||
| -rw-r--r-- | kernel/time/clockevents.c | 76 | ||||
| -rw-r--r-- | kernel/time/clocksource-wdtest.c | 268 | ||||
| -rw-r--r-- | kernel/time/clocksource.c | 805 | ||||
| -rw-r--r-- | kernel/time/hrtimer.c | 1128 | ||||
| -rw-r--r-- | kernel/time/jiffies.c | 1 | ||||
| -rw-r--r-- | kernel/time/namespace.c | 203 | ||||
| -rw-r--r-- | kernel/time/namespace_internal.h | 28 | ||||
| -rw-r--r-- | kernel/time/namespace_vdso.c | 160 | ||||
| -rw-r--r-- | kernel/time/posix-timers.c | 2 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast-hrtimer.c | 1 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 9 | ||||
| -rw-r--r-- | kernel/time/tick-common.c | 1 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 30 | ||||
| -rw-r--r-- | kernel/time/time.c | 2 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 203 | ||||
| -rw-r--r-- | kernel/time/timekeeping.h | 2 | ||||
| -rw-r--r-- | kernel/time/timer.c | 5 | ||||
| -rw-r--r-- | kernel/time/timer_list.c | 16 | ||||
| -rw-r--r-- | kernel/time/timer_migration.c | 48 |
23 files changed, 1789 insertions, 1248 deletions
diff --git a/kernel/time/.kunitconfig b/kernel/time/.kunitconfig new file mode 100644 index 000000000000..d60a611b2853 --- /dev/null +++ b/kernel/time/.kunitconfig @@ -0,0 +1,2 @@ +CONFIG_KUNIT=y +CONFIG_TIME_KUNIT_TEST=y diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 7c6a52f7836c..02aac7c5aa76 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -9,14 +9,13 @@ config CLOCKSOURCE_WATCHDOG bool -# Architecture has extra clocksource data -config ARCH_CLOCKSOURCE_DATA - bool - # Architecture has extra clocksource init called from registration config ARCH_CLOCKSOURCE_INIT bool +config ARCH_WANTS_CLOCKSOURCE_READ_INLINE + bool + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool @@ -44,10 +43,23 @@ config GENERIC_CLOCKEVENTS_BROADCAST_IDLE config GENERIC_CLOCKEVENTS_MIN_ADJUST bool +config GENERIC_CLOCKEVENTS_COUPLED + bool + +config GENERIC_CLOCKEVENTS_COUPLED_INLINE + select GENERIC_CLOCKEVENTS_COUPLED + bool + # Generic update of CMOS clock config GENERIC_CMOS_UPDATE bool +# Deferred rearming of the hrtimer interrupt +config HRTIMER_REARM_DEFERRED + def_bool y + depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS + depends on HIGH_RES_TIMERS && SCHED_HRTICK + # Select to handle posix CPU timers from task_work # and not from the timer interrupt context config HAVE_POSIX_CPU_TIMERS_TASK_WORK @@ -196,18 +208,6 @@ config HIGH_RES_TIMERS hardware is not capable then this option only increases the size of the kernel image. -config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US - int "Clocksource watchdog maximum allowable skew (in microseconds)" - depends on CLOCKSOURCE_WATCHDOG - range 50 1000 - default 125 - help - Specify the maximum amount of allowable watchdog skew in - microseconds before reporting the clocksource to be unstable. - The default is based on a half-second clocksource watchdog - interval and NTP's maximum frequency drift of 500 parts - per million. If the clocksource is good enough for NTP, - it is good enough for the clocksource watchdog! endif config POSIX_AUX_CLOCKS diff --git a/kernel/time/Makefile b/kernel/time/Makefile index f7d52d9543cc..eaf290c972f9 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -30,5 +30,6 @@ obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o obj-$(CONFIG_TIME_NS) += namespace.o +obj-$(CONFIG_TIME_NS_VDSO) += namespace_vdso.o obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 069d93bfb0c7..6e173d70d825 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -234,19 +234,23 @@ static int alarmtimer_suspend(struct device *dev) if (!rtc) return 0; - /* Find the soonest timer to expire*/ + /* Find the soonest timer to expire */ for (i = 0; i < ALARM_NUMTYPE; i++) { struct alarm_base *base = &alarm_bases[i]; struct timerqueue_node *next; + ktime_t next_expires; ktime_t delta; - scoped_guard(spinlock_irqsave, &base->lock) + scoped_guard(spinlock_irqsave, &base->lock) { next = timerqueue_getnext(&base->timerqueue); + if (next) + next_expires = next->expires; + } if (!next) continue; - delta = ktime_sub(next->expires, base->get_ktime()); + delta = ktime_sub(next_expires, base->get_ktime()); if (!min || (delta < min)) { - expires = next->expires; + expires = next_expires; min = delta; type = i; } @@ -540,7 +544,7 @@ static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; - return alarm_forward(alarm, timr->it_interval, now); + return alarm_forward(alarm, now, timr->it_interval); } /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index eaae1ce9f060..0014d163f989 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -94,6 +94,9 @@ static int __clockevents_switch_state(struct clock_event_device *dev, if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; + /* On state transitions clear the forced flag unconditionally */ + dev->next_event_forced = 0; + /* Transition with new state-specific callbacks */ switch (state) { case CLOCK_EVT_STATE_DETACHED: @@ -172,6 +175,7 @@ void clockevents_shutdown(struct clock_event_device *dev) { clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event = KTIME_MAX; + dev->next_event_forced = 0; } /** @@ -292,6 +296,38 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE +#include <asm/clock_inlined.h> +#else +static __always_inline void +arch_inlined_clockevent_set_next_coupled(u64 cycles, struct clock_event_device *dev) { } +#endif + +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + u64 cycles; + + if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED))) + return false; + + if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles))) + return false; + + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE)) + arch_inlined_clockevent_set_next_coupled(cycles, dev); + else + dev->set_next_coupled(cycles, dev); + return true; +} + +#else +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + return false; +} +#endif + /** * clockevents_program_event - Reprogram the clock event device. * @dev: device to program @@ -300,12 +336,10 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) * * Returns 0 on success, -ETIME when the event is in the past. */ -int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, - bool force) +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) { - unsigned long long clc; int64_t delta; - int rc; + u64 cycles; if (WARN_ON_ONCE(expires < 0)) return -ETIME; @@ -319,21 +353,37 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev)); - /* Shortcut for clockevent devices that can deal with ktime. */ - if (dev->features & CLOCK_EVT_FEAT_KTIME) + /* ktime_t based reprogramming for the broadcast hrtimer device */ + if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER)) return dev->set_next_ktime(expires, dev); + if (likely(clockevent_set_next_coupled(dev, expires))) + return 0; + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); - if (delta <= 0) - return force ? clockevents_program_min_delta(dev) : -ETIME; - delta = min(delta, (int64_t) dev->max_delta_ns); - delta = max(delta, (int64_t) dev->min_delta_ns); + /* Required for tick_periodic() during early boot */ + if (delta <= 0 && !force) + return -ETIME; - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - rc = dev->set_next_event((unsigned long) clc, dev); + if (delta > (int64_t)dev->min_delta_ns) { + delta = min(delta, (int64_t) dev->max_delta_ns); + cycles = ((u64)delta * dev->mult) >> dev->shift; + if (!dev->set_next_event((unsigned long) cycles, dev)) { + dev->next_event_forced = 0; + return 0; + } + } + + if (dev->next_event_forced) + return 0; - return (rc && force) ? clockevents_program_min_delta(dev) : rc; + if (dev->set_next_event(dev->min_delta_ticks, dev)) { + if (!force || clockevents_program_min_delta(dev)) + return -ETIME; + } + dev->next_event_forced = 1; + return 0; } /* diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 38dae590b29f..b4cf17b4aeed 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -3,202 +3,196 @@ * Unit test for the clocksource watchdog. * * Copyright (C) 2021 Facebook, Inc. + * Copyright (C) 2026 Intel Corp. * * Author: Paul E. McKenney <paulmck@kernel.org> + * Author: Thomas Gleixner <tglx@kernel.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/device.h> #include <linux/clocksource.h> -#include <linux/init.h> +#include <linux/delay.h> #include <linux/module.h> -#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ -#include <linux/tick.h> #include <linux/kthread.h> -#include <linux/delay.h> -#include <linux/prandom.h> -#include <linux/cpu.h> #include "tick-internal.h" +#include "timekeeping_internal.h" MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Clocksource watchdog unit test"); MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); +MODULE_AUTHOR("Thomas Gleixner <tglx@kernel.org>"); + +enum wdtest_states { + WDTEST_INJECT_NONE, + WDTEST_INJECT_DELAY, + WDTEST_INJECT_POSITIVE, + WDTEST_INJECT_NEGATIVE, + WDTEST_INJECT_PERCPU = 0x100, +}; -static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0; -module_param(holdoff, int, 0444); -MODULE_PARM_DESC(holdoff, "Time to wait to start test (s)."); +static enum wdtest_states wdtest_state; +static unsigned long wdtest_test_count; +static ktime_t wdtest_last_ts, wdtest_offset; -/* Watchdog kthread's task_struct pointer for debug purposes. */ -static struct task_struct *wdtest_task; +#define SHIFT_4000PPM 8 -static u64 wdtest_jiffies_read(struct clocksource *cs) +static ktime_t wdtest_get_offset(struct clocksource *cs) { - return (u64)jiffies; -} - -static struct clocksource clocksource_wdtest_jiffies = { - .name = "wdtest-jiffies", - .rating = 1, /* lowest valid rating*/ - .uncertainty_margin = TICK_NSEC, - .read = wdtest_jiffies_read, - .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_MUST_VERIFY, - .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ - .shift = JIFFIES_SHIFT, - .max_cycles = 10, -}; + if (wdtest_state < WDTEST_INJECT_PERCPU) + return wdtest_test_count & 0x1 ? 0 : wdtest_offset >> SHIFT_4000PPM; -static int wdtest_ktime_read_ndelays; -static bool wdtest_ktime_read_fuzz; + /* Only affect the readout of the "remote" CPU */ + return cs->wd_cpu == smp_processor_id() ? 0 : NSEC_PER_MSEC; +} static u64 wdtest_ktime_read(struct clocksource *cs) { - int wkrn = READ_ONCE(wdtest_ktime_read_ndelays); - static int sign = 1; - u64 ret; + ktime_t now = ktime_get_raw_fast_ns(); + ktime_t intv = now - wdtest_last_ts; - if (wkrn) { - udelay(cs->uncertainty_margin / 250); - WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1); - } - ret = ktime_get_real_fast_ns(); - if (READ_ONCE(wdtest_ktime_read_fuzz)) { - sign = -sign; - ret = ret + sign * 100 * NSEC_PER_MSEC; + /* + * Only increment the test counter once per watchdog interval and + * store the interval for the offset calculation of this step. This + * guarantees a consistent behaviour even if the other side needs + * to repeat due to a watchdog read timeout. + */ + if (intv > (NSEC_PER_SEC / 4)) { + WRITE_ONCE(wdtest_test_count, wdtest_test_count + 1); + wdtest_last_ts = now; + wdtest_offset = intv; } - return ret; -} -static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs) -{ - pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name); + switch (wdtest_state & ~WDTEST_INJECT_PERCPU) { + case WDTEST_INJECT_POSITIVE: + return now + wdtest_get_offset(cs); + case WDTEST_INJECT_NEGATIVE: + return now - wdtest_get_offset(cs); + case WDTEST_INJECT_DELAY: + udelay(500); + return now; + default: + return now; + } } -#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \ - CLOCK_SOURCE_VALID_FOR_HRES | \ - CLOCK_SOURCE_MUST_VERIFY | \ - CLOCK_SOURCE_VERIFY_PERCPU) +#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \ + CLOCK_SOURCE_CALIBRATED | \ + CLOCK_SOURCE_MUST_VERIFY | \ + CLOCK_SOURCE_WDTEST) static struct clocksource clocksource_wdtest_ktime = { .name = "wdtest-ktime", - .rating = 300, + .rating = 10, .read = wdtest_ktime_read, .mask = CLOCKSOURCE_MASK(64), .flags = KTIME_FLAGS, - .mark_unstable = wdtest_ktime_cs_mark_unstable, .list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list), }; -/* Reset the clocksource if needed. */ -static void wdtest_ktime_clocksource_reset(void) +static void wdtest_clocksource_reset(enum wdtest_states which, bool percpu) +{ + clocksource_unregister(&clocksource_wdtest_ktime); + + pr_info("Test: State %d percpu %d\n", which, percpu); + + wdtest_state = which; + if (percpu) + wdtest_state |= WDTEST_INJECT_PERCPU; + wdtest_test_count = 0; + wdtest_last_ts = 0; + + clocksource_wdtest_ktime.rating = 10; + clocksource_wdtest_ktime.flags = KTIME_FLAGS; + if (percpu) + clocksource_wdtest_ktime.flags |= CLOCK_SOURCE_WDTEST_PERCPU; + clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); +} + +static bool wdtest_execute(enum wdtest_states which, bool percpu, unsigned int expect, + unsigned long calls) { - if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) { - clocksource_unregister(&clocksource_wdtest_ktime); - clocksource_wdtest_ktime.flags = KTIME_FLAGS; - schedule_timeout_uninterruptible(HZ / 10); - clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); + wdtest_clocksource_reset(which, percpu); + + for (; READ_ONCE(wdtest_test_count) < calls; msleep(100)) { + unsigned int flags = READ_ONCE(clocksource_wdtest_ktime.flags); + + if (kthread_should_stop()) + return false; + + if (flags & CLOCK_SOURCE_UNSTABLE) { + if (expect & CLOCK_SOURCE_UNSTABLE) + return true; + pr_warn("Fail: Unexpected unstable\n"); + return false; + } + if (flags & CLOCK_SOURCE_VALID_FOR_HRES) { + if (expect & CLOCK_SOURCE_VALID_FOR_HRES) + return true; + pr_warn("Fail: Unexpected valid for highres\n"); + return false; + } } + + if (!expect) + return true; + + pr_warn("Fail: Timed out\n"); + return false; } -/* Run the specified series of watchdog tests. */ -static int wdtest_func(void *arg) +static bool wdtest_run(bool percpu) { - unsigned long j1, j2; - int i, max_retries; - char *s; + if (!wdtest_execute(WDTEST_INJECT_NONE, percpu, CLOCK_SOURCE_VALID_FOR_HRES, 8)) + return false; - schedule_timeout_uninterruptible(holdoff * HZ); + if (!wdtest_execute(WDTEST_INJECT_DELAY, percpu, 0, 4)) + return false; - /* - * Verify that jiffies-like clocksources get the manually - * specified uncertainty margin. - */ - pr_info("--- Verify jiffies-like uncertainty margin.\n"); - __clocksource_register(&clocksource_wdtest_jiffies); - WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC); + if (!wdtest_execute(WDTEST_INJECT_POSITIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8)) + return false; - j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); - schedule_timeout_uninterruptible(HZ); - j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); - WARN_ON_ONCE(j1 == j2); + if (!wdtest_execute(WDTEST_INJECT_NEGATIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8)) + return false; - clocksource_unregister(&clocksource_wdtest_jiffies); + return true; +} - /* - * Verify that tsc-like clocksources are assigned a reasonable - * uncertainty margin. - */ - pr_info("--- Verify tsc-like uncertainty margin.\n"); +static int wdtest_func(void *arg) +{ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); - WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC); - - j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); - udelay(1); - j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); - pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); - WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC), - "Expected at least 1000ns, got %lu.\n", j2 - j1); - - /* Verify tsc-like stability with various numbers of errors injected. */ - max_retries = clocksource_get_max_watchdog_retry(); - for (i = 0; i <= max_retries + 1; i++) { - if (i <= 1 && i < max_retries) - s = ""; - else if (i <= max_retries) - s = ", expect message"; - else - s = ", expect clock skew"; - pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s); - WRITE_ONCE(wdtest_ktime_read_ndelays, i); - schedule_timeout_uninterruptible(2 * HZ); - WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays)); - WARN_ON_ONCE((i <= max_retries) != - !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); - wdtest_ktime_clocksource_reset(); + if (wdtest_run(false)) { + if (wdtest_run(true)) + pr_info("Success: All tests passed\n"); } - - /* Verify tsc-like stability with clock-value-fuzz error injection. */ - pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n"); - WRITE_ONCE(wdtest_ktime_read_fuzz, true); - schedule_timeout_uninterruptible(2 * HZ); - WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); - clocksource_verify_percpu(&clocksource_wdtest_ktime); - WRITE_ONCE(wdtest_ktime_read_fuzz, false); - clocksource_unregister(&clocksource_wdtest_ktime); - pr_info("--- Done with test.\n"); - return 0; -} + if (!IS_MODULE(CONFIG_TEST_CLOCKSOURCE_WATCHDOG)) + return 0; -static void wdtest_print_module_parms(void) -{ - pr_alert("--- holdoff=%d\n", holdoff); + while (!kthread_should_stop()) + schedule_timeout_interruptible(3600 * HZ); + return 0; } -/* Cleanup function. */ -static void clocksource_wdtest_cleanup(void) -{ -} +static struct task_struct *wdtest_thread; static int __init clocksource_wdtest_init(void) { - int ret = 0; - - wdtest_print_module_parms(); + struct task_struct *t = kthread_run(wdtest_func, NULL, "wdtest"); - /* Create watchdog-test task. */ - wdtest_task = kthread_run(wdtest_func, NULL, "wdtest"); - if (IS_ERR(wdtest_task)) { - ret = PTR_ERR(wdtest_task); - pr_warn("%s: Failed to create wdtest kthread.\n", __func__); - wdtest_task = NULL; - return ret; + if (IS_ERR(t)) { + pr_warn("Failed to create wdtest kthread.\n"); + return PTR_ERR(t); } - + wdtest_thread = t; return 0; } - module_init(clocksource_wdtest_init); + +static void clocksource_wdtest_cleanup(void) +{ + if (wdtest_thread) + kthread_stop(wdtest_thread); +} module_exit(clocksource_wdtest_cleanup); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index df7194961658..baee13a1f87f 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -7,15 +7,17 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/device.h> #include <linux/clocksource.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/device.h> #include <linux/init.h> -#include <linux/module.h> -#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ -#include <linux/tick.h> #include <linux/kthread.h> +#include <linux/module.h> #include <linux/prandom.h> -#include <linux/cpu.h> +#include <linux/sched.h> +#include <linux/tick.h> +#include <linux/topology.h> #include "tick-internal.h" #include "timekeeping_internal.h" @@ -107,48 +109,6 @@ static char override_name[CS_NAME_LEN]; static int finished_booting; static u64 suspend_start; -/* - * Interval: 0.5sec. - */ -#define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ)) - -/* - * Threshold: 0.0312s, when doubled: 0.0625s. - */ -#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) - -/* - * Maximum permissible delay between two readouts of the watchdog - * clocksource surrounding a read of the clocksource being validated. - * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as - * a lower bound for cs->uncertainty_margin values when registering clocks. - * - * The default of 500 parts per million is based on NTP's limits. - * If a clocksource is good enough for NTP, it is good enough for us! - * - * In other words, by default, even if a clocksource is extremely - * precise (for example, with a sub-nanosecond period), the maximum - * permissible skew between the clocksource watchdog and the clocksource - * under test is not permitted to go below the 500ppm minimum defined - * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the - * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option. - */ -#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US -#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US -#else -#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ) -#endif - -/* - * Default for maximum permissible skew when cs->uncertainty_margin is - * not specified, and the lower bound even when cs->uncertainty_margin - * is specified. This is also the default that is used when registering - * clocks with unspecified cs->uncertainty_margin, so this macro is used - * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. - */ -#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) - #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); static void clocksource_select(void); @@ -160,7 +120,42 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; -static int64_t watchdog_max_interval; + +/* Watchdog interval: 0.5sec. */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_INTERVAL_NS (WATCHDOG_INTERVAL * (NSEC_PER_SEC / HZ)) + +/* Maximum time between two reference watchdog readouts */ +#define WATCHDOG_READOUT_MAX_NS (50U * NSEC_PER_USEC) + +/* + * Maximum time between two remote readouts for NUMA=n. On NUMA enabled systems + * the timeout is calculated from the numa distance. + */ +#define WATCHDOG_DEFAULT_TIMEOUT_NS (50U * NSEC_PER_USEC) + +/* + * Remote timeout NUMA distance multiplier. The local distance is 10. The + * default remote distance is 20. ACPI tables provide more accurate numbers + * which are guaranteed to be greater than the local distance. + * + * This results in a 5us base value, which is equivalent to the above !NUMA + * default. + */ +#define WATCHDOG_NUMA_MULTIPLIER_NS ((u64)(WATCHDOG_DEFAULT_TIMEOUT_NS / LOCAL_DISTANCE)) + +/* Limit the NUMA timeout in case the distance values are insanely big */ +#define WATCHDOG_NUMA_MAX_TIMEOUT_NS ((u64)(500U * NSEC_PER_USEC)) + +/* Shift values to calculate the approximate $N ppm of a given delta. */ +#define SHIFT_500PPM 11 +#define SHIFT_4000PPM 8 + +/* Number of attempts to read the watchdog */ +#define WATCHDOG_FREQ_RETRIES 3 + +/* Five reads local and remote for inter CPU skew detection */ +#define WATCHDOG_REMOTE_MAX_SEQ 10 static inline void clocksource_watchdog_lock(unsigned long *flags) { @@ -241,204 +236,422 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -static int verify_n_cpus = 8; -module_param(verify_n_cpus, int, 0644); +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; -enum wd_read_status { - WD_READ_SUCCESS, - WD_READ_UNSTABLE, - WD_READ_SKIP + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + +enum wd_result { + WD_SUCCESS, + WD_FREQ_NO_WATCHDOG, + WD_FREQ_TIMEOUT, + WD_FREQ_RESET, + WD_FREQ_SKEWED, + WD_CPU_TIMEOUT, + WD_CPU_SKEWED, +}; + +struct watchdog_cpu_data { + /* Keep first as it is 32 byte aligned */ + call_single_data_t csd; + atomic_t remote_inprogress; + enum wd_result result; + u64 cpu_ts[2]; + struct clocksource *cs; + /* Ensure that the sequence is in a separate cache line */ + atomic_t seq ____cacheline_aligned; + /* Set by the control CPU according to NUMA distance */ + u64 timeout_ns; }; -static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) -{ - int64_t md = watchdog->uncertainty_margin; - unsigned int nretries, max_retries; - int64_t wd_delay, wd_seq_delay; - u64 wd_end, wd_end2; - - max_retries = clocksource_get_max_watchdog_retry(); - for (nretries = 0; nretries <= max_retries; nretries++) { - local_irq_disable(); - *wdnow = watchdog->read(watchdog); - *csnow = cs->read(cs); - wd_end = watchdog->read(watchdog); - wd_end2 = watchdog->read(watchdog); - local_irq_enable(); - - wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); - if (wd_delay <= md + cs->uncertainty_margin) { - if (nretries > 1 && nretries >= max_retries) { - pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", - smp_processor_id(), watchdog->name, nretries); +struct watchdog_data { + raw_spinlock_t lock; + enum wd_result result; + + u64 wd_seq; + u64 wd_delta; + u64 cs_delta; + u64 cpu_ts[2]; + + unsigned int curr_cpu; +} ____cacheline_aligned_in_smp; + +static void watchdog_check_skew_remote(void *unused); + +static DEFINE_PER_CPU_ALIGNED(struct watchdog_cpu_data, watchdog_cpu_data) = { + .csd = CSD_INIT(watchdog_check_skew_remote, NULL), +}; + +static struct watchdog_data watchdog_data = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(watchdog_data.lock), +}; + +static inline void watchdog_set_result(struct watchdog_cpu_data *wd, enum wd_result result) +{ + guard(raw_spinlock)(&watchdog_data.lock); + if (!wd->result) { + atomic_set(&wd->seq, WATCHDOG_REMOTE_MAX_SEQ); + WRITE_ONCE(wd->result, result); + } +} + +/* Wait for the sequence number to hand over control. */ +static bool watchdog_wait_seq(struct watchdog_cpu_data *wd, u64 start, int seq) +{ + for(int cnt = 0; atomic_read(&wd->seq) < seq; cnt++) { + /* Bail if the other side set an error result */ + if (READ_ONCE(wd->result) != WD_SUCCESS) + return false; + + /* Prevent endless loops if the other CPU does not react. */ + if (cnt == 5000) { + u64 nsecs = ktime_get_raw_fast_ns(); + + if (nsecs - start >=wd->timeout_ns) { + watchdog_set_result(wd, WD_CPU_TIMEOUT); + return false; } - return WD_READ_SUCCESS; + cnt = 0; } + cpu_relax(); + } + return seq < WATCHDOG_REMOTE_MAX_SEQ; +} - /* - * Now compute delay in consecutive watchdog read to see if - * there is too much external interferences that cause - * significant delay in reading both clocksource and watchdog. - * - * If consecutive WD read-back delay > md, report - * system busy, reinit the watchdog and skip the current - * watchdog test. - */ - wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2); - if (wd_seq_delay > md) - goto skip_test; +static void watchdog_check_skew(struct watchdog_cpu_data *wd, int index) +{ + u64 prev, now, delta, start = ktime_get_raw_fast_ns(); + int local = index, remote = (index + 1) & 0x1; + struct clocksource *cs = wd->cs; + + /* Set the local timestamp so that the first iteration works correctly */ + wd->cpu_ts[local] = cs->read(cs); + + /* Signal arrival */ + atomic_inc(&wd->seq); + + for (int seq = local + 2; seq < WATCHDOG_REMOTE_MAX_SEQ; seq += 2) { + if (!watchdog_wait_seq(wd, start, seq)) + return; + + /* Capture local timestamp before possible non-local coherency overhead */ + now = cs->read(cs); + + /* Store local timestamp before reading remote to limit coherency stalls */ + wd->cpu_ts[local] = now; + + prev = wd->cpu_ts[remote]; + delta = (now - prev) & cs->mask; + + if (delta > cs->max_raw_delta) { + watchdog_set_result(wd, WD_CPU_SKEWED); + return; + } + + /* Hand over to the remote CPU */ + atomic_inc(&wd->seq); } +} - pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n", - smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name); - return WD_READ_UNSTABLE; +static void watchdog_check_skew_remote(void *unused) +{ + struct watchdog_cpu_data *wd = this_cpu_ptr(&watchdog_cpu_data); -skip_test: - pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n", - smp_processor_id(), watchdog->name, wd_seq_delay); - pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n", - cs->name, wd_delay); - return WD_READ_SKIP; + atomic_inc(&wd->remote_inprogress); + watchdog_check_skew(wd, 1); + atomic_dec(&wd->remote_inprogress); } -static u64 csnow_mid; -static cpumask_t cpus_ahead; -static cpumask_t cpus_behind; -static cpumask_t cpus_chosen; +static inline bool wd_csd_locked(struct watchdog_cpu_data *wd) +{ + return READ_ONCE(wd->csd.node.u_flags) & CSD_FLAG_LOCK; +} + +/* + * This is only invoked for remote CPUs. See watchdog_check_cpu_skew(). + */ +static inline u64 wd_get_remote_timeout(unsigned int remote_cpu) +{ + unsigned int n1, n2; + u64 ns; + + if (nr_node_ids == 1) + return WATCHDOG_DEFAULT_TIMEOUT_NS; + + n1 = cpu_to_node(smp_processor_id()); + n2 = cpu_to_node(remote_cpu); + ns = WATCHDOG_NUMA_MULTIPLIER_NS * node_distance(n1, n2); + return min(ns, WATCHDOG_NUMA_MAX_TIMEOUT_NS); +} -static void clocksource_verify_choose_cpus(void) +static void __watchdog_check_cpu_skew(struct clocksource *cs, unsigned int cpu) { - int cpu, i, n = verify_n_cpus; + struct watchdog_cpu_data *wd; - if (n < 0 || n >= num_online_cpus()) { - /* Check all of the CPUs. */ - cpumask_copy(&cpus_chosen, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); + wd = per_cpu_ptr(&watchdog_cpu_data, cpu); + if (atomic_read(&wd->remote_inprogress) || wd_csd_locked(wd)) { + watchdog_data.result = WD_CPU_TIMEOUT; return; } - /* If no checking desired, or no other CPU to check, leave. */ - cpumask_clear(&cpus_chosen); - if (n == 0 || num_online_cpus() <= 1) + atomic_set(&wd->seq, 0); + wd->result = WD_SUCCESS; + wd->cs = cs; + /* Store the current CPU ID for the watchdog test unit */ + cs->wd_cpu = smp_processor_id(); + + wd->timeout_ns = wd_get_remote_timeout(cpu); + + /* Kick the remote CPU into the watchdog function */ + if (WARN_ON_ONCE(smp_call_function_single_async(cpu, &wd->csd))) { + watchdog_data.result = WD_CPU_TIMEOUT; + return; + } + + scoped_guard(irq) + watchdog_check_skew(wd, 0); + + scoped_guard(raw_spinlock_irq, &watchdog_data.lock) { + watchdog_data.result = wd->result; + memcpy(watchdog_data.cpu_ts, wd->cpu_ts, sizeof(wd->cpu_ts)); + } +} + +static void watchdog_check_cpu_skew(struct clocksource *cs) +{ + unsigned int cpu = watchdog_data.curr_cpu; + + cpu = cpumask_next_wrap(cpu, cpu_online_mask); + watchdog_data.curr_cpu = cpu; + + /* Skip the current CPU. Handles num_online_cpus() == 1 as well */ + if (cpu == smp_processor_id()) return; - /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_any_but(cpu_online_mask, smp_processor_id()); - if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + /* Don't interfere with the test mechanics */ + if ((cs->flags & CLOCK_SOURCE_WDTEST) && !(cs->flags & CLOCK_SOURCE_WDTEST_PERCPU)) return; - cpumask_set_cpu(cpu, &cpus_chosen); - /* Force a sane value for the boot parameter. */ - if (n > nr_cpu_ids) - n = nr_cpu_ids; + __watchdog_check_cpu_skew(cs, cpu); +} + +static bool watchdog_check_freq(struct clocksource *cs, bool reset_pending) +{ + unsigned int ppm_shift = SHIFT_4000PPM; + u64 wd_ts0, wd_ts1, cs_ts; + + watchdog_data.result = WD_SUCCESS; + if (!watchdog) { + watchdog_data.result = WD_FREQ_NO_WATCHDOG; + return false; + } + + if (cs->flags & CLOCK_SOURCE_WDTEST_PERCPU) + return true; /* - * Randomly select the specified number of CPUs. If the same - * CPU is selected multiple times, that CPU is checked only once, - * and no replacement CPU is selected. This gracefully handles - * situations where verify_n_cpus is greater than the number of - * CPUs that are currently online. + * If both the clocksource and the watchdog claim they are + * calibrated use 500ppm limit. Uncalibrated clocksources need a + * larger allowance because thefirmware supplied frequencies can be + * way off. */ - for (i = 1; i < n; i++) { - cpu = cpumask_random(cpu_online_mask); - if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) - cpumask_set_cpu(cpu, &cpus_chosen); + if (watchdog->flags & CLOCK_SOURCE_CALIBRATED && cs->flags & CLOCK_SOURCE_CALIBRATED) + ppm_shift = SHIFT_500PPM; + + for (int retries = 0; retries < WATCHDOG_FREQ_RETRIES; retries++) { + s64 wd_last, cs_last, wd_seq, wd_delta, cs_delta, max_delta; + + scoped_guard(irq) { + wd_ts0 = watchdog->read(watchdog); + cs_ts = cs->read(cs); + wd_ts1 = watchdog->read(watchdog); + } + + wd_last = cs->wd_last; + cs_last = cs->cs_last; + + /* Validate the watchdog readout window */ + wd_seq = cycles_to_nsec_safe(watchdog, wd_ts0, wd_ts1); + if (wd_seq > WATCHDOG_READOUT_MAX_NS) { + /* Store for printout in case all retries fail */ + watchdog_data.wd_seq = wd_seq; + continue; + } + + /* Store for subsequent processing */ + cs->wd_last = wd_ts0; + cs->cs_last = cs_ts; + + /* First round or reset pending? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || reset_pending) + goto reset; + + /* Calculate the nanosecond deltas from the last invocation */ + wd_delta = cycles_to_nsec_safe(watchdog, wd_last, wd_ts0); + cs_delta = cycles_to_nsec_safe(cs, cs_last, cs_ts); + + watchdog_data.wd_delta = wd_delta; + watchdog_data.cs_delta = cs_delta; + + /* + * Ensure that the deltas are within the readout limits of + * the clocksource and the watchdog. Long delays can cause + * clocksources to overflow. + */ + max_delta = max(wd_delta, cs_delta); + if (max_delta > cs->max_idle_ns || max_delta > watchdog->max_idle_ns) + goto reset; + + /* + * Calculate and validate the skew against the allowed PPM + * value of the maximum delta plus the watchdog readout + * time. + */ + if (abs(wd_delta - cs_delta) < (max_delta >> ppm_shift) + wd_seq) + return true; + + watchdog_data.result = WD_FREQ_SKEWED; + return false; } - /* Don't verify ourselves. */ - cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); + watchdog_data.result = WD_FREQ_TIMEOUT; + return false; + +reset: + cs->flags |= CLOCK_SOURCE_WATCHDOG; + watchdog_data.result = WD_FREQ_RESET; + return false; } -static void clocksource_verify_one_cpu(void *csin) +/* Synchronization for sched clock */ +static void clocksource_tick_stable(struct clocksource *cs) { - struct clocksource *cs = (struct clocksource *)csin; - - csnow_mid = cs->read(cs); + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); } -void clocksource_verify_percpu(struct clocksource *cs) +/* Conditionaly enable high resolution mode */ +static void clocksource_enable_highres(struct clocksource *cs) { - int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX; - u64 csnow_begin, csnow_end; - int cpu, testcpu; - s64 delta; + if ((cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) || + !(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) || + !watchdog || !(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) + return; + + /* Mark it valid for high-res. */ + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - if (verify_n_cpus == 0) + /* + * Can't schedule work before finished_booting is + * true. clocksource_done_booting will take care of it. + */ + if (!finished_booting) return; - cpumask_clear(&cpus_ahead); - cpumask_clear(&cpus_behind); - cpus_read_lock(); - migrate_disable(); - clocksource_verify_choose_cpus(); - if (cpumask_empty(&cpus_chosen)) { - migrate_enable(); - cpus_read_unlock(); - pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); + + if (cs->flags & CLOCK_SOURCE_WDTEST) return; + + /* + * If this is not the current clocksource let the watchdog thread + * reselect it. Due to the change to high res this clocksource + * might be preferred now. If it is the current clocksource let the + * tick code know about that change. + */ + if (cs != curr_clocksource) { + cs->flags |= CLOCK_SOURCE_RESELECT; + schedule_work(&watchdog_work); + } else { + tick_clock_notify(); } - testcpu = smp_processor_id(); - pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", - cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); - preempt_disable(); - for_each_cpu(cpu, &cpus_chosen) { - if (cpu == testcpu) - continue; - csnow_begin = cs->read(cs); - smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1); - csnow_end = cs->read(cs); - delta = (s64)((csnow_mid - csnow_begin) & cs->mask); - if (delta < 0) - cpumask_set_cpu(cpu, &cpus_behind); - delta = (csnow_end - csnow_mid) & cs->mask; - if (delta < 0) - cpumask_set_cpu(cpu, &cpus_ahead); - cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end); - if (cs_nsec > cs_nsec_max) - cs_nsec_max = cs_nsec; - if (cs_nsec < cs_nsec_min) - cs_nsec_min = cs_nsec; +} + +static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 2); + +static void watchdog_print_freq_timeout(struct clocksource *cs) +{ + if (!__ratelimit(&ratelimit_state)) + return; + pr_info("Watchdog %s read timed out. Readout sequence took: %lluns\n", + watchdog->name, watchdog_data.wd_seq); +} + +static void watchdog_print_freq_skew(struct clocksource *cs) +{ + pr_warn("Marking clocksource %s unstable due to frequency skew\n", cs->name); + pr_warn("Watchdog %20s interval: %16lluns\n", watchdog->name, watchdog_data.wd_delta); + pr_warn("Clocksource %20s interval: %16lluns\n", cs->name, watchdog_data.cs_delta); +} + +static void watchdog_handle_remote_timeout(struct clocksource *cs) +{ + pr_info_once("Watchdog remote CPU %u read timed out\n", watchdog_data.curr_cpu); +} + +static void watchdog_print_remote_skew(struct clocksource *cs) +{ + pr_warn("Marking clocksource %s unstable due to inter CPU skew\n", cs->name); + if (watchdog_data.cpu_ts[0] < watchdog_data.cpu_ts[1]) { + pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", smp_processor_id(), + watchdog_data.cpu_ts[0], watchdog_data.curr_cpu, watchdog_data.cpu_ts[1]); + } else { + pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", watchdog_data.curr_cpu, + watchdog_data.cpu_ts[1], smp_processor_id(), watchdog_data.cpu_ts[0]); } - preempt_enable(); - migrate_enable(); - cpus_read_unlock(); - if (!cpumask_empty(&cpus_ahead)) - pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", - cpumask_pr_args(&cpus_ahead), testcpu, cs->name); - if (!cpumask_empty(&cpus_behind)) - pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", - cpumask_pr_args(&cpus_behind), testcpu, cs->name); - pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", - testcpu, cs_nsec_min, cs_nsec_max, cs->name); -} -EXPORT_SYMBOL_GPL(clocksource_verify_percpu); +} -static inline void clocksource_reset_watchdog(void) +static void watchdog_check_result(struct clocksource *cs) { - struct clocksource *cs; + switch (watchdog_data.result) { + case WD_SUCCESS: + clocksource_tick_stable(cs); + clocksource_enable_highres(cs); + return; - list_for_each_entry(cs, &watchdog_list, wd_list) + case WD_FREQ_TIMEOUT: + watchdog_print_freq_timeout(cs); + /* Try again later and invalidate the reference timestamps. */ cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -} + return; + case WD_FREQ_NO_WATCHDOG: + case WD_FREQ_RESET: + /* + * Nothing to do when the reference timestamps were reset + * or no watchdog clocksource registered. + */ + return; + + case WD_FREQ_SKEWED: + watchdog_print_freq_skew(cs); + break; + + case WD_CPU_TIMEOUT: + /* Remote check timed out. Try again next cycle. */ + watchdog_handle_remote_timeout(cs); + return; + + case WD_CPU_SKEWED: + watchdog_print_remote_skew(cs); + break; + } + __clocksource_unstable(cs); +} static void clocksource_watchdog(struct timer_list *unused) { - int64_t wd_nsec, cs_nsec, interval; - u64 csnow, wdnow, cslast, wdlast; - int next_cpu, reset_pending; struct clocksource *cs; - enum wd_read_status read_ret; - unsigned long extra_wait = 0; - u32 md; + bool reset_pending; - spin_lock(&watchdog_lock); + guard(spinlock)(&watchdog_lock); if (!watchdog_running) - goto out; + return; reset_pending = atomic_read(&watchdog_reset_pending); list_for_each_entry(cs, &watchdog_list, wd_list) { - /* Clocksource already marked unstable? */ if (cs->flags & CLOCK_SOURCE_UNSTABLE) { if (finished_booting) @@ -446,170 +659,40 @@ static void clocksource_watchdog(struct timer_list *unused) continue; } - read_ret = cs_watchdog_read(cs, &csnow, &wdnow); - - if (read_ret == WD_READ_UNSTABLE) { - /* Clock readout unreliable, so give it up. */ - __clocksource_unstable(cs); - continue; - } - - /* - * When WD_READ_SKIP is returned, it means the system is likely - * under very heavy load, where the latency of reading - * watchdog/clocksource is very big, and affect the accuracy of - * watchdog check. So give system some space and suspend the - * watchdog check for 5 minutes. - */ - if (read_ret == WD_READ_SKIP) { - /* - * As the watchdog timer will be suspended, and - * cs->last could keep unchanged for 5 minutes, reset - * the counters. - */ - clocksource_reset_watchdog(); - extra_wait = HZ * 300; - break; - } - - /* Clocksource initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || - atomic_read(&watchdog_reset_pending)) { - cs->flags |= CLOCK_SOURCE_WATCHDOG; - cs->wd_last = wdnow; - cs->cs_last = csnow; - continue; + /* Compare against watchdog clocksource if available */ + if (watchdog_check_freq(cs, reset_pending)) { + /* Check for inter CPU skew */ + watchdog_check_cpu_skew(cs); } - wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow); - cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow); - wdlast = cs->wd_last; /* save these in case we print them */ - cslast = cs->cs_last; - cs->cs_last = csnow; - cs->wd_last = wdnow; - - if (atomic_read(&watchdog_reset_pending)) - continue; - - /* - * The processing of timer softirqs can get delayed (usually - * on account of ksoftirqd not getting to run in a timely - * manner), which causes the watchdog interval to stretch. - * Skew detection may fail for longer watchdog intervals - * on account of fixed margins being used. - * Some clocksources, e.g. acpi_pm, cannot tolerate - * watchdog intervals longer than a few seconds. - */ - interval = max(cs_nsec, wd_nsec); - if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) { - if (system_state > SYSTEM_SCHEDULING && - interval > 2 * watchdog_max_interval) { - watchdog_max_interval = interval; - pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n", - cs_nsec, wd_nsec); - } - watchdog_timer.expires = jiffies; - continue; - } - - /* Check the deviation from the watchdog clocksource. */ - md = cs->uncertainty_margin + watchdog->uncertainty_margin; - if (abs(cs_nsec - wd_nsec) > md) { - s64 cs_wd_msec; - s64 wd_msec; - u32 wd_rem; - - pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", - smp_processor_id(), cs->name); - pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n", - watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); - pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", - cs->name, cs_nsec, csnow, cslast, cs->mask); - cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem); - wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem); - pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", - cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec); - if (curr_clocksource == cs) - pr_warn(" '%s' is current clocksource.\n", cs->name); - else if (curr_clocksource) - pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name); - else - pr_warn(" No current clocksource.\n"); - __clocksource_unstable(cs); - continue; - } - - if (cs == curr_clocksource && cs->tick_stable) - cs->tick_stable(cs); - - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && - (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { - /* Mark it valid for high-res. */ - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - - /* - * clocksource_done_booting() will sort it if - * finished_booting is not set yet. - */ - if (!finished_booting) - continue; - - /* - * If this is not the current clocksource let - * the watchdog thread reselect it. Due to the - * change to high res this clocksource might - * be preferred now. If it is the current - * clocksource let the tick code know about - * that change. - */ - if (cs != curr_clocksource) { - cs->flags |= CLOCK_SOURCE_RESELECT; - schedule_work(&watchdog_work); - } else { - tick_clock_notify(); - } - } + watchdog_check_result(cs); } - /* - * We only clear the watchdog_reset_pending, when we did a - * full cycle through all clocksources. - */ + /* Clear after the full clocksource walk */ if (reset_pending) atomic_dec(&watchdog_reset_pending); - /* - * Cycle through CPUs to check if the CPUs stay synchronized - * to each other. - */ - next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask); - - /* - * Arm timer if not already pending: could race with concurrent - * pair clocksource_stop_watchdog() clocksource_start_watchdog(). - */ + /* Could have been rearmed by a stop/start cycle */ if (!timer_pending(&watchdog_timer)) { - watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait; - add_timer_on(&watchdog_timer, next_cpu); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_local(&watchdog_timer); } -out: - spin_unlock(&watchdog_lock); } static inline void clocksource_start_watchdog(void) { - if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + if (watchdog_running || list_empty(&watchdog_list)) return; - timer_setup(&watchdog_timer, clocksource_watchdog, 0); + timer_setup(&watchdog_timer, clocksource_watchdog, TIMER_PINNED); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + + add_timer_on(&watchdog_timer, get_boot_cpu_id()); watchdog_running = 1; } static inline void clocksource_stop_watchdog(void) { - if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + if (!watchdog_running || !list_empty(&watchdog_list)) return; timer_delete(&watchdog_timer); watchdog_running = 0; @@ -651,6 +734,13 @@ static void clocksource_select_watchdog(bool fallback) if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) continue; + /* + * If it's not continuous, don't put the fox in charge of + * the henhouse. + */ + if (!(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)) + continue; + /* Skip current if we were requested for a fallback. */ if (fallback && cs == old_wd) continue; @@ -690,12 +780,6 @@ static int __clocksource_watchdog_kthread(void) unsigned long flags; int select = 0; - /* Do any required per-CPU skew verification. */ - if (curr_clocksource && - curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE && - curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU) - clocksource_verify_percpu(curr_clocksource); - spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { @@ -1016,6 +1100,8 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) continue; if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) continue; + if (cs->flags & CLOCK_SOURCE_WDTEST) + continue; return cs; } return NULL; @@ -1040,6 +1126,8 @@ static void __clocksource_select(bool skipcur) continue; if (strcmp(cs->name, override_name) != 0) continue; + if (cs->flags & CLOCK_SOURCE_WDTEST) + continue; /* * Check to make sure we don't switch to a non-highres * capable clocksource if the tick code is in oneshot @@ -1169,31 +1257,10 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, NSEC_PER_SEC / scale, sec * scale); - } - /* - * If the uncertainty margin is not specified, calculate it. If - * both scale and freq are non-zero, calculate the clock period, but - * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default. - * However, if either of scale or freq is zero, be very conservative - * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value - * for the uncertainty margin. Allow stupidly small uncertainty - * margins to be specified by the caller for testing purposes, - * but warn to discourage production use of this capability. - * - * Bottom line: The sum of the uncertainty margins of the - * watchdog clocksource and the clocksource under test will be at - * least 500ppm by default. For more information, please see the - * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above. - */ - if (scale && freq && !cs->uncertainty_margin) { - cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); - if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW) - cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW; - } else if (!cs->uncertainty_margin) { - cs->uncertainty_margin = WATCHDOG_THRESHOLD; + /* Update cs::freq_khz */ + cs->freq_khz = div_u64((u64)freq * scale, 1000); } - WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW); /* * Ensure clocksources that have large 'mult' values don't overflow @@ -1241,6 +1308,10 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX)) cs->id = CSID_GENERIC; + + if (WARN_ON_ONCE(!freq && cs->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT)) + cs->flags &= ~CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT; + if (cs->vdso_clock_mode < 0 || cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 860af7a58428..5bd6efe598f0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -50,6 +50,28 @@ #include "tick-internal.h" /* + * Constants to set the queued state of the timer (INACTIVE, ENQUEUED) + * + * The callback state is kept separate in the CPU base because having it in + * the timer would required touching the timer after the callback, which + * makes it impossible to free the timer from the callback function. + * + * Therefore we track the callback state in: + * + * timer->base->cpu_base->running == timer + * + * On SMP it is possible to have a "callback function running and enqueued" + * status. It happens for example when a posix timer expired and the callback + * queued a signal. Between dropping the lock which protects the posix timer + * and reacquiring the base lock of the hrtimer, another CPU can deliver the + * signal and rearm the timer. + * + * All state transitions are protected by cpu_base->lock. + */ +#define HRTIMER_STATE_INACTIVE false +#define HRTIMER_STATE_ENQUEUED true + +/* * The resolution of the clocks. The resolution value is returned in * the clock_getres() system call to give application programmers an * idea of the (in)accuracy of timers. Timer values are rounded up to @@ -77,43 +99,22 @@ static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ + +#define BASE_INIT(idx, cid) \ + [idx] = { .index = idx, .clockid = cid } + DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), - .clock_base = - { - { - .index = HRTIMER_BASE_MONOTONIC, - .clockid = CLOCK_MONOTONIC, - }, - { - .index = HRTIMER_BASE_REALTIME, - .clockid = CLOCK_REALTIME, - }, - { - .index = HRTIMER_BASE_BOOTTIME, - .clockid = CLOCK_BOOTTIME, - }, - { - .index = HRTIMER_BASE_TAI, - .clockid = CLOCK_TAI, - }, - { - .index = HRTIMER_BASE_MONOTONIC_SOFT, - .clockid = CLOCK_MONOTONIC, - }, - { - .index = HRTIMER_BASE_REALTIME_SOFT, - .clockid = CLOCK_REALTIME, - }, - { - .index = HRTIMER_BASE_BOOTTIME_SOFT, - .clockid = CLOCK_BOOTTIME, - }, - { - .index = HRTIMER_BASE_TAI_SOFT, - .clockid = CLOCK_TAI, - }, + .clock_base = { + BASE_INIT(HRTIMER_BASE_MONOTONIC, CLOCK_MONOTONIC), + BASE_INIT(HRTIMER_BASE_REALTIME, CLOCK_REALTIME), + BASE_INIT(HRTIMER_BASE_BOOTTIME, CLOCK_BOOTTIME), + BASE_INIT(HRTIMER_BASE_TAI, CLOCK_TAI), + BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT, CLOCK_MONOTONIC), + BASE_INIT(HRTIMER_BASE_REALTIME_SOFT, CLOCK_REALTIME), + BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT, CLOCK_BOOTTIME), + BASE_INIT(HRTIMER_BASE_TAI_SOFT, CLOCK_TAI), }, .csd = CSD_INIT(retrigger_next_event, NULL) }; @@ -126,23 +127,43 @@ static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) return likely(base->online); } +#ifdef CONFIG_HIGH_RES_TIMERS +DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key); + +static void hrtimer_hres_workfn(struct work_struct *work) +{ + static_branch_enable(&hrtimer_highres_enabled_key); +} + +static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn); + +static inline void hrtimer_schedule_hres_work(void) +{ + if (!hrtimer_highres_enabled()) + schedule_work(&hrtimer_hres_work); +} +#else +static inline void hrtimer_schedule_hres_work(void) { } +#endif + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ #ifdef CONFIG_SMP - /* * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() * such that hrtimer_callback_running() can unconditionally dereference * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .clock_base = { { - .cpu_base = &migration_cpu_base, - .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, - &migration_cpu_base.lock), - }, }, + .clock_base = { + [0] = { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, + }, }; #define migration_base migration_cpu_base.clock_base[0] @@ -159,15 +180,13 @@ static struct hrtimer_cpu_base migration_cpu_base = { * possible to set timer->base = &migration_base and drop the lock: the timer * remains locked. */ -static -struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, - unsigned long *flags) +static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) __acquires(&timer->base->lock) { - struct hrtimer_clock_base *base; - for (;;) { - base = READ_ONCE(timer->base); + struct hrtimer_clock_base *base = READ_ONCE(timer->base); + if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) @@ -220,7 +239,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_ return expires >= new_base->cpu_base->expires_next; } -static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned) { if (!hrtimer_base_is_online(base)) { int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); @@ -248,8 +267,7 @@ static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base * * the timer callback is currently running. */ static inline struct hrtimer_clock_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, - int pinned) +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned) { struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; @@ -262,13 +280,12 @@ again: if (base != new_base) { /* - * We are trying to move timer to new_base. - * However we can't change timer's base while it is running, - * so we keep it on the same CPU. No hassle vs. reprogramming - * the event source in the high resolution case. The softirq - * code will take care of this when the timer function has - * completed. There is no conflict as we hold the lock until - * the timer is enqueued. + * We are trying to move timer to new_base. However we can't + * change timer's base while it is running, so we keep it on + * the same CPU. No hassle vs. reprogramming the event source + * in the high resolution case. The remote CPU will take care + * of this when the timer function has completed. There is no + * conflict as we hold the lock until the timer is enqueued. */ if (unlikely(hrtimer_callback_running(timer))) return base; @@ -278,8 +295,7 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, - this_cpu_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -298,14 +314,13 @@ again: #else /* CONFIG_SMP */ -static inline struct hrtimer_clock_base * -lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); - return base; } @@ -340,7 +355,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div) return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); -#endif /* BITS_PER_LONG >= 64 */ +#endif /* BITS_PER_LONG < 64 */ /* * Add two ktime values and do a safety check for overflow: @@ -422,12 +437,37 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) } } +/* Stub timer callback for improperly used timers. */ +static enum hrtimer_restart stub_timer(struct hrtimer *unused) +{ + WARN_ON_ONCE(1); + return HRTIMER_NORESTART; +} + +/* + * hrtimer_fixup_assert_init is called when: + * - an untracked/uninit-ed object is found + */ +static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_NOTAVAILABLE: + hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0); + return true; + default: + return false; + } +} + static const struct debug_obj_descr hrtimer_debug_descr = { - .name = "hrtimer", - .debug_hint = hrtimer_debug_hint, - .fixup_init = hrtimer_fixup_init, - .fixup_activate = hrtimer_fixup_activate, - .fixup_free = hrtimer_fixup_free, + .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, + .fixup_init = hrtimer_fixup_init, + .fixup_activate = hrtimer_fixup_activate, + .fixup_free = hrtimer_fixup_free, + .fixup_assert_init = hrtimer_fixup_assert_init, }; static inline void debug_hrtimer_init(struct hrtimer *timer) @@ -440,8 +480,7 @@ static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) debug_object_init_on_stack(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_activate(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } @@ -451,6 +490,11 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer) debug_object_deactivate(timer, &hrtimer_debug_descr); } +static inline void debug_hrtimer_assert_init(struct hrtimer *timer) +{ + debug_object_assert_init(timer, &hrtimer_debug_descr); +} + void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); @@ -461,9 +505,9 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); static inline void debug_hrtimer_init(struct hrtimer *timer) { } static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } -static inline void debug_hrtimer_activate(struct hrtimer *timer, - enum hrtimer_mode mode) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } +static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { } #endif static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) @@ -479,80 +523,80 @@ static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid trace_hrtimer_setup(timer, clockid, mode); } -static inline void debug_activate(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed) { debug_hrtimer_activate(timer, mode); - trace_hrtimer_start(timer, mode); + trace_hrtimer_start(timer, mode, was_armed); } -static inline void debug_deactivate(struct hrtimer *timer) -{ - debug_hrtimer_deactivate(timer); - trace_hrtimer_cancel(timer); -} +#define for_each_active_base(base, cpu_base, active) \ + for (unsigned int idx = ffs(active); idx--; idx = ffs((active))) \ + for (bool done = false; !done; active &= ~(1U << idx)) \ + for (base = &cpu_base->clock_base[idx]; !done; done = true) -static struct hrtimer_clock_base * -__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) +#define hrtimer_from_timerqueue_node(_n) container_of_const(_n, struct hrtimer, node) + +#if defined(CONFIG_NO_HZ_COMMON) +/* + * Same as hrtimer_bases_next_event() below, but skips the excluded timer and + * does not update cpu_base->next_timer/expires. + */ +static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, + unsigned int active, ktime_t expires_next) { - unsigned int idx; + struct hrtimer_clock_base *base; + ktime_t expires; - if (!*active) - return NULL; + lockdep_assert_held(&cpu_base->lock); - idx = __ffs(*active); - *active &= ~(1U << idx); + for_each_active_base(base, cpu_base, active) { + expires = ktime_sub(base->expires_next, base->offset); + if (expires >= expires_next) + continue; + + /* + * If the excluded timer is the first on this base evaluate the + * next timer. + */ + struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active); - return &cpu_base->clock_base[idx]; + if (unlikely(&exclude->node == node)) { + node = timerqueue_linked_next(node); + if (!node) + continue; + expires = ktime_sub(node->expires, base->offset); + if (expires >= expires_next) + continue; + } + expires_next = expires; + } + /* If base->offset changed, the result might be negative */ + return max(expires_next, 0); } +#endif -#define for_each_active_base(base, cpu_base, active) \ - while ((base = __next_base((cpu_base), &(active)))) +static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base) +{ + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); -static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, - const struct hrtimer *exclude, - unsigned int active, - ktime_t expires_next) + return hrtimer_from_timerqueue_node(next); +} + +/* Find the base with the earliest expiry */ +static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active, + ktime_t *expires_next, struct hrtimer **next_timer) { struct hrtimer_clock_base *base; ktime_t expires; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *next; - struct hrtimer *timer; - - next = timerqueue_getnext(&base->active); - timer = container_of(next, struct hrtimer, node); - if (timer == exclude) { - /* Get to the next timer in the queue. */ - next = timerqueue_iterate_next(next); - if (!next) - continue; - - timer = container_of(next, struct hrtimer, node); - } - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (expires < expires_next) { - expires_next = expires; - - /* Skip cpu_base update if a timer is being excluded. */ - if (exclude) - continue; - - if (timer->is_soft) - cpu_base->softirq_next_timer = timer; - else - cpu_base->next_timer = timer; + expires = ktime_sub(base->expires_next, base->offset); + if (expires < *expires_next) { + *expires_next = expires; + *next_timer = clock_base_next_timer(base); } } - /* - * clock_was_set() might have changed base->offset of any of - * the clock bases so the result might be negative. Fix it up - * to prevent a false positive in clockevents_program_event(). - */ - if (expires_next < 0) - expires_next = 0; - return expires_next; } /* @@ -575,30 +619,28 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ -static ktime_t -__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) +static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { - unsigned int active; struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; + unsigned int active; + + lockdep_assert_held(&cpu_base->lock); if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; - cpu_base->softirq_next_timer = NULL; - expires_next = __hrtimer_next_event_base(cpu_base, NULL, - active, KTIME_MAX); - - next_timer = cpu_base->softirq_next_timer; + if (active) + hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer); + cpu_base->softirq_next_timer = next_timer; } if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + if (active) + hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer); cpu_base->next_timer = next_timer; - expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, - expires_next); } - - return expires_next; + return max(expires_next, 0); } static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) @@ -638,8 +680,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, - offs_real, offs_boot, offs_tai); + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, + offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; @@ -649,7 +691,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) } /* - * Is the high resolution mode active ? + * Is the high resolution mode active in the CPU base. This cannot use the + * static key as the CPUs are switched to high resolution mode + * asynchronously. */ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { @@ -657,8 +701,13 @@ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) cpu_base->hres_active : 0; } -static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, - struct hrtimer *next_timer, +static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred) +{ + trace_hrtimer_rearm(expires_next, deferred); + tick_program_event(expires_next, 1); +} + +static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) { cpu_base->expires_next = expires_next; @@ -683,20 +732,13 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; - tick_program_event(expires_next, 1); + hrtimer_rearm_event(expires_next, false); } -/* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held - */ -static void -hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +/* Reprogram the event source with a evaluation of all clock bases */ +static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal) { - ktime_t expires_next; - - expires_next = hrtimer_update_next_event(cpu_base); + ktime_t expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; @@ -707,57 +749,49 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS -/* - * High resolution timer enabled ? - */ +/* High resolution timer enabled ? */ static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); -/* - * Enable / Disable high resolution mode - */ +/* Enable / Disable high resolution mode */ static int __init setup_hrtimer_hres(char *str) { return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } - __setup("highres=", setup_hrtimer_hres); -/* - * hrtimer_high_res_enabled - query, if the highres mode is enabled - */ -static inline int hrtimer_is_hres_enabled(void) +/* hrtimer_high_res_enabled - query, if the highres mode is enabled */ +static inline bool hrtimer_is_hres_enabled(void) { return hrtimer_hres_enabled; } -/* - * Switch to high resolution mode - */ +/* Switch to high resolution mode */ static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { - pr_warn("Could not switch to high resolution mode on CPU %u\n", - base->cpu); + pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } - base->hres_active = 1; + base->hres_active = true; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); + hrtimer_schedule_hres_work(); } #else -static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline bool hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ + /* * Retrigger next event is called after clock was set with interrupts * disabled through an SMP function call or directly from low level @@ -792,13 +826,12 @@ static void retrigger_next_event(void *arg) * In periodic low resolution mode, the next softirq expiration * must also be updated. */ - raw_spin_lock(&base->lock); + guard(raw_spinlock)(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) - hrtimer_force_reprogram(base, 0); + hrtimer_force_reprogram(base, /* skip_equal */ false); else hrtimer_update_next_event(base); - raw_spin_unlock(&base->lock); } /* @@ -812,10 +845,11 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + ktime_t expires = hrtimer_get_expires(timer); - WARN_ON_ONCE(hrtimer_get_expires(timer) < 0); + WARN_ON_ONCE(expires < 0); + expires = ktime_sub(expires, base->offset); /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. @@ -842,8 +876,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) timer_cpu_base->softirq_next_timer = timer; timer_cpu_base->softirq_expires_next = expires; - if (!ktime_before(expires, timer_cpu_base->expires_next) || - !reprogram) + if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) return; } @@ -857,11 +890,8 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (expires >= cpu_base->expires_next) return; - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) return; cpu_base->next_timer = timer; @@ -869,8 +899,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) __hrtimer_reprogram(cpu_base, timer, expires); } -static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, - unsigned int active) +static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) { struct hrtimer_clock_base *base; unsigned int seq; @@ -896,13 +925,11 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, if (seq == cpu_base->clock_was_set_seq) return false; - /* - * If the remote CPU is currently handling an hrtimer interrupt, it - * will reevaluate the first expiring timer of all clock bases - * before reprogramming. Nothing to do here. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending the remote CPU will take care of it */ + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; return false; + } /* * Walk the affected clock bases and check whether the first expiring @@ -913,9 +940,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *next; + struct timerqueue_linked_node *next; - next = timerqueue_getnext(&base->active); + next = timerqueue_linked_first(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; @@ -947,11 +974,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, */ void clock_was_set(unsigned int bases) { - struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; - int cpu; - if (!hrtimer_hres_active(cpu_base) && !tick_nohz_is_active()) + if (!hrtimer_highres_enabled() && !tick_nohz_is_active()) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -960,23 +985,19 @@ void clock_was_set(unsigned int bases) } /* Avoid interrupting CPUs if possible */ - cpus_read_lock(); - for_each_online_cpu(cpu) { - unsigned long flags; - - cpu_base = &per_cpu(hrtimer_bases, cpu); - raw_spin_lock_irqsave(&cpu_base->lock, flags); + scoped_guard(cpus_read_lock) { + int cpu; - if (update_needs_ipi(cpu_base, bases)) - cpumask_set_cpu(cpu, mask); + for_each_online_cpu(cpu) { + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + guard(raw_spinlock_irqsave)(&cpu_base->lock); + if (update_needs_ipi(cpu_base, bases)) + cpumask_set_cpu(cpu, mask); + } + scoped_guard(preempt) + smp_call_function_many(mask, retrigger_next_event, NULL, 1); } - - preempt_disable(); - smp_call_function_many(mask, retrigger_next_event, NULL, 1); - preempt_enable(); - cpus_read_unlock(); free_cpumask_var(mask); out_timerfd: @@ -1011,11 +1032,8 @@ void hrtimers_resume_local(void) retrigger_next_event(NULL); } -/* - * Counterpart to lock_hrtimer_base above: - */ -static inline -void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +/* Counterpart to lock_hrtimer_base above */ +static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); @@ -1032,7 +1050,7 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * .. note:: * This only updates the timer expiry value and does not requeue the timer. * - * There is also a variant of the function hrtimer_forward_now(). + * There is also a variant of this function: hrtimer_forward_now(). * * Context: Can be safely called from the callback function of @timer. If called * from other contexts @timer must neither be enqueued nor running the @@ -1042,15 +1060,15 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { - u64 orun = 1; ktime_t delta; + u64 orun = 1; delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta < 0) return 0; - if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + if (WARN_ON(timer->is_queued)) return 0; if (interval < hrtimer_resolution) @@ -1079,73 +1097,98 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * enqueue_hrtimer - internal function to (re)start a timer * * The timer is inserted in expiry order. Insertion into the - * red black tree is O(log(n)). Must hold the base lock. + * red black tree is O(log(n)). * * Returns true when the new timer is the leftmost timer in the tree. */ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - enum hrtimer_mode mode) + enum hrtimer_mode mode, bool was_armed) { - debug_activate(timer, mode); + lockdep_assert_held(&base->cpu_base->lock); + + debug_activate(timer, mode, was_armed); WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); + WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); + + if (!timerqueue_linked_add(&base->active, &timer->node)) + return false; + + base->expires_next = hrtimer_get_expires(timer); + return true; +} - return timerqueue_add(&base->active, &timer->node); +static inline void base_update_next_timer(struct hrtimer_clock_base *base) +{ + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); + + base->expires_next = next ? next->expires : KTIME_MAX; } /* * __remove_hrtimer - internal function to remove a timer * - * Caller must hold the base lock. - * * High resolution timer mode reprograms the clock event device when the * timer is the one which expires next. The caller can disable this by setting * reprogram to zero. This is useful, when the context does a reprogramming * anyway (e.g. timer interrupt) */ -static void __remove_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - u8 newstate, int reprogram) +static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool newstate, bool reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; + bool was_first; - /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, newstate); - if (!(state & HRTIMER_STATE_ENQUEUED)) + lockdep_assert_held(&cpu_base->lock); + + if (!timer->is_queued) return; - if (!timerqueue_del(&base->active, &timer->node)) + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->is_queued, newstate); + + was_first = !timerqueue_linked_prev(&timer->node); + + if (!timerqueue_linked_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); + /* Nothing to update if this was not the first timer in the base */ + if (!was_first) + return; + + base_update_next_timer(base); + /* - * Note: If reprogram is false we do not update - * cpu_base->next_timer. This happens when we remove the first - * timer on a remote cpu. No harm as we never dereference - * cpu_base->next_timer. So the worst thing what can happen is - * an superfluous call to hrtimer_force_reprogram() on the - * remote cpu later on if the same timer gets enqueued again. + * If reprogram is false don't update cpu_base->next_timer and do not + * touch the clock event device. + * + * This happens when removing the first timer on a remote CPU, which + * will be handled by the remote CPU's interrupt. It also happens when + * a local timer is removed to be immediately restarted. That's handled + * at the call site. */ - if (reprogram && timer == cpu_base->next_timer) - hrtimer_force_reprogram(cpu_base, 1); + if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy) + return; + + if (cpu_base->deferred_rearm) + cpu_base->deferred_needs_update = true; + else + hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); } -/* - * remove hrtimer, called with base lock held - */ -static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - bool restart, bool keep_local) +static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool newstate) { - u8 state = timer->state; + lockdep_assert_held(&base->cpu_base->lock); - if (state & HRTIMER_STATE_ENQUEUED) { + if (timer->is_queued) { bool reprogram; + debug_hrtimer_deactivate(timer); + /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current @@ -1154,24 +1197,81 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ - debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - /* - * If the timer is not restarted then reprogramming is - * required if the timer is local. If it is local and about - * to be restarted, avoid programming it twice (on removal - * and a moment later when it's requeued). - */ - if (!restart) - state = HRTIMER_STATE_INACTIVE; - else - reprogram &= !keep_local; + __remove_hrtimer(timer, base, newstate, reprogram); + return true; + } + return false; +} + +/* + * Update in place has to retrieve the expiry times of the neighbour nodes + * if they exist. That is cache line neutral because the dequeue/enqueue + * operation is going to need the same cache lines. But there is a big win + * when the dequeue/enqueue can be avoided because the RB tree does not + * have to be rebalanced twice. + */ +static inline bool +hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires) +{ + struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node); + struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node); + + /* If the new expiry goes behind the next timer, requeue is required */ + if (next && expires > next->expires) + return false; + + /* If this is the first timer, update in place */ + if (!prev) + return true; + + /* Update in place when it does not go ahead of the previous one */ + return expires >= prev->expires; +} + +static inline bool +remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base, + const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns) +{ + bool was_first = false; + + /* Remove it from the timer queue if active */ + if (timer->is_queued) { + was_first = !timerqueue_linked_prev(&timer->node); + + /* Try to update in place to avoid the de/enqueue dance */ + if (hrtimer_can_update_in_place(timer, base, expires)) { + hrtimer_set_expires_range_ns(timer, expires, delta_ns); + trace_hrtimer_start(timer, mode, true); + if (was_first) + base->expires_next = expires; + return was_first; + } - __remove_hrtimer(timer, base, state, reprogram); - return 1; + debug_hrtimer_deactivate(timer); + timerqueue_linked_del(&base->active, &timer->node); } - return 0; + + /* Set the new expiry time */ + hrtimer_set_expires_range_ns(timer, expires, delta_ns); + + debug_activate(timer, mode, timer->is_queued); + base->cpu_base->active_bases |= 1 << base->index; + + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); + + /* If it's the first expiring timer now or again, update base */ + if (timerqueue_linked_add(&base->active, &timer->node)) { + base->expires_next = expires; + return true; + } + + if (was_first) + base_update_next_timer(base); + + return false; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, @@ -1190,55 +1290,93 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, return tim; } -static void -hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) +static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { - ktime_t expires; - - /* - * Find the next SOFT expiration. - */ - expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* - * reprogramming needs to be triggered, even if the next soft - * hrtimer expires at the same time than the next hard + * Reprogramming needs to be triggered, even if the next soft + * hrtimer expires at the same time as the next hard * hrtimer. cpu_base->softirq_expires_next needs to be updated! */ if (expires == KTIME_MAX) return; /* - * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() - * cpu_base->*expires_next is only set by hrtimer_reprogram() + * cpu_base->next_timer is recomputed by __hrtimer_get_next_event() + * cpu_base->expires_next is only set by hrtimer_reprogram() */ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); } -static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 delta_ns, const enum hrtimer_mode mode, - struct hrtimer_clock_base *base) +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned) +{ + if (static_branch_likely(&timers_migration_enabled)) { + /* + * If it is local and the first expiring timer keep it on the local + * CPU to optimize reprogramming of the clockevent device. Also + * avoid switch_hrtimer_base() overhead when local and pinned. + */ + if (!is_local) + return false; + if (is_first || is_pinned) + return true; + + /* Honour the NOHZ full restrictions */ + if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE)) + return false; + + /* + * If the tick is not stopped or need_resched() is set, then + * there is no point in moving the timer somewhere else. + */ + return !tick_nohz_tick_stopped() || need_resched(); + } + return is_local; +} +#else +static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned) +{ + return is_local; +} +#endif + +static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first, + bool is_pinned) +{ + /* If the timer is running the callback it has to stay on its CPU base. */ + if (unlikely(timer->base->running == timer)) + return true; + + return hrtimer_prefer_local(is_local, is_first, is_pinned); +} + +static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, + const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *new_base; - bool force_local, first; + bool is_pinned, first, was_first, keep_base = false; + struct hrtimer_cpu_base *cpu_base = base->cpu_base; - /* - * If the timer is on the local cpu base and is the first expiring - * timer then this might end up reprogramming the hardware twice - * (on removal and on enqueue). To avoid that by prevent the - * reprogram on removal, keep the timer local to the current CPU - * and enforce reprogramming after it is queued no matter whether - * it is the new first expiring timer again or not. - */ - force_local = base->cpu_base == this_cpu_base; - force_local &= base->cpu_base->next_timer == timer; + was_first = cpu_base->next_timer == timer; + is_pinned = !!(mode & HRTIMER_MODE_PINNED); /* - * Don't force local queuing if this enqueue happens on a unplugged - * CPU after hrtimer_cpu_dying() has been invoked. + * Don't keep it local if this enqueue happens on a unplugged CPU + * after hrtimer_cpu_dying() has been invoked. */ - force_local &= this_cpu_base->online; + if (likely(this_cpu_base->online)) { + bool is_local = cpu_base == this_cpu_base; + + keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned); + } + + /* Calculate absolute expiry time for relative timers */ + if (mode & HRTIMER_MODE_REL) + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); + /* Compensate for low resolution granularity */ + tim = hrtimer_update_lowres(timer, tim, mode); /* * Remove an active timer from the queue. In case it is not queued @@ -1250,32 +1388,41 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * reprogramming later if it was the first expiring timer. This * avoids programming the underlying clock event twice (once at * removal and once after enqueue). + * + * @keep_base is also true if the timer callback is running on a + * remote CPU and for local pinned timers. */ - remove_hrtimer(timer, base, true, force_local); + if (likely(keep_base)) { + first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns); + } else { + /* Keep the ENQUEUED state in case it is queued */ + bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED); - if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); + hrtimer_set_expires_range_ns(timer, tim, delta_ns); - tim = hrtimer_update_lowres(timer, tim, mode); + /* Switch the timer base, if necessary: */ + base = switch_hrtimer_base(timer, base, is_pinned); + cpu_base = base->cpu_base; - hrtimer_set_expires_range_ns(timer, tim, delta_ns); + first = enqueue_hrtimer(timer, base, mode, was_armed); + } - /* Switch the timer base, if necessary: */ - if (!force_local) { - new_base = switch_hrtimer_base(timer, base, - mode & HRTIMER_MODE_PINNED); - } else { - new_base = base; + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; + return false; } - first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) { + if (!was_first || cpu_base != this_cpu_base) { /* - * If the current CPU base is online, then the timer is - * never queued on a remote CPU if it would be the first - * expiring timer there. + * If the current CPU base is online, then the timer is never + * queued on a remote CPU if it would be the first expiring + * timer there unless the timer callback is currently executed + * on the remote CPU. In the latter case the remote CPU will + * re-evaluate the first expiring timer after completing the + * callbacks. */ - if (hrtimer_base_is_online(this_cpu_base)) + if (likely(hrtimer_base_is_online(this_cpu_base))) return first; /* @@ -1283,21 +1430,33 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * already offline. If the timer is the first to expire, * kick the remote CPU to reprogram the clock event. */ - if (first) { - struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + if (first) + smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd); + return false; + } - smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); - } - return 0; + /* + * Special case for the HRTICK timer. It is frequently rearmed and most + * of the time moves the expiry into the future. That's expensive in + * virtual machines and it's better to take the pointless already armed + * interrupt than reprogramming the hardware on every context switch. + * + * If the new expiry is before the armed time, then reprogramming is + * required. + */ + if (timer->is_lazy) { + if (cpu_base->expires_next <= hrtimer_get_expires(timer)) + return false; } /* - * Timer was forced to stay on the current CPU to avoid - * reprogramming on removal and enqueue. Force reprogram the - * hardware by evaluating the new first expiring timer. + * Timer was the first expiring timer and forced to stay on the + * current CPU to avoid reprogramming on removal and enqueue. Force + * reprogram the hardware by evaluating the new first expiring + * timer. */ - hrtimer_force_reprogram(new_base->cpu_base, 1); - return 0; + hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); + return false; } /** @@ -1309,12 +1468,14 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ -void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 delta_ns, const enum hrtimer_mode mode) +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, + const enum hrtimer_mode mode) { struct hrtimer_clock_base *base; unsigned long flags; + debug_hrtimer_assert_init(timer); + /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard @@ -1362,8 +1523,11 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); - if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false, false); + if (!hrtimer_callback_running(timer)) { + ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE); + if (ret) + trace_hrtimer_cancel(timer); + } unlock_hrtimer_base(timer, &flags); @@ -1397,8 +1561,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ -static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, - unsigned long flags) +static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) { if (atomic_read(&cpu_base->timer_waiters)) { raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1463,14 +1626,10 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer) spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } #else -static inline void -hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } -static inline void -hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } -static inline void -hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } -static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, - unsigned long flags) { } +static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { } #endif /** @@ -1526,15 +1685,11 @@ u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_base->lock, flags); + guard(raw_spinlock_irqsave)(&cpu_base->lock); if (!hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - return expires; } @@ -1549,26 +1704,20 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_base->lock, flags); - - if (hrtimer_hres_active(cpu_base)) { - unsigned int active; + unsigned int active; - if (!cpu_base->softirq_activated) { - active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; - expires = __hrtimer_next_event_base(cpu_base, exclude, - active, KTIME_MAX); - } - active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; - expires = __hrtimer_next_event_base(cpu_base, exclude, active, - expires); - } + guard(raw_spinlock_irqsave)(&cpu_base->lock); + if (!hrtimer_hres_active(cpu_base)) + return expires; - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + if (active && !cpu_base->softirq_activated) + expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX); - return expires; + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + if (!active) + return expires; + return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires); } #endif @@ -1612,8 +1761,7 @@ ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); -static void __hrtimer_setup(struct hrtimer *timer, - enum hrtimer_restart (*function)(struct hrtimer *), +static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); @@ -1645,13 +1793,14 @@ static void __hrtimer_setup(struct hrtimer *timer, base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); + timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM); timer->base = &cpu_base->clock_base[base]; - timerqueue_init(&timer->node); + timerqueue_linked_init(&timer->node); - if (WARN_ON_ONCE(!function)) + if (WARN_ON_ONCE(!fn)) ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; else - ACCESS_PRIVATE(timer, function) = function; + ACCESS_PRIVATE(timer, function) = fn; } /** @@ -1710,12 +1859,10 @@ bool hrtimer_active(const struct hrtimer *timer) base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || - base->running == timer) + if (timer->is_queued || base->running == timer) return true; - } while (read_seqcount_retry(&base->seq, seq) || - base != READ_ONCE(timer->base)); + } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); return false; } @@ -1729,7 +1876,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * - * On the read side we ensure we observe timer->state and cpu_base->running + * On the read side we ensure we observe timer->is_queued and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. @@ -1738,11 +1885,9 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ - -static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, - struct hrtimer_clock_base *base, - struct hrtimer *timer, ktime_t *now, - unsigned long flags) __must_hold(&cpu_base->lock) +static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, + struct hrtimer *timer, ktime_t now, unsigned long flags) + __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); bool expires_in_hardirq; @@ -1754,15 +1899,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, base->running = timer; /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); - __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false); fn = ACCESS_PRIVATE(timer, function); /* @@ -1797,16 +1942,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ - if (restart != HRTIMER_NORESTART && - !(timer->state & HRTIMER_STATE_ENQUEUED)) - enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); + if (restart == HRTIMER_RESTART && !timer->is_queued) + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false); /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); @@ -1814,23 +1958,24 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, base->running = NULL; } +static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base) +{ + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); + + return next ? hrtimer_from_timerqueue_node(next) : NULL; +} + static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, unsigned long flags, unsigned int active_mask) { - struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases & active_mask; + struct hrtimer_clock_base *base; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *node; - ktime_t basenow; - - basenow = ktime_add(now, base->offset); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); + ktime_t basenow = ktime_add(now, base->offset); + struct hrtimer *timer; + while ((timer = clock_base_next_timer(base))) { /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the @@ -1846,7 +1991,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, if (basenow < hrtimer_get_softexpires(timer)) break; - __run_hrtimer(cpu_base, base, timer, &basenow, flags); + __run_hrtimer(cpu_base, base, timer, basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } @@ -1865,7 +2010,7 @@ static __latent_entropy void hrtimer_run_softirq(void) now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); - cpu_base->softirq_activated = 0; + cpu_base->softirq_activated = false; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1875,6 +2020,63 @@ static __latent_entropy void hrtimer_run_softirq(void) #ifdef CONFIG_HIGH_RES_TIMERS /* + * Very similar to hrtimer_force_reprogram(), except it deals with + * deferred_rearm and hang_detected. + */ +static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred) +{ + cpu_base->expires_next = expires_next; + cpu_base->deferred_rearm = false; + + if (unlikely(cpu_base->hang_detected)) { + /* + * Give the system a chance to do something else than looping + * on hrtimer interrupts. + */ + expires_next = ktime_add_ns(ktime_get(), + min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time)); + } + hrtimer_rearm_event(expires_next, deferred); +} + +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +void __hrtimer_rearm_deferred(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next; + + if (!cpu_base->deferred_rearm) + return; + + guard(raw_spinlock)(&cpu_base->lock); + if (cpu_base->deferred_needs_update) { + hrtimer_update_base(cpu_base); + expires_next = hrtimer_update_next_event(cpu_base); + } else { + /* No timer added/removed. Use the cached value */ + expires_next = cpu_base->deferred_expires_next; + } + hrtimer_rearm(cpu_base, expires_next, true); +} + +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) +{ + /* hrtimer_interrupt() just re-evaluated the first expiring timer */ + cpu_base->deferred_needs_update = false; + /* Cache the expiry time */ + cpu_base->deferred_expires_next = expires_next; + set_thread_flag(TIF_HRTIMER_REARM); +} +#else /* CONFIG_HRTIMER_REARM_DEFERRED */ +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) +{ + hrtimer_rearm(cpu_base, expires_next, false); +} +#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ + +/* * High resolution timer interrupt * Called with interrupts disabled */ @@ -1888,86 +2090,55 @@ void hrtimer_interrupt(struct clock_event_device *dev) BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; + dev->next_event_forced = 0; raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: - cpu_base->in_hrtirq = 1; + cpu_base->deferred_rearm = true; /* - * We set expires_next to KTIME_MAX here with cpu_base->lock - * held to prevent that a timer is enqueued in our queue via - * the migration code. This does not affect enqueueing of - * timers which run their callback and need to be requeued on - * this CPU. + * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue + * timers while __hrtimer_run_queues() is expiring the clock bases. + * Timers which are re/enqueued on the local CPU are not affected by + * this. */ cpu_base->expires_next = KTIME_MAX; if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); - /* Reevaluate the clock bases for the [soft] next expiry */ - expires_next = hrtimer_update_next_event(cpu_base); - /* - * Store the new expiry value so the migration code can verify - * against it. - */ - cpu_base->expires_next = expires_next; - cpu_base->in_hrtirq = 0; - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - - /* Reprogramming necessary ? */ - if (!tick_program_event(expires_next, 0)) { - cpu_base->hang_detected = 0; - return; - } - /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * - * We need to prevent that we loop forever in the hrtimer - * interrupt routine. We give it 3 attempts to avoid - * overreacting on some spurious event. - * - * Acquire base lock for updating the offsets and retrieving - * the current time. + * We need to prevent that we loop forever in the hrtiner interrupt + * routine. We give it 3 attempts to avoid overreacting on some + * spurious event. */ - raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); - cpu_base->nr_retries++; - if (++retries < 3) - goto retry; - /* - * Give the system a chance to do something else than looping - * here. We stored the entry time, so we know exactly how long - * we spent here. We schedule the next event this amount of - * time away. - */ - cpu_base->nr_hangs++; - cpu_base->hang_detected = 1; - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + expires_next = hrtimer_update_next_event(cpu_base); + cpu_base->hang_detected = false; + if (expires_next < now) { + if (++retries < 3) + goto retry; + + delta = ktime_sub(now, entry_time); + cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta); + cpu_base->nr_hangs++; + cpu_base->hang_detected = true; + } - delta = ktime_sub(now, entry_time); - if ((unsigned int)delta > cpu_base->max_hang_time) - cpu_base->max_hang_time = (unsigned int) delta; - /* - * Limit it to a sensible value as we enforce a longer - * delay. Give the CPU at least 100ms to catch up. - */ - if (delta > 100 * NSEC_PER_MSEC) - expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); - else - expires_next = ktime_add(now, delta); - tick_program_event(expires_next, 1); - pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); + hrtimer_interrupt_rearm(cpu_base, expires_next); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } + #endif /* !CONFIG_HIGH_RES_TIMERS */ /* @@ -1999,7 +2170,7 @@ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } @@ -2012,8 +2183,7 @@ void hrtimer_run_queues(void) */ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { - struct hrtimer_sleeper *t = - container_of(timer, struct hrtimer_sleeper, timer); + struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); struct task_struct *task = t->task; t->task = NULL; @@ -2031,8 +2201,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) */ -void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, - enum hrtimer_mode mode) +void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { /* * Make the enqueue delivery mode check work on RT. If the sleeper @@ -2048,8 +2217,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); -static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly @@ -2085,8 +2254,8 @@ static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, * @clock_id: the clock to be used * @mode: timer mode abs/rel */ -void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode) { debug_setup_on_stack(&sl->timer, clock_id, mode); __hrtimer_setup_sleeper(sl, clock_id, mode); @@ -2159,12 +2328,11 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) return ret; } -long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, - const clockid_t clockid) +long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; - int ret = 0; + int ret; hrtimer_setup_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); @@ -2203,8 +2371,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, - CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif @@ -2212,7 +2379,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, - struct old_timespec32 __user *, rmtp) + struct old_timespec32 __user *, rmtp) { struct timespec64 tu; @@ -2225,8 +2392,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, - CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif @@ -2236,14 +2402,13 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); - int i; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); - timerqueue_init_head(&clock_b->active); + timerqueue_linked_init_head(&clock_b->active); } cpu_base->cpu = cpu; @@ -2257,13 +2422,14 @@ int hrtimers_cpu_starting(unsigned int cpu) /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; - cpu_base->hres_active = 0; - cpu_base->hang_detected = 0; + cpu_base->hres_active = false; + cpu_base->hang_detected = false; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->online = 1; + cpu_base->softirq_activated = false; + cpu_base->online = true; return 0; } @@ -2272,20 +2438,20 @@ int hrtimers_cpu_starting(unsigned int cpu) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { + struct timerqueue_linked_node *node; struct hrtimer *timer; - struct timerqueue_node *node; - while ((node = timerqueue_getnext(&old_base->active))) { - timer = container_of(node, struct hrtimer, node); + while ((node = timerqueue_linked_first(&old_base->active))) { + timer = hrtimer_from_timerqueue_node(node); BUG_ON(hrtimer_callback_running(timer)); - debug_deactivate(timer); + debug_hrtimer_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ - __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -2295,13 +2461,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * sort out already expired timers and reprogram the * event device. */ - enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true); } } int hrtimers_cpu_dying(unsigned int dying_cpu) { - int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); struct hrtimer_cpu_base *old_base, *new_base; old_base = this_cpu_ptr(&hrtimer_bases); @@ -2314,16 +2480,14 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) raw_spin_lock(&old_base->lock); raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); - } + for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); - old_base->online = 0; + old_base->online = false; raw_spin_unlock(&old_base->lock); return 0; diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 9daf8c5d9687..1c954f330dfe 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -32,7 +32,6 @@ static u64 jiffies_read(struct clocksource *cs) static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ - .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = jiffies_read, .mask = CLOCKSOURCE_MASK(32), .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 652744e00eb4..4bca3f78c8ea 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -18,8 +18,9 @@ #include <linux/cred.h> #include <linux/err.h> #include <linux/mm.h> +#include <linux/cleanup.h> -#include <vdso/datapage.h> +#include "namespace_internal.h" ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) @@ -93,8 +94,8 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!ns->vvar_page) + err = timens_vdso_alloc_vvar_page(ns); + if (err) goto fail_free; err = ns_common_init(ns); @@ -109,7 +110,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, return ns; fail_free_page: - __free_page(ns->vvar_page); + timens_vdso_free_vvar_page(ns); fail_free: kfree(ns); fail_dec: @@ -138,117 +139,7 @@ struct time_namespace *copy_time_ns(u64 flags, return clone_time_ns(user_ns, old_ns); } -static struct timens_offset offset_from_ts(struct timespec64 off) -{ - struct timens_offset ret; - - ret.sec = off.tv_sec; - ret.nsec = off.tv_nsec; - - return ret; -} - -/* - * A time namespace VVAR page has the same layout as the VVAR page which - * contains the system wide VDSO data. - * - * For a normal task the VVAR pages are installed in the normal ordering: - * VVAR - * PVCLOCK - * HVCLOCK - * TIMENS <- Not really required - * - * Now for a timens task the pages are installed in the following order: - * TIMENS - * PVCLOCK - * HVCLOCK - * VVAR - * - * The check for vdso_clock->clock_mode is in the unlikely path of - * the seq begin magic. So for the non-timens case most of the time - * 'seq' is even, so the branch is not taken. - * - * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the - * update to finish and for 'seq' to become even anyway. - * - * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which - * enforces the time namespace handling path. - */ -static void timens_setup_vdso_clock_data(struct vdso_clock *vc, - struct time_namespace *ns) -{ - struct timens_offset *offset = vc->offset; - struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); - struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - - vc->seq = 1; - vc->clock_mode = VDSO_CLOCKMODE_TIMENS; - offset[CLOCK_MONOTONIC] = monotonic; - offset[CLOCK_MONOTONIC_RAW] = monotonic; - offset[CLOCK_MONOTONIC_COARSE] = monotonic; - offset[CLOCK_BOOTTIME] = boottime; - offset[CLOCK_BOOTTIME_ALARM] = boottime; -} - -struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - if (likely(vma->vm_mm == current->mm)) - return current->nsproxy->time_ns->vvar_page; - - /* - * VM_PFNMAP | VM_IO protect .fault() handler from being called - * through interfaces like /proc/$pid/mem or - * process_vm_{readv,writev}() as long as there's no .access() - * in special_mapping_vmops(). - * For more details check_vma_flags() and __access_remote_vm() - */ - - WARN(1, "vvar_page accessed remotely"); - - return NULL; -} - -/* - * Protects possibly multiple offsets writers racing each other - * and tasks entering the namespace. - */ -static DEFINE_MUTEX(offset_lock); - -static void timens_set_vvar_page(struct task_struct *task, - struct time_namespace *ns) -{ - struct vdso_time_data *vdata; - struct vdso_clock *vc; - unsigned int i; - - if (ns == &init_time_ns) - return; - - /* Fast-path, taken by every task in namespace except the first. */ - if (likely(ns->frozen_offsets)) - return; - - mutex_lock(&offset_lock); - /* Nothing to-do: vvar_page has been already initialized. */ - if (ns->frozen_offsets) - goto out; - - ns->frozen_offsets = true; - vdata = page_address(ns->vvar_page); - vc = vdata->clock_data; - - for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_clock_data(&vc[i], ns); - - if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { - for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) - timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); - } - -out: - mutex_unlock(&offset_lock); -} +DEFINE_MUTEX(timens_offset_lock); void free_time_ns(struct time_namespace *ns) { @@ -256,41 +147,39 @@ void free_time_ns(struct time_namespace *ns) dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_common_free(ns); - __free_page(ns->vvar_page); + timens_vdso_free_vvar_page(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) { - struct time_namespace *ns = NULL; + struct time_namespace *ns; struct nsproxy *nsproxy; - task_lock(task); + guard(task_lock)(task); nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->time_ns; - get_time_ns(ns); - } - task_unlock(task); + if (!nsproxy) + return NULL; - return ns ? &ns->ns : NULL; + ns = nsproxy->time_ns; + get_time_ns(ns); + return &ns->ns; } static struct ns_common *timens_for_children_get(struct task_struct *task) { - struct time_namespace *ns = NULL; + struct time_namespace *ns; struct nsproxy *nsproxy; - task_lock(task); + guard(task_lock)(task); nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->time_ns_for_children; - get_time_ns(ns); - } - task_unlock(task); + if (!nsproxy) + return NULL; - return ns ? &ns->ns : NULL; + ns = nsproxy->time_ns_for_children; + get_time_ns(ns); + return &ns->ns; } static void timens_put(struct ns_common *ns) @@ -298,12 +187,6 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -void timens_commit(struct task_struct *tsk, struct time_namespace *ns) -{ - timens_set_vvar_page(tsk, ns); - vdso_join_timens(tsk, ns); -} - static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -367,36 +250,33 @@ static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) { - struct ns_common *ns; - struct time_namespace *time_ns; + struct time_namespace *time_ns __free(time_ns) = NULL; + struct ns_common *ns = timens_for_children_get(p); - ns = timens_for_children_get(p); if (!ns) return; + time_ns = to_time_ns(ns); show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); - put_time_ns(time_ns); } int proc_timens_set_offset(struct file *file, struct task_struct *p, struct proc_timens_offset *offsets, int noffsets) { - struct ns_common *ns; - struct time_namespace *time_ns; + struct time_namespace *time_ns __free(time_ns) = NULL; + struct ns_common *ns = timens_for_children_get(p); struct timespec64 tp; - int i, err; + int i; - ns = timens_for_children_get(p); if (!ns) return -ESRCH; + time_ns = to_time_ns(ns); - if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { - put_time_ns(time_ns); + if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) return -EPERM; - } for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; @@ -409,15 +289,12 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, ktime_get_boottime_ts64(&tp); break; default: - err = -EINVAL; - goto out; + return -EINVAL; } - err = -ERANGE; - if (off->val.tv_sec > KTIME_SEC_MAX || off->val.tv_sec < -KTIME_SEC_MAX) - goto out; + return -ERANGE; tp = timespec64_add(tp, off->val); /* @@ -425,16 +302,13 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, * still unreachable. */ if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) - goto out; + return -ERANGE; } - mutex_lock(&offset_lock); - if (time_ns->frozen_offsets) { - err = -EACCES; - goto out_unlock; - } + guard(mutex)(&timens_offset_lock); + if (time_ns->frozen_offsets) + return -EACCES; - err = 0; /* Don't report errors after this line */ for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; @@ -452,12 +326,7 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, *offset = off->val; } -out_unlock: - mutex_unlock(&offset_lock); -out: - put_time_ns(time_ns); - - return err; + return 0; } const struct proc_ns_operations timens_operations = { diff --git a/kernel/time/namespace_internal.h b/kernel/time/namespace_internal.h new file mode 100644 index 000000000000..b37ba179f43b --- /dev/null +++ b/kernel/time/namespace_internal.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TIME_NAMESPACE_INTERNAL_H +#define _TIME_NAMESPACE_INTERNAL_H + +#include <linux/mutex.h> + +struct time_namespace; + +/* + * Protects possibly multiple offsets writers racing each other + * and tasks entering the namespace. + */ +extern struct mutex timens_offset_lock; + +#ifdef CONFIG_TIME_NS_VDSO +int timens_vdso_alloc_vvar_page(struct time_namespace *ns); +void timens_vdso_free_vvar_page(struct time_namespace *ns); +#else /* !CONFIG_TIME_NS_VDSO */ +static inline int timens_vdso_alloc_vvar_page(struct time_namespace *ns) +{ + return 0; +} +static inline void timens_vdso_free_vvar_page(struct time_namespace *ns) +{ +} +#endif /* CONFIG_TIME_NS_VDSO */ + +#endif /* _TIME_NAMESPACE_INTERNAL_H */ diff --git a/kernel/time/namespace_vdso.c b/kernel/time/namespace_vdso.c new file mode 100644 index 000000000000..0d74d160eec9 --- /dev/null +++ b/kernel/time/namespace_vdso.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: Andrei Vagin <avagin@openvz.org> + * Author: Dmitry Safonov <dima@arista.com> + */ + +#include <linux/cleanup.h> +#include <linux/mm.h> +#include <linux/time_namespace.h> +#include <linux/time.h> +#include <linux/vdso_datastore.h> + +#include <vdso/clocksource.h> +#include <vdso/datapage.h> + +#include "namespace_internal.h" + +static struct timens_offset offset_from_ts(struct timespec64 off) +{ + struct timens_offset ret; + + ret.sec = off.tv_sec; + ret.nsec = off.tv_nsec; + + return ret; +} + +/* + * A time namespace VVAR page has the same layout as the VVAR page which + * contains the system wide VDSO data. + * + * For a normal task the VVAR pages are installed in the normal ordering: + * VVAR + * PVCLOCK + * HVCLOCK + * TIMENS <- Not really required + * + * Now for a timens task the pages are installed in the following order: + * TIMENS + * PVCLOCK + * HVCLOCK + * VVAR + * + * The check for vdso_clock->clock_mode is in the unlikely path of + * the seq begin magic. So for the non-timens case most of the time + * 'seq' is even, so the branch is not taken. + * + * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the + * update to finish and for 'seq' to become even anyway. + * + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * enforces the time namespace handling path. + */ +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) +{ + struct timens_offset *offset = vc->offset; + struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); + struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); + + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; + offset[CLOCK_MONOTONIC] = monotonic; + offset[CLOCK_MONOTONIC_RAW] = monotonic; + offset[CLOCK_MONOTONIC_COARSE] = monotonic; + offset[CLOCK_BOOTTIME] = boottime; + offset[CLOCK_BOOTTIME_ALARM] = boottime; +} + +struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops(). + * For more details check_vma_flags() and __access_remote_vm() + */ + + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} + +static void timens_set_vvar_page(struct task_struct *task, + struct time_namespace *ns) +{ + struct vdso_time_data *vdata; + struct vdso_clock *vc; + unsigned int i; + + if (ns == &init_time_ns) + return; + + /* Fast-path, taken by every task in namespace except the first. */ + if (likely(ns->frozen_offsets)) + return; + + guard(mutex)(&timens_offset_lock); + /* Nothing to-do: vvar_page has been already initialized. */ + if (ns->frozen_offsets) + return; + + ns->frozen_offsets = true; + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; + + for (i = 0; i < CS_BASES; i++) + timens_setup_vdso_clock_data(&vc[i], ns); + + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { + for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) + timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); + } +} + +/* + * The vvar page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will be re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_clock_data() for details. + */ +static int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + guard(mmap_read_lock)(mm); + for_each_vma(vmi, vma) { + if (vma_is_special_mapping(vma, &vdso_vvar_mapping)) + zap_vma(vma); + } + return 0; +} + +void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +{ + timens_set_vvar_page(tsk, ns); + vdso_join_timens(tsk, ns); +} + +int timens_vdso_alloc_vvar_page(struct time_namespace *ns) +{ + ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!ns->vvar_page) + return -ENOMEM; + + return 0; +} + +void timens_vdso_free_vvar_page(struct time_namespace *ns) +{ + __free_page(ns->vvar_page); +} diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 413e2389f0a5..9331e1614124 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1092,7 +1092,7 @@ void exit_itimers(struct task_struct *tsk) } /* - * There should be no timers on the ignored list. itimer_delete() has + * There should be no timers on the ignored list. posix_timer_delete() has * mopped them up. */ if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers))) diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index a88b72b0f35e..51f6a1032c83 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -78,7 +78,6 @@ static struct clock_event_device ce_broadcast_hrtimer = { .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | - CLOCK_EVT_FEAT_KTIME | CLOCK_EVT_FEAT_HRTIMER, .rating = 0, .bound_on = -1, diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f63c65881364..115e0bf01276 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -76,8 +76,10 @@ const struct clock_event_device *tick_get_wakeup_device(int cpu) */ static void tick_broadcast_start_periodic(struct clock_event_device *bc) { - if (bc) + if (bc) { + bc->next_event_forced = 0; tick_setup_periodic(bc, 1); + } } /* @@ -106,6 +108,7 @@ static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) static void tick_oneshot_wakeup_handler(struct clock_event_device *wd) { + wd->next_event_forced = 0; /* * If we woke up early and the tick was reprogrammed in the * meantime then this may be spurious but harmless. @@ -403,6 +406,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) bool bc_local; raw_spin_lock(&tick_broadcast_lock); + tick_broadcast_device.evtdev->next_event_forced = 0; /* Handle spurious interrupts gracefully */ if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { @@ -696,6 +700,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) raw_spin_lock(&tick_broadcast_lock); dev->next_event = KTIME_MAX; + tick_broadcast_device.evtdev->next_event_forced = 0; next_event = KTIME_MAX; cpumask_clear(tmpmask); now = ktime_get(); @@ -1063,6 +1068,7 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bc->event_handler = tick_handle_oneshot_broadcast; + bc->next_event_forced = 0; bc->next_event = KTIME_MAX; /* @@ -1175,6 +1181,7 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) } /* This moves the broadcast assignment to this CPU: */ + bc->next_event_forced = 0; clockevents_program_event(bc, bc->next_event, 1); } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index d305d8521896..6a9198a4279b 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -110,6 +110,7 @@ void tick_handle_periodic(struct clock_event_device *dev) int cpu = smp_processor_id(); ktime_t next = dev->next_event; + dev->next_event_forced = 0; tick_periodic(cpu); /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f7907fadd63f..cbbb87a0c6e7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -345,7 +345,7 @@ static bool check_tick_dependency(atomic_t *dep) int val = atomic_read(dep); if (likely(!tracepoint_enabled(tick_stop))) - return !val; + return !!val; if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); @@ -864,19 +864,32 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); +/* Simplified variant of hrtimer_forward_now() */ +static ktime_t tick_forward_now(ktime_t expires, ktime_t now) +{ + ktime_t delta = now - expires; + + if (likely(delta < TICK_NSEC)) + return expires + TICK_NSEC; + + expires += TICK_NSEC * ktime_divns(delta, TICK_NSEC); + if (expires > now) + return expires; + return expires + TICK_NSEC; +} + static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { - hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->last_tick); + ktime_t expires = ts->last_tick; - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); + if (now >= expires) + expires = tick_forward_now(expires, now); if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { - tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + hrtimer_set_expires(&ts->sched_timer, expires); + tick_program_event(expires, 1); } /* @@ -1513,6 +1526,7 @@ static void tick_nohz_lowres_handler(struct clock_event_device *dev) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); dev->next_event = KTIME_MAX; + dev->next_event_forced = 0; if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); diff --git a/kernel/time/time.c b/kernel/time/time.c index 0d832317d576..771cef87ad3b 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -207,7 +207,7 @@ SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv, get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; - if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) + if (new_ts.tv_nsec >= USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; new_ts.tv_nsec *= NSEC_PER_USEC; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c07e562ee4c1..c493a4010305 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -3,34 +3,30 @@ * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. */ -#include <linux/timekeeper_internal.h> -#include <linux/module.h> -#include <linux/interrupt.h> +#include <linux/audit.h> +#include <linux/clocksource.h> +#include <linux/compiler.h> +#include <linux/jiffies.h> #include <linux/kobject.h> -#include <linux/percpu.h> -#include <linux/init.h> -#include <linux/mm.h> +#include <linux/module.h> #include <linux/nmi.h> -#include <linux/sched.h> -#include <linux/sched/loadavg.h> +#include <linux/pvclock_gtod.h> +#include <linux/random.h> #include <linux/sched/clock.h> +#include <linux/sched/loadavg.h> +#include <linux/static_key.h> +#include <linux/stop_machine.h> #include <linux/syscore_ops.h> -#include <linux/clocksource.h> -#include <linux/jiffies.h> +#include <linux/tick.h> #include <linux/time.h> #include <linux/timex.h> -#include <linux/tick.h> -#include <linux/stop_machine.h> -#include <linux/pvclock_gtod.h> -#include <linux/compiler.h> -#include <linux/audit.h> -#include <linux/random.h> +#include <linux/timekeeper_internal.h> #include <vdso/auxclock.h> #include "tick-internal.h" -#include "ntp_internal.h" #include "timekeeping_internal.h" +#include "ntp_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_CLOCK_WAS_SET (1 << 1) @@ -275,6 +271,11 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } +#ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE +#include <asm/clock_inlined.h> + +static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined); + /* * tk_clock_read - atomic clocksource read() helper * @@ -288,12 +289,35 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ -static inline u64 tk_clock_read(const struct tk_read_base *tkr) +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + if (static_branch_likely(&clocksource_read_inlined)) + return arch_inlined_clocksource_read(clock); + + return clock->read(clock); +} + +static inline void clocksource_disable_inline_read(void) +{ + static_branch_disable(&clocksource_read_inlined); +} + +static inline void clocksource_enable_inline_read(void) +{ + static_branch_enable(&clocksource_read_inlined); +} +#else +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); return clock->read(clock); } +static inline void clocksource_disable_inline_read(void) { } +static inline void clocksource_enable_inline_read(void) { } +#endif /** * tk_setup_internals - Set up internals to use clocksource clock. @@ -367,6 +391,27 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; + + tk->cs_id = clock->id; + + /* Coupled clockevent data */ + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) && + clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) { + /* + * Aim for an one hour maximum delta and use KHz to handle + * clocksources with a frequency above 4GHz correctly as + * the frequency argument of clocks_calc_mult_shift() is u32. + */ + clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift, + NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000); + /* + * Initialize the conversion limit as the previous clocksource + * might have the same shift/mult pair so the quick check in + * tk_update_ns_to_cyc() fails to update it after a clocksource + * change leaving it effectivly zero. + */ + tk->cs_ns_to_cyc_maxns = div_u64(clock->mask, tk->cs_ns_to_cyc_mult); + } } /* Timekeeper helper functions. */ @@ -375,7 +420,7 @@ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) +static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; @@ -696,6 +741,36 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } +static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc) +{ + struct tk_read_base *tkrs = &tks->tkr_mono; + struct tk_read_base *tkrc = &tkc->tkr_mono; + unsigned int shift; + + if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) || + !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT)) + return; + + if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift) + return; + /* + * The conversion math is simple: + * + * CS::MULT (1 << NS_TO_CYC_SHIFT) + * --------------- = ---------------------- + * (1 << CS:SHIFT) NS_TO_CYC_MULT + * + * Ergo: + * + * NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT + * + * NS_TO_CYC_SHIFT has been set up in tk_setup_internals() + */ + shift = tkrs->shift + tks->cs_ns_to_cyc_shift; + tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult); + tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult); +} + /* * Restore the shadow timekeeper from the real timekeeper. */ @@ -730,6 +805,7 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; if (tk->id == TIMEKEEPER_CORE) { + tk_update_ns_to_cyc(tk, &tkd->timekeeper); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); @@ -784,6 +860,71 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_update_coarse_nsecs(tk); } +/* + * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles + * @id: Clocksource ID which is required for validity + * @expires_ns: Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted + * @cycles: Pointer to storage for corresponding absolute cycles value + * + * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value + * based on the correlated clocksource of the clockevent device by using + * the base nanoseconds and cycles values of the last timekeeper update and + * converting the delta between @expires_ns and base nanoseconds to cycles. + * + * This only works for clockevent devices which are using a less than or + * equal comparator against the clocksource. + * + * Utilizing this avoids two clocksource reads for such devices, the + * ktime_get() in clockevents_program_event() to calculate the delta expiry + * value and the readout in the device::set_next_event() callback to + * convert the delta back to a absolute comparator value. + * + * Returns: True if @id matches the current clocksource ID, false otherwise + */ +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct tk_read_base *tkrm = &tk->tkr_mono; + ktime_t base_ns, delta_ns, max_ns; + u64 base_cycles, delta_cycles; + unsigned int seq; + u32 mult, shift; + + /* + * Racy check to avoid the seqcount overhead when ID does not match. If + * the relevant clocksource is installed concurrently, then this will + * just delay the switch over to this mechanism until the next event is + * programmed. If the ID is not matching the clock events code will use + * the regular relative set_next_event() callback as before. + */ + if (data_race(tk->cs_id) != id) + return false; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + if (tk->cs_id != id) + return false; + + base_cycles = tkrm->cycle_last; + base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift); + + mult = tk->cs_ns_to_cyc_mult; + shift = tk->cs_ns_to_cyc_shift; + max_ns = tk->cs_ns_to_cyc_maxns; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + /* Prevent negative deltas and multiplication overflows */ + delta_ns = min(expires_ns - base_ns, max_ns); + delta_ns = max(delta_ns, 0); + + /* Convert to cycles */ + delta_cycles = ((u64)delta_ns * mult) >> shift; + *cycles = base_cycles + delta_cycles; + return true; +} + /** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set @@ -848,7 +989,7 @@ u32 ktime_get_resolution_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); -static ktime_t *offsets[TK_OFFS_MAX] = { +static const ktime_t *const offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, @@ -857,8 +998,9 @@ static ktime_t *offsets[TK_OFFS_MAX] = { ktime_t ktime_get_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; + const ktime_t *offset = offsets[offs]; unsigned int seq; - ktime_t base, *offset = offsets[offs]; + ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -878,8 +1020,9 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; - ktime_t base, *offset = offsets[offs]; + const ktime_t *offset = offsets[offs]; unsigned int seq; + ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -902,7 +1045,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); */ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { - ktime_t *offset = offsets[offs]; + const ktime_t *offset = offsets[offs]; unsigned int seq; ktime_t tconv; @@ -1631,7 +1774,19 @@ int timekeeping_notify(struct clocksource *clock) if (tk->tkr_mono.clock == clock) return 0; + + /* Disable inlined reads accross the clocksource switch */ + clocksource_disable_inline_read(); + stop_machine(change_clocksource, clock, NULL); + + /* + * If the clocksource has been selected and supports inlined reads + * enable the branch. + */ + if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ) + clocksource_enable_inline_read(); + tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } @@ -2834,7 +2989,7 @@ static void tk_aux_update_clocksource(void) continue; timekeeping_forward_now(tks); - tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock); + tk_setup_internals(tks, tk_core.timekeeper.tkr_raw.clock); timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); } } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 543beba096c7..198d0608db74 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -9,6 +9,8 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_boot, ktime_t *offs_tai); +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles); + extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern void timekeeping_warp_clock(void); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 7e1e3bde6b8b..04d928c21aba 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2319,6 +2319,7 @@ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle) */ void timer_clear_idle(void) { + int this_cpu = smp_processor_id(); /* * We do this unlocked. The worst outcome is a remote pinned timer * enqueue sending a pointless IPI, but taking the lock would just @@ -2327,9 +2328,9 @@ void timer_clear_idle(void) * path. Required for BASE_LOCAL only. */ __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); - if (tick_nohz_full_cpu(smp_processor_id())) + if (tick_nohz_full_cpu(this_cpu)) __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false); - trace_timer_base_idle(false, smp_processor_id()); + trace_timer_base_idle(false, this_cpu); /* Activate without holding the timer_base->lock */ tmigr_cpu_activate(); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 488e47e96e93..427d7ddea3af 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,7 +47,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); - SEQ_printf(m, ", S:%02x", timer->state); + SEQ_printf(m, ", S:%02x", timer->is_queued); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), @@ -56,13 +56,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } -static void -print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, - u64 now) +static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + struct timerqueue_linked_node *curr; struct hrtimer *timer, tmp; unsigned long next = 0, i; - struct timerqueue_node *curr; unsigned long flags; next_one: @@ -72,13 +70,13 @@ next_one: raw_spin_lock_irqsave(&base->cpu_base->lock, flags); - curr = timerqueue_getnext(&base->active); + curr = timerqueue_linked_first(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { - curr = timerqueue_iterate_next(curr); + curr = timerqueue_linked_next(curr); i++; } @@ -103,8 +101,8 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); #ifdef CONFIG_HIGH_RES_TIMERS - SEQ_printf(m, " .offset: %Lu nsecs\n", - (unsigned long long) ktime_to_ns(base->offset)); + SEQ_printf(m, " .offset: %Ld nsecs\n", + (long long) base->offset); #endif SEQ_printf(m, "active timers:\n"); print_active_timers(m, base, now + ktime_to_ns(base->offset)); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 155eeaea4113..52c15affdbff 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -978,8 +978,12 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, /* Drop the lock to allow the remote CPU to exit idle */ raw_spin_unlock_irq(&tmc->lock); - if (cpu != smp_processor_id()) - timer_expire_remote(cpu); + /* + * This can't exclude the local CPU because jiffies might have advanced + * after the timer softirq invoked run_timer_base(BASE_GLOBAL) and the + * point where the jiffies snapshot @jif was taken in tmigr_handle_remote(). + */ + timer_expire_remote(cpu); /* * Lock ordering needs to be preserved - timer_base locks before tmigr @@ -1860,19 +1864,37 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, * child to the new parents. So tmigr_active_up() activates the * new parents while walking up from the old root to the new. * - * * It is ensured that @start is active, as this setup path is - * executed in hotplug prepare callback. This is executed by an - * already connected and !idle CPU. Even if all other CPUs go idle, - * the CPU executing the setup will be responsible up to current top - * level group. And the next time it goes inactive, it will release - * the new childmask and parent to subsequent walkers through this - * @child. Therefore propagate active state unconditionally. + * * It is ensured that @start is active, (or on the way to be activated + * by another CPU that woke up before the current one) as this setup path + * is executed in hotplug prepare callback. This is executed by an already + * connected and !idle CPU in the hierarchy. + * + * * The below RmW atomic operation ensures that: + * + * 1) If the old root has been completely activated, the latest state is + * acquired (the below implicit acquire pairs with the implicit release + * from cmpxchg() in tmigr_active_up()). + * + * 2) If the old root is still on the way to be activated, the lagging behind + * CPU performing the activation will acquire the links up to the new root. + * (The below implicit release pairs with the implicit acquire from cmpxchg() + * in tmigr_active_up()). + * + * 3) Every subsequent CPU below the old root will acquire the new links while + * walking through the old root (The below implicit release pairs with the + * implicit acquire from cmpxchg() in either tmigr_active_up()) or + * tmigr_inactive_up(). */ - state.state = atomic_read(&start->migr_state); - WARN_ON_ONCE(!state.active); + state.state = atomic_fetch_or(0, &start->migr_state); WARN_ON_ONCE(!start->parent); - data.childmask = start->groupmask; - __walk_groups_from(tmigr_active_up, &data, start, start->parent); + /* + * If the state of the old root is inactive, another CPU is on its way to activate + * it and propagate to the new root. + */ + if (state.active) { + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); + } } /* Root update */ |
