From 5d51bee725cc1497352d6b0b604e42a90c680540 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2020 13:38:55 +0100 Subject: clocksource: Add common vdso clock mode storage All architectures which use the generic VDSO code have their own storage for the VDSO clock mode. That's pointless and just requires duplicate code. Provide generic storage for it. The new Kconfig symbol is intermediate and will be removed once all architectures are converted over. Signed-off-by: Thomas Gleixner Tested-by: Vincenzo Frascino Reviewed-by: Vincenzo Frascino Link: https://lkml.kernel.org/r/20200207124403.028046322@linutronix.de --- kernel/time/clocksource.c | 9 +++++++++ kernel/time/vsyscall.c | 10 ++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 428beb69426a..7cb09c4cf21c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -928,6 +928,15 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_arch_init(cs); +#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE + if (cs->vdso_clock_mode < 0 || + cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { + pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", + cs->name, cs->vdso_clock_mode); + cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; + } +#endif + /* Initialize mult/shift and max_idle_ns */ __clocksource_update_freq_scale(cs, scale, freq); diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 9577c89179cd..f9a5178c69bb 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -71,13 +71,19 @@ void update_vsyscall(struct timekeeper *tk) { struct vdso_data *vdata = __arch_get_k_vdso_data(); struct vdso_timestamp *vdso_ts; + s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); - vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk); - vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk); +#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE + clock_mode = tk->tkr_mono.clock->vdso_clock_mode; +#else + clock_mode = __arch_get_clock_mode(tk); +#endif + vdata[CS_HRES_COARSE].clock_mode = clock_mode; + vdata[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; -- cgit v1.2.3 From f86fd32db706613fe8d0104057efa6e83e0d7e8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2020 13:38:59 +0100 Subject: lib/vdso: Cleanup clock mode storage leftovers Now that all architectures are converted to use the generic storage the helpers and conditionals can be removed. Signed-off-by: Thomas Gleixner Tested-by: Vincenzo Frascino Reviewed-by: Vincenzo Frascino Link: https://lkml.kernel.org/r/20200207124403.470699892@linutronix.de --- kernel/time/vsyscall.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index f9a5178c69bb..d31a5ef4ade5 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -77,11 +77,7 @@ void update_vsyscall(struct timekeeper *tk) /* copy vsyscall data */ vdso_write_begin(vdata); -#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE clock_mode = tk->tkr_mono.clock->vdso_clock_mode; -#else - clock_mode = __arch_get_clock_mode(tk); -#endif vdata[CS_HRES_COARSE].clock_mode = clock_mode; vdata[CS_RAW].clock_mode = clock_mode; -- cgit v1.2.3 From c7a18100bdffdff440c7291db6e80863fab0461e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2020 13:39:00 +0100 Subject: lib/vdso: Avoid highres update if clocksource is not VDSO capable If the current clocksource is not VDSO capable there is no point in updating the high resolution parts of the VDSO data. Replace the architecture specific check with a check for a VDSO capable clocksource and skip the update if there is none. Signed-off-by: Thomas Gleixner Tested-by: Vincenzo Frascino Reviewed-by: Vincenzo Frascino Link: https://lkml.kernel.org/r/20200207124403.563379423@linutronix.de --- kernel/time/vsyscall.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index d31a5ef4ade5..54ce6eb2ca36 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -105,10 +105,10 @@ void update_vsyscall(struct timekeeper *tk) WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); /* - * Architectures can opt out of updating the high resolution part - * of the VDSO. + * If the current clocksource is not VDSO capable, then spare the + * update of the high reolution parts. */ - if (__arch_update_vdso_data()) + if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); -- cgit v1.2.3 From 2d6b01bd88ccabba06d342ef80eaab6b39d12497 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Feb 2020 13:39:01 +0100 Subject: lib/vdso: Move VCLOCK_TIMENS to vdso_clock_modes Move the time namespace indicator clock mode to the other ones for consistency sake. Signed-off-by: Thomas Gleixner Reviewed-by: Vincenzo Frascino Link: https://lkml.kernel.org/r/20200207124403.656097274@linutronix.de --- kernel/time/namespace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 12858507d75a..e6ba064ce773 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -172,8 +173,8 @@ static struct timens_offset offset_from_ts(struct timespec64 off) * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * - * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces - * the time namespace handling path. + * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * enforces the time namespace handling path. */ static void timens_setup_vdso_data(struct vdso_data *vdata, struct time_namespace *ns) @@ -183,7 +184,7 @@ static void timens_setup_vdso_data(struct vdso_data *vdata, struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); vdata->seq = 1; - vdata->clock_mode = VCLOCK_TIMENS; + vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; -- cgit v1.2.3 From 6e317c32fd39a13e4854a27958d5e35d15d196be Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Sat, 18 Jan 2020 01:59:00 +0300 Subject: timer: Improve the comment describing schedule_timeout() When working commit 6dcd5d7a7a29c1e, a mistake was noticed by Linus: schedule_timeout() was called without setting the task state to anything particular. It calls the scheduler, but doesn't delay anything, because the task stays runnable. That happens because sched_submit_work() does nothing for tasks in TASK_RUNNING state. That turned out to be the intended behavior. Adding a WARN() is not useful as the task could be woken up right after setting the state and before reaching schedule_timeout(). Improve the comment about schedule_timeout() and describe that more explicitly. Signed-off-by: Alexander Popov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200117225900.16340-1-alex.popov@linux.com --- kernel/time/timer.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 4820823515e9..cb34fac9d9f7 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1828,21 +1828,23 @@ static void process_timeout(struct timer_list *t) * schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). + * Make the current task sleep until @timeout jiffies have elapsed. + * The function behavior depends on the current task state + * (see also set_current_state() description): * - * You can set the task state as follows - + * %TASK_RUNNING - the scheduler is called, but the task does not sleep + * at all. That happens because sched_submit_work() does nothing for + * tasks in %TASK_RUNNING state. * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to * pass before the routine returns unless the current task is explicitly - * woken up, (e.g. by wake_up_process())". + * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * - * The current task state is guaranteed to be TASK_RUNNING when this + * The current task state is guaranteed to be %TASK_RUNNING when this * routine returns. * * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule @@ -1850,7 +1852,7 @@ static void process_timeout(struct timer_list *t) * value will be %MAX_SCHEDULE_TIMEOUT. * * Returns 0 when the timer has expired otherwise the remaining time in - * jiffies will be returned. In all cases the return value is guaranteed + * jiffies will be returned. In all cases the return value is guaranteed * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) -- cgit v1.2.3 From 5fb1c2a5bbf79ccca8d17cf97f66085be5808027 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Sun, 16 Feb 2020 13:13:30 +0530 Subject: posix-timers: Pass lockdep expression to RCU lists head is traversed using hlist_for_each_entry_rcu outside an RCU read-side critical section but under the protection of hash_lock. Hence, add corresponding lockdep expression to silence false-positive lockdep warnings, and harden RCU lists. [ tglx: Removed the macro and put the condition right where it's used ] Signed-off-by: Amol Grover Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200216074330.GA14025@workstation-portable --- kernel/time/posix-timers.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ff0eb30de346..07709ac30439 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -121,7 +121,8 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, { struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash) { + hlist_for_each_entry_rcu(timer, head, t_hash, + lockdep_is_held(&hash_lock)) { if ((timer->it_signal == sig) && (timer->it_id == id)) return timer; } -- cgit v1.2.3 From 90c018942c2babab73814648b37808fc3bf2ed1a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 11:37:38 -0800 Subject: timer: Use hlist_unhashed_lockless() in timer_pending() The timer_pending() function is mostly used in lockless contexts, so Without proper annotations, KCSAN might detect a data-race [1]. Using hlist_unhashed_lockless() instead of hand-coding it seems appropriate (as suggested by Paul E. McKenney). [1] BUG: KCSAN: data-race in del_timer / detach_if_pending write to 0xffff88808697d870 of 8 bytes by task 10 on cpu 0: __hlist_del include/linux/list.h:764 [inline] detach_timer kernel/time/timer.c:815 [inline] detach_if_pending+0xcd/0x2d0 kernel/time/timer.c:832 try_to_del_timer_sync+0x60/0xb0 kernel/time/timer.c:1226 del_timer_sync+0x6b/0xa0 kernel/time/timer.c:1365 schedule_timeout+0x2d2/0x6e0 kernel/time/timer.c:1896 rcu_gp_fqs_loop+0x37c/0x580 kernel/rcu/tree.c:1639 rcu_gp_kthread+0x143/0x230 kernel/rcu/tree.c:1799 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 read to 0xffff88808697d870 of 8 bytes by task 12060 on cpu 1: del_timer+0x3b/0xb0 kernel/time/timer.c:1198 sk_stop_timer+0x25/0x60 net/core/sock.c:2845 inet_csk_clear_xmit_timers+0x69/0xa0 net/ipv4/inet_connection_sock.c:523 tcp_clear_xmit_timers include/net/tcp.h:606 [inline] tcp_v4_destroy_sock+0xa3/0x3f0 net/ipv4/tcp_ipv4.c:2096 inet_csk_destroy_sock+0xf4/0x250 net/ipv4/inet_connection_sock.c:836 tcp_close+0x6f3/0x970 net/ipv4/tcp.c:2497 inet_release+0x86/0x100 net/ipv4/af_inet.c:427 __sock_release+0x85/0x160 net/socket.c:590 sock_close+0x24/0x30 net/socket.c:1268 __fput+0x1e1/0x520 fs/file_table.c:280 ____fput+0x1f/0x30 fs/file_table.c:313 task_work_run+0xf6/0x130 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop+0x2b4/0x2c0 arch/x86/entry/common.c:163 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 12060 Comm: syz-executor.5 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, Signed-off-by: Eric Dumazet Cc: Thomas Gleixner [ paulmck: Pulled in Eric's later amendments. ] Signed-off-by: Paul E. McKenney --- kernel/time/timer.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 4820823515e9..568564ae3597 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -944,6 +944,7 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, #define MOD_TIMER_PENDING_ONLY 0x01 #define MOD_TIMER_REDUCE 0x02 +#define MOD_TIMER_NOTPENDING 0x04 static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) @@ -960,7 +961,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option * the timer is re-modified to have the same timeout or ends up in the * same array bucket then just return: */ - if (timer_pending(timer)) { + if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { /* * The downside of this optimization is that it can result in * larger granularity than you would get from adding a new @@ -1133,7 +1134,7 @@ EXPORT_SYMBOL(timer_reduce); void add_timer(struct timer_list *timer) { BUG_ON(timer_pending(timer)); - mod_timer(timer, timer->expires); + __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer); @@ -1891,7 +1892,7 @@ signed long __sched schedule_timeout(signed long timeout) timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, 0); + __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); schedule(); del_singleshot_timer_sync(&timer.timer); -- cgit v1.2.3 From 412c53a680a97cb1ae2c0ab60230e193bee86387 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Feb 2020 20:03:54 -0800 Subject: y2038: remove unused time32 interfaces No users remain, so kill these off before we grow new ones. Link: http://lkml.kernel.org/r/20200110154232.4104492-3-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/time.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/time.c b/kernel/time/time.c index cdd7386115ff..3985b2b32d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,49 +449,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); -- cgit v1.2.3 From a2efdbf4fcb38ec1ae99c9d54d52c34fa867dc71 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 Feb 2020 11:08:45 -0600 Subject: posix-cpu-timers: cpu_clock_sample_group() no longer needs siglock As of e78c3496790e ("time, signal: Protect resource use statistics with seqlock") cpu_clock_sample_group() no longer needs siglock protection so remove the stale comment. Signed-off-by: "Eric W. Biederman" Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/87eeuevduq.fsf@x220.int.ebiederm.org --- kernel/time/posix-cpu-timers.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8ff6da77a01f..46cc188bf5ab 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -336,9 +336,7 @@ static void __thread_group_cputime(struct task_struct *tsk, u64 *samples) /* * Sample a process (thread group) clock for the given task clkid. If the * group's cputime accounting is already enabled, read the atomic - * store. Otherwise a full update is required. Task's sighand lock must be - * held to protect the task traversal on a full update. clkid is already - * validated. + * store. Otherwise a full update is required. clkid is already validated. */ static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p, bool start) -- cgit v1.2.3 From 60f2ceaa8111d9ec51af87bde4a6809f2b3235e4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 Feb 2020 11:09:19 -0600 Subject: posix-cpu-timers: Remove unnecessary locking around cpu_clock_sample_group As of e78c3496790e ("time, signal: Protect resource use statistics with seqlock") cpu_clock_sample_group no longers needs siglock protection. Unfortunately no one realized it at the time. Remove the extra locking that is for cpu_clock_sample_group and not for cpu_clock_sample. This significantly simplifies the code. Signed-off-by: "Eric W. Biederman" Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/878skmvdts.fsf@x220.int.ebiederm.org --- kernel/time/posix-cpu-timers.c | 66 ++++++++---------------------------------- 1 file changed, 12 insertions(+), 54 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 46cc188bf5ab..40c2d8396bb9 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -718,31 +718,10 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp /* * Sample the clock to take the difference with the expiry time. */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + if (CPUCLOCK_PERTHREAD(timer->it_clock)) now = cpu_clock_sample(clkid, p); - } else { - struct sighand_struct *sighand; - unsigned long flags; - - /* - * Protect against sighand release/switch in exit/exec and - * also make timer sampling safe if it ends up calling - * thread_group_cputime(). - */ - sighand = lock_task_sighand(p, &flags); - if (unlikely(sighand == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - * Disarm the timer, nothing else to do. - */ - cpu_timer_setexpires(ctmr, 0); - return; - } else { - now = cpu_clock_sample_group(clkid, p, false); - unlock_task_sighand(p, &flags); - } - } + else + now = cpu_clock_sample_group(clkid, p, false); if (now < expires) { itp->it_value = ns_to_timespec64(expires - now); @@ -986,43 +965,22 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) /* * Fetch the current sample and update the timer's expiry time. */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + if (CPUCLOCK_PERTHREAD(timer->it_clock)) now = cpu_clock_sample(clkid, p); - bump_cpu_timer(timer, now); - if (unlikely(p->exit_state)) - return; - - /* Protect timer list r/w in arm_timer() */ - sighand = lock_task_sighand(p, &flags); - if (!sighand) - return; - } else { - /* - * Protect arm_timer() and timer sampling in case of call to - * thread_group_cputime(). - */ - sighand = lock_task_sighand(p, &flags); - if (unlikely(sighand == NULL)) { - /* - * The process has been reaped. - * We can't even collect a sample any more. - */ - cpu_timer_setexpires(ctmr, 0); - return; - } else if (unlikely(p->exit_state) && thread_group_empty(p)) { - /* If the process is dying, no need to rearm */ - goto unlock; - } + else now = cpu_clock_sample_group(clkid, p, true); - bump_cpu_timer(timer, now); - /* Leave the sighand locked for the call below. */ - } + + bump_cpu_timer(timer, now); + + /* Protect timer list r/w in arm_timer() */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) + return; /* * Now re-arm for the new expiry time. */ arm_timer(timer); -unlock: unlock_task_sighand(p, &flags); } -- cgit v1.2.3 From beb41d9cbe4179058634e05d60235b6155c7b6c6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 Feb 2020 11:09:46 -0600 Subject: posix-cpu-timers: Pass the task into arm_timer() The task has been already computed to take siglock before calling arm_timer. So pass the benefit of that labor into arm_timer(). Signed-off-by: "Eric W. Biederman" Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/8736auvdt1.fsf@x220.int.ebiederm.org --- kernel/time/posix-cpu-timers.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 40c2d8396bb9..ef936c5a910b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -482,12 +482,11 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the sighand lock held. */ -static void arm_timer(struct k_itimer *timer) +static void arm_timer(struct k_itimer *timer, struct task_struct *p) { int clkidx = CPUCLOCK_WHICH(timer->it_clock); struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); - struct task_struct *p = ctmr->task; struct posix_cputimer_base *base; if (CPUCLOCK_PERTHREAD(timer->it_clock)) @@ -660,7 +659,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, */ cpu_timer_setexpires(ctmr, new_expires); if (new_expires != 0 && val < new_expires) { - arm_timer(timer); + arm_timer(timer, p); } unlock_task_sighand(p, &flags); @@ -980,7 +979,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) /* * Now re-arm for the new expiry time. */ - arm_timer(timer); + arm_timer(timer, p); unlock_task_sighand(p, &flags); } -- cgit v1.2.3 From 55e8c8eb2c7b6bf30e99423ccfe7ca032f498f59 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 28 Feb 2020 11:11:06 -0600 Subject: posix-cpu-timers: Store a reference to a pid not a task posix cpu timers do not handle the death of a process well. This is most clearly seen when a multi-threaded process calls exec from a thread that is not the leader of the thread group. The posix cpu timer code continues to pin the old thread group leader and is unable to find the siglock from there. This results in posix_cpu_timer_del being unable to delete a timer, posix_cpu_timer_set being unable to set a timer. Further to compensate for the problems in posix_cpu_timer_del on a multi-threaded exec all timers that point at the multi-threaded task are stopped. The code for the timers fundamentally needs to check if the target process/thread is alive. This needs an extra level of indirection. This level of indirection is already available in struct pid. So replace cpu.task with cpu.pid to get the needed extra layer of indirection. In addition to handling things more cleanly this reduces the amount of memory a timer can pin when a process exits and then is reaped from a task_struct to the vastly smaller struct pid. Fixes: e0a70217107e ("posix-cpu-timers: workaround to suppress the problems with mt exec") Signed-off-by: "Eric W. Biederman" Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/87wo86tz6d.fsf@x220.int.ebiederm.org --- kernel/time/posix-cpu-timers.c | 73 +++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 18 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ef936c5a910b..6df468a622fe 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -118,6 +118,16 @@ static inline int validate_clock_permissions(const clockid_t clock) return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL; } +static inline enum pid_type cpu_timer_pid_type(struct k_itimer *timer) +{ + return CPUCLOCK_PERTHREAD(timer->it_clock) ? PIDTYPE_PID : PIDTYPE_TGID; +} + +static inline struct task_struct *cpu_timer_task_rcu(struct k_itimer *timer) +{ + return pid_task(timer->it.cpu.pid, cpu_timer_pid_type(timer)); +} + /* * Update expiry time from increment, and increase overrun count, * given the current clock sample. @@ -391,7 +401,12 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); - new_timer->it.cpu.task = p; + new_timer->it.cpu.pid = get_task_pid(p, cpu_timer_pid_type(new_timer)); + /* + * get_task_for_clock() took a reference on @p. Drop it as the timer + * holds a reference on the pid of @p. + */ + put_task_struct(p); return 0; } @@ -404,13 +419,15 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) static int posix_cpu_timer_del(struct k_itimer *timer) { struct cpu_timer *ctmr = &timer->it.cpu; - struct task_struct *p = ctmr->task; struct sighand_struct *sighand; + struct task_struct *p; unsigned long flags; int ret = 0; - if (WARN_ON_ONCE(!p)) - return -EINVAL; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; /* * Protect against sighand release/switch in exit/exec and process/ @@ -432,8 +449,10 @@ static int posix_cpu_timer_del(struct k_itimer *timer) unlock_task_sighand(p, &flags); } +out: + rcu_read_unlock(); if (!ret) - put_task_struct(p); + put_pid(ctmr->pid); return ret; } @@ -561,13 +580,21 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); u64 old_expires, new_expires, old_incr, val; struct cpu_timer *ctmr = &timer->it.cpu; - struct task_struct *p = ctmr->task; struct sighand_struct *sighand; + struct task_struct *p; unsigned long flags; int ret = 0; - if (WARN_ON_ONCE(!p)) - return -EINVAL; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) { + /* + * If p has just been reaped, we can no + * longer get any information about it at all. + */ + rcu_read_unlock(); + return -ESRCH; + } /* * Use the to_ktime conversion because that clamps the maximum @@ -584,8 +611,10 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * If p has just been reaped, we can no * longer get any information about it at all. */ - if (unlikely(sighand == NULL)) + if (unlikely(sighand == NULL)) { + rcu_read_unlock(); return -ESRCH; + } /* * Disarm any old timer after extracting its expiry time. @@ -690,6 +719,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, ret = 0; out: + rcu_read_unlock(); if (old) old->it_interval = ns_to_timespec64(old_incr); @@ -701,10 +731,12 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); struct cpu_timer *ctmr = &timer->it.cpu; u64 now, expires = cpu_timer_getexpires(ctmr); - struct task_struct *p = ctmr->task; + struct task_struct *p; - if (WARN_ON_ONCE(!p)) - return; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; /* * Easy part: convert the reload time. @@ -712,7 +744,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp itp->it_interval = ktime_to_timespec64(timer->it_interval); if (!expires) - return; + goto out; /* * Sample the clock to take the difference with the expiry time. @@ -732,6 +764,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp itp->it_value.tv_nsec = 1; itp->it_value.tv_sec = 0; } +out: + rcu_read_unlock(); } #define MAX_COLLECTED 20 @@ -952,14 +986,15 @@ static void check_process_timers(struct task_struct *tsk, static void posix_cpu_timer_rearm(struct k_itimer *timer) { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - struct cpu_timer *ctmr = &timer->it.cpu; - struct task_struct *p = ctmr->task; + struct task_struct *p; struct sighand_struct *sighand; unsigned long flags; u64 now; - if (WARN_ON_ONCE(!p)) - return; + rcu_read_lock(); + p = cpu_timer_task_rcu(timer); + if (!p) + goto out; /* * Fetch the current sample and update the timer's expiry time. @@ -974,13 +1009,15 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) /* Protect timer list r/w in arm_timer() */ sighand = lock_task_sighand(p, &flags); if (unlikely(sighand == NULL)) - return; + goto out; /* * Now re-arm for the new expiry time. */ arm_timer(timer, p); unlock_task_sighand(p, &flags); +out: + rcu_read_unlock(); } /** -- cgit v1.2.3 From 4cbbc3a0eeed675449b1a4d080008927121f3da3 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 20 Jan 2020 18:05:23 +0800 Subject: timekeeping: Prevent 32bit truncation in scale64_check_overflow() While unlikely the divisor in scale64_check_overflow() could be >= 32bit in scale64_check_overflow(). do_div() truncates the divisor to 32bit at least on 32bit platforms. Use div64_u64() instead to avoid the truncation to 32-bit. [ tglx: Massaged changelog ] Signed-off-by: Wen Yang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200120100523.45656-1-wenyang@linux.alibaba.com --- kernel/time/timekeeping.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ca69290bee2a..4fc2af4367a7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1005,9 +1005,8 @@ static int scale64_check_overflow(u64 mult, u64 div, u64 *base) ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) return -EOVERFLOW; tmp *= mult; - rem *= mult; - do_div(rem, div); + rem = div64_u64(rem * mult, div); *base = tmp + rem; return 0; } -- cgit v1.2.3 From 38f7b0b1316d435f38ec3f2bb078897b7a1cfdea Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Thu, 30 Jan 2020 21:08:51 +0800 Subject: hrtimer: Cast explicitely to u32t in __ktime_divns() do_div() does a 64-by-32 division at least on 32bit platforms, while the divisor 'div' is explicitly casted to unsigned long, thus 64-bit on 64-bit platforms. The code already ensures that the divisor is less than 2^32. Hence the proper cast type is u32. Signed-off-by: Wen Yang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200130130851.29204-1-wenyang@linux.alibaba.com --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3a609e7344f3..d74fdcd3ced4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -311,7 +311,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div) div >>= 1; } tmp >>= sft; - do_div(tmp, (unsigned long) div); + do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); -- cgit v1.2.3 From d441dceb5dce71150f28add80d36d91bbfccba99 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 7 Feb 2020 14:39:29 -0500 Subject: tick/common: Make tick_periodic() check for missing ticks The tick_periodic() function is used at the beginning part of the bootup process for time keeping while the other clock sources are being initialized. The current code assumes that all the timer interrupts are handled in a timely manner with no missing ticks. That is not actually true. Some ticks are missed and there are some discrepancies between the tick time (jiffies) and the timestamp reported in the kernel log. Some systems, however, are more prone to missing ticks than the others. In the extreme case, the discrepancy can actually cause a soft lockup message to be printed by the watchdog kthread. For example, on a Cavium ThunderX2 Sabre arm64 system: [ 25.496379] watchdog: BUG: soft lockup - CPU#14 stuck for 22s! On that system, the missing ticks are especially prevalent during the smp_init() phase of the boot process. With an instrumented kernel, it was found that it took about 24s as reported by the timestamp for the tick to accumulate 4s of time. Investigation and bisection done by others seemed to point to the commit 73f381660959 ("arm64: Advertise mitigation of Spectre-v2, or lack thereof") as the culprit. It could also be a firmware issue as new firmware was promised that would fix the issue. To properly address this problem, stop assuming that there will be no missing tick in tick_periodic(). Modify it to follow the example of tick_do_update_jiffies64() by using another reference clock to check for missing ticks. Since the watchdog timer uses running_clock(), it is used here as the reference. With this applied, the soft lockup problem in the affected arm64 system is gone and tick time tracks much more closely to the timestamp time. Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200207193929.27308-1-longman@redhat.com --- kernel/time/tick-common.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 7e5d3524e924..cce4ed1515c7 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -84,12 +85,41 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { + /* + * Use running_clock() as reference to check for missing ticks. + */ + static ktime_t last_update; + ktime_t now; + int ticks = 1; + + now = ns_to_ktime(running_clock()); write_seqlock(&jiffies_lock); - /* Keep track of the next tick event */ - tick_next_period = ktime_add(tick_next_period, tick_period); + if (last_update) { + u64 delta = ktime_sub(now, last_update); - do_timer(1); + /* + * Check for eventually missed ticks + * + * There is likely a persistent delta between + * last_update and tick_next_period. So they are + * updated separately. + */ + if (delta >= 2 * tick_period) { + s64 period = ktime_to_ns(tick_period); + + ticks = ktime_divns(delta, period); + } + last_update = ktime_add(last_update, + ticks * tick_period); + } else { + last_update = now; + } + + /* Keep track of the next tick event */ + tick_next_period = ktime_add(tick_next_period, + ticks * tick_period); + do_timer(ticks); write_sequnlock(&jiffies_lock); update_wall_time(); } -- cgit v1.2.3 From 2c8bd58812ee3dbf0d78b566822f7eacd34bdd7b Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Mon, 9 Mar 2020 18:15:29 +0000 Subject: time/sched_clock: Expire timer in hardirq context To minimize latency, PREEMPT_RT kernels expires hrtimers in preemptible softirq context by default. This can be overriden by marking the timer's expiry with HRTIMER_MODE_HARD. sched_clock_timer is missing this annotation: if its callback is preempted and the duration of the preemption exceeds the wrap around time of the underlying clocksource, sched clock will get out of sync. Mark the sched_clock_timer for expiry in hard interrupt context. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200309181529.26558-1-a.darwish@linutronix.de --- kernel/time/sched_clock.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index e4332e3e2d56..fa3f800d7d76 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -208,7 +208,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (sched_clock_timer.function != NULL) { /* update timeout for clock wrap */ - hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, + HRTIMER_MODE_REL_HARD); } r = rate; @@ -254,9 +255,9 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); sched_clock_timer.function = sched_clock_poll; - hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } /* @@ -293,7 +294,7 @@ void sched_clock_resume(void) struct clock_read_data *rd = &cd.read_data[0]; rd->epoch_cyc = cd.actual_read_sched_clock(); - hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); rd->read_sched_clock = cd.actual_read_sched_clock; } -- cgit v1.2.3 From 52da479a9aee630d2cdf37d05edfe5bcfff3e17f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Mar 2020 19:47:06 +0100 Subject: Revert "tick/common: Make tick_periodic() check for missing ticks" This reverts commit d441dceb5dce71150f28add80d36d91bbfccba99 due to boot failures. Reported-by: Qian Cai Signed-off-by: Thomas Gleixner Cc: Waiman Long --- kernel/time/tick-common.c | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index cce4ed1515c7..7e5d3524e924 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -85,41 +84,12 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - /* - * Use running_clock() as reference to check for missing ticks. - */ - static ktime_t last_update; - ktime_t now; - int ticks = 1; - - now = ns_to_ktime(running_clock()); write_seqlock(&jiffies_lock); - if (last_update) { - u64 delta = ktime_sub(now, last_update); - - /* - * Check for eventually missed ticks - * - * There is likely a persistent delta between - * last_update and tick_next_period. So they are - * updated separately. - */ - if (delta >= 2 * tick_period) { - s64 period = ktime_to_ns(tick_period); - - ticks = ktime_divns(delta, period); - } - last_update = ktime_add(last_update, - ticks * tick_period); - } else { - last_update = now; - } - /* Keep track of the next tick event */ - tick_next_period = ktime_add(tick_next_period, - ticks * tick_period); - do_timer(ticks); + tick_next_period = ktime_add(tick_next_period, tick_period); + + do_timer(1); write_sequnlock(&jiffies_lock); update_wall_time(); } -- cgit v1.2.3 From e5d4d1756b07d9490a0269a9e68c1e05ee1feb9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 21 Mar 2020 12:25:58 +0100 Subject: timekeeping: Split jiffies seqlock seqlock consists of a sequence counter and a spinlock_t which is used to serialize the writers. spinlock_t is substituted by a "sleeping" spinlock on PREEMPT_RT enabled kernels which breaks the usage in the timekeeping code as the writers are executed in hard interrupt and therefore non-preemptible context even on PREEMPT_RT. The spinlock in seqlock cannot be unconditionally replaced by a raw_spinlock_t as many seqlock users have nesting spinlock sections or other code which is not suitable to run in truly atomic context on RT. Instead of providing a raw_seqlock API for a single use case, open code the seqlock for the jiffies use case and implement it with a raw_spinlock_t and a sequence counter. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200321113242.120587764@linutronix.de --- kernel/time/jiffies.c | 7 ++++--- kernel/time/tick-common.c | 10 ++++++---- kernel/time/tick-sched.c | 19 ++++++++++++------- kernel/time/timekeeping.c | 6 ++++-- kernel/time/timekeeping.h | 3 ++- 5 files changed, 28 insertions(+), 17 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index d23b434c2ca7..eddcf4970444 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -58,7 +58,8 @@ static struct clocksource clocksource_jiffies = { .max_cycles = 10, }; -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); +__cacheline_aligned_in_smp seqcount_t jiffies_seq; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) @@ -67,9 +68,9 @@ u64 get_jiffies_64(void) u64 ret; do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); ret = jiffies_64; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); return ret; } EXPORT_SYMBOL(get_jiffies_64); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 7e5d3524e924..6c9c342dd0e5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -84,13 +84,15 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } @@ -162,9 +164,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); next = tick_next_period; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a792d21cac64..4be756b88a48 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -65,7 +65,8 @@ static void tick_do_update_jiffies64(ktime_t now) return; /* Reevaluate with jiffies_lock held */ - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, last_jiffies_update); if (delta >= tick_period) { @@ -91,10 +92,12 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } else { - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); return; } - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } @@ -105,12 +108,14 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ if (last_jiffies_update == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); return period; } @@ -676,10 +681,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ca69290bee2a..856280d2cbd4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2397,8 +2397,10 @@ EXPORT_SYMBOL(hardpps); */ void xtime_update(unsigned long ticks) { - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); do_timer(ticks); - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 141ab3ab0354..099737f6f10c 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -25,7 +25,8 @@ static inline void sched_clock_resume(void) { } extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -extern seqlock_t jiffies_lock; +extern raw_spinlock_t jiffies_lock; +extern seqcount_t jiffies_seq; #define CS_NAME_LEN 32 -- cgit v1.2.3 From 40db173965c05a1d803451240ed41707d5bd978d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 21 Mar 2020 12:26:02 +0100 Subject: lockdep: Add hrtimer context tracing bits Set current->irq_config = 1 for hrtimers which are not marked to expire in hard interrupt context during hrtimer_init(). These timers will expire in softirq context on PREEMPT_RT. Setting this allows lockdep to differentiate these timers. If a timer is marked to expire in hard interrupt context then the timer callback is not supposed to acquire a regular spinlock instead of a raw_spinlock in the expiry callback. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200321113242.534508206@linutronix.de --- kernel/time/hrtimer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3a609e7344f3..8cce72501aea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1404,7 +1404,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; - timer->is_hard = !softtimer; + timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } @@ -1514,7 +1514,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); + lockdep_hrtimer_enter(timer); + restart = fn(timer); + + lockdep_hrtimer_exit(timer); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); -- cgit v1.2.3 From 49915ac35ca7b07c54295a72d905be5064afb89e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 21 Mar 2020 12:26:03 +0100 Subject: lockdep: Annotate irq_work Mark irq_work items with IRQ_WORK_HARD_IRQ which should be invoked in hardirq context even on PREEMPT_RT. IRQ_WORK without this flag will be invoked in softirq context on PREEMPT_RT. Set ->irq_config to 1 for the IRQ_WORK items which are invoked in softirq context so lockdep knows that these can safely acquire a spinlock_t. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200321113242.643576700@linutronix.de --- kernel/time/tick-sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4be756b88a48..3e2dc9b8858c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -245,6 +245,7 @@ static void nohz_full_kick_func(struct irq_work *work) static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { .func = nohz_full_kick_func, + .flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ), }; /* -- cgit v1.2.3 From d53f2b62fcb63f6547c10d8c62bca19e957b0eef Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 21 Mar 2020 12:26:04 +0100 Subject: lockdep: Add posixtimer context tracing bits Splitting run_posix_cpu_timers() into two parts is work in progress which is stuck on other entry code related problems. The heavy lifting which involves locking of sighand lock will be moved into task context so the necessary execution time is burdened on the task and not on interrupt context. Until this work completes lockdep with the spinlock nesting rules enabled would emit warnings for this known context. Prevent it by setting "->irq_config = 1" for the invocation of run_posix_cpu_timers() so lockdep does not complain when sighand lock is acquried. This will be removed once the split is completed. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200321113242.751182723@linutronix.de --- kernel/time/posix-cpu-timers.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/time') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8ff6da77a01f..2c48a7233b19 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1126,8 +1126,11 @@ void run_posix_cpu_timers(void) if (!fastpath_timer_check(tsk)) return; - if (!lock_task_sighand(tsk, &flags)) + lockdep_posixtimer_enter(); + if (!lock_task_sighand(tsk, &flags)) { + lockdep_posixtimer_exit(); return; + } /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1169,6 +1172,7 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } + lockdep_posixtimer_exit(); } /* -- cgit v1.2.3 From 73d20564e0dcae003e0d79977f044d5e57496304 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 31 Mar 2020 22:18:49 +0200 Subject: hrtimer: Don't dereference the hrtimer pointer after the callback A hrtimer can be released in its callback, but lockdep_hrtimer_exit() dereferences the pointer after the callback returns, i.e. a potential use after free. Retrieve the context in which the hrtimer expires before the callback is invoked and use it in lockdep_hrtimer_exit(). Fixes: 40db173965c0 ("lockdep: Add hrtimer context tracing bits") Reported-by: syzbot+62c155c276e580cfb606@syzkaller.appspotmail.com Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200331201849.fkp2siy3vcdqvqlz@linutronix.de --- kernel/time/hrtimer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d0a5ba37aff4..d89da1c7e005 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1480,6 +1480,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); + bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); @@ -1514,11 +1515,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); - lockdep_hrtimer_enter(timer); + expires_in_hardirq = lockdep_hrtimer_enter(timer); restart = fn(timer); - lockdep_hrtimer_exit(timer); + lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); -- cgit v1.2.3 From b801f1e22c23c259d6a2c955efddd20370de19a6 Mon Sep 17 00:00:00 2001 From: "Michael Kerrisk (man-pages)" Date: Fri, 3 Apr 2020 14:11:39 +0200 Subject: time/namespace: Fix time_for_children symlink Looking at the contents of the /proc/PID/ns/time_for_children symlink shows an anomaly: $ ls -l /proc/self/ns/* |awk '{print $9, $10, $11}' ... /proc/self/ns/pid -> pid:[4026531836] /proc/self/ns/pid_for_children -> pid:[4026531836] /proc/self/ns/time -> time:[4026531834] /proc/self/ns/time_for_children -> time_for_children:[4026531834] /proc/self/ns/user -> user:[4026531837] ... The reference for 'time_for_children' should be a 'time' namespace, just as the reference for 'pid_for_children' is a 'pid' namespace. In other words, the above time_for_children link should read: /proc/self/ns/time_for_children -> time:[4026531834] Fixes: 769071ac9f20 ("ns: Introduce Time Namespace") Signed-off-by: Michael Kerrisk Signed-off-by: Thomas Gleixner Reviewed-by: Dmitry Safonov Acked-by: Christian Brauner Acked-by: Andrei Vagin Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/a2418c48-ed80-3afe-116e-6611cb799557@gmail.com --- kernel/time/namespace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/time') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index e6ba064ce773..3b30288793fe 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -447,6 +447,7 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", + .real_ns_name = "time", .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, -- cgit v1.2.3