diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup/cgroup.c | 88 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 29 | ||||
| -rw-r--r-- | kernel/futex/core.c | 2 | ||||
| -rw-r--r-- | kernel/futex/pi.c | 3 | ||||
| -rw-r--r-- | kernel/futex/syscalls.c | 8 | ||||
| -rw-r--r-- | kernel/power/main.c | 2 | ||||
| -rw-r--r-- | kernel/power/snapshot.c | 11 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 95 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.c | 2 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 3 | ||||
| -rw-r--r-- | kernel/sysctl.c | 2 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 2 | ||||
| -rw-r--r-- | kernel/trace/trace_events_trigger.c | 79 | ||||
| -rw-r--r-- | kernel/trace/trace_osnoise.c | 10 | ||||
| -rw-r--r-- | kernel/workqueue.c | 25 |
15 files changed, 297 insertions, 64 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 01fc2a93f3ef..4ca3cb993da2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2126,6 +2126,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) #endif init_waitqueue_head(&cgrp->offline_waitq); + init_waitqueue_head(&cgrp->dying_populated_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -6224,6 +6225,78 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return 0; }; +/** + * cgroup_drain_dying - wait for dying tasks to leave before rmdir + * @cgrp: the cgroup being removed + * + * cgroup.procs and cgroup.threads use css_task_iter which filters out + * PF_EXITING tasks so that userspace doesn't see tasks that have already been + * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the + * cgroup has non-empty css_sets - is only updated when dying tasks pass through + * cgroup_task_dead() in finish_task_switch(). This creates a window where + * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir + * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no + * tasks. + * + * This function aligns cgroup_has_tasks() with what userspace can observe. If + * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are + * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the + * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief. + * + * This function only concerns itself with this cgroup's own dying tasks. + * Whether the cgroup has children is cgroup_destroy_locked()'s problem. + * + * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we + * retry the full check from scratch. + * + * Must be called with cgroup_mutex held. + */ +static int cgroup_drain_dying(struct cgroup *cgrp) + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +{ + struct css_task_iter it; + struct task_struct *task; + DEFINE_WAIT(wait); + + lockdep_assert_held(&cgroup_mutex); +retry: + if (!cgroup_has_tasks(cgrp)) + return 0; + + /* Same iterator as cgroup.threads - if any task is visible, it's busy */ + css_task_iter_start(&cgrp->self, 0, &it); + task = css_task_iter_next(&it); + css_task_iter_end(&it); + + if (task) + return -EBUSY; + + /* + * All remaining tasks are PF_EXITING and will pass through + * cgroup_task_dead() shortly. Wait for a kick and retry. + * + * cgroup_has_tasks() can't transition from false to true while we're + * holding cgroup_mutex, but the true to false transition happens + * under css_set_lock (via cgroup_task_dead()). We must retest and + * prepare_to_wait() under css_set_lock. Otherwise, the transition + * can happen between our first test and prepare_to_wait(), and we + * sleep with no one to wake us. + */ + spin_lock_irq(&css_set_lock); + if (!cgroup_has_tasks(cgrp)) { + spin_unlock_irq(&css_set_lock); + return 0; + } + prepare_to_wait(&cgrp->dying_populated_waitq, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + schedule(); + finish_wait(&cgrp->dying_populated_waitq, &wait); + mutex_lock(&cgroup_mutex); + goto retry; +} + int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; @@ -6233,9 +6306,12 @@ int cgroup_rmdir(struct kernfs_node *kn) if (!cgrp) return 0; - ret = cgroup_destroy_locked(cgrp); - if (!ret) - TRACE_CGROUP_PATH(rmdir, cgrp); + ret = cgroup_drain_dying(cgrp); + if (!ret) { + ret = cgroup_destroy_locked(cgrp); + if (!ret) + TRACE_CGROUP_PATH(rmdir, cgrp); + } cgroup_kn_unlock(kn); return ret; @@ -6995,6 +7071,7 @@ void cgroup_task_exit(struct task_struct *tsk) static void do_cgroup_task_dead(struct task_struct *tsk) { + struct cgrp_cset_link *link; struct css_set *cset; unsigned long flags; @@ -7008,6 +7085,11 @@ static void do_cgroup_task_dead(struct task_struct *tsk) if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) list_add_tail(&tsk->cg_list, &cset->dying_tasks); + /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */ + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) + if (waitqueue_active(&link->cgrp->dying_populated_waitq)) + wake_up(&link->cgrp->dying_populated_waitq); + if (dl_task(tsk)) dec_dl_tasks_cs(tsk); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index d21868455341..1335e437098e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2988,7 +2988,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct cpuset *cs, *oldcs; struct task_struct *task; - bool cpus_updated, mems_updated; + bool setsched_check; int ret; /* used later by cpuset_attach() */ @@ -3003,20 +3003,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) if (ret) goto out_unlock; - cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); - mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + /* + * Skip rights over task setsched check in v2 when nothing changes, + * migration permission derives from hierarchy ownership in + * cgroup_procs_write_permission()). + */ + setsched_check = !cpuset_v2() || + !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) || + !nodes_equal(cs->effective_mems, oldcs->effective_mems); + + /* + * A v1 cpuset with tasks will have no CPU left only when CPU hotplug + * brings the last online CPU offline as users are not allowed to empty + * cpuset.cpus when there are active tasks inside. When that happens, + * we should allow tasks to migrate out without security check to make + * sure they will be able to run after migration. + */ + if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus)) + setsched_check = false; cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task); if (ret) goto out_unlock; - /* - * Skip rights over task check in v2 when nothing changes, - * migration permission derives from hierarchy ownership in - * cgroup_procs_write_permission()). - */ - if (!cpuset_v2() || (cpus_updated || mems_updated)) { + if (setsched_check) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; diff --git a/kernel/futex/core.c b/kernel/futex/core.c index cf7e610eac42..31e83a09789e 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -342,7 +342,7 @@ static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) if (!vma) return FUTEX_NO_NODE; - mpol = vma_policy(vma); + mpol = READ_ONCE(vma->vm_policy); if (!mpol) return FUTEX_NO_NODE; diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index bc1f7e83a37e..7808068fa59e 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -918,7 +918,7 @@ int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to; - struct task_struct *exiting = NULL; + struct task_struct *exiting; struct rt_mutex_waiter rt_waiter; struct futex_q q = futex_q_init; DEFINE_WAKE_Q(wake_q); @@ -933,6 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl to = futex_setup_timer(time, &timeout, flags, 0); retry: + exiting = NULL; ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 743c7a728237..77ad9691f6a6 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -459,6 +459,14 @@ SYSCALL_DEFINE4(futex_requeue, if (ret) return ret; + /* + * For now mandate both flags are identical, like the sys_futex() + * interface has. If/when we merge the variable sized futex support, + * that patch can modify this test to allow a difference in size. + */ + if (futexes[0].w.flags != futexes[1].w.flags) + return -EINVAL; + cmpval = futexes[0].w.val; return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags, diff --git a/kernel/power/main.c b/kernel/power/main.c index 5f8c9e12eaec..5429e9f19b65 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -40,7 +40,7 @@ void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); - if (WARN_ON(!saved_gfp_count) || --saved_gfp_count) + if (!saved_gfp_count || --saved_gfp_count) return; gfp_allowed_mask = saved_gfp_mask; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 6e1321837c66..a564650734dc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2855,6 +2855,17 @@ int snapshot_write_finalize(struct snapshot_handle *handle) { int error; + /* + * Call snapshot_write_next() to drain any trailing zero pages, + * but make sure we're in the data page region first. + * This function can return PAGE_SIZE if the kernel was expecting + * another copy page. Return -ENODATA in that situation. + */ + if (handle->cur > nr_meta_pages + 1) { + error = snapshot_write_next(handle); + if (error) + return error > 0 ? -ENODATA : error; + } copy_last_highmem_page(); error = hibernate_restore_protect_page(handle->buffer); /* Do that only if we have loaded the image entirely */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 26a6ac2f8826..d5bdcdb3f700 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2404,7 +2404,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, { struct scx_sched *sch = scx_root; - /* see kick_cpus_irq_workfn() */ + /* see kick_sync_wait_bal_cb() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); update_curr_scx(rq); @@ -2447,6 +2447,48 @@ switch_class: switch_class(rq, next); } +static void kick_sync_wait_bal_cb(struct rq *rq) +{ + struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); + unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; + bool waited; + s32 cpu; + + /* + * Drop rq lock and enable IRQs while waiting. IRQs must be enabled + * — a target CPU may be waiting for us to process an IPI (e.g. TLB + * flush) while we wait for its kick_sync to advance. + * + * Also, keep advancing our own kick_sync so that new kick_sync waits + * targeting us, which can start after we drop the lock, cannot form + * cyclic dependencies. + */ +retry: + waited = false; + for_each_cpu(cpu, rq->scx.cpus_to_sync) { + /* + * smp_load_acquire() pairs with smp_store_release() on + * kick_sync updates on the target CPUs. + */ + if (cpu == cpu_of(rq) || + smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { + cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); + continue; + } + + raw_spin_rq_unlock_irq(rq); + while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + cpu_relax(); + } + raw_spin_rq_lock_irq(rq); + waited = true; + } + + if (waited) + goto retry; +} + static struct task_struct *first_local_task(struct rq *rq) { return list_first_entry_or_null(&rq->scx.local_dsq.list, @@ -2460,7 +2502,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) bool keep_prev; struct task_struct *p; - /* see kick_cpus_irq_workfn() */ + /* see kick_sync_wait_bal_cb() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); rq_modified_begin(rq, &ext_sched_class); @@ -2471,6 +2513,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) maybe_queue_balance_callback(rq); /* + * Defer to a balance callback which can drop rq lock and enable + * IRQs. Waiting directly in the pick path would deadlock against + * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. + */ + if (unlikely(rq->scx.kick_sync_pending)) { + rq->scx.kick_sync_pending = false; + queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, + kick_sync_wait_bal_cb); + } + + /* * If any higher-priority sched class enqueued a runnable task on * this rq during balance_one(), abort and return RETRY_TASK, so * that the scheduler loop can restart. @@ -4713,6 +4766,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) if (!cpumask_empty(rq->scx.cpus_to_wait)) dump_line(&ns, " cpus_to_wait : %*pb", cpumask_pr_args(rq->scx.cpus_to_wait)); + if (!cpumask_empty(rq->scx.cpus_to_sync)) + dump_line(&ns, " cpus_to_sync : %*pb", + cpumask_pr_args(rq->scx.cpus_to_sync)); used = seq_buf_used(&ns); if (SCX_HAS_OP(sch, dump_cpu)) { @@ -5610,11 +5666,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { if (cur_class == &ext_sched_class) { + cpumask_set_cpu(cpu, this_scx->cpus_to_sync); ksyncs[cpu] = rq->scx.kick_sync; should_wait = true; - } else { - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } resched_curr(rq); @@ -5669,27 +5725,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); } - if (!should_wait) - return; - - for_each_cpu(cpu, this_scx->cpus_to_wait) { - unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; - - /* - * Busy-wait until the task running at the time of kicking is no - * longer running. This can be used to implement e.g. core - * scheduling. - * - * smp_cond_load_acquire() pairs with store_releases in - * pick_task_scx() and put_prev_task_scx(). The former breaks - * the wait if SCX's scheduling path is entered even if the same - * task is picked subsequently. The latter is necessary to break - * the wait when $cpu is taken by a higher sched class. - */ - if (cpu != cpu_of(this_rq)) - smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); - - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + /* + * Can't wait in hardirq — kick_sync can't advance, deadlocking if + * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). + */ + if (should_wait) { + raw_spin_rq_lock(this_rq); + this_scx->kick_sync_pending = true; + resched_curr(this_rq); + raw_spin_rq_unlock(this_rq); } } @@ -5794,6 +5838,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index ba298ac3ce6c..0ae93cd64004 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -543,7 +543,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * piled up on it even if there is an idle core elsewhere on * the system. */ - waker_node = cpu_to_node(cpu); + waker_node = scx_cpu_node_if_enabled(cpu); if (!(current->flags & PF_EXITING) && cpu_rq(cpu)->scx.local_dsq.nr == 0 && (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 43bbf0693cca..1ef9ba480f51 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -805,9 +805,12 @@ struct scx_rq { cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_wait; + cpumask_var_t cpus_to_sync; + bool kick_sync_pending; unsigned long kick_sync; local_t reenq_local_deferred; struct balance_callback deferred_bal_cb; + struct balance_callback kick_sync_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; struct scx_dispatch_q bypass_dsq; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d3a666ffde1..c9efb17cc255 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1118,7 +1118,7 @@ int proc_do_large_bitmap(const struct ctl_table *table, int dir, unsigned long bitmap_len = table->maxlen; unsigned long *bitmap = *(unsigned long **) table->data; unsigned long *tmp_bitmap = NULL; - char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; + char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c = 0; if (!bitmap || !bitmap_len || !left || (*ppos && SYSCTL_KERN_TO_USER(dir))) { *lenp = 0; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 069d93bfb0c7..b64db405ba5c 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -540,7 +540,7 @@ static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; - return alarm_forward(alarm, timr->it_interval, now); + return alarm_forward(alarm, now, timr->it_interval); } /** diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d5230b759a2d..655db2e82513 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -22,6 +22,39 @@ static struct task_struct *trigger_kthread; static struct llist_head trigger_data_free_list; static DEFINE_MUTEX(trigger_data_kthread_mutex); +static int trigger_kthread_fn(void *ignore); + +static void trigger_create_kthread_locked(void) +{ + lockdep_assert_held(&trigger_data_kthread_mutex); + + if (!trigger_kthread) { + struct task_struct *kthread; + + kthread = kthread_create(trigger_kthread_fn, NULL, + "trigger_data_free"); + if (!IS_ERR(kthread)) + WRITE_ONCE(trigger_kthread, kthread); + } +} + +static void trigger_data_free_queued_locked(void) +{ + struct event_trigger_data *data, *tmp; + struct llist_node *llnodes; + + lockdep_assert_held(&trigger_data_kthread_mutex); + + llnodes = llist_del_all(&trigger_data_free_list); + if (!llnodes) + return; + + tracepoint_synchronize_unregister(); + + llist_for_each_entry_safe(data, tmp, llnodes, llist) + kfree(data); +} + /* Bulk garbage collection of event_trigger_data elements */ static int trigger_kthread_fn(void *ignore) { @@ -56,30 +89,50 @@ void trigger_data_free(struct event_trigger_data *data) if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); + /* + * Boot-time trigger registration can fail before kthread creation + * works. Keep the deferred-free semantics during boot and let late + * init start the kthread to drain the list. + */ + if (system_state == SYSTEM_BOOTING && !trigger_kthread) { + llist_add(&data->llist, &trigger_data_free_list); + return; + } + if (unlikely(!trigger_kthread)) { guard(mutex)(&trigger_data_kthread_mutex); + + trigger_create_kthread_locked(); /* Check again after taking mutex */ if (!trigger_kthread) { - struct task_struct *kthread; - - kthread = kthread_create(trigger_kthread_fn, NULL, - "trigger_data_free"); - if (!IS_ERR(kthread)) - WRITE_ONCE(trigger_kthread, kthread); + llist_add(&data->llist, &trigger_data_free_list); + /* Drain the queued frees synchronously if creation failed. */ + trigger_data_free_queued_locked(); + return; } } - if (!trigger_kthread) { - /* Do it the slow way */ - tracepoint_synchronize_unregister(); - kfree(data); - return; - } - llist_add(&data->llist, &trigger_data_free_list); wake_up_process(trigger_kthread); } +static int __init trigger_data_free_init(void) +{ + guard(mutex)(&trigger_data_kthread_mutex); + + if (llist_empty(&trigger_data_free_list)) + return 0; + + trigger_create_kthread_locked(); + if (trigger_kthread) + wake_up_process(trigger_kthread); + else + trigger_data_free_queued_locked(); + + return 0; +} +late_initcall(trigger_data_free_init); + static inline void data_ops_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index dee610e465b9..be6cf0bb3c03 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2073,8 +2073,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) if (!osnoise_has_registered_instances()) return; - guard(mutex)(&interface_lock); guard(cpus_read_lock)(); + guard(mutex)(&interface_lock); if (!cpu_online(cpu)) return; @@ -2237,11 +2237,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, if (running) stop_per_cpu_kthreads(); - mutex_lock(&interface_lock); /* * avoid CPU hotplug operations that might read options. */ cpus_read_lock(); + mutex_lock(&interface_lock); retval = cnt; @@ -2257,8 +2257,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, clear_bit(option, &osnoise_options); } - cpus_read_unlock(); mutex_unlock(&interface_lock); + cpus_read_unlock(); if (running) start_per_cpu_kthreads(); @@ -2345,16 +2345,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, if (running) stop_per_cpu_kthreads(); - mutex_lock(&interface_lock); /* * osnoise_cpumask is read by CPU hotplug operations. */ cpus_read_lock(); + mutex_lock(&interface_lock); cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); - cpus_read_unlock(); mutex_unlock(&interface_lock); + cpus_read_unlock(); if (running) start_per_cpu_kthreads(); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b77119d71641..eda756556341 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7699,8 +7699,29 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) else ts = touched; - /* did we stall? */ + /* + * Did we stall? + * + * Do a lockless check first to do not disturb the system. + * + * Prevent false positives by double checking the timestamp + * under pool->lock. The lock makes sure that the check reads + * an updated pool->last_progress_ts when this CPU saw + * an already updated pool->worklist above. It seems better + * than adding another barrier into __queue_work() which + * is a hotter path. + */ if (time_after(now, ts + thresh)) { + scoped_guard(raw_spinlock_irqsave, &pool->lock) { + pool_ts = pool->last_progress_ts; + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + } + if (!time_after(now, ts + thresh)) + continue; + lockup_detected = true; stall_time = jiffies_to_msecs(now - pool_ts) / 1000; max_stall_time = max(max_stall_time, stall_time); @@ -7712,8 +7733,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", stall_time); } - - } if (lockup_detected) |
