From bd14406b78e6daa1ea3c1673bda1ffc9efdeead0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 27 Aug 2018 11:12:25 +0200 Subject: perf/hw_breakpoint: Modify breakpoint even if the new attr has disabled set We need to change the breakpoint even if the attr with new fields has disabled set to true. Current code prevents following user code to change the breakpoint address: ptrace(PTRACE_POKEUSER, child, offsetof(struct user, u_debugreg[0]), addr_1) ptrace(PTRACE_POKEUSER, child, offsetof(struct user, u_debugreg[0]), addr_2) ptrace(PTRACE_POKEUSER, child, offsetof(struct user, u_debugreg[7]), dr7) The first PTRACE_POKEUSER creates the breakpoint with attr.disabled set to true: ptrace_set_breakpoint_addr(nr = 0) struct perf_event *bp = t->ptrace_bps[nr]; ptrace_register_breakpoint(..., disabled = true) ptrace_fill_bp_fields(..., disabled) register_user_hw_breakpoint So the second PTRACE_POKEUSER will be omitted: ptrace_set_breakpoint_addr(nr = 0) struct perf_event *bp = t->ptrace_bps[nr]; struct perf_event_attr attr = bp->attr; modify_user_hw_breakpoint(bp, &attr) if (!attr->disabled) modify_user_hw_breakpoint_check Reported-by: Milind Chabbi Signed-off-by: Jiri Olsa Acked-by: Frederic Weisbecker Acked-by: Oleg Nesterov Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: David Ahern Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180827091228.2878-3-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/hw_breakpoint.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b3814fce5ecb..fb229d9c7f3c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -509,6 +509,8 @@ modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *a */ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { + int err; + /* * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it * will not be possible to raise IPIs that invoke __perf_event_disable. @@ -520,11 +522,11 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att else perf_event_disable(bp); - if (!attr->disabled) { - int err = modify_user_hw_breakpoint_check(bp, attr, false); + err = modify_user_hw_breakpoint_check(bp, attr, false); + if (err) + return err; - if (err) - return err; + if (!attr->disabled) { perf_event_enable(bp); bp->attr.disabled = 0; } -- cgit v1.2.3 From cb45302d7c5e20f0c0598cdbd7753fa44daceb2a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 27 Aug 2018 11:12:26 +0200 Subject: perf/hw_breakpoint: Remove superfluous bp->attr.disabled = 0 Once the breakpoint was succesfully modified, the attr->disabled value is in bp->attr.disabled. So there's no reason to set it again, removing that. Signed-off-by: Jiri Olsa Acked-by: Frederic Weisbecker Acked-by: Oleg Nesterov Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: David Ahern Cc: Milind Chabbi Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180827091228.2878-4-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/hw_breakpoint.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index fb229d9c7f3c..3e560d7609fd 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -526,10 +526,9 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att if (err) return err; - if (!attr->disabled) { + if (!attr->disabled) perf_event_enable(bp); - bp->attr.disabled = 0; - } + return 0; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); -- cgit v1.2.3 From 969558371bf926258241727ebb994f516f2e6f61 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 27 Aug 2018 11:12:27 +0200 Subject: perf/hw_breakpoint: Enable breakpoint in modify_user_hw_breakpoint Currently we enable the breakpoint back only if the breakpoint modification was successful. If it fails we can leave the breakpoint in disabled state with attr->disabled == 0. We can safely enable the breakpoint back for both the fail and success paths by checking the bp->attr.disabled, which either holds the new 'requested' disabled state or the original breakpoint state. Suggested-by: Oleg Nesterov Signed-off-by: Jiri Olsa Acked-by: Frederic Weisbecker Acked-by: Oleg Nesterov Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: David Ahern Cc: Milind Chabbi Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180827091228.2878-5-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/hw_breakpoint.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3e560d7609fd..d6b56180827c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -523,13 +523,11 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, false); - if (err) - return err; - if (!attr->disabled) + if (!bp->attr.disabled) perf_event_enable(bp); - return 0; + return err; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); -- cgit v1.2.3 From bf06278c3fdf8909c3a9283e2c270b0fc170fa90 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 27 Aug 2018 11:12:28 +0200 Subject: perf/hw_breakpoint: Simplify breakpoint enable in perf_event_modify_breakpoint We can safely enable the breakpoint back for both the fail and success paths by checking only the bp->attr.disabled, which either holds the new 'requested' disabled state or the original breakpoint state. Committer testing: At the end of the series, the 'perf test' entry introduced as the first patch now runs to completion without finding the fixed issues: # perf test "bp modify" 62: x86 bp modify : Ok # In verbose mode: # perf test -v "bp modify" 62: x86 bp modify : --- start --- test child forked, pid 5161 rip 5950a0, bp_1 0x5950a0 in bp_1 rip 5950a0, bp_1 0x5950a0 in bp_1 test child finished with 0 ---- end ---- x86 bp modify: Ok Suggested-by: Oleg Nesterov Acked-by: Oleg Nesterov Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: David Ahern Cc: Milind Chabbi Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180827091228.2878-6-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f6ea33a9f904..22ede28ec07d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2867,16 +2867,11 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, _perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, true); - if (err) { - if (!bp->attr.disabled) - _perf_event_enable(bp); - return err; - } - - if (!attr->disabled) + if (!bp->attr.disabled) _perf_event_enable(bp); - return 0; + + return err; } static int perf_event_modify_attr(struct perf_event *event, -- cgit v1.2.3 From e13e2366d8415e029fe96a62502955083e272cef Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Mon, 3 Sep 2018 16:07:08 +0200 Subject: locking/mutex: Fix mutex debug call and ww_mutex documentation The following commit: 08295b3b5bee ("Implement an algorithm choice for Wound-Wait mutexes") introduced a reference in the documentation to a function that was removed in an earlier commit. It also forgot to remove a call to debug_mutex_add_waiter() which is now unconditionally called by __mutex_add_waiter(). Fix those bugs. Signed-off-by: Thomas Hellstrom Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dri-devel@lists.freedesktop.org Fixes: 08295b3b5bee ("Implement an algorithm choice for Wound-Wait mutexes") Link: http://lkml.kernel.org/r/20180903140708.2401-1-thellstrom@vmware.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 1a81a1257b3f..3f8a35104285 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -389,7 +389,7 @@ static bool __ww_mutex_wound(struct mutex *lock, /* * wake_up_process() paired with set_current_state() * inserts sufficient barriers to make sure @owner either sees - * it's wounded in __ww_mutex_lock_check_stamp() or has a + * it's wounded in __ww_mutex_check_kill() or has a * wakeup pending to re-read the wounded state. */ if (owner != current) @@ -946,7 +946,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, current); lock_contended(&lock->dep_map, ip); -- cgit v1.2.3 From e73e81975f2447e6f556100cada64a18ec631cbb Mon Sep 17 00:00:00 2001 From: Jiada Wang Date: Tue, 31 Jul 2018 21:12:22 +0900 Subject: sched/debug: Fix potential deadlock when writing to sched_features The following lockdep report can be triggered by writing to /sys/kernel/debug/sched_features: ====================================================== WARNING: possible circular locking dependency detected 4.18.0-rc6-00152-gcd3f77d74ac3-dirty #18 Not tainted ------------------------------------------------------ sh/3358 is trying to acquire lock: 000000004ad3989d (cpu_hotplug_lock.rw_sem){++++}, at: static_key_enable+0x14/0x30 but task is already holding lock: 00000000c1b31a88 (&sb->s_type->i_mutex_key#3){+.+.}, at: sched_feat_write+0x160/0x428 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&sb->s_type->i_mutex_key#3){+.+.}: lock_acquire+0xb8/0x148 down_write+0xac/0x140 start_creating+0x5c/0x168 debugfs_create_dir+0x18/0x220 opp_debug_register+0x8c/0x120 _add_opp_dev+0x104/0x1f8 dev_pm_opp_get_opp_table+0x174/0x340 _of_add_opp_table_v2+0x110/0x760 dev_pm_opp_of_add_table+0x5c/0x240 dev_pm_opp_of_cpumask_add_table+0x5c/0x100 cpufreq_init+0x160/0x430 cpufreq_online+0x1cc/0xe30 cpufreq_add_dev+0x78/0x198 subsys_interface_register+0x168/0x270 cpufreq_register_driver+0x1c8/0x278 dt_cpufreq_probe+0xdc/0x1b8 platform_drv_probe+0xb4/0x168 driver_probe_device+0x318/0x4b0 __device_attach_driver+0xfc/0x1f0 bus_for_each_drv+0xf8/0x180 __device_attach+0x164/0x200 device_initial_probe+0x10/0x18 bus_probe_device+0x110/0x178 device_add+0x6d8/0x908 platform_device_add+0x138/0x3d8 platform_device_register_full+0x1cc/0x1f8 cpufreq_dt_platdev_init+0x174/0x1bc do_one_initcall+0xb8/0x310 kernel_init_freeable+0x4b8/0x56c kernel_init+0x10/0x138 ret_from_fork+0x10/0x18 -> #2 (opp_table_lock){+.+.}: lock_acquire+0xb8/0x148 __mutex_lock+0x104/0xf50 mutex_lock_nested+0x1c/0x28 _of_add_opp_table_v2+0xb4/0x760 dev_pm_opp_of_add_table+0x5c/0x240 dev_pm_opp_of_cpumask_add_table+0x5c/0x100 cpufreq_init+0x160/0x430 cpufreq_online+0x1cc/0xe30 cpufreq_add_dev+0x78/0x198 subsys_interface_register+0x168/0x270 cpufreq_register_driver+0x1c8/0x278 dt_cpufreq_probe+0xdc/0x1b8 platform_drv_probe+0xb4/0x168 driver_probe_device+0x318/0x4b0 __device_attach_driver+0xfc/0x1f0 bus_for_each_drv+0xf8/0x180 __device_attach+0x164/0x200 device_initial_probe+0x10/0x18 bus_probe_device+0x110/0x178 device_add+0x6d8/0x908 platform_device_add+0x138/0x3d8 platform_device_register_full+0x1cc/0x1f8 cpufreq_dt_platdev_init+0x174/0x1bc do_one_initcall+0xb8/0x310 kernel_init_freeable+0x4b8/0x56c kernel_init+0x10/0x138 ret_from_fork+0x10/0x18 -> #1 (subsys mutex#6){+.+.}: lock_acquire+0xb8/0x148 __mutex_lock+0x104/0xf50 mutex_lock_nested+0x1c/0x28 subsys_interface_register+0xd8/0x270 cpufreq_register_driver+0x1c8/0x278 dt_cpufreq_probe+0xdc/0x1b8 platform_drv_probe+0xb4/0x168 driver_probe_device+0x318/0x4b0 __device_attach_driver+0xfc/0x1f0 bus_for_each_drv+0xf8/0x180 __device_attach+0x164/0x200 device_initial_probe+0x10/0x18 bus_probe_device+0x110/0x178 device_add+0x6d8/0x908 platform_device_add+0x138/0x3d8 platform_device_register_full+0x1cc/0x1f8 cpufreq_dt_platdev_init+0x174/0x1bc do_one_initcall+0xb8/0x310 kernel_init_freeable+0x4b8/0x56c kernel_init+0x10/0x138 ret_from_fork+0x10/0x18 -> #0 (cpu_hotplug_lock.rw_sem){++++}: __lock_acquire+0x203c/0x21d0 lock_acquire+0xb8/0x148 cpus_read_lock+0x58/0x1c8 static_key_enable+0x14/0x30 sched_feat_write+0x314/0x428 full_proxy_write+0xa0/0x138 __vfs_write+0xd8/0x388 vfs_write+0xdc/0x318 ksys_write+0xb4/0x138 sys_write+0xc/0x18 __sys_trace_return+0x0/0x4 other info that might help us debug this: Chain exists of: cpu_hotplug_lock.rw_sem --> opp_table_lock --> &sb->s_type->i_mutex_key#3 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sb->s_type->i_mutex_key#3); lock(opp_table_lock); lock(&sb->s_type->i_mutex_key#3); lock(cpu_hotplug_lock.rw_sem); *** DEADLOCK *** 2 locks held by sh/3358: #0: 00000000a8c4b363 (sb_writers#10){.+.+}, at: vfs_write+0x238/0x318 #1: 00000000c1b31a88 (&sb->s_type->i_mutex_key#3){+.+.}, at: sched_feat_write+0x160/0x428 stack backtrace: CPU: 5 PID: 3358 Comm: sh Not tainted 4.18.0-rc6-00152-gcd3f77d74ac3-dirty #18 Hardware name: Renesas H3ULCB Kingfisher board based on r8a7795 ES2.0+ (DT) Call trace: dump_backtrace+0x0/0x288 show_stack+0x14/0x20 dump_stack+0x13c/0x1ac print_circular_bug.isra.10+0x270/0x438 check_prev_add.constprop.16+0x4dc/0xb98 __lock_acquire+0x203c/0x21d0 lock_acquire+0xb8/0x148 cpus_read_lock+0x58/0x1c8 static_key_enable+0x14/0x30 sched_feat_write+0x314/0x428 full_proxy_write+0xa0/0x138 __vfs_write+0xd8/0x388 vfs_write+0xdc/0x318 ksys_write+0xb4/0x138 sys_write+0xc/0x18 __sys_trace_return+0x0/0x4 This is because when loading the cpufreq_dt module we first acquire cpu_hotplug_lock.rw_sem lock, then in cpufreq_init(), we are taking the &sb->s_type->i_mutex_key lock. But when writing to /sys/kernel/debug/sched_features, the cpu_hotplug_lock.rw_sem lock depends on the &sb->s_type->i_mutex_key lock. To fix this bug, reverse the lock acquisition order when writing to sched_features, this way cpu_hotplug_lock.rw_sem no longer depends on &sb->s_type->i_mutex_key. Tested-by: Dietmar Eggemann Signed-off-by: Jiada Wang Signed-off-by: Peter Zijlstra (Intel) Cc: Eugeniu Rosca Cc: George G. Davis Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180731121222.26195-1-jiada_wang@mentor.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 60caf1fb94e0..6383aa6a60ca 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -89,12 +89,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - static_key_disable(&sched_feat_keys[i]); + static_key_disable_cpuslocked(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - static_key_enable(&sched_feat_keys[i]); + static_key_enable_cpuslocked(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -146,9 +146,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); + cpus_read_lock(); inode_lock(inode); ret = sched_feat_set(cmp); inode_unlock(inode); + cpus_read_unlock(); if (ret < 0) return ret; -- cgit v1.2.3 From e5e96fafd9028b1478b165db78c52d981c14f471 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 10 Aug 2018 22:30:18 +0530 Subject: sched/topology: Set correct NUMA topology type With the following commit: 051f3ca02e46 ("sched/topology: Introduce NUMA identity node sched domain") the scheduler introduced a new NUMA level. However this leads to the NUMA topology on 2 node systems to not be marked as NUMA_DIRECT anymore. After this commit, it gets reported as NUMA_BACKPLANE, because sched_domains_numa_level is now 2 on 2 node systems. Fix this by allowing setting systems that have up to 2 NUMA levels as NUMA_DIRECT. While here remove code that assumes that level can be 0. Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Andre Wild Cc: Heiko Carstens Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Rik van Riel Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: linuxppc-dev Fixes: 051f3ca02e46 "Introduce NUMA identity node sched domain" Link: http://lkml.kernel.org/r/1533920419-17410-1-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 56a0fed30c0a..505a41c42b96 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1295,7 +1295,7 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (sched_domains_numa_levels <= 1) { + if (sched_domains_numa_levels <= 2) { sched_numa_topology_type = NUMA_DIRECT; return; } @@ -1380,9 +1380,6 @@ void sched_init_numa(void) break; } - if (!level) - return; - /* * 'level' contains the number of unique distances * -- cgit v1.2.3 From 12b04875d666e83d27511df25580de84505bc758 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 31 Aug 2018 17:22:55 +0200 Subject: sched/pelt: Fix update_blocked_averages() for RT and DL classes update_blocked_averages() is called to periodiccally decay the stalled load of idle CPUs and to sync all loads before running load balance. When cfs rq is idle, it trigs a load balance during pick_next_task_fair() in order to potentially pull tasks and to use this newly idle CPU. This load balance happens whereas prev task from another class has not been put and its utilization updated yet. This may lead to wrongly account running time as idle time for RT or DL classes. Test that no RT or DL task is running when updating their utilization in update_blocked_averages(). We still update RT and DL utilization instead of simply skipping them to make sure that all metrics are synced when used during load balance. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 371bf4273269 ("sched/rt: Add rt_rq utilization tracking") Fixes: 3727e0e16340 ("sched/dl: Add dl_rq utilization tracking") Link: http://lkml.kernel.org/r/1535728975-22799-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b39fb596f6c1..8cff8d55ee95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7263,6 +7263,7 @@ static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; + const struct sched_class *curr_class; struct rq_flags rf; bool done = true; @@ -7299,8 +7300,10 @@ static void update_blocked_averages(int cpu) if (cfs_rq_has_blocked(cfs_rq)) done = false; } - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ if (others_have_blocked(rq)) @@ -7365,13 +7368,16 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; + const struct sched_class *curr_class; struct rq_flags rf; rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; -- cgit v1.2.3 From d0cdb3ce8834332d918fc9c8ff74f8a169ec9abe Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Fri, 31 Aug 2018 15:42:17 -0700 Subject: sched/fair: Fix vruntime_normalized() for remote non-migration wakeup When a task which previously ran on a given CPU is remotely queued to wake up on that same CPU, there is a period where the task's state is TASK_WAKING and its vruntime is not normalized. This is not accounted for in vruntime_normalized() which will cause an error in the task's vruntime if it is switched from the fair class during this time. For example if it is boosted to RT priority via rt_mutex_setprio(), rq->min_vruntime will not be subtracted from the task's vruntime but it will be added again when the task returns to the fair class. The task's vruntime will have been erroneously doubled and the effective priority of the task will be reduced. Note this will also lead to inflation of all vruntimes since the doubled vruntime value will become the rq's min_vruntime when other tasks leave the rq. This leads to repeated doubling of the vruntime and priority penalty. Fix this by recognizing a WAKING task's vruntime as normalized only if sched_remote_wakeup is true. This indicates a migration, in which case the vruntime would have been normalized in migrate_task_rq_fair(). Based on a similar patch from John Dias . Suggested-by: Peter Zijlstra Tested-by: Dietmar Eggemann Signed-off-by: Steve Muckle Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Redpath Cc: John Dias Cc: Linus Torvalds Cc: Miguel de Dios Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Paul Turner Cc: Quentin Perret Cc: Thomas Gleixner Cc: Todd Kjos Cc: kernel-team@android.com Fixes: b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration") Link: http://lkml.kernel.org/r/20180831224217.169476-1-smuckle@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8cff8d55ee95..c6b7d6daab20 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9644,7 +9644,8 @@ static inline bool vruntime_normalized(struct task_struct *p) * - A task which has been woken up by try_to_wake_up() and * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!se->sum_exec_runtime || p->state == TASK_WAKING) + if (!se->sum_exec_runtime || + (p->state == TASK_WAKING && p->sched_remote_wakeup)) return true; return false; -- cgit v1.2.3 From 287cdaac5700c5b8970d739f73d742d863d3e2ca Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 4 Sep 2018 11:36:26 +0200 Subject: sched/fair: Fix scale_rt_capacity() for SMT Since commit: 523e979d3164 ("sched/core: Use PELT for scale_rt_capacity()") scale_rt_capacity() returns the remaining capacity and not a scale factor to apply on cpu_capacity_orig. arch_scale_cpu() is directly called by scale_rt_capacity() so we must take the sched_domain argument. Reported-by: Srikar Dronamraju Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 523e979d3164 ("sched/core: Use PELT for scale_rt_capacity()") Link: http://lkml.kernel.org/r/20180904093626.GA23936@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c6b7d6daab20..f12d004be6a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7488,10 +7488,10 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long scale_rt_capacity(int cpu) +static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + unsigned long max = arch_scale_cpu_capacity(sd, cpu); unsigned long used, free; unsigned long irq; @@ -7513,7 +7513,7 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(cpu); + unsigned long capacity = scale_rt_capacity(sd, cpu); struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); -- cgit v1.2.3 From bb3485c8ace6475c269b1aa2da674490f455f412 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 7 Sep 2018 09:51:04 +0200 Subject: sched/fair: Fix load_balance redo for !imbalance It can happen that load_balance() finds a busiest group and then a busiest rq but the calculated imbalance is in fact 0. In such situation, detach_tasks() returns immediately and lets the flag LBF_ALL_PINNED set. The busiest CPU is then wrongly assumed to have pinned tasks and removed from the load balance mask. then, we redo a load balance without the busiest CPU. This creates wrong load balance situation and generates wrong task migration. If the calculated imbalance is 0, it's useless to try to find a busiest rq as no task will be migrated and we can return immediately. This situation can happen with heterogeneous system or smp system when RT tasks are decreasing the capacity of some CPUs. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: jhugo@codeaurora.org Link: http://lkml.kernel.org/r/1536306664-29827-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f12d004be6a1..fc9a484ef82b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8275,7 +8275,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); - return sds.busiest; + return env->imbalance ? sds.busiest : NULL; out_balanced: env->imbalance = 0; -- cgit v1.2.3 From da260fe12330be8b003c2ab07a112704163ea675 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 7 Sep 2018 12:35:21 +0200 Subject: jump_label: Fix typo in warning message There's no 'allocatote' - use the next best thing: 'allocate' :-) Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Cc: Jason Baron Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180907103521.31344-1-bp@alien8.de Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 01ebdf1f9f40..2e62503bea0d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -678,7 +678,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, case MODULE_STATE_COMING: ret = jump_label_add_module(mod); if (ret) { - WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); + WARN(1, "Failed to allocate memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } break; -- cgit v1.2.3 From 882a78a9f39f5535b209b4aa0a1741e35b8c67fb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 3 Sep 2018 12:53:17 -0700 Subject: sched/fair: Fix kernel-doc notation warning Fix kernel-doc warning for missing 'flags' parameter description: ../kernel/sched/fair.c:3371: warning: Function parameter or member 'flags' not described in 'attach_entity_load_avg' Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: ea14b57e8a18 ("sched/cpufreq: Provide migration hint") Link: http://lkml.kernel.org/r/cdda0d42-880d-4229-a9f7-5899c977a063@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc9a484ef82b..f808ddf2a868 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3362,6 +3362,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach + * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. -- cgit v1.2.3 From dc5591a03f1d6dae6b11cdf1d74b023f7ac0fdbf Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 28 Aug 2018 21:33:15 +0100 Subject: locking/lockdep: Delete unnecessary #include Commit: c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints and unify their usage") added the inclusion of . liblockdep doesn't have a stub version of that header so now fails to build. However, commit: bff1b208a5d1 ("tracing: Partial revert of "tracing: Centralize preemptirq tracepoints and unify their usage"") removed the use of functions declared in that header. So delete the #include. Signed-off-by: Ben Hutchings Cc: Joel Fernandes Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Deacon Fixes: bff1b208a5d1 ("tracing: Partial revert of "tracing: Centralize ...") Fixes: c3bc8fd637a9 ("tracing: Centralize preemptirq tracepoints ...") Link: http://lkml.kernel.org/r/20180828203315.GD18030@decadent.org.uk Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e406c5fdb41e..dd13f865ad40 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,7 +55,6 @@ #include "lockdep_internals.h" -#include #define CREATE_TRACE_POINTS #include -- cgit v1.2.3 From 0b405c65ad459f5f4d3db1672246172bd19d946d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 24 Aug 2018 12:22:35 +0100 Subject: locking/ww_mutex: Fix spelling mistake "cylic" -> "cyclic" Trivial fix to spelling mistake in pr_err() error message Signed-off-by: Colin Ian King Acked-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-janitors@vger.kernel.org Link: http://lkml.kernel.org/r/20180824112235.8842-1-colin.king@canonical.com Signed-off-by: Ingo Molnar --- kernel/locking/test-ww_mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 5b915b370d5a..0be047dbd897 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -324,7 +324,7 @@ static int __test_cycle(unsigned int nthreads) if (!cycle->result) continue; - pr_err("cylic deadlock not resolved, ret[%d/%d] = %d\n", + pr_err("cyclic deadlock not resolved, ret[%d/%d] = %d\n", n, nthreads, cycle->result); ret = -EINVAL; break; -- cgit v1.2.3 From 02e184476eff848273826c1d6617bb37e5bcc7ad Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Thu, 23 Aug 2018 15:59:35 -0700 Subject: perf/core: Force USER_DS when recording user stack data Perf can record user stack data in response to a synchronous request, such as a tracepoint firing. If this happens under set_fs(KERNEL_DS), then we end up reading user stack data using __copy_from_user_inatomic() under set_fs(KERNEL_DS). I think this conflicts with the intention of using set_fs(KERNEL_DS). And it is explicitly forbidden by hardware on ARM64 when both CONFIG_ARM64_UAO and CONFIG_ARM64_PAN are used. So fix this by forcing USER_DS when recording user stack data. Signed-off-by: Yabin Cui Acked-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 88b0193d9418 ("perf/callchain: Force USER_DS when invoking perf_callchain_user()") Link: http://lkml.kernel.org/r/20180823225935.27035-1-yabinc@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index abaed4f8bb7f..c80549bf82c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5943,6 +5943,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, unsigned long sp; unsigned int rem; u64 dyn_size; + mm_segment_t fs; /* * We dump: @@ -5960,7 +5961,10 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); + fs = get_fs(); + set_fs(USER_DS); rem = __output_copy_user(handle, (void *) sp, dump_size); + set_fs(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); -- cgit v1.2.3 From a6ae928c25835ca18deb4a527079f169b68ed292 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 10 Sep 2018 15:52:06 +0200 Subject: Revert "printk: make sure to print log on console." This reverts commit 375899cddcbb26881b03cb3fbdcfd600e4e67f4a. The visibility of early messages did not longer take into account "quiet", "debug", and "loglevel" early parameters. It would be possible to invalidate and recompute LOG_NOCONS flag for the affected messages. But it would be hairy. Instead this patch just reverts the problematic commit. We could come up with a better solution for the original problem. For example, we could simplify the logic and just mark messages that should always be visible or always invisible on the console. Also this patch reverts the related build fix commit ffaa619af1b06 ("printk: Fix warning about unused suppress_message_printing"). Finally, this patch does not put back the unused LOG_NOCONS flag. Link: http://lkml.kernel.org/r/20180910145747.emvfzv4mzlk5dfqk@pathway.suse.cz Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H . Peter Anvin" Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Cc: Steven Rostedt Cc: Maninder Singh Reported-by: Hans de Goede Acked-by: Hans de Goede Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a63aeeaaf5d..e30e5023511b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -349,7 +349,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; */ enum log_flags { - LOG_NOCONS = 1, /* suppress print, do not print to console */ LOG_NEWLINE = 2, /* text ended with a newline */ LOG_PREFIX = 4, /* text started with a prefix */ LOG_CONT = 8, /* text is a fragment of a continuation line */ @@ -1879,9 +1878,6 @@ int vprintk_store(int facility, int level, if (dict) lflags |= LOG_PREFIX|LOG_NEWLINE; - if (suppress_message_printing(level)) - lflags |= LOG_NOCONS; - return log_output(facility, level, lflags, dict, dictlen, text, text_len); } @@ -2030,6 +2026,7 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } +static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2365,10 +2362,11 @@ skip: break; msg = log_from_idx(console_idx); - if (msg->flags & LOG_NOCONS) { + if (suppress_message_printing(msg->level)) { /* - * Skip record if !ignore_loglevel, and - * record has level above the console loglevel. + * Skip record we have buffered and already printed + * directly to the console when we received it, and + * record that has level above the console loglevel. */ console_idx = log_next(console_idx); console_seq++; -- cgit v1.2.3 From 4b1c5d917d34f705096bb7dd8a2bd19b0881970e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 12 Sep 2018 10:29:11 -0700 Subject: bpf: btf: Fix end boundary calculation for type section The end boundary math for type section is incorrect in btf_check_all_metas(). It just happens that hdr->type_off is always 0 for now because there are only two sections (type and string) and string section must be at the end (ensured in btf_parse_str_sec). However, type_off may not be 0 if a new section would be added later. This patch fixes it. Fixes: f80442a4cd18 ("bpf: btf: Change how section is supported in btf_header") Reported-by: Dmitry Vyukov Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2590700237c1..138f0302692e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1844,7 +1844,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env) hdr = &btf->hdr; cur = btf->nohdr_data + hdr->type_off; - end = btf->nohdr_data + hdr->type_len; + end = cur + hdr->type_len; env->log_type_id = 1; while (cur < end) { -- cgit v1.2.3 From dd066823db2ac4e22f721ec85190817b58059a54 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 12 Sep 2018 14:06:10 -0700 Subject: bpf/verifier: disallow pointer subtraction Subtraction of pointers was accidentally allowed for unpriv programs by commit 82abbf8d2fc4. Revert that part of commit. Fixes: 82abbf8d2fc4 ("bpf: do not allow root to mangle valid pointers") Reported-by: Jann Horn Acked-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92246117d2b0..bb07e74b34a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3163,7 +3163,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * an arbitrary scalar. Disallow all math except * pointer subtraction */ - if (opcode == BPF_SUB){ + if (opcode == BPF_SUB && env->allow_ptr_leaks) { mark_reg_unknown(env, regs, insn->dst_reg); return 0; } -- cgit v1.2.3 From 83f365554e47997ec68dc4eca3f5dce525cd15c3 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Fri, 7 Sep 2018 15:31:29 -0700 Subject: ring-buffer: Allow for rescheduling when removing pages When reducing ring buffer size, pages are removed by scheduling a work item on each CPU for the corresponding CPU ring buffer. After the pages are removed from ring buffer linked list, the pages are free()d in a tight loop. The loop does not give up CPU until all pages are removed. In a worst case behavior, when lot of pages are to be freed, it can cause system stall. After the pages are removed from the list, the free() can happen while the work is rescheduled. Call cond_resched() in the loop to prevent the system hangup. Link: http://lkml.kernel.org/r/20180907223129.71994-1-vnagarnaik@google.com Cc: stable@vger.kernel.org Fixes: 83f40318dab00 ("ring-buffer: Make removal of ring buffer pages atomic") Reported-by: Jason Behmer Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d92d4a982fd..65bd4616220d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1546,6 +1546,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) tmp_iter_page = first_page; do { + cond_resched(); + to_remove_page = tmp_iter_page; rb_inc_page(cpu_buffer, &tmp_iter_page); -- cgit v1.2.3 From f83606f5eb007adc33bc8541ede00590f477bdeb Mon Sep 17 00:00:00 2001 From: KJ Tsanaktsidis Date: Thu, 20 Sep 2018 12:22:25 -0700 Subject: fork: report pid exhaustion correctly Make the clone and fork syscalls return EAGAIN when the limit on the number of pids /proc/sys/kernel/pid_max is exceeded. Currently, when the pid_max limit is exceeded, the kernel will return ENOSPC from the fork and clone syscalls. This is contrary to the documented behaviour, which explicitly calls out the pid_max case as one where EAGAIN should be returned. It also leads to really confusing error messages in userspace programs which will complain about a lack of disk space when they fail to create processes/threads for this reason. This error is being returned because alloc_pid() uses the idr api to find a new pid; when there are none available, idr_alloc_cyclic() returns -ENOSPC, and this is being propagated back to userspace. This behaviour has been broken before, and was explicitly fixed in commit 35f71bc0a09a ("fork: report pid reservation failure properly"), so I think -EAGAIN is definitely the right thing to return in this case. The current behaviour change dates from commit 95846ecf9dac ("pid: replace pid bitmap implementation with IDR AIP") and was I believe unintentional. This patch has no impact on the case where allocating a pid fails because the child reaper for the namespace is dead; that case will still return -ENOMEM. Link: http://lkml.kernel.org/r/20180903111016.46461-1-ktsanaktsidis@zendesk.com Fixes: 95846ecf9dac ("pid: replace pid bitmap implementation with IDR AIP") Signed-off-by: KJ Tsanaktsidis Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Gargi Sharma Cc: Rik van Riel Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index de1cfc4f75a2..cdf63e53a014 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -195,7 +195,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) idr_preload_end(); if (nr < 0) { - retval = nr; + retval = (nr == -ENOSPC) ? -EAGAIN : nr; goto out_free; } -- cgit v1.2.3 From 3bf181bc5d8bc86f04ffd538d7fda9e69af1f2c2 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 20 Sep 2018 12:22:43 -0700 Subject: kernel/sys.c: remove duplicated include Link: http://lkml.kernel.org/r/20180821133424.18716-1-yuehaibing@huawei.com Signed-off-by: YueHaibing Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- kernel/sys.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index cf5c67533ff1..123bd73046ec 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -71,9 +71,6 @@ #include #include -/* Hardening for Spectre-v1 */ -#include - #include "uid16.h" #ifndef SET_UNALIGN_CTL -- cgit v1.2.3 From 5607fff303636d48b88414c6be353d9fed700af2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 18 Sep 2018 09:01:44 -0700 Subject: bpf: sockmap only allow ESTABLISHED sock state After this patch we only allow socks that are in ESTABLISHED state or are being added via a sock_ops event that is transitioning into an ESTABLISHED state. By allowing sock_ops events we allow users to manage sockmaps directly from sock ops programs. The two supported sock_ops ops are BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB and BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB. Similar to TLS ULP this ensures sk_user_data is correct. Reported-by: Eric Dumazet Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 488ef9663c01..1f97b559892a 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2097,8 +2097,12 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP) { + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { fput(socket->file); return -EOPNOTSUPP; } @@ -2453,6 +2457,16 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ + if (skops.sk->sk_type != SOCK_STREAM || + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { + fput(socket->file); + return -EOPNOTSUPP; + } + lock_sock(skops.sk); preempt_disable(); rcu_read_lock(); @@ -2543,10 +2557,22 @@ const struct bpf_map_ops sock_hash_ops = { .map_check_btf = map_check_no_btf, }; +static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; +} BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. This checks that the sock ops triggering the update is + * one indicating we are (or will be soon) in an ESTABLISHED state. + */ + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_map_ctx_update_elem(bpf_sock, map, key, flags); } @@ -2565,6 +2591,9 @@ BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); } -- cgit v1.2.3 From b05545e15e1ff1d6a6a8593971275f9cc3e6b92b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 18 Sep 2018 09:01:49 -0700 Subject: bpf: sockmap, fix transition through disconnect without close It is possible (via shutdown()) for TCP socks to go trough TCP_CLOSE state via tcp_disconnect() without actually calling tcp_close which would then call our bpf_tcp_close() callback. Because of this a user could disconnect a socket then put it in a LISTEN state which would break our assumptions about sockets always being ESTABLISHED state. To resolve this rely on the unhash hook, which is called in the disconnect case, to remove the sock from the sockmap. Reported-by: Eric Dumazet Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 60 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 1f97b559892a..0a0f2ec75370 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -132,6 +132,7 @@ struct smap_psock { struct work_struct gc_work; struct proto *sk_proto; + void (*save_unhash)(struct sock *sk); void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); @@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_unhash(struct sock *sk); static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) @@ -184,6 +186,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], struct proto *base) { prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; prot[SOCKMAP_BASE].close = bpf_tcp_close; prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; @@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk) return -EBUSY; } + psock->save_unhash = sk->sk_prot->unhash; psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; @@ -305,30 +309,12 @@ static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, return e; } -static void bpf_tcp_close(struct sock *sk, long timeout) +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) { - void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; struct sock *osk; - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - - /* The psock may be destroyed anytime after exiting the RCU critial - * section so by the time we use close_fun the psock may no longer - * be valid. However, bpf_tcp_close is called with the sock lock - * held so the close hook and sk are still valid. - */ - close_fun = psock->save_close; - if (psock->cork) { free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); @@ -379,6 +365,42 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(e); e = psock_map_pop(sk, psock); } +} + +static void bpf_tcp_unhash(struct sock *sk) +{ + void (*unhash_fun)(struct sock *sk); + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + unhash_fun = psock->save_unhash; + bpf_tcp_remove(sk, psock); + rcu_read_unlock(); + unhash_fun(sk); +} + +static void bpf_tcp_close(struct sock *sk, long timeout) +{ + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + close_fun = psock->save_close; + bpf_tcp_remove(sk, psock); rcu_read_unlock(); release_sock(sk); close_fun(sk, timeout); -- cgit v1.2.3 From 974c24c5bed75b53e229a6f68a0533b6d5f48feb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 11 Sep 2018 11:00:49 +0200 Subject: dma-mapping: add the missing ARCH_HAS_SYNC_DMA_FOR_CPU_ALL declaration The patch adding the infrastructure failed to actually add the symbol declaration, oops.. Fixes: faef87723a ("dma-noncoherent: add a arch_sync_dma_for_cpu_all hook") Signed-off-by: Christoph Hellwig Reviewed-by: Paul Burton Acked-by: Florian Fainelli Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 9bd54304446f..1b1d63b3634b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -23,6 +23,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU bool select NEED_DMA_MAP_STATE +config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL + bool + config DMA_DIRECT_OPS bool depends on HAS_DMA -- cgit v1.2.3 From 4288ea006c73e37c2a4f60dfaef20dd167b8df31 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:33:21 +0100 Subject: bpf: harden flags check in cgroup_storage_update_elem() cgroup_storage_update_elem() shouldn't accept any flags argument values except BPF_ANY and BPF_EXIST to guarantee the backward compatibility, had a new flag value been added. Fixes: de9cbbaadba5 ("bpf: introduce cgroup storage maps") Signed-off-by: Roman Gushchin Reported-by: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 22ad967d1e5f..94126cbffc88 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -129,7 +129,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags & BPF_NOEXIST) + if (flags != BPF_ANY && flags != BPF_EXIST) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, -- cgit v1.2.3 From befb1b3c2703897c5b8ffb0044dc5d0e5f27c5d7 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Wed, 19 Sep 2018 10:29:06 -0700 Subject: perf/core: Add sanity check to deal with pinned event failure It is possible that a failure can occur during the scheduling of a pinned event. The initial portion of perf_event_read_local() contains the various error checks an event should pass before it can be considered valid. Ensure that the potential scheduling failure of a pinned event is checked for and have a credible error. Suggested-by: Peter Zijlstra Signed-off-by: Reinette Chatre Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Cc: acme@kernel.org Cc: gavin.hindman@intel.com Cc: jithu.joseph@intel.com Cc: dave.hansen@intel.com Cc: hpa@zytor.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/6486385d1f30336e9973b24c8c65f5079543d3d3.1537377064.git.reinette.chatre@intel.com --- kernel/events/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c80549bf82c6..dcb093e7b377 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3935,6 +3935,12 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } + /* If this is a pinned event it must be running on this CPU */ + if (event->attr.pinned && event->oncpu != smp_processor_id()) { + ret = -EBUSY; + goto out; + } + /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise -- cgit v1.2.3 From a9f9772114c8b07ae75bcb3654bd017461248095 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Sep 2018 17:58:35 +0200 Subject: perf/core: Fix perf_pmu_unregister() locking When we unregister a PMU, we fail to serialize the @pmu_idr properly. Fix that by doing the entire thing under pmu_lock. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 2e80a82a49c4 ("perf: Dynamic pmu types") Signed-off-by: Ingo Molnar --- kernel/events/core.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index dcb093e7b377..dfb1d951789e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9431,9 +9431,7 @@ static void free_pmu_context(struct pmu *pmu) if (pmu->task_ctx_nr > perf_invalid_context) return; - mutex_lock(&pmus_lock); free_percpu(pmu->pmu_cpu_context); - mutex_unlock(&pmus_lock); } /* @@ -9689,12 +9687,8 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { - int remove_device; - mutex_lock(&pmus_lock); - remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); - mutex_unlock(&pmus_lock); /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -9706,13 +9700,14 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (remove_device) { + if (pmu_bus_running) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } free_pmu_context(pmu); + mutex_unlock(&pmus_lock); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); -- cgit v1.2.3 From cd6fb677ce7e460c25bdd66f689734102ec7d642 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 23 Sep 2018 18:13:43 +0200 Subject: perf/ring_buffer: Prevent concurent ring buffer access Some of the scheduling tracepoints allow the perf_tp_event code to write to ring buffer under different cpu than the code is running on. This results in corrupted ring buffer data demonstrated in following perf commands: # perf record -e 'sched:sched_switch,sched:sched_wakeup' perf bench sched messaging # Running 'sched/messaging' benchmark: # 20 sender and receiver processes per group # 10 groups == 400 processes run Total time: 0.383 [sec] [ perf record: Woken up 8 times to write data ] 0x42b890 [0]: failed to process type: -1765585640 [ perf record: Captured and wrote 4.825 MB perf.data (29669 samples) ] # perf report --stdio 0x42b890 [0]: failed to process type: -1765585640 The reason for the corruption are some of the scheduling tracepoints, that have __perf_task dfined and thus allow to store data to another cpu ring buffer: sched_waking sched_wakeup sched_wakeup_new sched_stat_wait sched_stat_sleep sched_stat_iowait sched_stat_blocked The perf_tp_event function first store samples for current cpu related events defined for tracepoint: hlist_for_each_entry_rcu(event, head, hlist_entry) perf_swevent_event(event, count, &data, regs); And then iterates events of the 'task' and store the sample for any task's event that passes tracepoint checks: ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->attr.type != PERF_TYPE_TRACEPOINT) continue; if (event->attr.config != entry->type) continue; perf_swevent_event(event, count, &data, regs); } Above code can race with same code running on another cpu, ending up with 2 cpus trying to store under the same ring buffer, which is specifically not allowed. This patch prevents the problem, by allowing only events with the same current cpu to receive the event. NOTE: this requires the use of (per-task-)per-cpu buffers for this feature to work; perf-record does this. Signed-off-by: Jiri Olsa [peterz: small edits to Changelog] Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andrew Vagin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: e6dab5ffab59 ("perf/trace: Add ability to set a target task for events") Link: http://lkml.kernel.org/r/20180923161343.GB15054@krava Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index dfb1d951789e..5a97f34bc14c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8314,6 +8314,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->cpu != smp_processor_id()) + continue; if (event->attr.type != PERF_TYPE_TRACEPOINT) continue; if (event->attr.config != entry->type) -- cgit v1.2.3 From a4739eca4456e3d140cc656c5331d42b7465f91d Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 21 Sep 2018 23:18:56 +0530 Subject: sched/numa: Stop multiple tasks from moving to the CPU at the same time Task migration under NUMA balancing can happen in parallel. More than one task might choose to migrate to the same CPU at the same time. This can result in: - During task swap, choosing a task that was not part of the evaluation. - During task swap, task which just got moved into its preferred node, moving to a completely different node. - During task swap, task failing to move to the preferred node, will have to wait an extra interval for the next migrate opportunity. - During task movement, multiple task movements can cause load imbalance. This problem is more likely if there are more cores per node or more nodes in the system. Use a per run-queue variable to check if NUMA-balance is active on the run-queue. Specjbb2005 results (8 warehouses) Higher bops are better 2 Socket - 2 Node Haswell - X86 JVMS Prev Current %Change 4 200194 203353 1.57797 1 311331 328205 5.41995 2 Socket - 4 Node Power8 - PowerNV JVMS Prev Current %Change 1 197654 214384 8.46429 2 Socket - 2 Node Power9 - PowerNV JVMS Prev Current %Change 4 192605 188553 -2.10379 1 213402 196273 -8.02664 4 Socket - 4 Node Power7 - PowerVM JVMS Prev Current %Change 8 52227.1 57581.2 10.2516 1 102529 103468 0.915838 There is a regression on power 9 box. If we look at the details, that box has a sudden jump in cache-misses with this patch. All other parameters seem to be pointing towards NUMA consolidation. perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 13,345,784 13,941,377 migrations 1,127,820 1,157,323 faults 374,736 382,175 cache-misses 55,132,054,603 54,993,823,500 sched:sched_move_numa 1,923 2,005 sched:sched_stick_numa 52 14 sched:sched_swap_numa 595 529 migrate:mm_migrate_pages 1,932 1,573 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 60605 67099 numa_hint_faults_local 51804 58456 numa_hit 239945 240416 numa_huge_pte_updates 14 18 numa_interleave 60 65 numa_local 239865 240339 numa_other 80 77 numa_pages_migrated 1931 1574 numa_pte_updates 67823 77182 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 3,016,467 3,176,453 migrations 37,326 30,238 faults 115,342 87,869 cache-misses 11,692,155,554 12,544,479,391 sched:sched_move_numa 965 23 sched:sched_stick_numa 8 0 sched:sched_swap_numa 35 6 migrate:mm_migrate_pages 1,168 10 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 16286 236 numa_hint_faults_local 11863 201 numa_hit 112482 72293 numa_huge_pte_updates 33 0 numa_interleave 20 26 numa_local 112419 72233 numa_other 63 60 numa_pages_migrated 1144 8 numa_pte_updates 32859 0 perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 8,629,724 8,478,820 migrations 221,052 171,323 faults 308,661 307,499 cache-misses 135,574,913 240,353,599 sched:sched_move_numa 147 214 sched:sched_stick_numa 0 0 sched:sched_swap_numa 2 4 migrate:mm_migrate_pages 64 89 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 11481 5301 numa_hint_faults_local 10968 4745 numa_hit 89773 92943 numa_huge_pte_updates 0 0 numa_interleave 1116 899 numa_local 89220 92345 numa_other 553 598 numa_pages_migrated 62 88 numa_pte_updates 11694 5505 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 2,272,887 2,066,172 migrations 12,206 11,076 faults 163,704 149,544 cache-misses 4,801,186 10,398,067 sched:sched_move_numa 44 43 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 0 migrate:mm_migrate_pages 17 6 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 2261 3552 numa_hint_faults_local 1993 3347 numa_hit 25726 25611 numa_huge_pte_updates 0 0 numa_interleave 239 213 numa_local 25498 25583 numa_other 228 28 numa_pages_migrated 17 6 numa_pte_updates 2266 3535 perf stats 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 117,980,962 99,358,136 migrations 3,950,220 4,041,607 faults 736,979 749,653 cache-misses 224,976,072,879 225,562,543,251 sched:sched_move_numa 504 771 sched:sched_stick_numa 50 14 sched:sched_swap_numa 239 204 migrate:mm_migrate_pages 1,260 1,180 vmstat 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 18293 27409 numa_hint_faults_local 11969 20677 numa_hit 240854 239988 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 240851 239983 numa_other 3 5 numa_pages_migrated 1190 1016 numa_pte_updates 18106 27916 perf stats 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 61,053,158 60,899,307 migrations 551,586 544,668 faults 244,174 270,834 cache-misses 74,326,766,973 74,543,455,635 sched:sched_move_numa 344 735 sched:sched_stick_numa 24 25 sched:sched_swap_numa 140 174 migrate:mm_migrate_pages 568 816 vmstat 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 6461 11059 numa_hint_faults_local 2283 4733 numa_hit 35661 41384 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 35661 41383 numa_other 0 1 numa_pages_migrated 568 815 numa_pte_updates 6518 11323 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Jirka Hladky Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537552141-27815-2-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 22 ++++++++++++++++++++++ kernel/sched/sched.h | 1 + 2 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f808ddf2a868..3b0b75de1141 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1514,6 +1514,21 @@ struct task_numa_env { static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { + struct rq *rq = cpu_rq(env->dst_cpu); + + /* Bail out if run-queue part of active NUMA balance. */ + if (xchg(&rq->numa_migrate_on, 1)) + return; + + /* + * Clear previous best_cpu/rq numa-migrate flag, since task now + * found a better CPU to move/swap. + */ + if (env->best_cpu != -1) { + rq = cpu_rq(env->best_cpu); + WRITE_ONCE(rq->numa_migrate_on, 0); + } + if (env->best_task) put_task_struct(env->best_task); if (p) @@ -1569,6 +1584,9 @@ static void task_numa_compare(struct task_numa_env *env, long moveimp = imp; int dist = env->dist; + if (READ_ONCE(dst_rq->numa_migrate_on)) + return; + rcu_read_lock(); cur = task_rcu_dereference(&dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) @@ -1710,6 +1728,7 @@ static int task_numa_migrate(struct task_struct *p) .best_cpu = -1, }; struct sched_domain *sd; + struct rq *best_rq; unsigned long taskweight, groupweight; int nid, ret, dist; long taskimp, groupimp; @@ -1811,14 +1830,17 @@ static int task_numa_migrate(struct task_struct *p) */ p->numa_scan_period = task_scan_start(p); + best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4a2e8cae63c4..0b9161241bda 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -783,6 +783,7 @@ struct rq { #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; + unsigned int numa_migrate_on; #endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; -- cgit v1.2.3 From 1327237a5978b00bcc665c33046c9bae75da1154 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 21 Sep 2018 23:18:57 +0530 Subject: sched/numa: Pass destination CPU as a parameter to migrate_task_rq This additional parameter (new_cpu) is used later for identifying if task migration is across nodes. No functional change. Specjbb2005 results (8 warehouses) Higher bops are better 2 Socket - 2 Node Haswell - X86 JVMS Prev Current %Change 4 203353 200668 -1.32036 1 328205 321791 -1.95427 2 Socket - 4 Node Power8 - PowerNV JVMS Prev Current %Change 1 214384 204848 -4.44809 2 Socket - 2 Node Power9 - PowerNV JVMS Prev Current %Change 4 188553 188098 -0.241311 1 196273 200351 2.07772 4 Socket - 4 Node Power7 - PowerVM JVMS Prev Current %Change 8 57581.2 58145.9 0.980702 1 103468 103798 0.318939 Brings out the variance between different specjbb2005 runs. Some events stats before and after applying the patch. perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 13,941,377 13,912,183 migrations 1,157,323 1,155,931 faults 382,175 367,139 cache-misses 54,993,823,500 54,240,196,814 sched:sched_move_numa 2,005 1,571 sched:sched_stick_numa 14 9 sched:sched_swap_numa 529 463 migrate:mm_migrate_pages 1,573 703 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 67099 50155 numa_hint_faults_local 58456 45264 numa_hit 240416 239652 numa_huge_pte_updates 18 36 numa_interleave 65 68 numa_local 240339 239576 numa_other 77 76 numa_pages_migrated 1574 680 numa_pte_updates 77182 71146 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 3,176,453 3,156,720 migrations 30,238 30,354 faults 87,869 97,261 cache-misses 12,544,479,391 12,400,026,826 sched:sched_move_numa 23 4 sched:sched_stick_numa 0 0 sched:sched_swap_numa 6 1 migrate:mm_migrate_pages 10 20 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 236 272 numa_hint_faults_local 201 186 numa_hit 72293 71362 numa_huge_pte_updates 0 0 numa_interleave 26 23 numa_local 72233 71299 numa_other 60 63 numa_pages_migrated 8 2 numa_pte_updates 0 0 perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 8,478,820 8,606,824 migrations 171,323 155,352 faults 307,499 301,409 cache-misses 240,353,599 157,759,224 sched:sched_move_numa 214 168 sched:sched_stick_numa 0 0 sched:sched_swap_numa 4 3 migrate:mm_migrate_pages 89 125 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 5301 4650 numa_hint_faults_local 4745 3946 numa_hit 92943 90489 numa_huge_pte_updates 0 0 numa_interleave 899 892 numa_local 92345 90034 numa_other 598 455 numa_pages_migrated 88 124 numa_pte_updates 5505 4818 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 2,066,172 2,113,167 migrations 11,076 10,533 faults 149,544 142,727 cache-misses 10,398,067 5,594,192 sched:sched_move_numa 43 10 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 0 migrate:mm_migrate_pages 6 6 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 3552 744 numa_hint_faults_local 3347 584 numa_hit 25611 25551 numa_huge_pte_updates 0 0 numa_interleave 213 263 numa_local 25583 25302 numa_other 28 249 numa_pages_migrated 6 6 numa_pte_updates 3535 744 perf stats 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 99,358,136 101,227,352 migrations 4,041,607 4,151,829 faults 749,653 745,233 cache-misses 225,562,543,251 224,669,561,766 sched:sched_move_numa 771 617 sched:sched_stick_numa 14 2 sched:sched_swap_numa 204 187 migrate:mm_migrate_pages 1,180 316 vmstat 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 27409 24195 numa_hint_faults_local 20677 21639 numa_hit 239988 238331 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 239983 238331 numa_other 5 0 numa_pages_migrated 1016 204 numa_pte_updates 27916 24561 perf stats 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 60,899,307 62,738,978 migrations 544,668 562,702 faults 270,834 228,465 cache-misses 74,543,455,635 75,778,067,952 sched:sched_move_numa 735 648 sched:sched_stick_numa 25 13 sched:sched_swap_numa 174 137 migrate:mm_migrate_pages 816 733 vmstat 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 11059 10281 numa_hint_faults_local 4733 3242 numa_hit 41384 36338 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 41383 36338 numa_other 1 0 numa_pages_migrated 815 706 numa_pte_updates 11323 10176 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Jirka Hladky Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537552141-27815-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 625bc9897f62..ad97f3ba5ec5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1167,7 +1167,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p); + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 997ea7b839fa..91e4202b0634 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1607,7 +1607,7 @@ out: return cpu; } -static void migrate_task_rq_dl(struct task_struct *p) +static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { struct rq *rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3b0b75de1141..bc768156239f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6297,7 +6297,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); * cfs_rq_of(p) references at time of call are still valid and identify the * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ -static void migrate_task_rq_fair(struct task_struct *p) +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu __maybe_unused) { /* * As blocked tasks retain absolute vruntime the migration needs to diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0b9161241bda..455fa330de04 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1524,7 +1524,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); - void (*migrate_task_rq)(struct task_struct *p); + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); -- cgit v1.2.3 From 3f9672baaa70fc62765857f13f007feb01f9ad33 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 21 Sep 2018 23:18:58 +0530 Subject: sched/numa: Reset scan rate whenever task moves across nodes Currently task scan rate is reset when NUMA balancer migrates the task to a different node. If NUMA balancer initiates a swap, reset is only applicable to the task that initiates the swap. Similarly no scan rate reset is done if the task is migrated across nodes by traditional load balancer. Instead move the scan reset to the migrate_task_rq. This ensures the task moved out of its preferred node, either gets back to its preferred node quickly or finds a new preferred node. Doing so, would be fair to all tasks migrating across nodes. Specjbb2005 results (8 warehouses) Higher bops are better 2 Socket - 2 Node Haswell - X86 JVMS Prev Current %Change 4 200668 203370 1.3465 1 321791 328431 2.06345 2 Socket - 4 Node Power8 - PowerNV JVMS Prev Current %Change 1 204848 206070 0.59654 2 Socket - 2 Node Power9 - PowerNV JVMS Prev Current %Change 4 188098 188386 0.153112 1 200351 201566 0.606436 4 Socket - 4 Node Power7 - PowerVM JVMS Prev Current %Change 8 58145.9 59157.4 1.73959 1 103798 105495 1.63491 Some events stats before and after applying the patch. perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 13,912,183 13,825,492 migrations 1,155,931 1,152,509 faults 367,139 371,948 cache-misses 54,240,196,814 55,654,206,041 sched:sched_move_numa 1,571 1,856 sched:sched_stick_numa 9 4 sched:sched_swap_numa 463 428 migrate:mm_migrate_pages 703 898 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 50155 57146 numa_hint_faults_local 45264 51612 numa_hit 239652 238164 numa_huge_pte_updates 36 16 numa_interleave 68 63 numa_local 239576 238085 numa_other 76 79 numa_pages_migrated 680 883 numa_pte_updates 71146 67540 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 3,156,720 3,288,525 migrations 30,354 38,652 faults 97,261 111,678 cache-misses 12,400,026,826 12,111,197,376 sched:sched_move_numa 4 900 sched:sched_stick_numa 0 0 sched:sched_swap_numa 1 5 migrate:mm_migrate_pages 20 714 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 272 18572 numa_hint_faults_local 186 14850 numa_hit 71362 73197 numa_huge_pte_updates 0 11 numa_interleave 23 25 numa_local 71299 73138 numa_other 63 59 numa_pages_migrated 2 712 numa_pte_updates 0 24021 perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 8,606,824 8,451,543 migrations 155,352 202,804 faults 301,409 310,024 cache-misses 157,759,224 253,522,507 sched:sched_move_numa 168 213 sched:sched_stick_numa 0 0 sched:sched_swap_numa 3 2 migrate:mm_migrate_pages 125 88 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 4650 11830 numa_hint_faults_local 3946 11301 numa_hit 90489 90038 numa_huge_pte_updates 0 0 numa_interleave 892 855 numa_local 90034 89796 numa_other 455 242 numa_pages_migrated 124 88 numa_pte_updates 4818 12039 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 2,113,167 2,049,153 migrations 10,533 11,405 faults 142,727 162,309 cache-misses 5,594,192 7,203,343 sched:sched_move_numa 10 22 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 0 migrate:mm_migrate_pages 6 1 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 744 1693 numa_hint_faults_local 584 1669 numa_hit 25551 25177 numa_huge_pte_updates 0 0 numa_interleave 263 194 numa_local 25302 24993 numa_other 249 184 numa_pages_migrated 6 1 numa_pte_updates 744 1577 perf stats 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 101,227,352 94,515,937 migrations 4,151,829 4,203,554 faults 745,233 832,697 cache-misses 224,669,561,766 226,248,698,331 sched:sched_move_numa 617 1,730 sched:sched_stick_numa 2 14 sched:sched_swap_numa 187 432 migrate:mm_migrate_pages 316 1,398 vmstat 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 24195 80079 numa_hint_faults_local 21639 68620 numa_hit 238331 241187 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 238331 241186 numa_other 0 1 numa_pages_migrated 204 1347 numa_pte_updates 24561 80729 perf stats 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 62,738,978 63,704,961 migrations 562,702 573,404 faults 228,465 230,878 cache-misses 75,778,067,952 76,568,222,781 sched:sched_move_numa 648 509 sched:sched_stick_numa 13 31 sched:sched_swap_numa 137 182 migrate:mm_migrate_pages 733 541 vmstat 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 10281 8501 numa_hint_faults_local 3242 2960 numa_hit 36338 35526 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 36338 35526 numa_other 0 0 numa_pages_migrated 706 539 numa_pte_updates 10176 8433 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Jirka Hladky Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537552141-27815-4-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc768156239f..5cbfb3068bc6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1824,12 +1824,6 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - /* - * Reset the scan period if the task is being rescheduled on an - * alternative node to recheck if the tasks is now properly placed. - */ - p->numa_scan_period = task_scan_start(p); - best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); @@ -2618,6 +2612,18 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } +static void update_scan_period(struct task_struct *p, int new_cpu) +{ + int src_nid = cpu_to_node(task_cpu(p)); + int dst_nid = cpu_to_node(new_cpu); + + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) + return; + + if (src_nid != dst_nid) + p->numa_scan_period = task_scan_start(p); +} + #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2631,6 +2637,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } +static inline void update_scan_period(struct task_struct *p, int new_cpu) +{ +} + #endif /* CONFIG_NUMA_BALANCING */ static void @@ -6297,7 +6307,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); * cfs_rq_of(p) references at time of call are still valid and identify the * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ -static void migrate_task_rq_fair(struct task_struct *p, int new_cpu __maybe_unused) +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { /* * As blocked tasks retain absolute vruntime the migration needs to @@ -6350,6 +6360,8 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu __maybe_unus /* We have migrated, no longer consider this task hot */ p->se.exec_start = 0; + + update_scan_period(p, new_cpu); } static void task_dead_fair(struct task_struct *p) -- cgit v1.2.3 From 05cbdf4f5c191ff378c47bbf66d7230beb725bdb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 21 Sep 2018 23:18:59 +0530 Subject: sched/numa: Limit the conditions where scan period is reset migrate_task_rq_fair() resets the scan rate for NUMA balancing on every cross-node migration. In the event of excessive load balancing due to saturation, this may result in the scan rate being pegged at maximum and further overloading the machine. This patch only resets the scan if NUMA balancing is active, a preferred node has been selected and the task is being migrated from the preferred node as these are the most harmful. For example, a migration to the preferred node does not justify a faster scan rate. Similarly, a migration between two nodes that are not preferred is probably bouncing due to over-saturation of the machine. In that case, scanning faster and trapping more NUMA faults will further overload the machine. Specjbb2005 results (8 warehouses) Higher bops are better 2 Socket - 2 Node Haswell - X86 JVMS Prev Current %Change 4 203370 205332 0.964744 1 328431 319785 -2.63252 2 Socket - 4 Node Power8 - PowerNV JVMS Prev Current %Change 1 206070 206585 0.249915 2 Socket - 2 Node Power9 - PowerNV JVMS Prev Current %Change 4 188386 189162 0.41192 1 201566 213760 6.04963 4 Socket - 4 Node Power7 - PowerVM JVMS Prev Current %Change 8 59157.4 58736.8 -0.710985 1 105495 105419 -0.0720413 Some events stats before and after applying the patch. perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 13,825,492 14,285,708 migrations 1,152,509 1,180,621 faults 371,948 339,114 cache-misses 55,654,206,041 55,205,631,894 sched:sched_move_numa 1,856 843 sched:sched_stick_numa 4 6 sched:sched_swap_numa 428 219 migrate:mm_migrate_pages 898 365 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 57146 26907 numa_hint_faults_local 51612 24279 numa_hit 238164 239771 numa_huge_pte_updates 16 0 numa_interleave 63 68 numa_local 238085 239688 numa_other 79 83 numa_pages_migrated 883 363 numa_pte_updates 67540 27415 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 3,288,525 3,202,779 migrations 38,652 37,186 faults 111,678 106,076 cache-misses 12,111,197,376 12,024,873,744 sched:sched_move_numa 900 931 sched:sched_stick_numa 0 0 sched:sched_swap_numa 5 1 migrate:mm_migrate_pages 714 637 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 18572 17409 numa_hint_faults_local 14850 14367 numa_hit 73197 73953 numa_huge_pte_updates 11 20 numa_interleave 25 25 numa_local 73138 73892 numa_other 59 61 numa_pages_migrated 712 668 numa_pte_updates 24021 27276 perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 8,451,543 8,474,013 migrations 202,804 254,934 faults 310,024 320,506 cache-misses 253,522,507 110,580,458 sched:sched_move_numa 213 725 sched:sched_stick_numa 0 0 sched:sched_swap_numa 2 7 migrate:mm_migrate_pages 88 145 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 11830 22797 numa_hint_faults_local 11301 21539 numa_hit 90038 89308 numa_huge_pte_updates 0 0 numa_interleave 855 865 numa_local 89796 88955 numa_other 242 353 numa_pages_migrated 88 149 numa_pte_updates 12039 22930 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 2,049,153 2,195,628 migrations 11,405 11,179 faults 162,309 149,656 cache-misses 7,203,343 8,117,515 sched:sched_move_numa 22 49 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 0 migrate:mm_migrate_pages 1 5 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 1693 3577 numa_hint_faults_local 1669 3476 numa_hit 25177 26142 numa_huge_pte_updates 0 0 numa_interleave 194 358 numa_local 24993 26042 numa_other 184 100 numa_pages_migrated 1 5 numa_pte_updates 1577 3587 perf stats 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 94,515,937 100,602,296 migrations 4,203,554 4,135,630 faults 832,697 789,256 cache-misses 226,248,698,331 226,160,621,058 sched:sched_move_numa 1,730 1,366 sched:sched_stick_numa 14 16 sched:sched_swap_numa 432 374 migrate:mm_migrate_pages 1,398 1,350 vmstat 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 80079 47857 numa_hint_faults_local 68620 39768 numa_hit 241187 240165 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 241186 240165 numa_other 1 0 numa_pages_migrated 1347 1224 numa_pte_updates 80729 48354 perf stats 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 63,704,961 58,515,496 migrations 573,404 564,845 faults 230,878 245,807 cache-misses 76,568,222,781 73,603,757,976 sched:sched_move_numa 509 996 sched:sched_stick_numa 31 10 sched:sched_swap_numa 182 193 migrate:mm_migrate_pages 541 646 vmstat 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 8501 13422 numa_hint_faults_local 2960 5619 numa_hit 35526 36118 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 35526 36116 numa_other 0 2 numa_pages_migrated 539 616 numa_pte_updates 8433 13374 Signed-off-by: Mel Gorman Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Jirka Hladky Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537552141-27815-5-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5cbfb3068bc6..3529bf61826b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2617,11 +2617,32 @@ static void update_scan_period(struct task_struct *p, int new_cpu) int src_nid = cpu_to_node(task_cpu(p)); int dst_nid = cpu_to_node(new_cpu); + if (!static_branch_likely(&sched_numa_balancing)) + return; + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) return; - if (src_nid != dst_nid) - p->numa_scan_period = task_scan_start(p); + if (src_nid == dst_nid) + return; + + /* + * Allow resets if faults have been trapped before one scan + * has completed. This is most likely due to a new task that + * is pulled cross-node due to wakeups or load balancing. + */ + if (p->numa_scan_seq) { + /* + * Avoid scan adjustments if moving to the preferred + * node or if the task was not previously running on + * the preferred node. + */ + if (dst_nid == p->numa_preferred_nid || + (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + return; + } + + p->numa_scan_period = task_scan_start(p); } #else -- cgit v1.2.3 From 6fd98e775f24fd41520928d345f5db3ff52bb35d Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 21 Sep 2018 23:19:01 +0530 Subject: sched/numa: Avoid task migration for small NUMA improvement If NUMA improvement from the task migration is going to be very minimal, then avoid task migration. Specjbb2005 results (8 warehouses) Higher bops are better 2 Socket - 2 Node Haswell - X86 JVMS Prev Current %Change 4 198512 205910 3.72673 1 313559 318491 1.57291 2 Socket - 4 Node Power8 - PowerNV JVMS Prev Current %Change 8 74761.9 74935.9 0.232739 1 214874 226796 5.54837 2 Socket - 2 Node Power9 - PowerNV JVMS Prev Current %Change 4 180536 189780 5.12031 1 210281 205695 -2.18089 4 Socket - 4 Node Power7 - PowerVM JVMS Prev Current %Change 8 56511.4 60370 6.828 1 104899 108100 3.05151 1/7 cases is regressing, if we look at events migrate_pages seem to vary the most especially in the regressing case. Also some amount of variance is expected between different runs of Specjbb2005. Some events stats before and after applying the patch. perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 13,818,546 13,801,554 migrations 1,149,960 1,151,541 faults 385,583 433,246 cache-misses 55,259,546,768 55,168,691,835 sched:sched_move_numa 2,257 2,551 sched:sched_stick_numa 9 24 sched:sched_swap_numa 512 904 migrate:mm_migrate_pages 2,225 1,571 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 72692 113682 numa_hint_faults_local 62270 102163 numa_hit 238762 240181 numa_huge_pte_updates 48 36 numa_interleave 75 64 numa_local 238676 240103 numa_other 86 78 numa_pages_migrated 2225 1564 numa_pte_updates 98557 134080 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After cs 3,173,490 3,079,150 migrations 36,966 31,455 faults 108,776 99,081 cache-misses 12,200,075,320 11,588,126,740 sched:sched_move_numa 1,264 1 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 0 migrate:mm_migrate_pages 899 36 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Haswell - X86 Event Before After numa_hint_faults 21109 430 numa_hint_faults_local 17120 77 numa_hit 72934 71277 numa_huge_pte_updates 42 0 numa_interleave 33 22 numa_local 72866 71218 numa_other 68 59 numa_pages_migrated 915 23 numa_pte_updates 42326 0 perf stats 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 8,312,022 8,707,565 migrations 231,705 171,342 faults 310,242 310,820 cache-misses 402,324,573 136,115,400 sched:sched_move_numa 193 215 sched:sched_stick_numa 0 6 sched:sched_swap_numa 3 24 migrate:mm_migrate_pages 93 162 vmstat 8th warehouse Multi JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 11838 8985 numa_hint_faults_local 11216 8154 numa_hit 90689 93819 numa_huge_pte_updates 0 0 numa_interleave 1579 882 numa_local 89634 93496 numa_other 1055 323 numa_pages_migrated 92 169 numa_pte_updates 12109 9217 perf stats 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After cs 2,170,481 2,152,072 migrations 10,126 10,704 faults 160,962 164,376 cache-misses 10,834,845 3,818,437 sched:sched_move_numa 10 16 sched:sched_stick_numa 0 0 sched:sched_swap_numa 0 7 migrate:mm_migrate_pages 2 199 vmstat 8th warehouse Single JVM 2 Socket - 2 Node Power9 - PowerNV Event Before After numa_hint_faults 403 2248 numa_hint_faults_local 358 1666 numa_hit 25898 25704 numa_huge_pte_updates 0 0 numa_interleave 207 200 numa_local 25860 25679 numa_other 38 25 numa_pages_migrated 2 197 numa_pte_updates 400 2234 perf stats 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 110,339,633 93,330,595 migrations 4,139,812 4,122,061 faults 863,622 865,979 cache-misses 231,838,045,660 225,395,083,479 sched:sched_move_numa 2,196 2,372 sched:sched_stick_numa 33 24 sched:sched_swap_numa 544 769 migrate:mm_migrate_pages 2,469 1,677 vmstat 8th warehouse Multi JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 85748 91638 numa_hint_faults_local 66831 78096 numa_hit 242213 242225 numa_huge_pte_updates 0 0 numa_interleave 0 2 numa_local 242211 242219 numa_other 2 6 numa_pages_migrated 2376 1515 numa_pte_updates 86233 92274 perf stats 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After cs 59,331,057 51,487,271 migrations 552,019 537,170 faults 266,586 256,921 cache-misses 73,796,312,990 70,073,831,187 sched:sched_move_numa 981 576 sched:sched_stick_numa 54 24 sched:sched_swap_numa 286 327 migrate:mm_migrate_pages 713 726 vmstat 8th warehouse Single JVM 4 Socket - 4 Node Power7 - PowerVM Event Before After numa_hint_faults 14807 12000 numa_hint_faults_local 5738 5024 numa_hit 36230 36470 numa_huge_pte_updates 0 0 numa_interleave 0 0 numa_local 36228 36465 numa_other 2 5 numa_pages_migrated 703 726 numa_pte_updates 14742 11930 Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Jirka Hladky Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537552141-27815-7-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3529bf61826b..25c7c7e09cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1567,6 +1567,13 @@ static bool load_too_imbalanced(long src_load, long dst_load, return (imb > old_imb); } +/* + * Maximum NUMA importance can be 1998 (2*999); + * SMALLIMP @ 30 would be close to 1998/64. + * Used to deter task migration. + */ +#define SMALLIMP 30 + /* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1600,7 +1607,7 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; if (!cur) { - if (maymove || imp > env->best_imp) + if (maymove && moveimp >= env->best_imp) goto assign; else goto unlock; @@ -1643,15 +1650,21 @@ static void task_numa_compare(struct task_numa_env *env, task_weight(cur, env->dst_nid, dist); } - if (imp <= env->best_imp) - goto unlock; - if (maymove && moveimp > imp && moveimp > env->best_imp) { - imp = moveimp - 1; + imp = moveimp; cur = NULL; goto assign; } + /* + * If the NUMA importance is less than SMALLIMP, + * task migration might only result in ping pong + * of tasks and also hurt performance due to cache + * misses. + */ + if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) + goto unlock; + /* * In the overloaded case, try and keep the load balanced. */ -- cgit v1.2.3 From 37355bdc5a129899f6b245900a8eb944a092f7fd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 1 Oct 2018 11:05:25 +0100 Subject: sched/numa: Migrate pages to local nodes quicker early in the lifetime of a task Automatic NUMA Balancing uses a multi-stage pass to decide whether a page should migrate to a local node. This filter avoids excessive ping-ponging if a page is shared or used by threads that migrate cross-node frequently. Threads inherit both page tables and the preferred node ID from the parent. This means that threads can trigger hinting faults earlier than a new task which delays scanning for a number of seconds. As it can be load balanced very early in its lifetime there can be an unnecessary delay before it starts migrating thread-local data. This patch migrates private pages faster early in the lifetime of a thread using the sequence counter as an identifier of new tasks. With this patch applied, STREAM performance is the same as 4.17 even though processes are not spread cross-node prematurely. Other workloads showed a mix of minor gains and losses. This is somewhat expected most workloads are not very sensitive to the starting conditions of a process. 4.19.0-rc5 4.19.0-rc5 4.17.0 numab-v1r1 fastmigrate-v1r1 vanilla MB/sec copy 43298.52 ( 0.00%) 47335.46 ( 9.32%) 47219.24 ( 9.06%) MB/sec scale 30115.06 ( 0.00%) 32568.12 ( 8.15%) 32527.56 ( 8.01%) MB/sec add 32825.12 ( 0.00%) 36078.94 ( 9.91%) 35928.02 ( 9.45%) MB/sec triad 32549.52 ( 0.00%) 35935.94 ( 10.40%) 35969.88 ( 10.51%) Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra Cc: Jirka Hladky Cc: Linus Torvalds Cc: Linux-MM Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181001100525.29789-3-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25c7c7e09cbd..7fc4a371bdd2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1392,6 +1392,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int last_cpupid, this_cpupid; this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + + /* + * Allow first faults or private faults to migrate immediately early in + * the lifetime of a task. The magic number 4 is based on waiting for + * two full passes of the "multi-stage node selection" test that is + * executed below. + */ + if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) + return true; /* * Multi-stage node selection is used in conjunction with a periodic @@ -1410,7 +1421,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * This quadric squishes small probabilities, making it less likely we * act on an unlikely task<->page relation. */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != dst_nid) return false; -- cgit v1.2.3 From b0584ea66d73919cbf5878a3420a837f06ab8396 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 2 Oct 2018 02:41:53 +0000 Subject: bpf: don't accept cgroup local storage with zero value size Explicitly forbid creating cgroup local storage maps with zero value size, as it makes no sense and might even cause a panic. Reported-by: syzbot+18628320d3b14a5c459c@syzkaller.appspotmail.com Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/local_storage.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 94126cbffc88..830d7f095748 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -195,6 +195,9 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); + if (attr->value_size == 0) + return ERR_PTR(-EINVAL); + if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); -- cgit v1.2.3 From e4a02ed2aaf447fa849e3254bfdb3b9b01e1e520 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 2 Oct 2018 14:48:49 -0700 Subject: locking/ww_mutex: Fix runtime warning in the WW mutex selftest If CONFIG_WW_MUTEX_SELFTEST=y is enabled, booting an image in an arm64 virtual machine results in the following traceback if 8 CPUs are enabled: DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current) WARNING: CPU: 2 PID: 537 at kernel/locking/mutex.c:1033 __mutex_unlock_slowpath+0x1a8/0x2e0 ... Call trace: __mutex_unlock_slowpath() ww_mutex_unlock() test_cycle_work() process_one_work() worker_thread() kthread() ret_from_fork() If requesting b_mutex fails with -EDEADLK, the error variable is reassigned to the return value from calling ww_mutex_lock on a_mutex again. If this call fails, a_mutex is not locked. It is, however, unconditionally unlocked subsequently, causing the reported warning. Fix the problem by using two error variables. With this change, the selftest still fails as follows: cyclic deadlock not resolved, ret[7/8] = -35 However, the traceback is gone. Signed-off-by: Guenter Roeck Cc: Chris Wilson Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Fixes: d1b42b800e5d0 ("locking/ww_mutex: Add kselftests for resolving ww_mutex cyclic deadlocks") Link: http://lkml.kernel.org/r/1538516929-9734-1-git-send-email-linux@roeck-us.net Signed-off-by: Ingo Molnar --- kernel/locking/test-ww_mutex.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 0be047dbd897..65a3b7e55b9f 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -260,7 +260,7 @@ static void test_cycle_work(struct work_struct *work) { struct test_cycle *cycle = container_of(work, typeof(*cycle), work); struct ww_acquire_ctx ctx; - int err; + int err, erra = 0; ww_acquire_init(&ctx, &ww_class); ww_mutex_lock(&cycle->a_mutex, &ctx); @@ -270,17 +270,19 @@ static void test_cycle_work(struct work_struct *work) err = ww_mutex_lock(cycle->b_mutex, &ctx); if (err == -EDEADLK) { + err = 0; ww_mutex_unlock(&cycle->a_mutex); ww_mutex_lock_slow(cycle->b_mutex, &ctx); - err = ww_mutex_lock(&cycle->a_mutex, &ctx); + erra = ww_mutex_lock(&cycle->a_mutex, &ctx); } if (!err) ww_mutex_unlock(cycle->b_mutex); - ww_mutex_unlock(&cycle->a_mutex); + if (!erra) + ww_mutex_unlock(&cycle->a_mutex); ww_acquire_fini(&ctx); - cycle->result = err; + cycle->result = err ?: erra; } static int __test_cycle(unsigned int nthreads) -- cgit v1.2.3 From b799207e1e1816b09e7a5920fbb2d5fcf6edd681 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 5 Oct 2018 18:17:59 +0200 Subject: bpf: 32-bit RSH verification must truncate input before the ALU op When I wrote commit 468f6eafa6c4 ("bpf: fix 32-bit ALU op verification"), I assumed that, in order to emulate 64-bit arithmetic with 32-bit logic, it is sufficient to just truncate the output to 32 bits; and so I just moved the register size coercion that used to be at the start of the function to the end of the function. That assumption is true for almost every op, but not for 32-bit right shifts, because those can propagate information towards the least significant bit. Fix it by always truncating inputs for 32-bit ops to 32 bits. Also get rid of the coerce_reg_to_size() after the ALU op, since that has no effect. Fixes: 468f6eafa6c4 ("bpf: fix 32-bit ALU op verification") Acked-by: Daniel Borkmann Signed-off-by: Jann Horn Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb07e74b34a2..465952a8e465 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2896,6 +2896,15 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + if (insn_bitness == 32) { + /* Relevant for 32-bit RSH: Information can propagate towards + * LSB, so it isn't sufficient to only truncate the output to + * 32 bits. + */ + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); + } + smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; @@ -3131,7 +3140,6 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops are (32,32)->32 */ coerce_reg_to_size(dst_reg, 4); - coerce_reg_to_size(&src_reg, 4); } __reg_deduce_bounds(dst_reg); -- cgit v1.2.3