From a2775bbc1d58ce630517dfe86090c166f27d719f Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Tue, 12 Mar 2019 21:21:26 +0100 Subject: kernel/workqueue: Use __printf markup to silence compiler in function 'alloc_workqueue' Silence warnings (triggered at W=1) by adding relevant __printf attributes. kernel/workqueue.c:4249:2: warning: function 'alloc_workqueue' might be a candidate for 'gnu_printf' format attribute [-Wsuggest-attribute=format] Signed-off-by: Mathieu Malaterre Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4026d1871407..56b7cf898f10 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4208,6 +4208,7 @@ static int init_rescuer(struct workqueue_struct *wq) return 0; } +__printf(1, 4) struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...) -- cgit v1.2.3 From 95e0b46fcebd7dbf6850dee96046e4c4ddc7f69c Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 7 Mar 2019 09:16:24 +0800 Subject: audit: fix a memleak caused by auditing load module module.name will be allocated unconditionally when auditing load module, and audit_log_start() can fail with other reasons, or audit_log_exit maybe not called, caused module.name is not freed so free module.name in audit_free_context and __audit_syscall_exit unreferenced object 0xffff88af90837d20 (size 8): comm "modprobe", pid 1036, jiffies 4294704867 (age 3069.138s) hex dump (first 8 bytes): 69 78 67 62 65 00 ff ff ixgbe... backtrace: [<0000000008da28fe>] __audit_log_kern_module+0x33/0x80 [<00000000c1491e61>] load_module+0x64f/0x3850 [<000000007fc9ae3f>] __do_sys_init_module+0x218/0x250 [<0000000000d4a478>] do_syscall_64+0x117/0x400 [<000000004924ded8>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<000000007dc331dd>] 0xffffffffffffffff Fixes: ca86cad7380e3 ("audit: log module name on init_module") Signed-off-by: Zhang Yu Signed-off-by: Li RongQing [PM: manual merge fixup in __audit_syscall_exit()] Signed-off-by: Paul Moore --- kernel/auditsc.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d1eab1d4a930..fa7b8047aab8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -840,6 +840,13 @@ static inline void audit_proctitle_free(struct audit_context *context) context->proctitle.len = 0; } +static inline void audit_free_module(struct audit_context *context) +{ + if (context->type == AUDIT_KERN_MODULE) { + kfree(context->module.name); + context->module.name = NULL; + } +} static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; @@ -923,6 +930,7 @@ int audit_alloc(struct task_struct *tsk) static inline void audit_free_context(struct audit_context *context) { + audit_free_module(context); audit_free_names(context); unroll_tree_refs(context, NULL, 0); free_tree_refs(context); @@ -1266,7 +1274,6 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "name="); if (context->module.name) { audit_log_untrustedstring(ab, context->module.name); - kfree(context->module.name); } else audit_log_format(ab, "(null)"); @@ -1697,6 +1704,7 @@ void __audit_syscall_exit(int success, long return_code) context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; + audit_free_module(context); audit_free_names(context); unroll_tree_refs(context, NULL, 0); audit_free_aux(context); -- cgit v1.2.3 From 8194fe94ab08f56ea55653df924647d23873d18c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 19 Mar 2019 10:45:09 -0700 Subject: kernel/workqueue: Document wq_worker_last_func() argument This patch avoids that the following warning is reported when building with W=1: kernel/workqueue.c:938: warning: Function parameter or member 'task' not described in 'wq_worker_last_func' Signed-off-by: Bart Van Assche Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 56b7cf898f10..21721faa923c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -913,6 +913,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) /** * wq_worker_last_func - retrieve worker's last work function + * @task: Task to retrieve last work function of. * * Determine the last function a worker executed. This is called from * the scheduler to get a worker's last known identity. -- cgit v1.2.3 From 73e65b88feb919f95bdb77c4ed35f69588cf27ee Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 19 Mar 2019 15:23:29 -0400 Subject: audit: connect LOGIN record to its syscall record Currently the AUDIT_LOGIN event is a standalone record that isn't connected to any other records that may be part of its syscall event. To avoid the confusion of generating two events, connect the records by using its syscall context. Please see the github issue https://github.com/linux-audit/audit-kernel/issues/110 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index c89ea48c70a6..b96bf69183f4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2220,7 +2220,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; -- cgit v1.2.3 From 2efa48fec0c344a6ca1bba66b15d63d38cf20199 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 20 Mar 2019 21:59:22 +0800 Subject: audit: Make audit_log_cap and audit_copy_inode static Fix sparse warning: kernel/auditsc.c:1150:6: warning: symbol 'audit_log_cap' was not declared. Should it be static? kernel/auditsc.c:1908:6: warning: symbol 'audit_copy_inode' was not declared. Should it be static? Signed-off-by: YueHaibing Acked-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fa7b8047aab8..17b0007fafc2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1147,7 +1147,8 @@ out: kfree(buf_head); } -void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +static void audit_log_cap(struct audit_buffer *ab, char *prefix, + kernel_cap_t *cap) { int i; @@ -1905,8 +1906,9 @@ static inline int audit_copy_fcaps(struct audit_names *name, } /* Copy inode data into an audit_names. */ -void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - struct inode *inode, unsigned int flags) +static void audit_copy_inode(struct audit_names *name, + const struct dentry *dentry, + struct inode *inode, unsigned int flags) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; -- cgit v1.2.3 From 16add411645cff83360086e102daa67b25f1e39a Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 18 Mar 2019 02:30:18 +0300 Subject: syscall_get_arch: add "struct task_struct *" argument This argument is required to extend the generic ptrace API with PTRACE_GET_SYSCALL_INFO request: syscall_get_arch() is going to be called from ptrace_request() along with syscall_get_nr(), syscall_get_arguments(), syscall_get_error(), and syscall_get_return_value() functions with a tracee as their argument. The primary intent is that the triple (audit_arch, syscall_nr, arg1..arg6) should describe what system call is being called and what its arguments are. Reverts: 5e937a9ae913 ("syscall_get_arch: remove useless function arguments") Reverts: 1002d94d3076 ("syscall.h: fix doc text for syscall_get_arch()") Reviewed-by: Andy Lutomirski # for x86 Reviewed-by: Palmer Dabbelt Acked-by: Paul Moore Acked-by: Paul Burton # MIPS parts Acked-by: Michael Ellerman (powerpc) Acked-by: Kees Cook # seccomp parts Acked-by: Mark Salter # for the c6x bit Cc: Elvira Khabirova Cc: Eugene Syromyatnikov Cc: Oleg Nesterov Cc: x86@kernel.org Cc: linux-alpha@vger.kernel.org Cc: linux-snps-arc@lists.infradead.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: uclinux-h8-devel@lists.sourceforge.jp Cc: linux-hexagon@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-m68k@lists.linux-m68k.org Cc: linux-mips@vger.kernel.org Cc: nios2-dev@lists.rocketboards.org Cc: openrisc@lists.librecores.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-riscv@lists.infradead.org Cc: linux-s390@vger.kernel.org Cc: linux-sh@vger.kernel.org Cc: sparclinux@vger.kernel.org Cc: linux-um@lists.infradead.org Cc: linux-xtensa@linux-xtensa.org Cc: linux-arch@vger.kernel.org Cc: linux-audit@redhat.com Signed-off-by: Dmitry V. Levin Signed-off-by: Paul Moore --- kernel/auditsc.c | 4 ++-- kernel/seccomp.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 17b0007fafc2..98a98e6dca05 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1636,7 +1636,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, return; } - context->arch = syscall_get_arch(); + context->arch = syscall_get_arch(current); context->major = major; context->argv[0] = a1; context->argv[1] = a2; @@ -2590,7 +2590,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, + signr, syscall_get_arch(current), syscall, in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 54a0347ca812..36f36ab00f48 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -148,7 +148,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) unsigned long args[6]; sd->nr = syscall_get_nr(task, regs); - sd->arch = syscall_get_arch(); + sd->arch = syscall_get_arch(task); syscall_get_arguments(task, regs, 0, 6, args); sd->args[0] = args[0]; sd->args[1] = args[1]; @@ -591,7 +591,7 @@ static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason info->si_code = SYS_SECCOMP; info->si_call_addr = (void __user *)KSTK_EIP(current); info->si_errno = reason; - info->si_arch = syscall_get_arch(); + info->si_arch = syscall_get_arch(current); info->si_syscall = syscall; } -- cgit v1.2.3 From 0f3adc288df8ba2ac2fea0a8e4890f9fb4cd075d Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:53:59 +0800 Subject: bpf: track references based on is_acquire_func So far, the verifier only acquires reference tracking state for RET_PTR_TO_SOCKET_OR_NULL. Instead of extending this for every new return type which desires these semantics, acquire reference tracking state iff the called helper is an acquire function. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 86f9cd5d1c4e..868a82ad5597 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3147,19 +3147,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - if (is_acquire_function(func_id)) { - int id = acquire_reference_state(env, insn_idx); - - if (id < 0) - return id; - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = id; - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = id; - } else { - /* For mark_ptr_or_null_reg() */ - regs[BPF_REG_0].id = ++env->id_gen; - } + regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; @@ -3170,9 +3158,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } - if (is_ptr_cast_function(func_id)) + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = id; + } do_refine_retval_range(regs, fn->ret_type, func_id, &meta); -- cgit v1.2.3 From 85a51f8c28b9812642d76db6889f3f39dc3fbab3 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:00 +0800 Subject: bpf: allow helpers to return PTR_TO_SOCK_COMMON It's currently not possible to access timewait or request sockets from eBPF, since there is no way to return a PTR_TO_SOCK_COMMON from a helper. Introduce RET_PTR_TO_SOCK_COMMON to enable this behaviour. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 868a82ad5597..a476e13201d6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3148,6 +3148,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; -- cgit v1.2.3 From edbf8c01de5a104a71ed6df2bf6421ceb2836a8e Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:01 +0800 Subject: bpf: add skc_lookup_tcp helper Allow looking up a sock_common. This gives eBPF programs access to timewait and request sockets. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a476e13201d6..dffeec3706ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -369,7 +369,8 @@ static bool is_release_function(enum bpf_func_id func_id) static bool is_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || - func_id == BPF_FUNC_sk_lookup_udp; + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp; } static bool is_ptr_cast_function(enum bpf_func_id func_id) -- cgit v1.2.3 From 3b0f31f2b8c9fb348e4530b88f6b64f9621f83d6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 21 Mar 2019 22:51:02 +0100 Subject: genetlink: make policy common to family Since maxattr is common, the policy can't really differ sanely, so make it common as well. The only user that did in fact manage to make a non-common policy is taskstats, which has to be really careful about it (since it's still using a common maxattr!). This is no longer supported, but we can fake it using pre_doit. This reduces the size of e.g. nl80211.o (which has lots of commands): text data bss dec hex filename 398745 14323 2240 415308 6564c net/wireless/nl80211.o (before) 397913 14331 2240 414484 65314 net/wireless/nl80211.o (after) -------------------------------- -832 +8 0 -824 Which is obviously just 8 bytes for each command, and an added 8 bytes for the new policy pointer. I'm not sure why the ops list is counted as .text though. Most of the code transformations were done using the following spatch: @ops@ identifier OPS; expression POLICY; @@ struct genl_ops OPS[] = { ..., { - .policy = POLICY, }, ... }; @@ identifier ops.OPS; expression ops.POLICY; identifier fam; expression M; @@ struct genl_family fam = { .ops = OPS, .maxattr = M, + .policy = POLICY, ... }; This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing the cb->data as ops, which we want to change in a later genl patch. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- kernel/taskstats.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4e62a4a8fa91..1b942a7caf26 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -650,16 +650,37 @@ static const struct genl_ops taskstats_ops[] = { { .cmd = TASKSTATS_CMD_GET, .doit = taskstats_user_cmd, - .policy = taskstats_cmd_get_policy, - .flags = GENL_ADMIN_PERM, + /* policy enforced later */ + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL, }, { .cmd = CGROUPSTATS_CMD_GET, .doit = cgroupstats_user_cmd, - .policy = cgroupstats_cmd_get_policy, + /* policy enforced later */ + .flags = GENL_CMD_CAP_HASPOL, }, }; +static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + const struct nla_policy *policy = NULL; + + switch (ops->cmd) { + case TASKSTATS_CMD_GET: + policy = taskstats_cmd_get_policy; + break; + case CGROUPSTATS_CMD_GET: + policy = cgroupstats_cmd_get_policy; + break; + default: + return -EINVAL; + } + + return nlmsg_validate(info->nlhdr, GENL_HDRLEN, TASKSTATS_CMD_ATTR_MAX, + policy, info->extack); +} + static struct genl_family family __ro_after_init = { .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, @@ -667,6 +688,7 @@ static struct genl_family family __ro_after_init = { .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), + .pre_doit = taskstats_pre_doit, }; /* Needed early in initialization */ -- cgit v1.2.3 From 8db5da0b8618df79eceea99672e205d4a2a6309e Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Sun, 27 Jan 2019 19:03:45 -0500 Subject: x86/ima: require signed kernel modules Have the IMA architecture specific policy require signed kernel modules on systems with secure boot mode enabled; and coordinate the different signature verification methods, so only one signature is required. Requiring appended kernel module signatures may be configured, enabled on the boot command line, or with this patch enabled in secure boot mode. This patch defines set_module_sig_enforced(). To coordinate between appended kernel module signatures and IMA signatures, only define an IMA MODULE_CHECK policy rule if CONFIG_MODULE_SIG is not enabled. A custom IMA policy may still define and require an IMA signature. Signed-off-by: Mimi Zohar Reviewed-by: Luis Chamberlain Acked-by: Jessica Yu --- kernel/module.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0b9aa8ab89f0..985caa467aef 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -286,6 +286,11 @@ bool is_module_sig_enforced(void) } EXPORT_SYMBOL(is_module_sig_enforced); +void set_module_sig_enforced(void) +{ + sig_enforce = true; +} + /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); -- cgit v1.2.3 From 1c7651f43777cdd59c1aaa82c87324d3e7438c7b Mon Sep 17 00:00:00 2001 From: Eugene Loh Date: Mon, 25 Feb 2019 11:59:58 -0800 Subject: kallsyms: store type information in its own array When a module is loaded, its symbols' Elf_Sym information is stored in a symtab. Further, type information is also captured. Since Elf_Sym has no type field, historically the st_info field has been hijacked for storing type: st_info was overwritten. commit 5439c985c5a83a8419f762115afdf560ab72a452 ("module: Overwrite st_size instead of st_info") changes that practice, as its one-liner indicates. Unfortunately, this change overwrites symbol size, information that a tool like DTrace expects to find. Allocate a typetab array to store type information so that no Elf_Sym field needs to be overwritten. Fixes: 5439c985c5a8 ("module: Overwrite st_size instead of st_info") Signed-off-by: Eugene Loh Reviewed-by: Nick Alcock [jeyu: renamed typeoff -> typeoffs ] Signed-off-by: Jessica Yu --- kernel/module-internal.h | 2 +- kernel/module.c | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 79c9be2dbbe9..d354341f8cc0 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -20,7 +20,7 @@ struct load_info { unsigned long len; Elf_Shdr *sechdrs; char *secstrings, *strtab; - unsigned long symoffs, stroffs; + unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; struct _ddebug *debug; unsigned int num_debug; bool sig_ok; diff --git a/kernel/module.c b/kernel/module.c index 0b9aa8ab89f0..69e52e82242a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2647,6 +2647,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); mod->core_layout.size += strtab_size; + info->core_typeoffs = mod->core_layout.size; + mod->core_layout.size += ndst * sizeof(char); mod->core_layout.size = debug_align(mod->core_layout.size); /* Put string table section at end of init part of module. */ @@ -2660,6 +2662,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) __alignof__(struct mod_kallsyms)); info->mod_kallsyms_init_off = mod->init_layout.size; mod->init_layout.size += sizeof(struct mod_kallsyms); + info->init_typeoffs = mod->init_layout.size; + mod->init_layout.size += nsrc * sizeof(char); mod->init_layout.size = debug_align(mod->init_layout.size); } @@ -2683,20 +2687,23 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; - /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->kallsyms->num_symtab; i++) - mod->kallsyms->symtab[i].st_size - = elf_type(&mod->kallsyms->symtab[i], info); - - /* Now populate the cut down core kallsyms for after init. */ + /* + * Now populate the cut down core kallsyms for after init + * and set types up while we still have access to sections. + */ mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; src = mod->kallsyms->symtab; for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { + mod->kallsyms->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { + mod->core_kallsyms.typetab[ndst] = + mod->kallsyms->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], @@ -4080,7 +4087,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, const Elf_Sym *sym = &kallsyms->symtab[symnum]; *value = kallsyms_symbol_value(sym); - *type = sym->st_size; + *type = kallsyms->typetab[symnum]; strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); -- cgit v1.2.3 From 2011fccfb61bbd1d7c8864b2b3ed7012342e9ba3 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 28 Mar 2019 18:01:57 -0700 Subject: bpf: Support variable offset stack access from helpers Currently there is a difference in how verifier checks memory access for helper arguments for PTR_TO_MAP_VALUE and PTR_TO_STACK with regard to variable part of offset. check_map_access, that is used for PTR_TO_MAP_VALUE, can handle variable offsets just fine, so that BPF program can call a helper like this: some_helper(map_value_ptr + off, size); , where offset is unknown at load time, but is checked by program to be in a safe rage (off >= 0 && off + size < map_value_size). But it's not the case for check_stack_boundary, that is used for PTR_TO_STACK, and same code with pointer to stack is rejected by verifier: some_helper(stack_value_ptr + off, size); For example: 0: (7a) *(u64 *)(r10 -16) = 0 1: (7a) *(u64 *)(r10 -8) = 0 2: (61) r2 = *(u32 *)(r1 +0) 3: (57) r2 &= 4 4: (17) r2 -= 16 5: (0f) r2 += r10 6: (18) r1 = 0xffff888111343a80 8: (85) call bpf_map_lookup_elem#1 invalid variable stack read R2 var_off=(0xfffffffffffffff0; 0x4) Add support for variable offset access to check_stack_boundary so that if offset is checked by program to be in a safe range it's accepted by verifier. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 75 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2fe89138309a..87221fda1321 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2157,6 +2157,29 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins BPF_SIZE(insn->code), BPF_WRITE, -1, true); } +static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, + int off, int access_size, + bool zero_size_allowed) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size < 0 || (access_size == 0 && !zero_size_allowed)) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", + regno, tn_buf, access_size); + } + return -EACCES; + } + return 0; +} + /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized. @@ -2169,7 +2192,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); - int off, i, slot, spi; + int err, min_off, max_off, i, slot, spi; if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -2183,21 +2206,23 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } - /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable stack read R%d var_off=%s\n", - regno, tn_buf); - return -EACCES; - } - off = reg->off + reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - return -EACCES; + if (tnum_is_const(reg->var_off)) { + min_off = max_off = reg->var_off.value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) + return err; + } else { + min_off = reg->smin_value + reg->off; + max_off = reg->umax_value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) + return err; + err = __check_stack_boundary(env, regno, max_off, access_size, + zero_size_allowed); + if (err) + return err; } if (meta && meta->raw_mode) { @@ -2206,10 +2231,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return 0; } - for (i = 0; i < access_size; i++) { + for (i = min_off; i < max_off + access_size; i++) { u8 *stype; - slot = -(off + i) - 1; + slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) goto err; @@ -2222,8 +2247,16 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, goto mark; } err: - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + min_off, i - min_off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", + tn_buf, i - min_off, access_size); + } return -EACCES; mark: /* reading any byte out of 8-byte 'spill_slot' will cause @@ -2232,7 +2265,7 @@ mark: mark_reg_read(env, &state->stack[spi].spilled_ptr, state->stack[spi].spilled_ptr.parent); } - return update_stack_depth(env, state, off); + return update_stack_depth(env, state, min_off); } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, -- cgit v1.2.3 From 40ed29b373381532ef222e509c5aa69a1d8561ea Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sun, 23 Sep 2018 12:11:33 +0000 Subject: ring-buffer: Fix ring buffer size in rb_write_something() 'cnt' should be used to calculate ring buffer size rather than data->cnt Link: http://lkml.kernel.org/r/1537704693-184237-1-git-send-email-yuehaibing@huawei.com Signed-off-by: YueHaibing Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 41b6f96e5366..4f33d7d841af 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4979,7 +4979,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) cnt = data->cnt + (nested ? 27 : 0); /* Multiply cnt by ~e, to make some unique increment */ - size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + size = (cnt * 68 / 25) % (sizeof(rb_string) - 1); len = size + sizeof(struct rb_item); -- cgit v1.2.3 From f45d1225adb0479478cee989e2ae2d7d2c62b31b Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 20 Mar 2019 11:28:51 -0700 Subject: tracing: Kernel access to Ftrace instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ftrace provides the feature “instances” that provides the capability to create multiple Ftrace ring buffers. However, currently these buffers are created/accessed via userspace only. The kernel APIs providing these features are not exported, hence cannot be used by other kernel components. This patch aims to extend this infrastructure to provide the flexibility to create/log/remove/ enable-disable existing trace events to these buffers from within the kernel. Link: http://lkml.kernel.org/r/1553106531-3281-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Reviewed-by: Joe Jin Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 74 ++++++++++++++++++++++++++++++--------------- kernel/trace/trace_events.c | 1 + 2 files changed, 51 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 21153e64bf1c..4384fcc386c8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3053,6 +3053,7 @@ void trace_printk_init_buffers(void) if (global_trace.trace_buffer.buffer) tracing_start_cmdline_record(); } +EXPORT_SYMBOL_GPL(trace_printk_init_buffers); void trace_printk_start_comm(void) { @@ -3213,6 +3214,7 @@ int trace_array_printk(struct trace_array *tr, va_end(ap); return ret; } +EXPORT_SYMBOL_GPL(trace_array_printk); __printf(3, 4) int trace_array_printk_buf(struct ring_buffer *buffer, @@ -8037,7 +8039,7 @@ static void update_tracer_options(struct trace_array *tr) mutex_unlock(&trace_types_lock); } -static int instance_mkdir(const char *name) +struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; int ret; @@ -8101,7 +8103,7 @@ static int instance_mkdir(const char *name) mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); - return 0; + return tr; out_free_tr: free_trace_buffers(tr); @@ -8113,33 +8115,21 @@ static int instance_mkdir(const char *name) mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); - return ret; + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(trace_array_create); +static int instance_mkdir(const char *name) +{ + return PTR_ERR_OR_ZERO(trace_array_create(name)); } -static int instance_rmdir(const char *name) +static int __remove_instance(struct trace_array *tr) { - struct trace_array *tr; - int found = 0; - int ret; int i; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); - - ret = -ENODEV; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) { - found = 1; - break; - } - } - if (!found) - goto out_unlock; - - ret = -EBUSY; if (tr->ref || (tr->current_trace && tr->current_trace->ref)) - goto out_unlock; + return -EBUSY; list_del(&tr->list); @@ -8165,10 +8155,46 @@ static int instance_rmdir(const char *name) free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); + tr = NULL; - ret = 0; + return 0; +} + +int trace_array_destroy(struct trace_array *tr) +{ + int ret; + + if (!tr) + return -EINVAL; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = __remove_instance(tr); + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_array_destroy); + +static int instance_rmdir(const char *name) +{ + struct trace_array *tr; + int ret; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -ENODEV; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) { + ret = __remove_instance(tr); + break; + } + } - out_unlock: mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5b3b0c3c8a47..81c038ed6cee 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -832,6 +832,7 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) return ret; } +EXPORT_SYMBOL_GPL(ftrace_set_clr_event); /** * trace_set_clr_event - enable or disable an event -- cgit v1.2.3 From 8a062902be725f647dc8da532b04d836546a369a Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 31 Mar 2019 18:48:15 -0500 Subject: tracing: Add tracing error log Introduce a new ftrace file, tracing/error_log, for ftrace commands to log errors. This is useful for allowing more complex commands such as hist trigger and kprobe_event commands to point out specifically where something may have gone wrong without forcing them to resort to more ad hoc methods such as tacking error messages onto existing output files. To log a tracing error, call the event_log_err() function, passing it a location string describing where it came from e.g. kprobe_events or system:event, the command that caused the error, an array of static error strings describing errors and an index within that array which describes the specific error, along with the position to place the error caret. Reading the log displays the last (currently) 8 errors logged in the following format: [timestamp] : error: Command: ^ Memory for the error log isn't allocated unless there has been a trace event error, and the error log can be cleared and have its memory freed by writing the empty string in truncation mode to it: # echo > tracing/error_log. Link: http://lkml.kernel.org/r/0c2c82571fd38c5f3a88ca823627edff250e9416.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Suggested-by: Masami Hiramatsu Improvements-suggested-by: Steve Rostedt Acked-by: Namhyung Kim Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 218 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 4 + 2 files changed, 222 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4384fcc386c8..7978168f5041 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6878,6 +6878,221 @@ static const struct file_operations snapshot_raw_fops = { #endif /* CONFIG_TRACER_SNAPSHOT */ +#define TRACING_LOG_ERRS_MAX 8 +#define TRACING_LOG_LOC_MAX 128 + +#define CMD_PREFIX " Command: " + +struct err_info { + const char **errs; /* ptr to loc-specific array of err strings */ + u8 type; /* index into errs -> specific err string */ + u8 pos; /* MAX_FILTER_STR_VAL = 256 */ + u64 ts; +}; + +struct tracing_log_err { + struct list_head list; + struct err_info info; + char loc[TRACING_LOG_LOC_MAX]; /* err location */ + char cmd[MAX_FILTER_STR_VAL]; /* what caused err */ +}; + +static LIST_HEAD(tracing_err_log); +static DEFINE_MUTEX(tracing_err_log_lock); + +static unsigned int n_tracing_err_log_entries; + +struct tracing_log_err *get_tracing_log_err(void) +{ + struct tracing_log_err *err; + + if (n_tracing_err_log_entries < TRACING_LOG_ERRS_MAX) { + err = kzalloc(sizeof(*err), GFP_KERNEL); + if (!err) + err = ERR_PTR(-ENOMEM); + n_tracing_err_log_entries++; + + return err; + } + + err = list_first_entry(&tracing_err_log, struct tracing_log_err, list); + list_del(&err->list); + + return err; +} + +/** + * err_pos - find the position of a string within a command for error careting + * @cmd: The tracing command that caused the error + * @str: The string to position the caret at within @cmd + * + * Finds the position of the first occurence of @str within @cmd. The + * return value can be passed to tracing_log_err() for caret placement + * within @cmd. + * + * Returns the index within @cmd of the first occurence of @str or 0 + * if @str was not found. + */ +unsigned int err_pos(char *cmd, const char *str) +{ + char *found; + + if (WARN_ON(!strlen(cmd))) + return 0; + + found = strstr(cmd, str); + if (found) + return found - cmd; + + return 0; +} + +/** + * tracing_log_err - write an error to the tracing error log + * @loc: A string describing where the error occurred + * @cmd: The tracing command that caused the error + * @errs: The array of loc-specific static error strings + * @type: The index into errs[], which produces the specific static err string + * @pos: The position the caret should be placed in the cmd + * + * Writes an error into tracing/error_log of the form: + * + * : error: + * Command: + * ^ + * + * tracing/error_log is a small log file containing the last + * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated + * unless there has been a tracing error, and the error log can be + * cleared and have its memory freed by writing the empty string in + * truncation mode to it i.e. echo > tracing/error_log. + * + * NOTE: the @errs array along with the @type param are used to + * produce a static error string - this string is not copied and saved + * when the error is logged - only a pointer to it is saved. See + * existing callers for examples of how static strings are typically + * defined for use with tracing_log_err(). + */ +void tracing_log_err(const char *loc, const char *cmd, + const char **errs, u8 type, u8 pos) +{ + struct tracing_log_err *err; + + mutex_lock(&tracing_err_log_lock); + err = get_tracing_log_err(); + if (PTR_ERR(err) == -ENOMEM) { + mutex_unlock(&tracing_err_log_lock); + return; + } + + snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); + snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd); + + err->info.errs = errs; + err->info.type = type; + err->info.pos = pos; + err->info.ts = local_clock(); + + list_add_tail(&err->list, &tracing_err_log); + mutex_unlock(&tracing_err_log_lock); +} + +static void clear_tracing_err_log(void) +{ + struct tracing_log_err *err, *next; + + mutex_lock(&tracing_err_log_lock); + list_for_each_entry_safe(err, next, &tracing_err_log, list) { + list_del(&err->list); + kfree(err); + } + + n_tracing_err_log_entries = 0; + mutex_unlock(&tracing_err_log_lock); +} + +static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&tracing_err_log_lock); + + return seq_list_start(&tracing_err_log, *pos); +} + +static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &tracing_err_log, pos); +} + +static void tracing_err_log_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&tracing_err_log_lock); +} + +static void tracing_err_log_show_pos(struct seq_file *m, u8 pos) +{ + u8 i; + + for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) + seq_putc(m, ' '); + for (i = 0; i < pos; i++) + seq_putc(m, ' '); + seq_puts(m, "^\n"); +} + +static int tracing_err_log_seq_show(struct seq_file *m, void *v) +{ + struct tracing_log_err *err = v; + + if (err) { + const char *err_text = err->info.errs[err->info.type]; + u64 sec = err->info.ts; + u32 nsec; + + nsec = do_div(sec, NSEC_PER_SEC); + seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000, + err->loc, err_text); + seq_printf(m, "%s", err->cmd); + tracing_err_log_show_pos(m, err->info.pos); + } + + return 0; +} + +static const struct seq_operations tracing_err_log_seq_ops = { + .start = tracing_err_log_seq_start, + .next = tracing_err_log_seq_next, + .stop = tracing_err_log_seq_stop, + .show = tracing_err_log_seq_show +}; + +static int tracing_err_log_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + /* If this file was opened for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + clear_tracing_err_log(); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &tracing_err_log_seq_ops); + + return ret; +} + +static ssize_t tracing_err_log_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return count; +} + +static const struct file_operations tracing_err_log_fops = { + .open = tracing_err_log_open, + .write = tracing_err_log_write, + .read = seq_read, + .llseek = seq_lseek, +}; + static int tracing_buffers_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -8284,6 +8499,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) tr, &snapshot_fops); #endif + trace_create_file("error_log", 0644, d_tracer, + tr, &tracing_err_log_fops); + for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d80cee49e0eb..b711edbef7e7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1884,6 +1884,10 @@ extern ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(int, char**)); +extern unsigned int err_pos(char *cmd, const char *str); +extern void tracing_log_err(const char *loc, const char *cmd, + const char **errs, u8 type, u8 pos); + /* * Normal trace_printk() and friends allocates special buffers * to do the manipulation, as well as saves the print formats -- cgit v1.2.3 From a1a05bb40e229d148c071fcd2ed787b21f61ff8b Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 31 Mar 2019 18:48:16 -0500 Subject: tracing: Save the last hist command's associated event name In preparation for making use of the new trace error log, save the subsystem and event name associated with the last hist command - it will be passed as the location param in the event_log_err() calls. Link: http://lkml.kernel.org/r/eb0fd1362be8f39facb86c83eecf441b7a5876f8.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 795aa2038377..0de702bf148f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -535,15 +535,34 @@ static struct track_data *track_data_alloc(unsigned int key_len, return data; } -static char last_hist_cmd[MAX_FILTER_STR_VAL]; +static char last_cmd[MAX_FILTER_STR_VAL]; +static char last_cmd_loc[MAX_FILTER_STR_VAL]; + static char hist_err_str[MAX_FILTER_STR_VAL]; -static void last_cmd_set(char *str) +static void last_cmd_set(struct trace_event_file *file, char *str) { + const char *system = NULL, *name = NULL; + struct trace_event_call *call; + if (!str) return; - strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1); + strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); + + if (file) { + call = file->event_call; + + system = call->class->system; + if (system) { + name = trace_event_name(call); + if (!name) + system = NULL; + } + } + + if (system) + snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); } static void hist_err(char *str, char *var) @@ -583,6 +602,8 @@ static void hist_err_event(char *str, char *system, char *event, char *var) static void hist_err_clear(void) { hist_err_str[0] = '\0'; + last_cmd[0] = '\0'; + last_cmd_loc[0] = '\0'; } static bool have_hist_err(void) @@ -5438,8 +5459,8 @@ static int hist_show(struct seq_file *m, void *v) } if (have_hist_err()) { - seq_printf(m, "\nERROR: %s\n", hist_err_str); - seq_printf(m, " Last command: %s\n", last_hist_cmd); + seq_printf(m, "\n%s: error: \n", hist_err_str); + seq_printf(m, " Last command: %s\n", last_cmd); } out_unlock: @@ -6043,8 +6064,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, lockdep_assert_held(&event_mutex); if (glob && strlen(glob)) { - last_cmd_set(param); hist_err_clear(); + last_cmd_set(file, param); } if (!param) -- cgit v1.2.3 From d566c5e9d1bad6773fe9cce3d4514cca2cc32e4e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 31 Mar 2019 18:48:17 -0500 Subject: tracing: Use tracing error_log with hist triggers Replace hist_err() and hist_err_event() with tracing_log_err() from the new tracing error_log mechanism. Also add a couple related helper functions and remove most of the old hist_err()-related code. With this change, users no longer read the hist files for hist trigger error information, but instead look at tracing/error_log for the same information. Link: http://lkml.kernel.org/r/c98f77a97c9715d18b623eeb5741057b330d5ac0.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 206 ++++++++++++++++++++------------------- 1 file changed, 104 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0de702bf148f..071c62cacba7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -22,6 +22,57 @@ #define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ +#define ERRORS \ + C(NONE, "No error"), \ + C(DUPLICATE_VAR, "Variable already defined"), \ + C(VAR_NOT_UNIQUE, "Variable name not unique, need to use fully qualified name (subsys.event.var) for variable"), \ + C(TOO_MANY_VARS, "Too many variables defined"), \ + C(MALFORMED_ASSIGNMENT, "Malformed assignment"), \ + C(NAMED_MISMATCH, "Named hist trigger doesn't match existing named trigger (includes variables)"), \ + C(TRIGGER_EEXIST, "Hist trigger already exists"), \ + C(TRIGGER_ENOENT_CLEAR, "Can't clear or continue a nonexistent hist trigger"), \ + C(SET_CLOCK_FAIL, "Couldn't set trace_clock"), \ + C(BAD_FIELD_MODIFIER, "Invalid field modifier"), \ + C(TOO_MANY_SUBEXPR, "Too many subexpressions (3 max)"), \ + C(TIMESTAMP_MISMATCH, "Timestamp units in expression don't match"), \ + C(TOO_MANY_FIELD_VARS, "Too many field variables defined"), \ + C(EVENT_FILE_NOT_FOUND, "Event file not found"), \ + C(HIST_NOT_FOUND, "Matching event histogram not found"), \ + C(HIST_CREATE_FAIL, "Couldn't create histogram for field"), \ + C(SYNTH_VAR_NOT_FOUND, "Couldn't find synthetic variable"), \ + C(SYNTH_EVENT_NOT_FOUND,"Couldn't find synthetic event"), \ + C(SYNTH_TYPE_MISMATCH, "Param type doesn't match synthetic event field type"), \ + C(SYNTH_COUNT_MISMATCH, "Param count doesn't match synthetic event field count"), \ + C(FIELD_VAR_PARSE_FAIL, "Couldn't parse field variable"), \ + C(VAR_CREATE_FIND_FAIL, "Couldn't create or find variable"), \ + C(ONX_NOT_VAR, "For onmax(x) or onchange(x), x must be a variable"), \ + C(ONX_VAR_NOT_FOUND, "Couldn't find onmax or onchange variable"), \ + C(ONX_VAR_CREATE_FAIL, "Couldn't create onmax or onchange variable"), \ + C(FIELD_VAR_CREATE_FAIL,"Couldn't create field variable"), \ + C(TOO_MANY_PARAMS, "Too many action params"), \ + C(PARAM_NOT_FOUND, "Couldn't find param"), \ + C(INVALID_PARAM, "Invalid action param"), \ + C(ACTION_NOT_FOUND, "No action found"), \ + C(NO_SAVE_PARAMS, "No params found for save()"), \ + C(TOO_MANY_SAVE_ACTIONS,"Can't have more than one save() action per hist"), \ + C(ACTION_MISMATCH, "Handler doesn't support action"), \ + C(NO_CLOSING_PAREN, "No closing paren found"), \ + C(SUBSYS_NOT_FOUND, "Missing subsystem"), \ + C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \ + C(INVALID_REF_KEY, "Using variable references as keys not supported"), \ + C(VAR_NOT_FOUND, "Couldn't find variable"), \ + C(FIELD_NOT_FOUND, "Couldn't find field"), + +#undef C +#define C(a, b) HIST_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static const char *err_text[] = { ERRORS }; + struct hist_field; typedef u64 (*hist_field_fn_t) (struct hist_field *field, @@ -538,7 +589,10 @@ static struct track_data *track_data_alloc(unsigned int key_len, static char last_cmd[MAX_FILTER_STR_VAL]; static char last_cmd_loc[MAX_FILTER_STR_VAL]; -static char hist_err_str[MAX_FILTER_STR_VAL]; +static int errpos(char *str) +{ + return err_pos(last_cmd, str); +} static void last_cmd_set(struct trace_event_file *file, char *str) { @@ -565,55 +619,17 @@ static void last_cmd_set(struct trace_event_file *file, char *str) snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); } -static void hist_err(char *str, char *var) +static void hist_err(u8 err_type, u8 err_pos) { - int maxlen = MAX_FILTER_STR_VAL - 1; - - if (!str) - return; - - if (strlen(hist_err_str)) - return; - - if (!var) - var = ""; - - if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen) - return; - - strcat(hist_err_str, str); - strcat(hist_err_str, var); -} - -static void hist_err_event(char *str, char *system, char *event, char *var) -{ - char err[MAX_FILTER_STR_VAL]; - - if (system && var) - snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var); - else if (system) - snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); - else - strscpy(err, var, MAX_FILTER_STR_VAL); - - hist_err(str, err); + tracing_log_err(last_cmd_loc, last_cmd, err_text, err_type, err_pos); } static void hist_err_clear(void) { - hist_err_str[0] = '\0'; last_cmd[0] = '\0'; last_cmd_loc[0] = '\0'; } -static bool have_hist_err(void) -{ - if (strlen(hist_err_str)) - return true; - - return false; -} - struct synth_trace_event { struct trace_entry ent; u64 fields[]; @@ -1740,7 +1756,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr, if (find_var_field(var_hist_data, var_name)) { if (found) { - hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); return NULL; } @@ -1791,7 +1807,7 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) hist_field = find_file_var(file, var_name); if (hist_field) { if (found) { - hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); return ERR_PTR(-EINVAL); } @@ -2023,7 +2039,6 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) attrs->n_actions++; ret = 0; } - return ret; } @@ -2083,7 +2098,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) char *assignment; if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { - hist_err("Too many variables defined: ", str); + hist_err(HIST_ERR_TOO_MANY_VARS, errpos(str)); ret = -EINVAL; goto out; } @@ -2681,8 +2696,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, system, event_name); if (!ref_field) - hist_err_event("Couldn't find variable: $", - system, event_name, var_name); + hist_err(HIST_ERR_VAR_NOT_FOUND, errpos(var_name)); return ref_field; } @@ -2716,7 +2730,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; else { - hist_err("Invalid field modifier: ", modifier); + hist_err(HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); field = ERR_PTR(-EINVAL); goto out; } @@ -2732,7 +2746,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { - hist_err("Couldn't find field: ", field_name); + hist_err(HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); field = ERR_PTR(-EINVAL); goto out; } @@ -2843,7 +2857,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, /* we support only -(xxx) i.e. explicit parens required */ if (level > 3) { - hist_err("Too many subexpressions (3 max): ", str); + hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); ret = -EINVAL; goto free; } @@ -2926,7 +2940,7 @@ static int check_expr_operands(struct hist_field *operand1, if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { - hist_err("Timestamp units in expression don't match", NULL); + hist_err(HIST_ERR_TIMESTAMP_MISMATCH, 0); return -EINVAL; } @@ -2944,7 +2958,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, char *sep, *operand1_str; if (level > 3) { - hist_err("Too many subexpressions (3 max): ", str); + hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); return ERR_PTR(-EINVAL); } @@ -3182,16 +3196,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, int ret; if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { - hist_err_event("trace action: Too many field variables defined: ", - subsys_name, event_name, field_name); + hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); return ERR_PTR(-EINVAL); } file = event_file(tr, subsys_name, event_name); if (IS_ERR(file)) { - hist_err_event("trace action: Event file not found: ", - subsys_name, event_name, field_name); + hist_err(HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name)); ret = PTR_ERR(file); return ERR_PTR(ret); } @@ -3204,8 +3216,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, */ hist_data = find_compatible_hist(target_hist_data, file); if (!hist_data) { - hist_err_event("trace action: Matching event histogram not found: ", - subsys_name, event_name, field_name); + hist_err(HIST_ERR_HIST_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3266,8 +3277,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, kfree(cmd); kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("trace action: Couldn't create histogram for field: ", - subsys_name, event_name, field_name); + hist_err(HIST_ERR_HIST_CREATE_FAIL, errpos(field_name)); return ERR_PTR(ret); } @@ -3279,8 +3289,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, if (IS_ERR_OR_NULL(event_var)) { kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("trace action: Couldn't find synthetic variable: ", - subsys_name, event_name, field_name); + hist_err(HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3417,21 +3426,21 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, int ret = 0; if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { - hist_err("Too many field variables defined: ", field_name); + hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); ret = -EINVAL; goto err; } val = parse_atom(hist_data, file, field_name, &flags, NULL); if (IS_ERR(val)) { - hist_err("Couldn't parse field variable: ", field_name); + hist_err(HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name)); ret = PTR_ERR(val); goto err; } var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { - hist_err("Couldn't create or find variable: ", field_name); + hist_err(HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); kfree(val); ret = PTR_ERR(var); goto err; @@ -3763,14 +3772,14 @@ static int track_data_create(struct hist_trigger_data *hist_data, track_data_var_str = data->track_data.var_str; if (track_data_var_str[0] != '$') { - hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str); + hist_err(HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str)); return -EINVAL; } track_data_var_str++; var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); if (!var_field) { - hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str); + hist_err(HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str)); return -EINVAL; } @@ -3783,7 +3792,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, if (data->handler == HANDLER_ONMAX) track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); if (IS_ERR(track_var)) { - hist_err("Couldn't create onmax variable: ", "__max"); + hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0); ret = PTR_ERR(track_var); goto out; } @@ -3791,7 +3800,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, if (data->handler == HANDLER_ONCHANGE) track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); if (IS_ERR(track_var)) { - hist_err("Couldn't create onchange variable: ", "__change"); + hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0); ret = PTR_ERR(track_var); goto out; } @@ -3810,20 +3819,20 @@ static int parse_action_params(char *params, struct action_data *data) while (params) { if (data->n_params >= SYNTH_FIELDS_MAX) { - hist_err("Too many action params", ""); + hist_err(HIST_ERR_TOO_MANY_PARAMS, 0); goto out; } param = strsep(¶ms, ","); if (!param) { - hist_err("No action param found", ""); + hist_err(HIST_ERR_PARAM_NOT_FOUND, 0); ret = -EINVAL; goto out; } param = strstrip(param); if (strlen(param) < 2) { - hist_err("Invalid action param: ", param); + hist_err(HIST_ERR_INVALID_PARAM, errpos(param)); ret = -EINVAL; goto out; } @@ -3855,14 +3864,14 @@ static int action_parse(char *str, struct action_data *data, strsep(&str, "."); if (!str) { - hist_err("action parsing: No action found", ""); + hist_err(HIST_ERR_ACTION_NOT_FOUND, 0); ret = -EINVAL; goto out; } action_name = strsep(&str, "("); if (!action_name || !str) { - hist_err("action parsing: No action found", ""); + hist_err(HIST_ERR_ACTION_NOT_FOUND, 0); ret = -EINVAL; goto out; } @@ -3871,7 +3880,7 @@ static int action_parse(char *str, struct action_data *data, char *params = strsep(&str, ")"); if (!params) { - hist_err("action parsing: No params found for %s", "save"); + hist_err(HIST_ERR_NO_SAVE_PARAMS, 0); ret = -EINVAL; goto out; } @@ -3885,7 +3894,7 @@ static int action_parse(char *str, struct action_data *data, else if (handler == HANDLER_ONCHANGE) data->track_data.check_val = check_track_val_changed; else { - hist_err("action parsing: Handler doesn't support action: ", action_name); + hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name)); ret = -EINVAL; goto out; } @@ -3897,7 +3906,7 @@ static int action_parse(char *str, struct action_data *data, char *params = strsep(&str, ")"); if (!str) { - hist_err("action parsing: No closing paren found: %s", params); + hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(params)); ret = -EINVAL; goto out; } @@ -3907,7 +3916,7 @@ static int action_parse(char *str, struct action_data *data, else if (handler == HANDLER_ONCHANGE) data->track_data.check_val = check_track_val_changed; else { - hist_err("action parsing: Handler doesn't support action: ", action_name); + hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name)); ret = -EINVAL; goto out; } @@ -4060,7 +4069,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data, } if (!hist_field) - hist_err_event("trace action: Couldn't find param: $", system, event, var); + hist_err(HIST_ERR_PARAM_NOT_FOUND, errpos(var)); return hist_field; } @@ -4135,7 +4144,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data, event = find_synth_event(synth_event_name); if (!event) { - hist_err("trace action: Couldn't find synthetic event: ", synth_event_name); + hist_err(HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name)); return -EINVAL; } @@ -4196,15 +4205,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data, continue; } - hist_err_event("trace action: Param type doesn't match synthetic event field type: ", - system, event_name, param); + hist_err(HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param)); kfree(p); ret = -EINVAL; goto err; } if (field_pos != event->n_fields) { - hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name); + hist_err(HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name)); ret = -EINVAL; goto err; } @@ -4250,7 +4258,7 @@ static int action_create(struct hist_trigger_data *hist_data, if (data->action == ACTION_SAVE) { if (hist_data->n_save_vars) { ret = -EEXIST; - hist_err("save action: Can't have more than one save() action per hist", ""); + hist_err(HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0); goto out; } @@ -4263,7 +4271,7 @@ static int action_create(struct hist_trigger_data *hist_data, field_var = create_target_field_var(hist_data, NULL, NULL, param); if (IS_ERR(field_var)) { - hist_err("save action: Couldn't create field variable: ", param); + hist_err(HIST_ERR_FIELD_VAR_CREATE_FAIL, errpos(param)); ret = PTR_ERR(field_var); kfree(param); goto out; @@ -4297,19 +4305,18 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) match_event = strsep(&str, ")"); if (!match_event || !str) { - hist_err("onmatch: Missing closing paren: ", match_event); + hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(match_event)); goto free; } match_event_system = strsep(&match_event, "."); if (!match_event) { - hist_err("onmatch: Missing subsystem for match event: ", match_event_system); + hist_err(HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system)); goto free; } if (IS_ERR(event_file(tr, match_event_system, match_event))) { - hist_err_event("onmatch: Invalid subsystem or event name: ", - match_event_system, match_event, NULL); + hist_err(HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event)); goto free; } @@ -4400,7 +4407,7 @@ static int create_var_field(struct hist_trigger_data *hist_data, return -EINVAL; if (find_var(hist_data, file, var_name) && !hist_data->remove) { - hist_err("Variable already defined: ", var_name); + hist_err(HIST_ERR_DUPLICATE_VAR, errpos(var_name)); return -EINVAL; } @@ -4481,7 +4488,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, } if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { - hist_err("Using variable references as keys not supported: ", field_str); + hist_err(HIST_ERR_INVALID_REF_KEY, errpos(field_str)); destroy_hist_field(hist_field, 0); ret = -EINVAL; goto out; @@ -4595,13 +4602,13 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) var_name = strsep(&field_str, "="); if (!var_name || !field_str) { - hist_err("Malformed assignment: ", var_name); + hist_err(HIST_ERR_MALFORMED_ASSIGNMENT, errpos(var_name)); ret = -EINVAL; goto free; } if (n_vars == TRACING_MAP_VARS_MAX) { - hist_err("Too many variables defined: ", var_name); + hist_err(HIST_ERR_TOO_MANY_VARS, errpos(var_name)); ret = -EINVAL; goto free; } @@ -5458,11 +5465,6 @@ static int hist_show(struct seq_file *m, void *v) hist_trigger_show(m, data, n++); } - if (have_hist_err()) { - seq_printf(m, "\n%s: error: \n", hist_err_str); - seq_printf(m, " Last command: %s\n", last_cmd); - } - out_unlock: mutex_unlock(&event_mutex); @@ -5834,7 +5836,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (named_data) { if (!hist_trigger_match(data, named_data, named_data, true)) { - hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name); + hist_err(HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name)); ret = -EINVAL; goto out; } @@ -5855,7 +5857,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, else if (hist_data->attrs->clear) hist_clear(test); else { - hist_err("Hist trigger already exists", NULL); + hist_err(HIST_ERR_TRIGGER_EEXIST, 0); ret = -EEXIST; } goto out; @@ -5863,7 +5865,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, } new: if (hist_data->attrs->cont || hist_data->attrs->clear) { - hist_err("Can't clear or continue a nonexistent hist trigger", NULL); + hist_err(HIST_ERR_TRIGGER_ENOENT_CLEAR, 0); ret = -ENOENT; goto out; } @@ -5888,7 +5890,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, ret = tracing_set_clock(file->tr, hist_data->attrs->clock); if (ret) { - hist_err("Couldn't set trace_clock: ", clock); + hist_err(HIST_ERR_SET_CLOCK_FAIL, errpos(clock)); goto out; } -- cgit v1.2.3 From 34f76afaac7a437a2ce381225135563928b359dd Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 31 Mar 2019 18:48:18 -0500 Subject: tracing: Use tracing error_log with trace event filters Use tracing_log_err() from the new tracing error_log mechanism to send filter parse errors to tracing/error_log. With this change, users will be able to see filter errors by looking at tracing/error_log. The same errors will also be available in the filter file, as expected. Link: http://lkml.kernel.org/r/1d942c419941539a11d78a6810fc5740a99b2974.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 05a66493a164..290d42c59101 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -66,7 +66,8 @@ static const char * ops[] = { OPS }; C(INVALID_FILTER, "Meaningless filter expression"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ - C(NO_FILTER, "No filter found"), + C(ERRNO, "Error"), \ + C(NO_FILTER, "No filter found") #undef C #define C(a, b) FILT_ERR_##a @@ -76,7 +77,7 @@ enum { ERRORS }; #undef C #define C(a, b) b -static char *err_text[] = { ERRORS }; +static const char *err_text[] = { ERRORS }; /* Called after a '!' character but "!=" and "!~" are not "not"s */ static bool is_not(const char *str) @@ -947,8 +948,14 @@ static void append_filter_err(struct filter_parse_error *pe, if (pe->lasterr > 0) { trace_seq_printf(s, "\n%*s", pos, "^"); trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); + tracing_log_err("event filter parse error", + filter->filter_string, err_text, + pe->lasterr, pe->lasterr_pos); } else { trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); + tracing_log_err("event filter parse error", + filter->filter_string, err_text, + FILT_ERR_ERRNO, 0); } trace_seq_putc(s, 0); buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); -- cgit v1.2.3 From ab105a4fb89496c71c5a0f3222347c506c30feb0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 31 Mar 2019 18:48:19 -0500 Subject: tracing: Use tracing error_log with probe events Use tracing error_log with probe events for logging error more precisely. This also makes all parse error returns -EINVAL (except for -ENOMEM), because user can see better error message in error_log file now. Link: http://lkml.kernel.org/r/6a4d90e141d138040ea61f4776b991597077451e.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 77 ++++++++----- kernel/trace/trace_probe.c | 274 +++++++++++++++++++++++++++++++------------- kernel/trace/trace_probe.h | 77 ++++++++++++- kernel/trace/trace_uprobe.c | 44 ++++--- 4 files changed, 348 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5d5129b05df7..7d736248a070 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -441,13 +441,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) else ret = register_kprobe(&tk->rp.kp); - if (ret == 0) { + if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; - } else if (ret == -EILSEQ) { - pr_warn("Probing address(0x%p) is not an instruction boundary.\n", - tk->rp.kp.addr); - ret = -EINVAL; - } return ret; } @@ -591,7 +586,7 @@ static int trace_kprobe_create(int argc, const char *argv[]) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_kprobe *tk; + struct trace_kprobe *tk = NULL; int i, len, ret = 0; bool is_return = false; char *symbol = NULL, *tmp = NULL; @@ -615,44 +610,50 @@ static int trace_kprobe_create(int argc, const char *argv[]) if (argc < 2) return -ECANCELED; + trace_probe_log_init("trace_kprobe", argc, argv); + event = strchr(&argv[0][1], ':'); if (event) event++; if (isdigit(argv[0][1])) { if (!is_return) { - pr_info("Maxactive is not for kprobe"); - return -EINVAL; + trace_probe_log_err(1, MAXACT_NO_KPROBE); + goto parse_error; } if (event) len = event - &argv[0][1] - 1; else len = strlen(&argv[0][1]); - if (len > MAX_EVENT_NAME_LEN - 1) - return -E2BIG; + if (len > MAX_EVENT_NAME_LEN - 1) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } memcpy(buf, &argv[0][1], len); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { - pr_info("Invalid maxactive number\n"); - return ret; + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; } /* kretprobes instances are iterated over via a list. The * maximum should stay reasonable. */ if (maxactive > KRETPROBE_MAXACTIVE_MAX) { - pr_info("Maxactive is too big (%d > %d).\n", - maxactive, KRETPROBE_MAXACTIVE_MAX); - return -E2BIG; + trace_probe_log_err(1, MAXACT_TOO_BIG); + goto parse_error; } } /* try to parse an address. if that fails, try to read the * input as a symbol. */ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { + trace_probe_log_set_index(1); /* Check whether uprobe event specified */ - if (strchr(argv[1], '/') && strchr(argv[1], ':')) - return -ECANCELED; + if (strchr(argv[1], '/') && strchr(argv[1], ':')) { + ret = -ECANCELED; + goto error; + } /* a symbol specified */ symbol = kstrdup(argv[1], GFP_KERNEL); if (!symbol) @@ -660,23 +661,23 @@ static int trace_kprobe_create(int argc, const char *argv[]) /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { - pr_info("Failed to parse either an address or a symbol.\n"); - goto out; + trace_probe_log_err(0, BAD_PROBE_ADDR); + goto parse_error; } if (kprobe_on_func_entry(NULL, symbol, offset)) flags |= TPARG_FL_FENTRY; if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { - pr_info("Given offset is not valid for return probe.\n"); - ret = -EINVAL; - goto out; + trace_probe_log_err(0, BAD_RETPROBE); + goto parse_error; } } - argc -= 2; argv += 2; + trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf); + ret = traceprobe_parse_event_name(&event, &group, buf, + event - argv[0]); if (ret) - goto out; + goto parse_error; } else { /* Make a new event name */ if (symbol) @@ -691,13 +692,14 @@ static int trace_kprobe_create(int argc, const char *argv[]) /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, - argc, is_return); + argc - 2, is_return); if (IS_ERR(tk)) { ret = PTR_ERR(tk); - /* This must return -ENOMEM otherwise there is a bug */ + /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto out; + goto out; /* We know tk is not allocated */ } + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { @@ -707,19 +709,32 @@ static int trace_kprobe_create(int argc, const char *argv[]) goto error; } + trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); kfree(tmp); if (ret) - goto error; + goto error; /* This can be -ENOMEM */ } ret = register_trace_kprobe(tk); - if (ret) + if (ret) { + trace_probe_log_set_index(1); + if (ret == -EILSEQ) + trace_probe_log_err(0, BAD_INSN_BNDRY); + else if (ret == -ENOENT) + trace_probe_log_err(0, BAD_PROBE_ADDR); + else if (ret != -ENOMEM) + trace_probe_log_err(0, FAIL_REG_PROBE); goto error; + } + out: + trace_probe_log_clear(); kfree(symbol); return ret; +parse_error: + ret = -EINVAL; error: free_trace_kprobe(tk); goto out; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8f8411e7835f..e11f98c49d72 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,6 +13,11 @@ #include "trace_probe.h" +#undef C +#define C(a, b) b + +static const char *trace_probe_err_text[] = { ERRORS }; + static const char *reserved_field_names[] = { "common_type", "common_flags", @@ -133,6 +138,60 @@ fail: return NULL; } +static struct trace_probe_log trace_probe_log; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv) +{ + trace_probe_log.subsystem = subsystem; + trace_probe_log.argc = argc; + trace_probe_log.argv = argv; + trace_probe_log.index = 0; +} + +void trace_probe_log_clear(void) +{ + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); +} + +void trace_probe_log_set_index(int index) +{ + trace_probe_log.index = index; +} + +void __trace_probe_log_err(int offset, int err_type) +{ + char *command, *p; + int i, len = 0, pos = 0; + + if (!trace_probe_log.argv) + return; + + /* Recalcurate the length and allocate buffer */ + for (i = 0; i < trace_probe_log.argc; i++) { + if (i == trace_probe_log.index) + pos = len; + len += strlen(trace_probe_log.argv[i]) + 1; + } + command = kzalloc(len, GFP_KERNEL); + if (!command) + return; + + /* And make a command string from argv array */ + p = command; + for (i = 0; i < trace_probe_log.argc; i++) { + len = strlen(trace_probe_log.argv[i]); + strcpy(p, trace_probe_log.argv[i]); + p[len] = ' '; + p += len + 1; + } + *(p - 1) = '\0'; + + tracing_log_err(trace_probe_log.subsystem, command, + trace_probe_err_text, err_type, pos + offset); + + kfree(command); +} + /* Split symbol and offset. */ int traceprobe_split_symbol_offset(char *symbol, long *offset) { @@ -156,7 +215,7 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) /* @buf must has MAX_EVENT_NAME_LEN size */ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, - char *buf) + char *buf, int offset) { const char *slash, *event = *pevent; int len; @@ -164,32 +223,33 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, slash = strchr(event, '/'); if (slash) { if (slash == event) { - pr_info("Group name is not specified\n"); + trace_probe_log_err(offset, NO_GROUP_NAME); return -EINVAL; } if (slash - event + 1 > MAX_EVENT_NAME_LEN) { - pr_info("Group name is too long\n"); - return -E2BIG; + trace_probe_log_err(offset, GROUP_TOO_LONG); + return -EINVAL; } strlcpy(buf, event, slash - event + 1); if (!is_good_name(buf)) { - pr_info("Group name must follow the same rules as C identifiers\n"); + trace_probe_log_err(offset, BAD_GROUP_NAME); return -EINVAL; } *pgroup = buf; *pevent = slash + 1; + offset += slash - event + 1; event = *pevent; } len = strlen(event); if (len == 0) { - pr_info("Event name is not specified\n"); + trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; } else if (len > MAX_EVENT_NAME_LEN) { - pr_info("Event name is too long\n"); - return -E2BIG; + trace_probe_log_err(offset, EVENT_TOO_LONG); + return -EINVAL; } if (!is_good_name(event)) { - pr_info("Event name must follow the same rules as C identifiers\n"); + trace_probe_log_err(offset, BAD_EVENT_NAME); return -EINVAL; } return 0; @@ -198,56 +258,67 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_insn *code, unsigned int flags) + struct fetch_insn *code, unsigned int flags, int offs) { unsigned long param; int ret = 0; int len; if (strcmp(arg, "retval") == 0) { - if (flags & TPARG_FL_RETURN) + if (flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; - else + } else { + trace_probe_log_err(offs, RETVAL_ON_PROBE); ret = -EINVAL; + } } else if ((len = str_has_prefix(arg, "stack"))) { if (arg[len] == '\0') { code->op = FETCH_OP_STACKP; } else if (isdigit(arg[len])) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret || ((flags & TPARG_FL_KERNEL) && - param > PARAM_MAX_STACK)) + if (ret) { + goto inval_var; + } else if ((flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK) { + trace_probe_log_err(offs, BAD_STACK_NUM); ret = -EINVAL; - else { + } else { code->op = FETCH_OP_STACK; code->param = (unsigned int)param; } } else - ret = -EINVAL; + goto inval_var; } else if (strcmp(arg, "comm") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && (len = str_has_prefix(arg, "arg"))) { - if (!isdigit(arg[len])) - return -EINVAL; ret = kstrtoul(arg + len, 10, ¶m); - if (ret || !param || param > PARAM_MAX_STACK) + if (ret) { + goto inval_var; + } else if (!param || param > PARAM_MAX_STACK) { + trace_probe_log_err(offs, BAD_ARG_NUM); return -EINVAL; + } code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif } else - ret = -EINVAL; + goto inval_var; return ret; + +inval_var: + trace_probe_log_err(offs, BAD_VAR); + return -EINVAL; } /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *type, struct fetch_insn **pcode, struct fetch_insn *end, - unsigned int flags) + unsigned int flags, int offs) { struct fetch_insn *code = *pcode; unsigned long param; @@ -257,7 +328,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, type, code, flags); + ret = parse_probe_vars(arg + 1, type, code, flags, offs); break; case '%': /* named register */ @@ -266,47 +337,57 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->op = FETCH_OP_REG; code->param = (unsigned int)ret; ret = 0; - } + } else + trace_probe_log_err(offs, BAD_REG_NAME); break; case '@': /* memory, file-offset or symbol */ if (isdigit(arg[1])) { ret = kstrtoul(arg + 1, 0, ¶m); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_MEM_ADDR); break; + } /* load address */ code->op = FETCH_OP_IMM; code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (flags & TPARG_FL_KERNEL) + if (flags & TPARG_FL_KERNEL) { + trace_probe_log_err(offs, FILE_ON_KPROBE); return -EINVAL; - + } ret = kstrtol(arg + 2, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_FILE_OFFS); break; + } code->op = FETCH_OP_FOFFS; code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!(flags & TPARG_FL_KERNEL)) + if (!(flags & TPARG_FL_KERNEL)) { + trace_probe_log_err(offs, SYM_ON_UPROBE); return -EINVAL; - + } /* Preserve symbol for updating */ code->op = FETCH_NOP_SYMBOL; code->data = kstrdup(arg + 1, GFP_KERNEL); if (!code->data) return -ENOMEM; - if (++code == end) - return -E2BIG; - + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); + return -EINVAL; + } code->op = FETCH_OP_IMM; code->immediate = 0; } /* These are fetching from memory */ - if (++code == end) - return -E2BIG; + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); + return -EINVAL; + } *pcode = code; code->op = FETCH_OP_DEREF; code->offset = offset; @@ -317,28 +398,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type, /* fall through */ case '-': tmp = strchr(arg, '('); - if (!tmp) + if (!tmp) { + trace_probe_log_err(offs, DEREF_NEED_BRACE); return -EINVAL; - + } *tmp = '\0'; ret = kstrtol(arg, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_DEREF_OFFS); break; - + } + offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); arg = tmp + 1; tmp = strrchr(arg, ')'); - - if (tmp) { + if (!tmp) { + trace_probe_log_err(offs + strlen(arg), + DEREF_OPEN_BRACE); + return -EINVAL; + } else { const struct fetch_type *t2 = find_fetch_type(NULL); *tmp = '\0'; - ret = parse_probe_arg(arg, t2, &code, end, flags); + ret = parse_probe_arg(arg, t2, &code, end, flags, offs); if (ret) break; - if (code->op == FETCH_OP_COMM) + if (code->op == FETCH_OP_COMM) { + trace_probe_log_err(offs, COMM_CANT_DEREF); + return -EINVAL; + } + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); return -EINVAL; - if (++code == end) - return -E2BIG; + } *pcode = code; code->op = FETCH_OP_DEREF; @@ -348,6 +439,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ + trace_probe_log_err(offs, BAD_FETCH_ARG); ret = -EINVAL; } return ret; @@ -379,7 +471,7 @@ static int __parse_bitfield_probe_arg(const char *bf, return -EINVAL; code++; if (code->op != FETCH_OP_NOP) - return -E2BIG; + return -EINVAL; *pcode = code; code->op = FETCH_OP_MOD_BF; @@ -392,32 +484,53 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, - struct probe_arg *parg, unsigned int flags) + struct probe_arg *parg, unsigned int flags, int offset) { struct fetch_insn *code, *scode, *tmp = NULL; - char *t, *t2; + char *t, *t2, *t3; int ret, len; - if (strlen(arg) > MAX_ARGSTR_LEN) { - pr_info("Argument is too long.: %s\n", arg); - return -ENOSPC; + len = strlen(arg); + if (len > MAX_ARGSTR_LEN) { + trace_probe_log_err(offset, ARG_TOO_LONG); + return -EINVAL; + } else if (len == 0) { + trace_probe_log_err(offset, NO_ARG_BODY); + return -EINVAL; } + parg->comm = kstrdup(arg, GFP_KERNEL); - if (!parg->comm) { - pr_info("Failed to allocate memory for command '%s'.\n", arg); + if (!parg->comm) return -ENOMEM; - } + t = strchr(arg, ':'); if (t) { *t = '\0'; t2 = strchr(++t, '['); if (t2) { - *t2 = '\0'; - parg->count = simple_strtoul(t2 + 1, &t2, 0); - if (strcmp(t2, "]") || parg->count == 0) + *t2++ = '\0'; + t3 = strchr(t2, ']'); + if (!t3) { + offset += t2 + strlen(t2) - arg; + trace_probe_log_err(offset, + ARRAY_NO_CLOSE); + return -EINVAL; + } else if (t3[1] != '\0') { + trace_probe_log_err(offset + t3 + 1 - arg, + BAD_ARRAY_SUFFIX); + return -EINVAL; + } + *t3 = '\0'; + if (kstrtouint(t2, 0, &parg->count) || !parg->count) { + trace_probe_log_err(offset + t2 - arg, + BAD_ARRAY_NUM); return -EINVAL; - if (parg->count > MAX_ARRAY_LEN) - return -E2BIG; + } + if (parg->count > MAX_ARRAY_LEN) { + trace_probe_log_err(offset + t2 - arg, + ARRAY_TOO_BIG); + return -EINVAL; + } } } /* @@ -429,7 +542,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, else parg->type = find_fetch_type(t); if (!parg->type) { - pr_info("Unsupported type: %s\n", t); + trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); return -EINVAL; } parg->offset = *size; @@ -450,7 +563,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - flags); + flags, offset); if (ret) goto fail; @@ -458,7 +571,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (!strcmp(parg->type->name, "string")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) { - pr_info("string only accepts memory or address.\n"); + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_STRING); ret = -EINVAL; goto fail; } @@ -470,7 +584,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, */ code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } } @@ -483,7 +598,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } else { code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } code->op = FETCH_OP_ST_RAW; @@ -493,20 +609,24 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, /* Modify operation */ if (t != NULL) { ret = __parse_bitfield_probe_arg(t, parg->type, &code); - if (ret) + if (ret) { + trace_probe_log_err(offset + t - arg, BAD_BITFIELD); goto fail; + } } /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && scode->op != FETCH_OP_ST_STRING) { - pr_info("array only accepts memory or address\n"); + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_STRING); ret = -EINVAL; goto fail; } code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -555,15 +675,19 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, { struct probe_arg *parg = &tp->args[i]; char *body; - int ret; /* Increment count for freeing args in error case */ tp->nr_args++; body = strchr(arg, '='); if (body) { - if (body - arg > MAX_ARG_NAME_LEN || body == arg) + if (body - arg > MAX_ARG_NAME_LEN) { + trace_probe_log_err(0, ARG_NAME_TOO_LONG); + return -EINVAL; + } else if (body == arg) { + trace_probe_log_err(0, NO_ARG_NAME); return -EINVAL; + } parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { @@ -575,22 +699,16 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, return -ENOMEM; if (!is_good_name(parg->name)) { - pr_info("Invalid argument[%d] name: %s\n", - i, parg->name); + trace_probe_log_err(0, BAD_ARG_NAME); return -EINVAL; } - if (traceprobe_conflict_field_name(parg->name, tp->args, i)) { - pr_info("Argument[%d]: '%s' conflicts with another field.\n", - i, parg->name); + trace_probe_log_err(0, USED_ARG_NAME); return -EINVAL; } - /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags); - if (ret) - pr_info("Parse error at argument[%d]. (%d)\n", i, ret); - return ret; + return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags, + body - arg); } void traceprobe_free_probe_arg(struct probe_arg *arg) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 2177c206de15..b7737666c1a8 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -280,8 +280,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); -extern int traceprobe_parse_event_name(const char **pevent, - const char **pgroup, char *buf); +int traceprobe_parse_event_name(const char **pevent, const char **pgroup, + char *buf, int offset); extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); @@ -298,3 +298,76 @@ extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, size_t offset, struct trace_probe *tp); + +#undef ERRORS +#define ERRORS \ + C(FILE_NOT_FOUND, "Failed to find the given file"), \ + C(NO_REGULAR_FILE, "Not a regular file"), \ + C(BAD_REFCNT, "Invalid reference counter offset"), \ + C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \ + C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \ + C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \ + C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \ + C(BAD_MAXACT, "Invalid maxactive number"), \ + C(MAXACT_TOO_BIG, "Maxactive is too big"), \ + C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ + C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(NO_GROUP_NAME, "Group name is not specified"), \ + C(GROUP_TOO_LONG, "Group name is too long"), \ + C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \ + C(NO_EVENT_NAME, "Event name is not specified"), \ + C(EVENT_TOO_LONG, "Event name is too long"), \ + C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ + C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ + C(BAD_STACK_NUM, "Invalid stack number"), \ + C(BAD_ARG_NUM, "Invalid argument number"), \ + C(BAD_VAR, "Invalid $-valiable specified"), \ + C(BAD_REG_NAME, "Invalid register name"), \ + C(BAD_MEM_ADDR, "Invalid memory address"), \ + C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \ + C(BAD_FILE_OFFS, "Invalid file offset value"), \ + C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \ + C(TOO_MANY_OPS, "Dereference is too much nested"), \ + C(DEREF_NEED_BRACE, "Dereference needs a brace"), \ + C(BAD_DEREF_OFFS, "Invalid dereference offset"), \ + C(DEREF_OPEN_BRACE, "Dereference brace is not closed"), \ + C(COMM_CANT_DEREF, "$comm can not be dereferenced"), \ + C(BAD_FETCH_ARG, "Invalid fetch argument"), \ + C(ARRAY_NO_CLOSE, "Array is not closed"), \ + C(BAD_ARRAY_SUFFIX, "Array has wrong suffix"), \ + C(BAD_ARRAY_NUM, "Invalid array size"), \ + C(ARRAY_TOO_BIG, "Array number is too big"), \ + C(BAD_TYPE, "Unknown type is specified"), \ + C(BAD_STRING, "String accepts only memory argument"), \ + C(BAD_BITFIELD, "Invalid bitfield"), \ + C(ARG_NAME_TOO_LONG, "Argument name is too long"), \ + C(NO_ARG_NAME, "Argument name is not specified"), \ + C(BAD_ARG_NAME, "Argument name must follow the same rules as C identifiers"), \ + C(USED_ARG_NAME, "This argument name is already used"), \ + C(ARG_TOO_LONG, "Argument expression is too long"), \ + C(NO_ARG_BODY, "No argument expression"), \ + C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\ + C(FAIL_REG_PROBE, "Failed to register probe event"), + +#undef C +#define C(a, b) TP_ERR_##a + +/* Define TP_ERR_ */ +enum { ERRORS }; + +/* Error text is defined in trace_probe.c */ + +struct trace_probe_log { + const char *subsystem; + const char **argv; + int argc; + int index; +}; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv); +void trace_probe_log_set_index(int index); +void trace_probe_log_clear(void); +void __trace_probe_log_err(int offset, int err); + +#define trace_probe_log_err(offs, err) \ + __trace_probe_log_err(offs, TP_ERR_##err) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index be78d99ee6bc..cd8750a72768 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -457,13 +457,19 @@ static int trace_uprobe_create(int argc, const char **argv) return -ECANCELED; } + trace_probe_log_init("trace_uprobe", argc, argv); + trace_probe_log_set_index(1); /* filename is the 2nd argument */ + *arg++ = '\0'; ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) { + trace_probe_log_err(0, FILE_NOT_FOUND); kfree(filename); + trace_probe_log_clear(); return ret; } if (!d_is_reg(path.dentry)) { + trace_probe_log_err(0, NO_REGULAR_FILE); ret = -EINVAL; goto fail_address_parse; } @@ -472,9 +478,16 @@ static int trace_uprobe_create(int argc, const char **argv) rctr = strchr(arg, '('); if (rctr) { rctr_end = strchr(rctr, ')'); - if (rctr > rctr_end || *(rctr_end + 1) != 0) { + if (!rctr_end) { + ret = -EINVAL; + rctr_end = rctr + strlen(rctr); + trace_probe_log_err(rctr_end - filename, + REFCNT_OPEN_BRACE); + goto fail_address_parse; + } else if (rctr_end[1] != '\0') { ret = -EINVAL; - pr_info("Invalid reference counter offset.\n"); + trace_probe_log_err(rctr_end + 1 - filename, + BAD_REFCNT_SUFFIX); goto fail_address_parse; } @@ -482,22 +495,23 @@ static int trace_uprobe_create(int argc, const char **argv) *rctr_end = '\0'; ret = kstrtoul(rctr, 0, &ref_ctr_offset); if (ret) { - pr_info("Invalid reference counter offset.\n"); + trace_probe_log_err(rctr - filename, BAD_REFCNT); goto fail_address_parse; } } /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS); goto fail_address_parse; - - argc -= 2; - argv += 2; + } /* setup a probe */ + trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf); + ret = traceprobe_parse_event_name(&event, &group, buf, + event - argv[0]); if (ret) goto fail_address_parse; } else { @@ -519,6 +533,9 @@ static int trace_uprobe_create(int argc, const char **argv) kfree(tail); } + argc -= 2; + argv += 2; + tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { ret = PTR_ERR(tu); @@ -539,6 +556,7 @@ static int trace_uprobe_create(int argc, const char **argv) goto error; } + trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, is_return ? TPARG_FL_RETURN : 0); kfree(tmp); @@ -547,20 +565,20 @@ static int trace_uprobe_create(int argc, const char **argv) } ret = register_trace_uprobe(tu); - if (ret) - goto error; - return 0; + if (!ret) + goto out; error: free_trace_uprobe(tu); +out: + trace_probe_log_clear(); return ret; fail_address_parse: + trace_probe_log_clear(); path_put(&path); kfree(filename); - pr_info("Failed to parse address or file.\n"); - return ret; } -- cgit v1.2.3 From 06ee7115b0d1742de745ad143fb5e06d77d27fba Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:40 -0700 Subject: bpf: add verifier stats and log_level bit 2 In order to understand the verifier bottlenecks add various stats and extend log_level: log_level 1 and 2 are kept as-is: bit 0 - level=1 - print every insn and verifier state at branch points bit 1 - level=2 - print every insn and verifier state at every insn bit 2 - level=4 - print verifier error and stats at the end of verification When verifier rejects the program the libbpf is trying to load the program twice. Once with log_level=0 (no messages, only error code is reported to user space) and second time with log_level=1 to tell the user why the verifier rejected it. With introduction of bit 2 - level=4 the libbpf can choose to always use that level and load programs once, since the verification speed is not affected and in case of error the verbose message will be available. Note that the verifier stats are not part of uapi just like all other verbose messages. They're expected to change in the future. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 76 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87221fda1321..e2001c1e40b3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1092,7 +1092,7 @@ static int check_subprogs(struct bpf_verifier_env *env) */ subprog[env->subprog_cnt].start = insn_cnt; - if (env->log.level > 1) + if (env->log.level & BPF_LOG_LEVEL2) for (i = 0; i < env->subprog_cnt; i++) verbose(env, "func#%d @%d\n", i, subprog[i].start); @@ -1139,6 +1139,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, struct bpf_reg_state *parent) { bool writes = parent == state->parent; /* Observe write marks */ + int cnt = 0; while (parent) { /* if read wasn't screened by an earlier write ... */ @@ -1155,7 +1156,11 @@ static int mark_reg_read(struct bpf_verifier_env *env, state = parent; parent = state->parent; writes = true; + cnt++; } + + if (env->longest_mark_read_walk < cnt) + env->longest_mark_read_walk = cnt; return 0; } @@ -1455,7 +1460,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (env->log.level) + if (env->log.level & BPF_LOG_LEVEL) print_verifier_state(env, state); /* The minimum value is only important with signed @@ -2938,7 +2943,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* and go analyze first insn of the callee */ *insn_idx = target_insn; - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); print_verifier_state(env, caller); verbose(env, "callee:\n"); @@ -2978,7 +2983,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; *insn_idx = callee->callsite + 1; - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); print_verifier_state(env, callee); verbose(env, "to caller at %d:\n", *insn_idx); @@ -5001,7 +5006,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->dst_reg); return -EACCES; } - if (env->log.level) + if (env->log.level & BPF_LOG_LEVEL) print_verifier_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -6181,6 +6186,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) states_cnt++; } + if (env->max_states_per_insn < states_cnt) + env->max_states_per_insn = states_cnt; + if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return 0; @@ -6194,6 +6202,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; + env->total_states++; + env->peak_states++; /* add new state to the head of linked list */ new = &new_sl->state; @@ -6278,8 +6288,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len, i; - int insn_processed = 0; + int insn_cnt = env->prog->len; bool do_print_state = false; env->prev_linfo = NULL; @@ -6314,10 +6323,10 @@ static int do_check(struct bpf_verifier_env *env) insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); - if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { + if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose(env, "BPF program is too large. Processed %d insn\n", - insn_processed); + env->insn_processed); return -E2BIG; } @@ -6326,7 +6335,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { if (do_print_state) verbose(env, "\nfrom %d to %d%s: safe\n", env->prev_insn_idx, env->insn_idx, @@ -6344,8 +6353,9 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (env->log.level > 1 || (env->log.level && do_print_state)) { - if (env->log.level > 1) + if (env->log.level & BPF_LOG_LEVEL2 || + (env->log.level & BPF_LOG_LEVEL && do_print_state)) { + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "%d:", env->insn_idx); else verbose(env, "\nfrom %d to %d%s:", @@ -6356,7 +6366,7 @@ static int do_check(struct bpf_verifier_env *env) do_print_state = false; } - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { const struct bpf_insn_cbs cbs = { .cb_print = verbose, .private_data = env, @@ -6621,16 +6631,6 @@ process_bpf_exit: env->insn_idx++; } - verbose(env, "processed %d insns (limit %d), stack depth ", - insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); - for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_info[i].stack_depth; - - verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt) - verbose(env, "+"); - } - verbose(env, "\n"); env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -7854,9 +7854,34 @@ static void free_states(struct bpf_verifier_env *env) kfree(env->explored_states); } +static void print_verification_stats(struct bpf_verifier_env *env) +{ + int i; + + if (env->log.level & BPF_LOG_STATS) { + verbose(env, "verification time %lld usec\n", + div_u64(env->verification_time, 1000)); + verbose(env, "stack depth "); + for (i = 0; i < env->subprog_cnt; i++) { + u32 depth = env->subprog_info[i].stack_depth; + + verbose(env, "%d", depth); + if (i + 1 < env->subprog_cnt) + verbose(env, "+"); + } + verbose(env, "\n"); + } + verbose(env, "processed %d insns (limit %d) max_states_per_insn %d " + "total_states %d peak_states %d mark_read %d\n", + env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS, + env->max_states_per_insn, env->total_states, + env->peak_states, env->longest_mark_read_walk); +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, union bpf_attr __user *uattr) { + u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; struct bpf_verifier_log *log; int i, len, ret = -EINVAL; @@ -7899,7 +7924,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, ret = -EINVAL; /* log attributes have to be sane */ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || - !log->level || !log->ubuf) + !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) goto err_unlock; } @@ -7980,6 +8005,9 @@ skip_full_check: if (ret == 0) ret = fixup_call_args(env); + env->verification_time = ktime_get_ns() - start_time; + print_verification_stats(env); + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { -- cgit v1.2.3 From 9f4686c41bdff051f557accb531af79dd1773687 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:41 -0700 Subject: bpf: improve verification speed by droping states Branch instructions, branch targets and calls in a bpf program are the places where the verifier remembers states that led to successful verification of the program. These states are used to prune brute force program analysis. For unprivileged programs there is a limit of 64 states per such 'branching' instructions (maximum length is tracked by max_states_per_insn counter introduced in the previous patch). Simply reducing this threshold to 32 or lower increases insn_processed metric to the point that small valid programs get rejected. For root programs there is no limit and cilium programs can have max_states_per_insn to be 100 or higher. Walking 100+ states multiplied by number of 'branching' insns during verification consumes significant amount of cpu time. Turned out simple LRU-like mechanism can be used to remove states that unlikely will be helpful in future search pruning. This patch introduces hit_cnt and miss_cnt counters: hit_cnt - this many times this state successfully pruned the search miss_cnt - this many times this state was not equivalent to other states (and that other states were added to state list) The heuristic introduced in this patch is: if (sl->miss_cnt > sl->hit_cnt * 3 + 3) /* drop this state from future considerations */ Higher numbers increase max_states_per_insn (allow more states to be considered for pruning) and slow verification speed, but do not meaningfully reduce insn_processed metric. Lower numbers drop too many states and insn_processed increases too much. Many different formulas were considered. This one is simple and works well enough in practice. (the analysis was done on selftests/progs/* and on cilium programs) The end result is this heuristic improves verification speed by 10 times. Large synthetic programs that used to take a second more now take 1/10 of a second. In cases where max_states_per_insn used to be 100 or more, now it's ~10. There is a slight increase in insn_processed for cilium progs: before after bpf_lb-DLB_L3.o 1831 1838 bpf_lb-DLB_L4.o 3029 3218 bpf_lb-DUNKNOWN.o 1064 1064 bpf_lxc-DDROP_ALL.o 26309 26935 bpf_lxc-DUNKNOWN.o 33517 34439 bpf_netdev.o 9713 9721 bpf_overlay.o 6184 6184 bpf_lcx_jit.o 37335 39389 And 2-3 times improvement in the verification speed. Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e2001c1e40b3..a636db4a7a4e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6152,11 +6152,13 @@ static int propagate_liveness(struct bpf_verifier_env *env, static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; - struct bpf_verifier_state_list *sl; + struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - sl = env->explored_states[insn_idx]; + pprev = &env->explored_states[insn_idx]; + sl = *pprev; + if (!sl) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here @@ -6167,6 +6169,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) while (sl != STATE_LIST_MARK) { if (states_equal(env, &sl->state, cur)) { + sl->hit_cnt++; /* reached equivalent register/stack state, * prune the search. * Registers read by the continuation are read by us. @@ -6182,8 +6185,35 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return err; return 1; } - sl = sl->next; states_cnt++; + sl->miss_cnt++; + /* heuristic to determine whether this state is beneficial + * to keep checking from state equivalence point of view. + * Higher numbers increase max_states_per_insn and verification time, + * but do not meaningfully decrease insn_processed. + */ + if (sl->miss_cnt > sl->hit_cnt * 3 + 3) { + /* the state is unlikely to be useful. Remove it to + * speed up verification + */ + *pprev = sl->next; + if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + free_verifier_state(&sl->state, false); + kfree(sl); + env->peak_states--; + } else { + /* cannot free this state, since parentage chain may + * walk it later. Add it for free_list instead to + * be freed at the end of verification + */ + sl->next = env->free_list; + env->free_list = sl; + } + sl = *pprev; + continue; + } + pprev = &sl->next; + sl = *pprev; } if (env->max_states_per_insn < states_cnt) @@ -7836,6 +7866,14 @@ static void free_states(struct bpf_verifier_env *env) struct bpf_verifier_state_list *sl, *sln; int i; + sl = env->free_list; + while (sl) { + sln = sl->next; + free_verifier_state(&sl->state, false); + kfree(sl); + sl = sln; + } + if (!env->explored_states) return; -- cgit v1.2.3 From 25af32dad8047d180e70e233c85b909dd6587cc5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:42 -0700 Subject: bpf: improve verification speed by not remarking live_read With large verifier speed improvement brought by the previous patch mark_reg_read() becomes the hottest function during verification. On a typical program it consumes 40% of cpu. mark_reg_read() walks parentage chain of registers to mark parents as LIVE_READ. Once the register is marked there is no need to remark it again in the future. Hence stop walking the chain once first LIVE_READ is seen. This optimization drops mark_reg_read() time from 40% of cpu to <1% and overall 2x improvement of verification speed. For some programs the longest_mark_read_walk counter improves from ~500 to ~5 Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Reviewed-by: Edward Cree Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a636db4a7a4e..94cf6efc5df6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1151,6 +1151,15 @@ static int mark_reg_read(struct bpf_verifier_env *env, parent->var_off.value, parent->off); return -EFAULT; } + if (parent->live & REG_LIVE_READ) + /* The parentage chain never changes and + * this parent was already marked as LIVE_READ. + * There is no need to keep walking the chain again and + * keep re-marking all parents as LIVE_READ. + * This case happens when the same register is read + * multiple times without writes into it in-between. + */ + break; /* ... then we depend on parent's value */ parent->live |= REG_LIVE_READ; state = parent; -- cgit v1.2.3 From 71dde681a8cea1ccff2c7b3be83c043ab6b2a977 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:43 -0700 Subject: bpf: convert temp arrays to kvcalloc Temporary arrays used during program verification need to be vmalloc-ed to support large bpf programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 94cf6efc5df6..ad3494a881da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5313,13 +5313,13 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; - insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; - insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_stack) { - kfree(insn_state); + kvfree(insn_state); return -ENOMEM; } @@ -5417,8 +5417,8 @@ check_state: ret = 0; /* cfg looks good */ err_free: - kfree(insn_state); - kfree(insn_stack); + kvfree(insn_state); + kvfree(insn_stack); return ret; } @@ -7898,7 +7898,7 @@ static void free_states(struct bpf_verifier_env *env) } } - kfree(env->explored_states); + kvfree(env->explored_states); } static void print_verification_stats(struct bpf_verifier_env *env) @@ -7994,7 +7994,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - env->explored_states = kcalloc(env->prog->len, + env->explored_states = kvcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; -- cgit v1.2.3 From 4f73379ec5c2891598aa715c6df7ac9afdc86fbf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:44 -0700 Subject: bpf: verbose jump offset overflow check Larger programs may trigger 16-bit jump offset overflow check during instruction patching. Make this error verbose otherwise users cannot decipher error code without printks in the verifier. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/core.c | 11 ++++++----- kernel/bpf/verifier.c | 7 ++++++- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ff09d32a8a1b..2966cb368bf4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -438,6 +438,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; + int err; /* Since our patchlet doesn't expand the image, we're done. */ if (insn_delta == 0) { @@ -453,8 +454,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && - bpf_adj_branches(prog, off, off + 1, off + len, true)) - return NULL; + (err = bpf_adj_branches(prog, off, off + 1, off + len, true))) + return ERR_PTR(err); /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as @@ -463,7 +464,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), GFP_USER); if (!prog_adj) - return NULL; + return ERR_PTR(-ENOMEM); prog_adj->len = insn_adj_cnt; @@ -1096,13 +1097,13 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) continue; tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); - if (!tmp) { + if (IS_ERR(tmp)) { /* Patching may have repointed aux->prog during * realloc from the original one, so we need to * fix it up here on error. */ bpf_jit_prog_release_other(prog, clone); - return ERR_PTR(-ENOMEM); + return tmp; } clone = tmp; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ad3494a881da..6dcfeb44bb8e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6932,8 +6932,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of struct bpf_prog *new_prog; new_prog = bpf_patch_insn_single(env->prog, off, patch, len); - if (!new_prog) + if (IS_ERR(new_prog)) { + if (PTR_ERR(new_prog) == -ERANGE) + verbose(env, + "insn %d cannot be patched due to 16-bit range\n", + env->insn_aux_data[off].orig_idx); return NULL; + } if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; adjust_subprog_starts(env, off, len); -- cgit v1.2.3 From c04c0d2b968ac45d6ef020316808ef6c82325a82 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:45 -0700 Subject: bpf: increase complexity limit and maximum program size Large verifier speed improvements allow to increase verifier complexity limit. Now regardless of the program composition and its size it takes little time for the verifier to hit insn_processed limit. On typical x86 machine non-debug kernel processes 1M instructions in 1/10 of a second. (before these speed improvements specially crafted programs could be hitting multi-second verification times) Full kasan kernel with debug takes ~1 second for the same 1M insns. Hence bump the BPF_COMPLEXITY_LIMIT_INSNS limit to 1M. Also increase the number of instructions per program from 4k to internal BPF_COMPLEXITY_LIMIT_INSNS limit. 4k limit was confusing to users, since small programs with hundreds of insns could be hitting BPF_COMPLEXITY_LIMIT_INSNS limit. Sometimes adding more insns and bpf_trace_printk debug statements would make the verifier accept the program while removing code would make the verifier reject it. Some user space application started to add #define MAX_FOO to their programs and do: MAX_FOO=100; again: compile with MAX_FOO; try to load; if (fails_to_load) { reduce MAX_FOO; goto again; } to be able to fit maximum amount of processing into single program. Other users artificially split their single program into a set of programs and use all 32 iterations of tail_calls to increase compute limits. And the most advanced folks used unlimited tc-bpf filter list to execute many bpf programs. Essentially the users managed to workaround 4k insn limit. This patch removes the limit for root programs from uapi. BPF_COMPLEXITY_LIMIT_INSNS is the kernel internal limit and success to load the program no longer depends on program size, but on 'smartness' of the verifier only. The verifier will continue to get smarter with every kernel release. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 3 ++- kernel/bpf/verifier.c | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index afca36f53c49..1d65e56594db 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1557,7 +1557,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) /* eBPF programs must be GPL compatible to use GPL-ed functions */ is_gpl = license_is_gpl_compatible(license); - if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) + if (attr->insn_cnt == 0 || + attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6dcfeb44bb8e..b631e89e7a51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -176,7 +176,6 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 #define BPF_COMPLEXITY_LIMIT_STATES 64 -- cgit v1.2.3 From 7a9f5c65abcc9644b11738ca0815510cb5510eaf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:46 -0700 Subject: bpf: increase verifier log limit The existing 16Mbyte verifier log limit is not enough for log_level=2 even for small programs. Increase it to 1Gbyte. Note it's not a kernel memory limit. It's an amount of memory user space provides to store the verifier log. The kernel populates it 1k at a time. Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b631e89e7a51..bb27b675923c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7974,7 +7974,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, ret = -EINVAL; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) goto err_unlock; } -- cgit v1.2.3 From d6e486ee0ef2f99a4069d9186e53dac61b28cb3c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 3 Apr 2019 16:03:54 -0700 Subject: cgroup: remove extra cgroup_migrate_finish() call The callers of cgroup_migrate_prepare_dst() correctly call cgroup_migrate_finish() for success and failure cases both. No need to call it in cgroup_migrate_prepare_dst() in failure case. Signed-off-by: Shakeel Butt Reviewed-by: Daniel Jordan Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3f2b4bde0f9c..f219c195a9a5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2602,7 +2602,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) - goto err; + return -ENOMEM; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); @@ -2634,9 +2634,6 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) } return 0; -err: - cgroup_migrate_finish(mgctx); - return -ENOMEM; } /** -- cgit v1.2.3 From 9419a3191dcb27f24478d288abaab697228d28e6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Apr 2019 21:04:13 -0400 Subject: acct_on(): don't mess with freeze protection What happens there is that we are replacing file->path.mnt of a file we'd just opened with a clone and we need the write count contribution to be transferred from original mount to new one. That's it. We do *NOT* want any kind of freeze protection for the duration of switchover. IOW, we should just use __mnt_{want,drop}_write() for that switchover; no need to bother with mnt_{want,drop}_write() there. Tested-by: Amir Goldstein Reported-by: syzbot+2a73a6ea9507b7112141@syzkaller.appspotmail.com Signed-off-by: Al Viro --- kernel/acct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index addf7732fb56..81f9831a7859 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return PTR_ERR(internal); } - err = mnt_want_write(internal); + err = __mnt_want_write(internal); if (err) { mntput(internal); kfree(acct); @@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname) old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - mnt_drop_write(mnt); + __mnt_drop_write(mnt); mntput(mnt); return 0; } -- cgit v1.2.3 From f2bcd05ec7b839ff826d2008506ad2d2dff46a59 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 3 Apr 2019 23:22:37 -0700 Subject: bpf: Reject indirect var_off stack access in raw mode It's hard to guarantee that whole memory is marked as initialized on helper return if uninitialized stack is accessed with variable offset since specific bounds are unknown to verifier. This may cause uninitialized stack leaking. Reject such an access in check_stack_boundary to prevent possible leaking. There are no known use-cases for indirect uninitialized stack access with variable offset so it shouldn't break anything. Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb27b675923c..0f12fda35626 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2226,6 +2226,15 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (err) return err; } else { + /* Only initialized buffer on stack is allowed to be accessed + * with variable offset. With uninitialized buffer it's hard to + * guarantee that whole memory is marked as initialized on + * helper return since specific bounds are unknown what may + * cause uninitialized stack leaking. + */ + if (meta && meta->raw_mode) + meta = NULL; + min_off = reg->smin_value + reg->off; max_off = reg->umax_value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, -- cgit v1.2.3 From 088ec26d9c2da9d879ab73e3f4117f9df6c566ee Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 3 Apr 2019 23:22:39 -0700 Subject: bpf: Reject indirect var_off stack access in unpriv mode Proper support of indirect stack access with variable offset in unprivileged mode (!root) requires corresponding support in Spectre masking for stack ALU in retrieve_ptr_limit(). There are no use-case for variable offset in unprivileged mode though so make verifier reject such accesses for simplicity. Pointer arithmetics is one (and only?) way to cause variable offset and it's already rejected in unpriv mode so that verifier won't even get to helper function whose argument contains variable offset, e.g.: 0: (7a) *(u64 *)(r10 -16) = 0 1: (7a) *(u64 *)(r10 -8) = 0 2: (61) r2 = *(u32 *)(r1 +0) 3: (57) r2 &= 4 4: (17) r2 -= 16 5: (0f) r2 += r10 variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1R2 stack pointer arithmetic goes out of range, prohibited for !root Still it looks like a good idea to reject variable offset indirect stack access for unprivileged mode in check_stack_boundary() explicitly. Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0f12fda35626..8400c1f33cd4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2226,6 +2226,19 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (err) return err; } else { + /* Variable offset is prohibited for unprivileged mode for + * simplicity since it requires corresponding support in + * Spectre masking for stack ALU. + * See also retrieve_ptr_limit(). + */ + if (!env->allow_ptr_leaks) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", + regno, tn_buf); + return -EACCES; + } /* Only initialized buffer on stack is allowed to be accessed * with variable offset. With uninitialized buffer it's hard to * guarantee that whole memory is marked as initialized on @@ -3339,6 +3352,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, switch (ptr_reg->type) { case PTR_TO_STACK: + /* Indirect variable offset stack access is prohibited in + * unprivileged mode so it's not handled here. + */ off = ptr_reg->off + ptr_reg->var_off.value; if (mask_to_left) *ptr_limit = MAX_BPF_STACK + off; -- cgit v1.2.3 From 107c26a70ca81bfc33657366ad69d02fdc9efc9d Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 3 Apr 2019 23:22:41 -0700 Subject: bpf: Sanity check max value for var_off stack access As discussed in [1] max value of variable offset has to be checked for overflow on stack access otherwise verifier would accept code like this: 0: (b7) r2 = 6 1: (b7) r3 = 28 2: (7a) *(u64 *)(r10 -16) = 0 3: (7a) *(u64 *)(r10 -8) = 0 4: (79) r4 = *(u64 *)(r1 +168) 5: (c5) if r4 s< 0x0 goto pc+4 R1=ctx(id=0,off=0,imm=0) R2=inv6 R3=inv28 R4=inv(id=0,umax_value=9223372036854775807,var_off=(0x0; 0x7fffffffffffffff)) R10=fp0,call_-1 fp-8=mmmmmmmm fp-16=mmmmmmmm 6: (17) r4 -= 16 7: (0f) r4 += r10 8: (b7) r5 = 8 9: (85) call bpf_getsockopt#57 10: (b7) r0 = 0 11: (95) exit , where R4 obviosly has unbounded max value. Fix it by checking that reg->smax_value is inside (-BPF_MAX_VAR_OFF; BPF_MAX_VAR_OFF) range. reg->smax_value is used instead of reg->umax_value because stack pointers are calculated using negative offset from fp. This is opposite to e.g. map access where offset must be non-negative and where umax_value is used. Also dedicated verbose logs are added for both min and max bound check failures to have diagnostics consistent with variable offset handling in check_map_access(). [1] https://marc.info/?l=linux-netdev&m=155433357510597&w=2 Fixes: 2011fccfb61b ("bpf: Support variable offset stack access from helpers") Reported-by: Daniel Borkmann Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8400c1f33cd4..f2d600199e66 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2248,16 +2248,28 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (meta && meta->raw_mode) meta = NULL; + if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smax_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "R%d unbounded indirect variable offset stack access\n", + regno); + return -EACCES; + } min_off = reg->smin_value + reg->off; - max_off = reg->umax_value + reg->off; + max_off = reg->smax_value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, zero_size_allowed); - if (err) + if (err) { + verbose(env, "R%d min value is outside of stack bound\n", + regno); return err; + } err = __check_stack_boundary(env, regno, max_off, access_size, zero_size_allowed); - if (err) + if (err) { + verbose(env, "R%d max value is outside of stack bound\n", + regno); return err; + } } if (meta && meta->raw_mode) { -- cgit v1.2.3 From 1fbd20f8b77b366ea4aeb92ade72daa7f36a7e3b Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 3 Apr 2019 23:22:43 -0700 Subject: bpf: Add missed newline in verifier verbose log check_stack_access() that prints verbose log is used in adjust_ptr_min_max_vals() that prints its own verbose log and now they stick together, e.g.: variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1R2 stack pointer arithmetic goes out of range, prohibited for !root Add missing newline so that log is more readable: variable stack access var_off=(0xfffffffffffffff0; 0x4) off=-16 size=1 R2 stack pointer arithmetic goes out of range, prohibited for !root Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Andrey Ignatov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f2d600199e66..48718e1da16d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1426,7 +1426,7 @@ static int check_stack_access(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d", + verbose(env, "variable stack access var_off=%s off=%d size=%d\n", tn_buf, off, size); return -EACCES; } -- cgit v1.2.3 From 12f2639038ef420fe796171ffb810b30d1ac0619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 5 Apr 2019 21:46:12 +0200 Subject: tracing: stop making gpio tracing configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gpio tracing was made configurable in 4.4-rc1 (commit ddd70280bf0e ("tracing: gpio: Add Kconfig option for enabling/disabling trace events")). Since then it is the only event type that can be compiled conditionally. Given that there is only little overhead I don't understand the reasoning and I was annoyed more than once that gpio events were not available without recompiling. So drop the Kconfig symbol and make gpio events available unconditionally. Signed-off-by: Uwe Kleine-König Signed-off-by: Linus Walleij --- kernel/trace/Kconfig | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8bd1d6d001d7..5d965cef6c77 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -774,13 +774,6 @@ config TRACE_EVAL_MAP_FILE If unsure, say N -config TRACING_EVENTS_GPIO - bool "Trace gpio events" - depends on GPIOLIB - default y - help - Enable tracing events for gpio subsystem - config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" depends on GCOV_KERNEL -- cgit v1.2.3 From 1e144d73f7295f766568c357448a11eb12868e29 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 1 Apr 2019 16:07:48 -0400 Subject: tracing: Add trace_array parameter to create_event_filter() Pass in the trace_array that represents the instance the filter being changed is in to create_event_filter(). This will allow for error messages that happen when writing to the filter can be displayed in the proper instance "error_log" file. Note, for calls to create_filter() (that was also modified to support create_event_filter()), that changes filters that do not exist in a instance (for perf for example), NULL may be passed in, which means that there will not be any message to log for that filter. Reviewed-by: Masami Hiramatsu Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 3 ++- kernel/trace/trace_events_filter.c | 25 ++++++++++++++----------- kernel/trace/trace_events_trigger.c | 3 ++- 3 files changed, 18 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b711edbef7e7..809c5d7f0064 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1553,7 +1553,8 @@ extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); -extern int create_event_filter(struct trace_event_call *call, +extern int create_event_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp); extern void free_event_filter(struct event_filter *filter); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 290d42c59101..2b63930cd3e6 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -920,7 +920,8 @@ static void remove_filter_string(struct event_filter *filter) filter->filter_string = NULL; } -static void append_filter_err(struct filter_parse_error *pe, +static void append_filter_err(struct trace_array *tr, + struct filter_parse_error *pe, struct event_filter *filter) { struct trace_seq *s; @@ -1607,7 +1608,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (err) { filter_disable(file); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); - append_filter_err(pe, filter); + append_filter_err(tr, pe, filter); } else event_set_filtered_flag(file); @@ -1719,7 +1720,8 @@ static void create_filter_finish(struct filter_parse_error *pe) * information if @set_str is %true and the caller is responsible for * freeing it. */ -static int create_filter(struct trace_event_call *call, +static int create_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_string, bool set_str, struct event_filter **filterp) { @@ -1736,17 +1738,18 @@ static int create_filter(struct trace_event_call *call, err = process_preds(call, filter_string, *filterp, pe); if (err && set_str) - append_filter_err(pe, *filterp); + append_filter_err(tr, pe, *filterp); create_filter_finish(pe); return err; } -int create_event_filter(struct trace_event_call *call, +int create_event_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp) { - return create_filter(call, filter_str, set_str, filterp); + return create_filter(tr, call, filter_str, set_str, filterp); } /** @@ -1773,7 +1776,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir, kfree((*filterp)->filter_string); (*filterp)->filter_string = NULL; } else { - append_filter_err(pe, *filterp); + append_filter_err(tr, pe, *filterp); } } create_filter_finish(pe); @@ -1804,7 +1807,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) return 0; } - err = create_filter(call, filter_string, true, &filter); + err = create_filter(file->tr, call, filter_string, true, &filter); /* * Always swap the call filter with the new filter @@ -2060,7 +2063,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - err = create_filter(call, filter_str, false, &filter); + err = create_filter(NULL, call, filter_str, false, &filter); if (err) goto free_filter; @@ -2209,8 +2212,8 @@ static __init int ftrace_test_event_filter(void) struct test_filter_data_t *d = &test_filter_data[i]; int err; - err = create_filter(&event_ftrace_test_filter, d->filter, - false, &filter); + err = create_filter(NULL, &event_ftrace_test_filter, + d->filter, false, &filter); if (err) { printk(KERN_INFO "Failed to get filter for '%s', err %d\n", diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cd12ecb66eb9..2a2912cb4533 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -731,7 +731,8 @@ int set_trigger_filter(char *filter_str, goto out; /* The filter is for the 'trigger' event, not the triggered event */ - ret = create_event_filter(file->event_call, filter_str, false, &filter); + ret = create_event_filter(file->tr, file->event_call, + filter_str, false, &filter); /* * If create_event_filter() fails, filter still needs to be freed. * Which the calling code will do with data->filter. -- cgit v1.2.3 From d0cd871ba0d613e09366e4b6a17946dfcf51db7c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 1 Apr 2019 22:30:22 -0400 Subject: tracing: Have histogram code pass around trace_array for error handling Have the trace_array that associates the trace instance of the histogram passed around to functions so that error handling can display the error message in the proper instance. Reviewed-by: Masami Hiramatsu Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 142 ++++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 071c62cacba7..a167e439e9a1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -619,7 +619,7 @@ static void last_cmd_set(struct trace_event_file *file, char *str) snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); } -static void hist_err(u8 err_type, u8 err_pos) +static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) { tracing_log_err(last_cmd_loc, last_cmd, err_text, err_type, err_pos); } @@ -1756,7 +1756,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr, if (find_var_field(var_hist_data, var_name)) { if (found) { - hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); + hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); return NULL; } @@ -1807,7 +1807,8 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) hist_field = find_file_var(file, var_name); if (hist_field) { if (found) { - hist_err(HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); + hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, + errpos(var_name)); return ERR_PTR(-EINVAL); } @@ -2042,7 +2043,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) return ret; } -static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) +static int parse_assignment(struct trace_array *tr, + char *str, struct hist_trigger_attrs *attrs) { int ret = 0; @@ -2098,7 +2100,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) char *assignment; if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { - hist_err(HIST_ERR_TOO_MANY_VARS, errpos(str)); + hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(str)); ret = -EINVAL; goto out; } @@ -2115,7 +2117,8 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) return ret; } -static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) +static struct hist_trigger_attrs * +parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) { struct hist_trigger_attrs *attrs; int ret = 0; @@ -2128,7 +2131,7 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) char *str = strsep(&trigger_str, ":"); if (strchr(str, '=')) { - ret = parse_assignment(str, attrs); + ret = parse_assignment(tr, str, attrs); if (ret) goto free; } else if (strcmp(str, "pause") == 0) @@ -2684,6 +2687,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, char *var_name) { struct hist_field *var_field = NULL, *ref_field = NULL; + struct trace_array *tr = hist_data->event_file->tr; if (!is_var_ref(var_name)) return NULL; @@ -2696,7 +2700,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, system, event_name); if (!ref_field) - hist_err(HIST_ERR_VAR_NOT_FOUND, errpos(var_name)); + hist_err(tr, HIST_ERR_VAR_NOT_FOUND, errpos(var_name)); return ref_field; } @@ -2707,6 +2711,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, { struct ftrace_event_field *field = NULL; char *field_name, *modifier, *str; + struct trace_array *tr = file->tr; modifier = str = kstrdup(field_str, GFP_KERNEL); if (!modifier) @@ -2730,7 +2735,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; else { - hist_err(HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); + hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); field = ERR_PTR(-EINVAL); goto out; } @@ -2746,7 +2751,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { - hist_err(HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); + hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); field = ERR_PTR(-EINVAL); goto out; } @@ -2808,7 +2813,8 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); if (!s) { - hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); + hist_field = parse_var_ref(hist_data, ref_system, + ref_event, ref_var); if (hist_field) { if (var_name) { hist_field = create_alias(hist_data, hist_field, var_name); @@ -2857,7 +2863,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, /* we support only -(xxx) i.e. explicit parens required */ if (level > 3) { - hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); ret = -EINVAL; goto free; } @@ -2912,7 +2918,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, return ERR_PTR(ret); } -static int check_expr_operands(struct hist_field *operand1, +static int check_expr_operands(struct trace_array *tr, + struct hist_field *operand1, struct hist_field *operand2) { unsigned long operand1_flags = operand1->flags; @@ -2940,7 +2947,7 @@ static int check_expr_operands(struct hist_field *operand1, if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { - hist_err(HIST_ERR_TIMESTAMP_MISMATCH, 0); + hist_err(tr, HIST_ERR_TIMESTAMP_MISMATCH, 0); return -EINVAL; } @@ -2958,7 +2965,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, char *sep, *operand1_str; if (level > 3) { - hist_err(HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); return ERR_PTR(-EINVAL); } @@ -3003,7 +3010,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, goto free; } - ret = check_expr_operands(operand1, operand2); + ret = check_expr_operands(file->tr, operand1, operand2); if (ret) goto free; @@ -3196,14 +3203,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, int ret; if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { - hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); + hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); return ERR_PTR(-EINVAL); } file = event_file(tr, subsys_name, event_name); if (IS_ERR(file)) { - hist_err(HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name)); + hist_err(tr, HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name)); ret = PTR_ERR(file); return ERR_PTR(ret); } @@ -3216,7 +3223,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, */ hist_data = find_compatible_hist(target_hist_data, file); if (!hist_data) { - hist_err(HIST_ERR_HIST_NOT_FOUND, errpos(field_name)); + hist_err(tr, HIST_ERR_HIST_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3277,7 +3284,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, kfree(cmd); kfree(var_hist->cmd); kfree(var_hist); - hist_err(HIST_ERR_HIST_CREATE_FAIL, errpos(field_name)); + hist_err(tr, HIST_ERR_HIST_CREATE_FAIL, errpos(field_name)); return ERR_PTR(ret); } @@ -3289,7 +3296,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, if (IS_ERR_OR_NULL(event_var)) { kfree(var_hist->cmd); kfree(var_hist); - hist_err(HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name)); + hist_err(tr, HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3422,25 +3429,26 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, { struct hist_field *val = NULL, *var = NULL; unsigned long flags = HIST_FIELD_FL_VAR; + struct trace_array *tr = file->tr; struct field_var *field_var; int ret = 0; if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { - hist_err(HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); + hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); ret = -EINVAL; goto err; } val = parse_atom(hist_data, file, field_name, &flags, NULL); if (IS_ERR(val)) { - hist_err(HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name)); + hist_err(tr, HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name)); ret = PTR_ERR(val); goto err; } var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { - hist_err(HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); + hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); kfree(val); ret = PTR_ERR(var); goto err; @@ -3767,19 +3775,20 @@ static int track_data_create(struct hist_trigger_data *hist_data, { struct hist_field *var_field, *ref_field, *track_var = NULL; struct trace_event_file *file = hist_data->event_file; + struct trace_array *tr = file->tr; char *track_data_var_str; int ret = 0; track_data_var_str = data->track_data.var_str; if (track_data_var_str[0] != '$') { - hist_err(HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str)); + hist_err(tr, HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str)); return -EINVAL; } track_data_var_str++; var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); if (!var_field) { - hist_err(HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str)); + hist_err(tr, HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str)); return -EINVAL; } @@ -3792,7 +3801,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, if (data->handler == HANDLER_ONMAX) track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); if (IS_ERR(track_var)) { - hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0); + hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); ret = PTR_ERR(track_var); goto out; } @@ -3800,7 +3809,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, if (data->handler == HANDLER_ONCHANGE) track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); if (IS_ERR(track_var)) { - hist_err(HIST_ERR_ONX_VAR_CREATE_FAIL, 0); + hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); ret = PTR_ERR(track_var); goto out; } @@ -3811,7 +3820,8 @@ static int track_data_create(struct hist_trigger_data *hist_data, return ret; } -static int parse_action_params(char *params, struct action_data *data) +static int parse_action_params(struct trace_array *tr, char *params, + struct action_data *data) { char *param, *saved_param; bool first_param = true; @@ -3819,20 +3829,20 @@ static int parse_action_params(char *params, struct action_data *data) while (params) { if (data->n_params >= SYNTH_FIELDS_MAX) { - hist_err(HIST_ERR_TOO_MANY_PARAMS, 0); + hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0); goto out; } param = strsep(¶ms, ","); if (!param) { - hist_err(HIST_ERR_PARAM_NOT_FOUND, 0); + hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, 0); ret = -EINVAL; goto out; } param = strstrip(param); if (strlen(param) < 2) { - hist_err(HIST_ERR_INVALID_PARAM, errpos(param)); + hist_err(tr, HIST_ERR_INVALID_PARAM, errpos(param)); ret = -EINVAL; goto out; } @@ -3856,7 +3866,7 @@ static int parse_action_params(char *params, struct action_data *data) return ret; } -static int action_parse(char *str, struct action_data *data, +static int action_parse(struct trace_array *tr, char *str, struct action_data *data, enum handler_id handler) { char *action_name; @@ -3864,14 +3874,14 @@ static int action_parse(char *str, struct action_data *data, strsep(&str, "."); if (!str) { - hist_err(HIST_ERR_ACTION_NOT_FOUND, 0); + hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); ret = -EINVAL; goto out; } action_name = strsep(&str, "("); if (!action_name || !str) { - hist_err(HIST_ERR_ACTION_NOT_FOUND, 0); + hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); ret = -EINVAL; goto out; } @@ -3880,12 +3890,12 @@ static int action_parse(char *str, struct action_data *data, char *params = strsep(&str, ")"); if (!params) { - hist_err(HIST_ERR_NO_SAVE_PARAMS, 0); + hist_err(tr, HIST_ERR_NO_SAVE_PARAMS, 0); ret = -EINVAL; goto out; } - ret = parse_action_params(params, data); + ret = parse_action_params(tr, params, data); if (ret) goto out; @@ -3894,7 +3904,7 @@ static int action_parse(char *str, struct action_data *data, else if (handler == HANDLER_ONCHANGE) data->track_data.check_val = check_track_val_changed; else { - hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name)); + hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); ret = -EINVAL; goto out; } @@ -3906,7 +3916,7 @@ static int action_parse(char *str, struct action_data *data, char *params = strsep(&str, ")"); if (!str) { - hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(params)); + hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(params)); ret = -EINVAL; goto out; } @@ -3916,7 +3926,7 @@ static int action_parse(char *str, struct action_data *data, else if (handler == HANDLER_ONCHANGE) data->track_data.check_val = check_track_val_changed; else { - hist_err(HIST_ERR_ACTION_MISMATCH, errpos(action_name)); + hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); ret = -EINVAL; goto out; } @@ -3931,7 +3941,7 @@ static int action_parse(char *str, struct action_data *data, data->use_trace_keyword = true; if (params) { - ret = parse_action_params(params, data); + ret = parse_action_params(tr, params, data); if (ret) goto out; } @@ -3984,7 +3994,7 @@ static struct action_data *track_data_parse(struct hist_trigger_data *hist_data, goto free; } - ret = action_parse(str, data, handler); + ret = action_parse(hist_data->event_file->tr, str, data, handler); if (ret) goto free; out: @@ -4054,6 +4064,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data, struct action_data *data, char *system, char *event, char *var) { + struct trace_array *tr = hist_data->event_file->tr; struct hist_field *hist_field; var++; /* skip '$' */ @@ -4069,7 +4080,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data, } if (!hist_field) - hist_err(HIST_ERR_PARAM_NOT_FOUND, errpos(var)); + hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, errpos(var)); return hist_field; } @@ -4127,6 +4138,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data, static int trace_action_create(struct hist_trigger_data *hist_data, struct action_data *data) { + struct trace_array *tr = hist_data->event_file->tr; char *event_name, *param, *system = NULL; struct hist_field *hist_field, *var_ref; unsigned int i, var_ref_idx; @@ -4144,7 +4156,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data, event = find_synth_event(synth_event_name); if (!event) { - hist_err(HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name)); + hist_err(tr, HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name)); return -EINVAL; } @@ -4205,14 +4217,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data, continue; } - hist_err(HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param)); + hist_err(tr, HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param)); kfree(p); ret = -EINVAL; goto err; } if (field_pos != event->n_fields) { - hist_err(HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name)); + hist_err(tr, HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name)); ret = -EINVAL; goto err; } @@ -4231,6 +4243,7 @@ static int action_create(struct hist_trigger_data *hist_data, struct action_data *data) { struct trace_event_file *file = hist_data->event_file; + struct trace_array *tr = file->tr; struct track_data *track_data; struct field_var *field_var; unsigned int i; @@ -4258,7 +4271,7 @@ static int action_create(struct hist_trigger_data *hist_data, if (data->action == ACTION_SAVE) { if (hist_data->n_save_vars) { ret = -EEXIST; - hist_err(HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0); + hist_err(tr, HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0); goto out; } @@ -4271,7 +4284,8 @@ static int action_create(struct hist_trigger_data *hist_data, field_var = create_target_field_var(hist_data, NULL, NULL, param); if (IS_ERR(field_var)) { - hist_err(HIST_ERR_FIELD_VAR_CREATE_FAIL, errpos(param)); + hist_err(tr, HIST_ERR_FIELD_VAR_CREATE_FAIL, + errpos(param)); ret = PTR_ERR(field_var); kfree(param); goto out; @@ -4305,18 +4319,18 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) match_event = strsep(&str, ")"); if (!match_event || !str) { - hist_err(HIST_ERR_NO_CLOSING_PAREN, errpos(match_event)); + hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(match_event)); goto free; } match_event_system = strsep(&match_event, "."); if (!match_event) { - hist_err(HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system)); + hist_err(tr, HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system)); goto free; } if (IS_ERR(event_file(tr, match_event_system, match_event))) { - hist_err(HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event)); + hist_err(tr, HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event)); goto free; } @@ -4332,7 +4346,7 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) goto free; } - ret = action_parse(str, data, HANDLER_ONMATCH); + ret = action_parse(tr, str, data, HANDLER_ONMATCH); if (ret) goto free; out: @@ -4401,13 +4415,14 @@ static int create_var_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *var_name, char *expr_str) { + struct trace_array *tr = hist_data->event_file->tr; unsigned long flags = 0; if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) return -EINVAL; if (find_var(hist_data, file, var_name) && !hist_data->remove) { - hist_err(HIST_ERR_DUPLICATE_VAR, errpos(var_name)); + hist_err(tr, HIST_ERR_DUPLICATE_VAR, errpos(var_name)); return -EINVAL; } @@ -4464,8 +4479,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *field_str) { + struct trace_array *tr = hist_data->event_file->tr; struct hist_field *hist_field = NULL; - unsigned long flags = 0; unsigned int key_size; int ret = 0; @@ -4488,7 +4503,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, } if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { - hist_err(HIST_ERR_INVALID_REF_KEY, errpos(field_str)); + hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str)); destroy_hist_field(hist_field, 0); ret = -EINVAL; goto out; @@ -4589,6 +4604,7 @@ static void free_var_defs(struct hist_trigger_data *hist_data) static int parse_var_defs(struct hist_trigger_data *hist_data) { + struct trace_array *tr = hist_data->event_file->tr; char *s, *str, *var_name, *field_str; unsigned int i, j, n_vars = 0; int ret = 0; @@ -4602,13 +4618,14 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) var_name = strsep(&field_str, "="); if (!var_name || !field_str) { - hist_err(HIST_ERR_MALFORMED_ASSIGNMENT, errpos(var_name)); + hist_err(tr, HIST_ERR_MALFORMED_ASSIGNMENT, + errpos(var_name)); ret = -EINVAL; goto free; } if (n_vars == TRACING_MAP_VARS_MAX) { - hist_err(HIST_ERR_TOO_MANY_VARS, errpos(var_name)); + hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(var_name)); ret = -EINVAL; goto free; } @@ -5829,6 +5846,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, { struct hist_trigger_data *hist_data = data->private_data; struct event_trigger_data *test, *named_data = NULL; + struct trace_array *tr = file->tr; int ret = 0; if (hist_data->attrs->name) { @@ -5836,7 +5854,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (named_data) { if (!hist_trigger_match(data, named_data, named_data, true)) { - hist_err(HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name)); + hist_err(tr, HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name)); ret = -EINVAL; goto out; } @@ -5857,7 +5875,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, else if (hist_data->attrs->clear) hist_clear(test); else { - hist_err(HIST_ERR_TRIGGER_EEXIST, 0); + hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0); ret = -EEXIST; } goto out; @@ -5865,7 +5883,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, } new: if (hist_data->attrs->cont || hist_data->attrs->clear) { - hist_err(HIST_ERR_TRIGGER_ENOENT_CLEAR, 0); + hist_err(tr, HIST_ERR_TRIGGER_ENOENT_CLEAR, 0); ret = -ENOENT; goto out; } @@ -5890,7 +5908,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, ret = tracing_set_clock(file->tr, hist_data->attrs->clock); if (ret) { - hist_err(HIST_ERR_SET_CLOCK_FAIL, errpos(clock)); + hist_err(tr, HIST_ERR_SET_CLOCK_FAIL, errpos(clock)); goto out; } @@ -6108,7 +6126,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger = strstrip(trigger); } - attrs = parse_hist_trigger_attrs(trigger); + attrs = parse_hist_trigger_attrs(file->tr, trigger); if (IS_ERR(attrs)) return PTR_ERR(attrs); -- cgit v1.2.3 From 2f754e771b1a6feba670782e82c45555984ac43b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 1 Apr 2019 22:52:21 -0400 Subject: tracing: Have the error logs show up in the proper instances As each instance has their own error_log file, it makes more sense that the instances show the errors of their own instead of all error_logs having the same data. Make it that the errors show up in the instance error_log file that the error happens in. If no instance trace_array is available, then NULL can be passed in which will create the error in the top level instance (the one at the top of the tracefs directory). Reviewed-by: Masami Hiramatsu Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 55 +++++++++++++++++++++++++------------- kernel/trace/trace.h | 5 +++- kernel/trace/trace_events_filter.c | 4 +-- kernel/trace/trace_events_hist.c | 3 ++- kernel/trace/trace_probe.c | 2 +- 5 files changed, 46 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7978168f5041..3d55e9daae8c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6897,25 +6897,22 @@ struct tracing_log_err { char cmd[MAX_FILTER_STR_VAL]; /* what caused err */ }; -static LIST_HEAD(tracing_err_log); static DEFINE_MUTEX(tracing_err_log_lock); -static unsigned int n_tracing_err_log_entries; - -struct tracing_log_err *get_tracing_log_err(void) +struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) { struct tracing_log_err *err; - if (n_tracing_err_log_entries < TRACING_LOG_ERRS_MAX) { + if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { err = kzalloc(sizeof(*err), GFP_KERNEL); if (!err) err = ERR_PTR(-ENOMEM); - n_tracing_err_log_entries++; + tr->n_err_log_entries++; return err; } - err = list_first_entry(&tracing_err_log, struct tracing_log_err, list); + err = list_first_entry(&tr->err_log, struct tracing_log_err, list); list_del(&err->list); return err; @@ -6949,6 +6946,7 @@ unsigned int err_pos(char *cmd, const char *str) /** * tracing_log_err - write an error to the tracing error log + * @tr: The associated trace array for the error (NULL for top level array) * @loc: A string describing where the error occurred * @cmd: The tracing command that caused the error * @errs: The array of loc-specific static error strings @@ -6973,13 +6971,17 @@ unsigned int err_pos(char *cmd, const char *str) * existing callers for examples of how static strings are typically * defined for use with tracing_log_err(). */ -void tracing_log_err(const char *loc, const char *cmd, +void tracing_log_err(struct trace_array *tr, + const char *loc, const char *cmd, const char **errs, u8 type, u8 pos) { struct tracing_log_err *err; + if (!tr) + tr = &global_trace; + mutex_lock(&tracing_err_log_lock); - err = get_tracing_log_err(); + err = get_tracing_log_err(tr); if (PTR_ERR(err) == -ENOMEM) { mutex_unlock(&tracing_err_log_lock); return; @@ -6993,34 +6995,38 @@ void tracing_log_err(const char *loc, const char *cmd, err->info.pos = pos; err->info.ts = local_clock(); - list_add_tail(&err->list, &tracing_err_log); + list_add_tail(&err->list, &tr->err_log); mutex_unlock(&tracing_err_log_lock); } -static void clear_tracing_err_log(void) +static void clear_tracing_err_log(struct trace_array *tr) { struct tracing_log_err *err, *next; mutex_lock(&tracing_err_log_lock); - list_for_each_entry_safe(err, next, &tracing_err_log, list) { + list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); kfree(err); } - n_tracing_err_log_entries = 0; + tr->n_err_log_entries = 0; mutex_unlock(&tracing_err_log_lock); } static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) { + struct trace_array *tr = m->private; + mutex_lock(&tracing_err_log_lock); - return seq_list_start(&tracing_err_log, *pos); + return seq_list_start(&tr->err_log, *pos); } static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) { - return seq_list_next(v, &tracing_err_log, pos); + struct trace_array *tr = m->private; + + return seq_list_next(v, &tr->err_log, pos); } static void tracing_err_log_seq_stop(struct seq_file *m, void *v) @@ -7067,15 +7073,25 @@ static const struct seq_operations tracing_err_log_seq_ops = { static int tracing_err_log_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; int ret = 0; + if (trace_array_get(tr) < 0) + return -ENODEV; + /* If this file was opened for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - clear_tracing_err_log(); + clear_tracing_err_log(tr); - if (file->f_mode & FMODE_READ) + if (file->f_mode & FMODE_READ) { ret = seq_open(file, &tracing_err_log_seq_ops); - + if (!ret) { + struct seq_file *m = file->private_data; + m->private = tr; + } else { + trace_array_put(tr); + } + } return ret; } @@ -7091,6 +7107,7 @@ static const struct file_operations tracing_err_log_fops = { .write = tracing_err_log_write, .read = seq_read, .llseek = seq_lseek, + .release = tracing_release_generic_tr, }; static int tracing_buffers_open(struct inode *inode, struct file *filp) @@ -8293,6 +8310,7 @@ struct trace_array *trace_array_create(const char *name) INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); + INIT_LIST_HEAD(&tr->err_log); if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; @@ -9087,6 +9105,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); + INIT_LIST_HEAD(&global_trace.err_log); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 809c5d7f0064..da00a3d508c1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -293,11 +293,13 @@ struct trace_array { int nr_topts; bool clear_trace; int buffer_percent; + unsigned int n_err_log_entries; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; + struct list_head err_log; struct dentry *dir; struct dentry *options; struct dentry *percpu_dir; @@ -1886,7 +1888,8 @@ extern ssize_t trace_parse_run_command(struct file *file, int (*createfn)(int, char**)); extern unsigned int err_pos(char *cmd, const char *str); -extern void tracing_log_err(const char *loc, const char *cmd, +extern void tracing_log_err(struct trace_array *tr, + const char *loc, const char *cmd, const char **errs, u8 type, u8 pos); /* diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2b63930cd3e6..180ecb390baa 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -949,12 +949,12 @@ static void append_filter_err(struct trace_array *tr, if (pe->lasterr > 0) { trace_seq_printf(s, "\n%*s", pos, "^"); trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); - tracing_log_err("event filter parse error", + tracing_log_err(tr, "event filter parse error", filter->filter_string, err_text, pe->lasterr, pe->lasterr_pos); } else { trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); - tracing_log_err("event filter parse error", + tracing_log_err(tr, "event filter parse error", filter->filter_string, err_text, FILT_ERR_ERRNO, 0); } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a167e439e9a1..a1136e043f17 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -621,7 +621,8 @@ static void last_cmd_set(struct trace_event_file *file, char *str) static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) { - tracing_log_err(last_cmd_loc, last_cmd, err_text, err_type, err_pos); + tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, + err_type, err_pos); } static void hist_err_clear(void) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e11f98c49d72..4cc2d467d34c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -186,7 +186,7 @@ void __trace_probe_log_err(int offset, int err_type) } *(p - 1) = '\0'; - tracing_log_err(trace_probe_log.subsystem, command, + tracing_log_err(NULL, trace_probe_log.subsystem, command, trace_probe_err_text, err_type, pos + offset); kfree(command); -- cgit v1.2.3 From a8d655792a32312f6715ac789b860fee50168106 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 31 Mar 2019 18:48:25 -0500 Subject: tracing: Add error_log to README Add brief blurb about error_log to the 'Important files' section. Link: http://lkml.kernel.org/r/c81e60f9aded495081231a32d2d1023c4d043a7a.1554072478.git.tom.zanussi@linux.intel.com Acked-by: Masami Hiramatsu Acked-by: Namhyung Kim Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3d55e9daae8c..2bc18de7f0dc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4702,6 +4702,7 @@ static const char readme_msg[] = " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" " current_tracer\t- function and latency tracers\n" " available_tracers\t- list of configured tracers for current_tracer\n" + " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" " trace_clock\t\t-change the clock used to order events\n" -- cgit v1.2.3 From 4f5fbd78a7b40bab538ae0d316363530da751e42 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 26 Mar 2019 20:13:11 +0800 Subject: rcu: validate arguments for rcu tracepoints When CONFIG_RCU_TRACE is not set, all these tracepoints are defined as do-nothing macro. We'd better make those inline functions that take proper arguments. As RCU_TRACE() is defined as do-nothing marco as well when CONFIG_RCU_TRACE is not set, so we can clean it up. Link: http://lkml.kernel.org/r/1553602391-11926-4-git-send-email-laoar.shao@gmail.com Reviewed-by: Paul E. McKenney Signed-off-by: Yafang Shao Signed-off-by: Steven Rostedt (VMware) --- kernel/rcu/rcu.h | 9 ++------- kernel/rcu/tree.c | 8 ++++---- 2 files changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index acee72c0b24b..442ace406ac9 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -11,11 +11,6 @@ #define __LINUX_RCU_H #include -#ifdef CONFIG_RCU_TRACE -#define RCU_TRACE(stmt) stmt -#else /* #ifdef CONFIG_RCU_TRACE */ -#define RCU_TRACE(stmt) -#endif /* #else #ifdef CONFIG_RCU_TRACE */ /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) @@ -216,12 +211,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { - RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) + trace_rcu_invoke_kfree_callback(rn, head, offset); kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } else { - RCU_TRACE(trace_rcu_invoke_callback(rn, head);) + trace_rcu_invoke_callback(rn, head); f = head->func; WRITE_ONCE(head->func, (rcu_callback_t)0L); f(head); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acd6ccf56faf..906563a1cdea 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2352,14 +2352,14 @@ rcu_check_quiescent_state(struct rcu_data *rdp) */ int rcutree_dying_cpu(unsigned int cpu) { - RCU_TRACE(bool blkd;) - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);) - RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) + bool blkd; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return 0; - RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) + blkd = !!(rnp->qsmask & rdp->grpmask); trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); return 0; -- cgit v1.2.3 From c13edf8106f6ad1edb9b7e011351fbaf83ceb992 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Leger?= Date: Wed, 27 Mar 2019 14:06:27 +0100 Subject: dma: select GENERIC_ALLOCATOR for DMA_REMAP When DMA_REMAP is enabled, code in remap.c needs generic allocator. It currently worked since few architectures uses it (arm64, csky) and they both select GENERIC_ALLOCATOR. Select it when using DMA_REMAP to have correct dependencies. Signed-off-by: Clement Leger Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index a06ba3013b3b..52b704e2b97a 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -57,6 +57,7 @@ config SWIOTLB config DMA_REMAP depends on MMU + select GENERIC_ALLOCATOR bool config DMA_DIRECT_REMAP -- cgit v1.2.3 From d7e02a931235de0779d44c6f8d211df0eca304b8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Mar 2019 18:45:21 +0100 Subject: dma-mapping: remove leftover NULL device support Most dma_map_ops implementations already had some issues with a NULL device, or did simply crash if one was fed to them. Now that we have cleaned up all the obvious offenders we can stop to pretend we support this mode. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index fcdb23e8d2fc..2c2772e9702a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -311,7 +311,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, size_t size) { return swiotlb_force != SWIOTLB_FORCE && - (!dev || dma_capable(dev, dma_addr, size)); + dma_capable(dev, dma_addr, size); } dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, -- cgit v1.2.3 From e43e2657fe77a37b13643e2469670ecdb0ba5e10 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Dec 2018 14:32:02 +0100 Subject: x86/dma: Remove the x86_dma_fallback_dev hack Now that we removed support for the NULL device argument in the DMA API, there is no need to cater for that in the x86 code. Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index c000906348c9..685a53f2a793 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -238,10 +238,6 @@ u64 dma_get_required_mask(struct device *dev) } EXPORT_SYMBOL_GPL(dma_get_required_mask); -#ifndef arch_dma_alloc_attrs -#define arch_dma_alloc_attrs(dev) (true) -#endif - void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { @@ -256,9 +252,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (!arch_dma_alloc_attrs(&dev)) - return NULL; - if (dma_is_direct(ops)) cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); else if (ops->alloc) -- cgit v1.2.3 From 24acfb71822566e4d469b4992a7b3b9f873e0083 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Mar 2019 17:55:47 +0100 Subject: workqueue: Use normal rcu There is no need for sched_rcu. The undocumented reason why sched_rcu is used is to avoid a few explicit rcu_read_lock()/unlock() pairs by the fact that sched_rcu reader side critical sections are also protected by preempt or irq disabled regions. Replace rcu_read_lock_sched with rcu_read_lock and acquire the RCU lock where it is not yet explicit acquired. Replace local_irq_disable() with rcu_read_lock(). Update asserts. Signed-off-by: Thomas Gleixner [bigeasy: mangle changelog a little] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tejun Heo --- kernel/workqueue.c | 93 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 21721faa923c..37a32884986b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -127,16 +127,16 @@ enum { * * PL: wq_pool_mutex protected. * - * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. + * PR: wq_pool_mutex protected for writes. RCU protected for reads. * * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. * * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or - * sched-RCU for reads. + * RCU for reads. * * WQ: wq->mutex protected. * - * WR: wq->mutex protected for writes. Sched-RCU protected for reads. + * WR: wq->mutex protected for writes. RCU protected for reads. * * MD: wq_mayday_lock protected. */ @@ -183,7 +183,7 @@ struct worker_pool { atomic_t nr_running ____cacheline_aligned_in_smp; /* - * Destruction of pool is sched-RCU protected to allow dereferences + * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; @@ -212,7 +212,7 @@ struct pool_workqueue { /* * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue - * itself is also sched-RCU protected so that the first pwq can be + * itself is also RCU protected so that the first pwq can be * determined without grabbing wq->mutex. */ struct work_struct unbound_release_work; @@ -266,8 +266,8 @@ struct workqueue_struct { char name[WQ_NAME_LEN]; /* I: workqueue name */ /* - * Destruction of workqueue_struct is sched-RCU protected to allow - * walking the workqueues list without grabbing wq_pool_mutex. + * Destruction of workqueue_struct is RCU protected to allow walking + * the workqueues list without grabbing wq_pool_mutex. * This is used to dump all workqueues from sysrq. */ struct rcu_head rcu; @@ -359,20 +359,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #include #define assert_rcu_or_pool_mutex() \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq_pool_mutex), \ - "sched RCU or wq_pool_mutex should be held") + "RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq->mutex), \ - "sched RCU or wq->mutex should be held") + "RCU or wq->mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq->mutex) && \ !lockdep_is_held(&wq_pool_mutex), \ - "sched RCU, wq->mutex or wq_pool_mutex should be held") + "RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ @@ -384,7 +384,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * @pool: iteration cursor * @pi: integer used for iteration * - * This must be called either with wq_pool_mutex held or sched RCU read + * This must be called either with wq_pool_mutex held or RCU read * locked. If the pool needs to be used beyond the locking in effect, the * caller is responsible for guaranteeing that the pool stays online. * @@ -416,7 +416,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * @pwq: iteration cursor * @wq: the target workqueue * - * This must be called either with wq->mutex held or sched RCU read locked. + * This must be called either with wq->mutex held or RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * @@ -552,7 +552,7 @@ static int worker_pool_assign_id(struct worker_pool *pool) * @wq: the target workqueue * @node: the node ID * - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU + * This must be called with any of wq_pool_mutex, wq->mutex or RCU * read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. @@ -696,8 +696,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * @work: the work item of interest * * Pools are created and destroyed under wq_pool_mutex, and allows read - * access under sched-RCU read lock. As such, this function should be - * called under wq_pool_mutex or with preemption disabled. + * access under RCU read lock. As such, this function should be + * called under wq_pool_mutex or inside of a rcu_read_lock() region. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used @@ -1133,7 +1133,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) { if (pwq) { /* - * As both pwqs and pools are sched-RCU protected, the + * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ spin_lock_irq(&pwq->pool->lock); @@ -1261,6 +1261,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; + rcu_read_lock(); /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. @@ -1299,10 +1300,12 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, set_work_pool_and_keep_pending(work, pool->id); spin_unlock(&pool->lock); + rcu_read_unlock(); return 1; } spin_unlock(&pool->lock); fail: + rcu_read_unlock(); local_irq_restore(*flags); if (work_is_canceling(work)) return -ENOENT; @@ -1416,6 +1419,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; + rcu_read_lock(); retry: if (req_cpu == WORK_CPU_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); @@ -1472,10 +1476,8 @@ retry: /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); - if (WARN_ON(!list_empty(&work->entry))) { - spin_unlock(&pwq->pool->lock); - return; - } + if (WARN_ON(!list_empty(&work->entry))) + goto out; pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); @@ -1493,7 +1495,9 @@ retry: insert_work(pwq, work, worklist, work_flags); +out: spin_unlock(&pwq->pool->lock); + rcu_read_unlock(); } /** @@ -2975,14 +2979,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, might_sleep(); - local_irq_disable(); + rcu_read_lock(); pool = get_work_pool(work); if (!pool) { - local_irq_enable(); + rcu_read_unlock(); return false; } - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -3014,10 +3018,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, lock_map_acquire(&pwq->wq->lockdep_map); lock_map_release(&pwq->wq->lockdep_map); } - + rcu_read_unlock(); return true; already_gone: spin_unlock_irq(&pool->lock); + rcu_read_unlock(); return false; } @@ -3504,7 +3509,7 @@ static void rcu_free_pool(struct rcu_head *rcu) * put_unbound_pool - put a worker_pool * @pool: worker_pool to put * - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU + * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU * safe manner. get_unbound_pool() calls this function on its failure path * and this function should be able to release pools which went through, * successfully or not, init_worker_pool(). @@ -3558,7 +3563,7 @@ static void put_unbound_pool(struct worker_pool *pool) del_timer_sync(&pool->idle_timer); del_timer_sync(&pool->mayday_timer); - /* sched-RCU protected to allow dereferences from get_work_pool() */ + /* RCU protected to allow dereferences from get_work_pool() */ call_rcu(&pool->rcu, rcu_free_pool); } @@ -4472,7 +4477,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) struct pool_workqueue *pwq; bool ret; - rcu_read_lock_sched(); + rcu_read_lock(); + preempt_disable(); if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); @@ -4483,7 +4489,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); ret = !list_empty(&pwq->delayed_works); - rcu_read_unlock_sched(); + preempt_enable(); + rcu_read_unlock(); return ret; } @@ -4509,15 +4516,15 @@ unsigned int work_busy(struct work_struct *work) if (work_pending(work)) ret |= WORK_BUSY_PENDING; - local_irq_save(flags); + rcu_read_lock(); pool = get_work_pool(work); if (pool) { - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); } - local_irq_restore(flags); + rcu_read_unlock(); return ret; } @@ -4701,7 +4708,7 @@ void show_workqueue_state(void) unsigned long flags; int pi; - rcu_read_lock_sched(); + rcu_read_lock(); pr_info("Showing busy workqueues and worker pools:\n"); @@ -4766,7 +4773,7 @@ void show_workqueue_state(void) touch_nmi_watchdog(); } - rcu_read_unlock_sched(); + rcu_read_unlock(); } /* used to show worker information through /proc/PID/{comm,stat,status} */ @@ -5153,16 +5160,16 @@ bool freeze_workqueues_busy(void) * nr_active is monotonically decreasing. It's safe * to peek without lock. */ - rcu_read_lock_sched(); + rcu_read_lock(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; - rcu_read_unlock_sched(); + rcu_read_unlock(); goto out_unlock; } } - rcu_read_unlock_sched(); + rcu_read_unlock(); } out_unlock: mutex_unlock(&wq_pool_mutex); @@ -5357,7 +5364,8 @@ static ssize_t wq_pool_ids_show(struct device *dev, const char *delim = ""; int node, written = 0; - rcu_read_lock_sched(); + get_online_cpus(); + rcu_read_lock(); for_each_node(node) { written += scnprintf(buf + written, PAGE_SIZE - written, "%s%d:%d", delim, node, @@ -5365,7 +5373,8 @@ static ssize_t wq_pool_ids_show(struct device *dev, delim = " "; } written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock_sched(); + rcu_read_unlock(); + put_online_cpus(); return written; } -- cgit v1.2.3 From 3b15d09f7e6db44065aaba5fd16dc7420035c5ad Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 28 Feb 2019 13:13:26 +0800 Subject: time: Introduce jiffies64_to_msecs() there is a similar helper in net/netfilter/nf_tables_api.c, this maybe become a common request someday, so move it to time.c Signed-off-by: Zhang Yu Signed-off-by: Li RongQing Acked-by: John Stultz Signed-off-by: Pablo Neira Ayuso --- kernel/time/time.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index c3f756f8534b..9e3f79d4f5a8 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -783,6 +783,16 @@ u64 jiffies64_to_nsecs(u64 j) } EXPORT_SYMBOL(jiffies64_to_nsecs); +u64 jiffies64_to_msecs(const u64 j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#else + return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); +#endif +} +EXPORT_SYMBOL(jiffies64_to_msecs); + /** * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * -- cgit v1.2.3 From 699c1868a743f530081f429058616a2dd5d8a4b2 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 8 Apr 2019 12:50:57 -0400 Subject: audit: purge unnecessary list_empty calls The original conditions that led to the use of list_empty() to optimize list_for_each_entry_rcu() in auditfilter.c and auditsc.c code have been removed without removing the list_empty() call, but this code example has been copied several times. Remove the unnecessary list_empty() calls. Please see upstream github issue https://github.com/linux-audit/audit-kernel/issues/112 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditfilter.c | 2 -- kernel/auditsc.c | 64 ++++++++++++++++++++++------------------------------ 2 files changed, 27 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 63f8b3f26fab..2c3c2f349b23 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1315,8 +1315,6 @@ int audit_filter(int msgtype, unsigned int listtype) int ret = 1; /* Audit by default */ rcu_read_lock(); - if (list_empty(&audit_filter_list[listtype])) - goto unlock_and_return; list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) { int i, result = 0; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 98a98e6dca05..51a2ceb3a1ca 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -771,15 +771,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_DISABLED; rcu_read_lock(); - if (!list_empty(list)) { - list_for_each_entry_rcu(e, list, list) { - if (audit_in_mask(&e->rule, ctx->major) && - audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return state; - } + list_for_each_entry_rcu(e, list, list) { + if (audit_in_mask(&e->rule, ctx->major) && + audit_filter_rules(tsk, &e->rule, ctx, NULL, + &state, false)) { + rcu_read_unlock(); + ctx->current_state = state; + return state; } } rcu_read_unlock(); @@ -798,9 +796,6 @@ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_entry *e; enum audit_state state; - if (list_empty(list)) - return 0; - list_for_each_entry_rcu(e, list, list) { if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { @@ -808,7 +803,6 @@ static int audit_filter_inode_name(struct task_struct *tsk, return 1; } } - return 0; } @@ -1945,18 +1939,16 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, return; rcu_read_lock(); - if (!list_empty(list)) { - list_for_each_entry_rcu(e, list, list) { - for (i = 0; i < e->rule.field_count; i++) { - struct audit_field *f = &e->rule.fields[i]; - - if (f->type == AUDIT_FSTYPE - && audit_comparator(inode->i_sb->s_magic, - f->op, f->val) - && e->rule.action == AUDIT_NEVER) { - rcu_read_unlock(); - return; - } + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE + && audit_comparator(inode->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; } } } @@ -2065,18 +2057,16 @@ void __audit_inode_child(struct inode *parent, return; rcu_read_lock(); - if (!list_empty(list)) { - list_for_each_entry_rcu(e, list, list) { - for (i = 0; i < e->rule.field_count; i++) { - struct audit_field *f = &e->rule.fields[i]; - - if (f->type == AUDIT_FSTYPE - && audit_comparator(parent->i_sb->s_magic, - f->op, f->val) - && e->rule.action == AUDIT_NEVER) { - rcu_read_unlock(); - return; - } + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE + && audit_comparator(parent->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; } } } -- cgit v1.2.3 From d75f773c86a2b8b7278e2c33343b46a4024bc002 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 25 Mar 2019 21:32:28 +0200 Subject: treewide: Switch printk users from %pf and %pF to %ps and %pS, respectively %pF and %pf are functionally equivalent to %pS and %ps conversion specifiers. The former are deprecated, therefore switch the current users to use the preferred variant. The changes have been produced by the following command: git grep -l '%p[fF]' | grep -v '^\(tools\|Documentation\)/' | \ while read i; do perl -i -pe 's/%pf/%ps/g; s/%pF/%pS/g;' $i; done And verifying the result. Link: http://lkml.kernel.org/r/20190325193229.23390-1-sakari.ailus@linux.intel.com Cc: Andy Shevchenko Cc: linux-arm-kernel@lists.infradead.org Cc: sparclinux@vger.kernel.org Cc: linux-um@lists.infradead.org Cc: xen-devel@lists.xenproject.org Cc: linux-acpi@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-nvdimm@lists.01.org Cc: linux-pci@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: linux-btrfs@vger.kernel.org Cc: linux-f2fs-devel@lists.sourceforge.net Cc: linux-mm@kvack.org Cc: ceph-devel@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Sakari Ailus Acked-by: David Sterba (for btrfs) Acked-by: Mike Rapoport (for mm/memblock.c) Acked-by: Bjorn Helgaas (for drivers/pci) Acked-by: Rafael J. Wysocki Signed-off-by: Petr Mladek --- kernel/async.c | 4 ++-- kernel/events/uprobes.c | 2 +- kernel/fail_function.c | 2 +- kernel/irq/debugfs.c | 2 +- kernel/irq/handle.c | 2 +- kernel/irq/manage.c | 2 +- kernel/irq/spurious.c | 4 ++-- kernel/rcu/tree.c | 2 +- kernel/stop_machine.c | 2 +- kernel/time/sched_clock.c | 2 +- kernel/time/timer.c | 2 +- kernel/workqueue.c | 12 ++++++------ 12 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index f6bd0d9885e1..12c332e4e13e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work) /* 1) run (and print duration) */ if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("calling %lli_%pF @ %i\n", + pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); @@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", + pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index affa830a198c..48abc0f18eae 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2028,7 +2028,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) if (uc->handler) { rc = uc->handler(uc, regs); WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); + "bad rc=0x%x from %ps()\n", rc, uc->handler); } if (uc->ret_handler) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 17f75b545f66..feb80712b913 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v) { struct fei_attr *attr = list_entry(v, struct fei_attr, list); - seq_printf(m, "%pf\n", attr->kp.addr); + seq_printf(m, "%ps\n", attr->kp.addr); return 0; } diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 516c00a5e867..c1eccd4f6520 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p) raw_spin_lock_irq(&desc->lock); data = irq_desc_get_irq_data(desc); - seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "handler: %ps\n", desc->handle_irq); seq_printf(m, "device: %s\n", desc->dev_name); seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6df5ddfdb0f8..a4ace611f47f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); - if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", irq, action->handler)) local_irq_disable(); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9ec34a2a6638..ec43ab2fdfda 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -778,7 +778,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) ret = 0; break; default: - pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", + pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 6d2fa6914b30..2ed97a7c9b2a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) */ raw_spin_lock_irqsave(&desc->lock, flags); for_each_action_of_desc(desc, action) { - printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); + printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler); if (action->thread_fn) - printk(KERN_CONT " threaded [<%p>] %pf", + printk(KERN_CONT " threaded [<%p>] %ps", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acd6ccf56faf..8eee921b384d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2870,7 +2870,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) * Use rcu:rcu_callback trace event to find the previous * time callback was passed to __call_rcu(). */ - WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", + WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", head, head->func); WRITE_ONCE(head->func, rcu_leak_callback); return; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 067cb83f37ea..7231fb5953fc 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -513,7 +513,7 @@ repeat: } preempt_count_dec(); WARN_ONCE(preempt_count(), - "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); + "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); goto repeat; } } diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 094b82ca95e5..1002cf61700a 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); - pr_debug("Registered %pF as sched_clock source\n", read); + pr_debug("Registered %pS as sched_clock source\n", read); } void __init generic_sched_clock_init(void) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2fce056f8a49..6502c3ed317e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1328,7 +1328,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list lock_map_release(&lockdep_map); if (count != preempt_count()) { - WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7abbeed13421..8ea9e4fb8cc6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2277,7 +2277,7 @@ __acquires(&pool->lock) if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" - " last function: %pf\n", + " last function: %ps\n", current->comm, preempt_count(), task_pid_nr(current), worker->current_func); debug_show_held_locks(current); @@ -2596,11 +2596,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, worker = current_wq_worker(); WARN_ONCE(current->flags & PF_MEMALLOC, - "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", current->pid, current->comm, target_wq->name, target_func); WARN_ONCE(worker && ((worker->current_pwq->wq->flags & (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), - "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); } @@ -4582,7 +4582,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { - printk("%sWorkqueue: %s %pf", log_lvl, name, fn); + printk("%sWorkqueue: %s %ps", log_lvl, name, fn); if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); @@ -4607,7 +4607,7 @@ static void pr_cont_work(bool comma, struct work_struct *work) pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { - pr_cont("%s %pf", comma ? "," : "", work->func); + pr_cont("%s %ps", comma ? "," : "", work->func); } } @@ -4639,7 +4639,7 @@ static void show_pwq(struct pool_workqueue *pwq) if (worker->current_pwq != pwq) continue; - pr_cont("%s %d%s:%pf", comma ? "," : "", + pr_cont("%s %d%s:%ps", comma ? "," : "", task_pid_nr(worker->task), worker == pwq->wq->rescuer ? "(RESCUER)" : "", worker->current_func); -- cgit v1.2.3 From d8eca5bbb2be9bc7546f9e733786fa2f1a594c67 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:03 +0200 Subject: bpf: implement lookup-free direct value access for maps This generic extension to BPF maps allows for directly loading an address residing inside a BPF map value as a single BPF ldimm64 instruction! The idea is similar to what BPF_PSEUDO_MAP_FD does today, which is a special src_reg flag for ldimm64 instruction that indicates that inside the first part of the double insns's imm field is a file descriptor which the verifier then replaces as a full 64bit address of the map into both imm parts. For the newly added BPF_PSEUDO_MAP_VALUE src_reg flag, the idea is the following: the first part of the double insns's imm field is again a file descriptor corresponding to the map, and the second part of the imm field is an offset into the value. The verifier will then replace both imm parts with an address that points into the BPF map value at the given value offset for maps that support this operation. Currently supported is array map with single entry. It is possible to support more than just single map element by reusing both 16bit off fields of the insns as a map index, so full array map lookup could be expressed that way. It hasn't been implemented here due to lack of concrete use case, but could easily be done so in future in a compatible way, since both off fields right now have to be 0 and would correctly denote a map index 0. The BPF_PSEUDO_MAP_VALUE is a distinct flag as otherwise with BPF_PSEUDO_MAP_FD we could not differ offset 0 between load of map pointer versus load of map's value at offset 0, and changing BPF_PSEUDO_MAP_FD's encoding into off by one to differ between regular map pointer and map value pointer would add unnecessary complexity and increases barrier for debugability thus less suitable. Using the second part of the imm field as an offset into the value does /not/ come with limitations since maximum possible value size is in u32 universe anyway. This optimization allows for efficiently retrieving an address to a map value memory area without having to issue a helper call which needs to prepare registers according to calling convention, etc, without needing the extra NULL test, and without having to add the offset in an additional instruction to the value base pointer. The verifier then treats the destination register as PTR_TO_MAP_VALUE with constant reg->off from the user passed offset from the second imm field, and guarantees that this is within bounds of the map value. Any subsequent operations are normally treated as typical map value handling without anything extra needed from verification side. The two map operations for direct value access have been added to array map for now. In future other types could be supported as well depending on the use case. The main use case for this commit is to allow for BPF loader support for global variables that reside in .data/.rodata/.bss sections such that we can directly load the address of them with minimal additional infrastructure required. Loader support has been added in subsequent commits for libbpf library. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 32 +++++++++++++++++++ kernel/bpf/core.c | 3 +- kernel/bpf/disasm.c | 5 +-- kernel/bpf/syscall.c | 28 ++++++++++++----- kernel/bpf/verifier.c | 86 +++++++++++++++++++++++++++++++++++++++------------ 5 files changed, 124 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c72e0d8e1e65..1a6e9861d554 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -160,6 +160,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + array->elem_size * (index & array->index_mask); } +static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, + u32 off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + if (map->max_entries != 1) + return -ENOTSUPP; + if (off >= map->value_size) + return -EINVAL; + + *imm = (unsigned long)array->value; + return 0; +} + +static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, + u32 *off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u64 base = (unsigned long)array->value; + u64 range = array->elem_size; + + if (map->max_entries != 1) + return -ENOTSUPP; + if (imm < base || imm >= base + range) + return -ENOENT; + + *off = imm - base; + return 0; +} + /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -419,6 +449,8 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, + .map_direct_value_addr = array_map_direct_value_addr, + .map_direct_value_meta = array_map_direct_value_meta, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2966cb368bf4..ace8c22c8b0e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -292,7 +292,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) dst[i] = fp->insnsi[i]; if (!was_ld_map && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && - dst[i].src_reg == BPF_PSEUDO_MAP_FD) { + (dst[i].src_reg == BPF_PSEUDO_MAP_FD || + dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { was_ld_map = true; dst[i].imm = 0; } else if (was_ld_map && diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index de73f55e42fd..d9ce383c0f9c 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -205,10 +205,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, * part of the ldimm64 insn is accessible. */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || + insn->src_reg == BPF_PSEUDO_MAP_VALUE; char tmp[64]; - if (map_ptr && !allow_ptr_leaks) + if (is_ptr && !allow_ptr_leaks) imm = 0; verbose(cbs->private_data, "(%02x) r%d = %s\n", diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1d65e56594db..828518bb947b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2072,13 +2072,26 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) } static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, - unsigned long addr) + unsigned long addr, u32 *off, + u32 *type) { + const struct bpf_map *map; int i; - for (i = 0; i < prog->aux->used_map_cnt; i++) - if (prog->aux->used_maps[i] == (void *)addr) - return prog->aux->used_maps[i]; + for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (map == (void *)addr) { + *type = BPF_PSEUDO_MAP_FD; + return map; + } + if (!map->ops->map_direct_value_meta) + continue; + if (!map->ops->map_direct_value_meta(map, addr, off)) { + *type = BPF_PSEUDO_MAP_VALUE; + return map; + } + } + return NULL; } @@ -2086,6 +2099,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) { const struct bpf_map *map; struct bpf_insn *insns; + u32 off, type; u64 imm; int i; @@ -2113,11 +2127,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; - map = bpf_map_from_imm(prog, imm); + map = bpf_map_from_imm(prog, imm, &off, &type); if (map) { - insns[i].src_reg = BPF_PSEUDO_MAP_FD; + insns[i].src_reg = type; insns[i].imm = map->id; - insns[i + 1].imm = 0; + insns[i + 1].imm = off; continue; } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48718e1da16d..6ab7a23fc924 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5056,18 +5056,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return 0; } -/* return the map pointer stored inside BPF_LD_IMM64 instruction */ -static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) -{ - u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; - - return (struct bpf_map *) (unsigned long) imm64; -} - /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { + struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map; int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -5091,11 +5085,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ - BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + map = env->used_maps[aux->map_index]; + mark_reg_known_zero(env, regs, insn->dst_reg); + regs[insn->dst_reg].map_ptr = map; + + if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].off = aux->map_off; + if (map_value_has_spin_lock(map)) + regs[insn->dst_reg].id = ++env->id_gen; + } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + } else { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; - regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); return 0; } @@ -6803,8 +6808,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_insn_aux_data *aux; struct bpf_map *map; struct fd f; + u64 addr; if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || @@ -6813,13 +6820,19 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return -EINVAL; } - if (insn->src_reg == 0) + if (insn[0].src_reg == 0) /* valid generic load 64-bit imm */ goto next_insn; - if (insn[0].src_reg != BPF_PSEUDO_MAP_FD || - insn[1].imm != 0) { - verbose(env, "unrecognized bpf_ld_imm64 insn\n"); + /* In final convert_pseudo_ld_imm64() step, this is + * converted into regular 64-bit imm load insn. + */ + if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD && + insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) || + (insn[0].src_reg == BPF_PSEUDO_MAP_FD && + insn[1].imm != 0)) { + verbose(env, + "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } @@ -6837,16 +6850,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return err; } - /* store map pointer inside BPF_LD_IMM64 instruction */ - insn[0].imm = (u32) (unsigned long) map; - insn[1].imm = ((u64) (unsigned long) map) >> 32; + aux = &env->insn_aux_data[i]; + if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + addr = (unsigned long)map; + } else { + u32 off = insn[1].imm; + + if (off >= BPF_MAX_VAR_OFF) { + verbose(env, "direct value offset of %u is not allowed\n", off); + fdput(f); + return -EINVAL; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + fdput(f); + return -EINVAL; + } + + err = map->ops->map_direct_value_addr(map, &addr, off); + if (err) { + verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n", + map->value_size, off); + fdput(f); + return err; + } + + aux->map_off = off; + addr += off; + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; /* check whether we recorded this map already */ - for (j = 0; j < env->used_map_cnt; j++) + for (j = 0; j < env->used_map_cnt; j++) { if (env->used_maps[j] == map) { + aux->map_index = j; fdput(f); goto next_insn; } + } if (env->used_map_cnt >= MAX_USED_MAPS) { fdput(f); @@ -6863,6 +6907,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) fdput(f); return PTR_ERR(map); } + + aux->map_index = env->used_map_cnt; env->used_maps[env->used_map_cnt++] = map; if (bpf_map_is_cgroup_storage(map) && -- cgit v1.2.3 From be70bcd53de66e86f2726e576307cbdaebd3b1a5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:04 +0200 Subject: bpf: do not retain flags that are not tied to map lifetime Both BPF_F_WRONLY / BPF_F_RDONLY flags are tied to the map file descriptor, but not to the map object itself! Meaning, at map creation time BPF_F_RDONLY can be set to make the map read-only from syscall side, but this holds only for the returned fd, so any other fd either retrieved via bpf file system or via map id for the very same underlying map object can have read-write access instead. Given that, keeping the two flags around in the map_flags attribute and exposing them to user space upon map dump is misleading and may lead to false conclusions. Since these two flags are not tied to the map object lets also not store them as map property. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 828518bb947b..56b4b0e08b3b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -166,13 +166,25 @@ void bpf_map_area_free(void *area) kvfree(area); } +static u32 bpf_map_flags_retain_permanent(u32 flags) +{ + /* Some map creation flags are not tied to the map object but + * rather to the map fd instead, so they have no meaning upon + * map object inspection since multiple file descriptors with + * different (access) properties can exist here. Thus, given + * this has zero meaning for the map itself, lets clear these + * from here. + */ + return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); +} + void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) { map->map_type = attr->map_type; map->key_size = attr->key_size; map->value_size = attr->value_size; map->max_entries = attr->max_entries; - map->map_flags = attr->map_flags; + map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); map->numa_node = bpf_map_attr_numa_node(attr); } -- cgit v1.2.3 From 591fe9888d7809d9ee5c828020b6c6ae27c37229 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:05 +0200 Subject: bpf: add program side {rd, wr}only support for maps This work adds two new map creation flags BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG in order to allow for read-only or write-only BPF maps from a BPF program side. Today we have BPF_F_RDONLY and BPF_F_WRONLY, but this only applies to system call side, meaning the BPF program has full read/write access to the map as usual while bpf(2) calls with map fd can either only read or write into the map depending on the flags. BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG allows for the exact opposite such that verifier is going to reject program loads if write into a read-only map or a read into a write-only map is detected. For read-only map case also some helpers are forbidden for programs that would alter the map state such as map deletion, update, etc. As opposed to the two BPF_F_RDONLY / BPF_F_WRONLY flags, BPF_F_RDONLY_PROG as well as BPF_F_WRONLY_PROG really do correspond to the map lifetime. We've enabled this generic map extension to various non-special maps holding normal user data: array, hash, lru, lpm, local storage, queue and stack. Further generic map types could be followed up in future depending on use-case. Main use case here is to forbid writes into .rodata map values from verifier side. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 6 +++++- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/local_storage.c | 6 +++--- kernel/bpf/lpm_trie.c | 3 ++- kernel/bpf/queue_stack_maps.c | 6 +++--- kernel/bpf/syscall.c | 2 ++ kernel/bpf/verifier.c | 46 +++++++++++++++++++++++++++++++++++++++++-- 7 files changed, 62 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1a6e9861d554..217b10bd9f48 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -22,7 +22,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -63,6 +63,7 @@ int array_map_alloc_check(union bpf_attr *attr) if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; @@ -472,6 +473,9 @@ static int fd_array_map_alloc_check(union bpf_attr *attr) /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return -EINVAL; + /* Program read-only/write-only not supported for special maps yet. */ + if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) + return -EINVAL; return array_map_alloc_check(attr); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fed15cf94dca..192d32e77db3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -23,7 +23,7 @@ #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED) + BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) struct bucket { struct hlist_nulls_head head; @@ -262,8 +262,8 @@ static int htab_map_alloc_check(union bpf_attr *attr) /* Guard against local DoS, and discourage production use. */ return -EPERM; - if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (!lru && percpu_lru) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 6b572e2de7fb..980e8f1f6cb5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO #ifdef CONFIG_CGROUP_BPF #define LOCAL_STORAGE_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_cgroup_storage_map { struct bpf_map map; @@ -282,8 +282,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); - if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return ERR_PTR(-EINVAL); if (attr->max_entries) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 93a5cbbde421..e61630c2e50b 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -538,7 +538,7 @@ out: #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY) + BPF_F_ACCESS_MASK) static struct bpf_map *trie_alloc(union bpf_attr *attr) { @@ -553,6 +553,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) if (attr->max_entries == 0 || !(attr->map_flags & BPF_F_NO_PREALLOC) || attr->map_flags & ~LPM_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || attr->key_size < LPM_KEY_SIZE_MIN || attr->key_size > LPM_KEY_SIZE_MAX || attr->value_size < LPM_VAL_SIZE_MIN || diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index b384ea9f3254..0b140d236889 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -11,8 +11,7 @@ #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_queue_stack { struct bpf_map map; @@ -52,7 +51,8 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || attr->value_size == 0 || - attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 56b4b0e08b3b..0c9276b54c88 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -501,6 +501,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->spin_lock_off = btf_find_spin_lock(btf, value_type); if (map_value_has_spin_lock(map)) { + if (map->map_flags & BPF_F_RDONLY_PROG) + return -EACCES; if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6ab7a23fc924..b747434df89c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1439,6 +1439,28 @@ static int check_stack_access(struct bpf_verifier_env *env, return 0; } +static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, + int off, int size, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map = regs[regno].map_ptr; + u32 cap = bpf_map_flags_to_cap(map); + + if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { + verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { + verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) @@ -2024,7 +2046,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } - + err = check_map_access_type(env, regno, off, size, t); + if (err) + return err; err = check_map_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); @@ -2327,6 +2351,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_VALUE: + if (check_map_access_type(env, regno, reg->off, access_size, + meta && meta->raw_mode ? BPF_WRITE : + BPF_READ)) + return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ @@ -3059,6 +3087,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map = meta->map_ptr; if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && @@ -3069,11 +3098,24 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_peek_elem) return 0; - if (meta->map_ptr == NULL) { + if (map == NULL) { verbose(env, "kernel subsystem misconfigured verifier\n"); return -EINVAL; } + /* In case of read-only, some additional restrictions + * need to be applied in order to prevent altering the + * state of the map from program side. + */ + if ((map->map_flags & BPF_F_RDONLY_PROG) && + (func_id == BPF_FUNC_map_delete_elem || + func_id == BPF_FUNC_map_update_elem || + func_id == BPF_FUNC_map_push_elem || + func_id == BPF_FUNC_map_pop_elem)) { + verbose(env, "write into map forbidden\n"); + return -EACCES; + } + if (!BPF_MAP_PTR(aux->map_state)) bpf_map_ptr_store(aux, meta->map_ptr, meta->map_ptr->unpriv_array); -- cgit v1.2.3 From 87df15de441bd4add7876ef584da8cabdd9a042a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:06 +0200 Subject: bpf: add syscall side map freeze support This patch adds a new BPF_MAP_FREEZE command which allows to "freeze" the map globally as read-only / immutable from syscall side. Map permission handling has been refactored into map_get_sys_perms() and drops FMODE_CAN_WRITE in case of locked map. Main use case is to allow for setting up .rodata sections from the BPF ELF which are loaded into the kernel, meaning BPF loader first allocates map, sets up map value by copying .rodata section into it and once complete, it calls BPF_MAP_FREEZE on the map fd to prevent further modifications. Right now BPF_MAP_FREEZE only takes map fd as argument while remaining bpf_attr members are required to be zero. I didn't add write-only locking here as counterpart since I don't have a concrete use-case for it on my side, and I think it makes probably more sense to wait once there is actually one. In that case bpf_attr can be extended as usual with a flag field and/or others where flag 0 means that we lock the map read-only hence this doesn't prevent to add further extensions to BPF_MAP_FREEZE upon need. A map creation flag like BPF_F_WRONCE was not considered for couple of reasons: i) in case of a generic implementation, a map can consist of more than just one element, thus there could be multiple map updates needed to set the map into a state where it can then be made immutable, ii) WRONCE indicates exact one-time write before it is then set immutable. A generic implementation would set a bit atomically on map update entry (if unset), indicating that every subsequent update from then onwards will need to bail out there. However, map updates can fail, so upon failure that flag would need to be unset again and the update attempt would need to be repeated for it to be eventually made immutable. While this can be made race-free, this approach feels less clean and in combination with reason i), it's not generic enough. A dedicated BPF_MAP_FREEZE command directly sets the flag and caller has the guarantee that map is immutable from syscall side upon successful return for any future syscall invocations that would alter the map state, which is also more intuitive from an API point of view. A command name such as BPF_MAP_LOCK has been avoided as it's too close with BPF map spin locks (which already has BPF_F_LOCK flag). BPF_MAP_FREEZE is so far only enabled for privileged users. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 66 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c9276b54c88..b3ce516e5a20 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -355,6 +355,18 @@ static int bpf_map_release(struct inode *inode, struct file *filp) return 0; } +static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) +{ + fmode_t mode = f.file->f_mode; + + /* Our file permissions may have been overridden by global + * map permissions facing syscall side. + */ + if (READ_ONCE(map->frozen)) + mode &= ~FMODE_CAN_WRITE; + return mode; +} + #ifdef CONFIG_PROC_FS static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { @@ -376,14 +388,16 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "max_entries:\t%u\n" "map_flags:\t%#x\n" "memlock:\t%llu\n" - "map_id:\t%u\n", + "map_id:\t%u\n" + "frozen:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, map->pages * 1ULL << PAGE_SHIFT, - map->id); + map->id, + READ_ONCE(map->frozen)); if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", @@ -727,8 +741,7 @@ static int map_lookup_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } @@ -857,8 +870,7 @@ static int map_update_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -969,8 +981,7 @@ static int map_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1021,8 +1032,7 @@ static int map_get_next_key(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } @@ -1089,8 +1099,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1132,6 +1141,36 @@ err_put: return err; } +#define BPF_MAP_FREEZE_LAST_FIELD map_fd + +static int map_freeze(const union bpf_attr *attr) +{ + int err = 0, ufd = attr->map_fd; + struct bpf_map *map; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_FREEZE)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + if (READ_ONCE(map->frozen)) { + err = -EBUSY; + goto err_put; + } + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto err_put; + } + + WRITE_ONCE(map->frozen, true); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -2735,6 +2774,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_MAP_FREEZE: + err = map_freeze(&attr); + break; case BPF_PROG_LOAD: err = bpf_prog_load(&attr, uattr); break; -- cgit v1.2.3 From 3e0ddc4f3ff1436970e96e76f3df3c3b5f5173b6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:07 +0200 Subject: bpf: allow . char as part of the object name Trivial addition to allow '.' aside from '_' as "special" characters in the object name. Used to allow for substrings in maps from loader side such as ".bss", ".data", ".rodata", but could also be useful for other purposes. Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b3ce516e5a20..198c9680bf0d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -474,10 +474,10 @@ static int bpf_obj_name_cpy(char *dst, const char *src) const char *end = src + BPF_OBJ_NAME_LEN; memset(dst, 0, BPF_OBJ_NAME_LEN); - - /* Copy all isalnum() and '_' char */ + /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { - if (!isalnum(*src) && *src != '_') + if (!isalnum(*src) && + *src != '_' && *src != '.') return -EINVAL; *dst++ = *src++; } -- cgit v1.2.3 From 1dc92851849cc2235a1efef8f8d5a9255efc5f13 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:09 +0200 Subject: bpf: kernel side support for BTF Var and DataSec This work adds kernel-side verification, logging and seq_show dumping of BTF Var and DataSec kinds which are emitted with latest LLVM. The following constraints apply: BTF Var must have: - Its kind_flag is 0 - Its vlen is 0 - Must point to a valid type - Type must not resolve to a forward type - Size of underlying type must be > 0 - Must have a valid name - Can only be a source type, not sink or intermediate one - Name may include dots (e.g. in case of static variables inside functions) - Cannot be a member of a struct/union - Linkage so far can either only be static or global/allocated BTF DataSec must have: - Its kind_flag is 0 - Its vlen cannot be 0 - Its size cannot be 0 - Must have a valid name - Can only be a source type, not sink or intermediate one - Name may include dots (e.g. to represent .bss, .data, .rodata etc) - Cannot be a member of a struct/union - Inner btf_var_secinfo array with {type,offset,size} triple must be sorted by offset in ascending order - Type must always point to BTF Var - BTF resolved size of Var must be <= size provided by triple - DataSec size must be >= sum of triple sizes (thus holes are allowed) btf_var_resolve(), btf_ptr_resolve() and btf_modifier_resolve() are on a high level quite similar but each come with slight, subtle differences. They could potentially be a bit refactored in future which hasn't been done here to ease review. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 417 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 397 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index bd3921b1514b..0cecf6bab61b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -185,6 +185,16 @@ i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_vsi(i, struct_type, member) \ + for (i = 0, member = btf_type_var_secinfo(struct_type); \ + i < btf_type_vlen(struct_type); \ + i++, member++) + +#define for_each_vsi_from(i, from, struct_type, member) \ + for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ + i < btf_type_vlen(struct_type); \ + i++, member++) + static DEFINE_IDR(btf_idr); static DEFINE_SPINLOCK(btf_idr_lock); @@ -262,6 +272,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_RESTRICT] = "RESTRICT", [BTF_KIND_FUNC] = "FUNC", [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", + [BTF_KIND_VAR] = "VAR", + [BTF_KIND_DATASEC] = "DATASEC", }; struct btf_kind_operations { @@ -375,13 +387,36 @@ static bool btf_type_is_int(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_INT; } +static bool btf_type_is_var(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +} + +static bool btf_type_is_datasec(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; +} + +/* Types that act only as a source, not sink or intermediate + * type when resolving. + */ +static bool btf_type_is_resolve_source_only(const struct btf_type *t) +{ + return btf_type_is_var(t) || + btf_type_is_datasec(t); +} + /* What types need to be resolved? * * btf_type_is_modifier() is an obvious one. * * btf_type_is_struct() because its member refers to * another type (through member->type). - + * + * btf_type_is_var() because the variable refers to + * another type. btf_type_is_datasec() holds multiple + * btf_type_is_var() types that need resolving. + * * btf_type_is_array() because its element (array->type) * refers to another type. Array can be thought of a * special case of struct while array just has the same @@ -390,9 +425,11 @@ static bool btf_type_is_int(const struct btf_type *t) static bool btf_type_needs_resolve(const struct btf_type *t) { return btf_type_is_modifier(t) || - btf_type_is_ptr(t) || - btf_type_is_struct(t) || - btf_type_is_array(t); + btf_type_is_ptr(t) || + btf_type_is_struct(t) || + btf_type_is_array(t) || + btf_type_is_var(t) || + btf_type_is_datasec(t); } /* t->size can be used */ @@ -403,6 +440,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_DATASEC: return true; } @@ -467,6 +505,16 @@ static const struct btf_enum *btf_type_enum(const struct btf_type *t) return (const struct btf_enum *)(t + 1); } +static const struct btf_var *btf_type_var(const struct btf_type *t) +{ + return (const struct btf_var *)(t + 1); +} + +static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) +{ + return (const struct btf_var_secinfo *)(t + 1); +} + static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -478,23 +526,31 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) offset < btf->hdr.str_len; } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ -static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +{ + if ((first ? !isalpha(c) : + !isalnum(c)) && + c != '_' && + ((c == '.' && !dot_ok) || + c != '.')) + return false; + return true; +} + +static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) { /* offset must be valid */ const char *src = &btf->strings[offset]; const char *src_limit; - if (!isalpha(*src) && *src != '_') + if (!__btf_name_char_ok(*src, true, dot_ok)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { - if (!isalnum(*src) && *src != '_') + if (!__btf_name_char_ok(*src, false, dot_ok)) return false; src++; } @@ -502,6 +558,19 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return !*src; } +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ + return __btf_name_valid(btf, offset, false); +} + +static bool btf_name_valid_section(const struct btf *btf, u32 offset) +{ + return __btf_name_valid(btf, offset, true); +} + static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) @@ -697,6 +766,32 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, __btf_verifier_log(log, "\n"); } +__printf(4, 5) +static void btf_verifier_log_vsi(struct btf_verifier_env *env, + const struct btf_type *datasec_type, + const struct btf_var_secinfo *vsi, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + if (env->phase != CHECK_META) + btf_verifier_log_type(env, datasec_type, NULL); + + __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u", + vsi->type, vsi->offset, vsi->size); + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + static void btf_verifier_log_hdr(struct btf_verifier_env *env, u32 btf_data_size) { @@ -974,7 +1069,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, } else if (btf_type_is_ptr(size_type)) { size = sizeof(void *); } else { - if (WARN_ON_ONCE(!btf_type_is_modifier(size_type))) + if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) && + !btf_type_is_var(size_type))) return NULL; size = btf->resolved_sizes[size_type_id]; @@ -1509,7 +1605,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, u32 next_type_size = 0; next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { + if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1542,6 +1638,53 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, return 0; } +static int btf_var_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *next_type; + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type || btf_type_is_resolve_source_only(next_type)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + if (btf_type_is_modifier(next_type)) { + const struct btf_type *resolved_type; + u32 resolved_type_id; + + resolved_type_id = next_type_id; + resolved_type = btf_type_id_resolve(btf, &resolved_type_id); + + if (btf_type_is_ptr(resolved_type) && + !env_type_is_resolve_sink(env, resolved_type) && + !env_type_is_resolved(env, resolved_type_id)) + return env_stack_push(env, resolved_type, + resolved_type_id); + } + + /* We must resolve to something concrete at this point, no + * forward types or similar that would resolve to size of + * zero is allowed. + */ + if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + env_stack_pop_resolved(env, next_type_id, next_type_size); + + return 0; +} + static int btf_ptr_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { @@ -1551,7 +1694,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, struct btf *btf = env->btf; next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { + if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1609,6 +1752,15 @@ static void btf_modifier_seq_show(const struct btf *btf, btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); } +static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + t = btf_type_id_resolve(btf, &type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +} + static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) @@ -1776,7 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_nosize_or_null(index_type)) { + if (btf_type_is_resolve_source_only(index_type) || + btf_type_nosize_or_null(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1795,7 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_nosize_or_null(elem_type)) { + if (btf_type_is_resolve_source_only(elem_type) || + btf_type_nosize_or_null(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -2016,7 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_nosize_or_null(member_type)) { + if (btf_type_is_resolve_source_only(member_type) || + btf_type_nosize_or_null(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; @@ -2411,6 +2566,222 @@ static struct btf_kind_operations func_ops = { .seq_show = btf_df_seq_show, }; +static s32 btf_var_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_var *var; + u32 meta_needed = sizeof(*var); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (!t->name_off || + !__btf_name_valid(env->btf, t->name_off, true)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + /* A var cannot be in type void */ + if (!t->type || !BTF_TYPE_ID_VALID(t->type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + var = btf_type_var(t); + if (var->linkage != BTF_VAR_STATIC && + var->linkage != BTF_VAR_GLOBAL_ALLOCATED) { + btf_verifier_log_type(env, t, "Linkage not supported"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t) +{ + const struct btf_var *var = btf_type_var(t); + + btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage); +} + +static const struct btf_kind_operations var_ops = { + .check_meta = btf_var_check_meta, + .resolve = btf_var_resolve, + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_var_log, + .seq_show = btf_var_seq_show, +}; + +static s32 btf_datasec_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_var_secinfo *vsi; + u64 last_vsi_end_off = 0, sum = 0; + u32 i, meta_needed; + + meta_needed = btf_type_vlen(t) * sizeof(*vsi); + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (!btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen == 0"); + return -EINVAL; + } + + if (!t->size) { + btf_verifier_log_type(env, t, "size == 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (!t->name_off || + !btf_name_valid_section(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for_each_vsi(i, t, vsi) { + /* A var cannot be in type void */ + if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid type_id"); + return -EINVAL; + } + + if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid offset"); + return -EINVAL; + } + + if (!vsi->size || vsi->size > t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid size"); + return -EINVAL; + } + + last_vsi_end_off = vsi->offset + vsi->size; + if (last_vsi_end_off > t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid offset+size"); + return -EINVAL; + } + + btf_verifier_log_vsi(env, t, vsi, NULL); + sum += vsi->size; + } + + if (t->size < sum) { + btf_verifier_log_type(env, t, "Invalid btf_info size"); + return -EINVAL; + } + + return meta_needed; +} + +static int btf_datasec_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_var_secinfo *vsi; + struct btf *btf = env->btf; + u16 i; + + for_each_vsi_from(i, v->next_member, v->t, vsi) { + u32 var_type_id = vsi->type, type_id, type_size = 0; + const struct btf_type *var_type = btf_type_by_id(env->btf, + var_type_id); + if (!var_type || !btf_type_is_var(var_type)) { + btf_verifier_log_vsi(env, v->t, vsi, + "Not a VAR kind member"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, var_type) && + !env_type_is_resolved(env, var_type_id)) { + env_stack_set_next_member(env, i + 1); + return env_stack_push(env, var_type, var_type_id); + } + + type_id = var_type->type; + if (!btf_type_id_size(btf, &type_id, &type_size)) { + btf_verifier_log_vsi(env, v->t, vsi, "Invalid type"); + return -EINVAL; + } + + if (vsi->size < type_size) { + btf_verifier_log_vsi(env, v->t, vsi, "Invalid size"); + return -EINVAL; + } + } + + env_stack_pop_resolved(env, 0, 0); + return 0; +} + +static void btf_datasec_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static void btf_datasec_seq_show(const struct btf *btf, + const struct btf_type *t, u32 type_id, + void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_var_secinfo *vsi; + const struct btf_type *var; + u32 i; + + seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); + for_each_vsi(i, t, vsi) { + var = btf_type_by_id(btf, vsi->type); + if (i) + seq_puts(m, ","); + btf_type_ops(var)->seq_show(btf, var, vsi->type, + data + vsi->offset, bits_offset, m); + } + seq_puts(m, "}"); +} + +static const struct btf_kind_operations datasec_ops = { + .check_meta = btf_datasec_check_meta, + .resolve = btf_datasec_resolve, + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_datasec_log, + .seq_show = btf_datasec_seq_show, +}; + static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *t) { @@ -2542,6 +2913,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_RESTRICT] = &modifier_ops, [BTF_KIND_FUNC] = &func_ops, [BTF_KIND_FUNC_PROTO] = &func_proto_ops, + [BTF_KIND_VAR] = &var_ops, + [BTF_KIND_DATASEC] = &datasec_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -2622,13 +2995,17 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, if (!env_type_is_resolved(env, type_id)) return false; - if (btf_type_is_struct(t)) + if (btf_type_is_struct(t) || btf_type_is_datasec(t)) return !btf->resolved_ids[type_id] && - !btf->resolved_sizes[type_id]; + !btf->resolved_sizes[type_id]; - if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) { + if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || + btf_type_is_var(t)) { t = btf_type_id_resolve(btf, &type_id); - return t && !btf_type_is_modifier(t); + return t && + !btf_type_is_modifier(t) && + !btf_type_is_var(t) && + !btf_type_is_datasec(t); } if (btf_type_is_array(t)) { -- cgit v1.2.3 From 2824ecb7010f6a20e9a4140512b798469ab066cc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:10 +0200 Subject: bpf: allow for key-less BTF in array map Given we'll be reusing BPF array maps for global data/bss/rodata sections, we need a way to associate BTF DataSec type as its map value type. In usual cases we have this ugly BPF_ANNOTATE_KV_PAIR() macro hack e.g. via 38d5d3b3d5db ("bpf: Introduce BPF_ANNOTATE_KV_PAIR") to get initial map to type association going. While more use cases for it are discouraged, this also won't work for global data since the use of array map is a BPF loader detail and therefore unknown at compilation time. For array maps with just a single entry we make an exception in terms of BTF in that key type is declared optional if value type is of DataSec type. The latter LLVM is guaranteed to emit and it also aligns with how we regard global data maps as just a plain buffer area reusing existing map facilities for allowing things like introspection with existing tools. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 15 ++++++++++++++- kernel/bpf/btf.c | 2 +- kernel/bpf/syscall.c | 15 +++++++++++---- 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 217b10bd9f48..584636c9e2eb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -391,7 +391,8 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, return; } - seq_printf(m, "%u: ", *(u32 *)key); + if (map->btf_key_type_id) + seq_printf(m, "%u: ", *(u32 *)key); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_puts(m, "\n"); @@ -428,6 +429,18 @@ static int array_map_check_btf(const struct bpf_map *map, { u32 int_data; + /* One exception for keyless BTF: .bss/.data/.rodata map */ + if (btf_type_is_void(key_type)) { + if (map->map_type != BPF_MAP_TYPE_ARRAY || + map->max_entries != 1) + return -EINVAL; + + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) + return -EINVAL; + + return 0; + } + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0cecf6bab61b..cad09858a5f2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -326,7 +326,7 @@ static bool btf_type_is_modifier(const struct btf_type *t) return false; } -static bool btf_type_is_void(const struct btf_type *t) +bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 198c9680bf0d..438199e2eca4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -504,9 +504,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 key_size, value_size; int ret = 0; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || key_size != map->key_size) - return -EINVAL; + /* Some maps allow key to be unspecified. */ + if (btf_key_id) { + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + } else { + key_type = btf_type_by_id(btf, 0); + if (!map->ops->map_check_btf) + return -EINVAL; + } value_type = btf_type_id_size(btf, &btf_value_id, &value_size); if (!value_type || value_size != map->value_size) @@ -573,7 +580,7 @@ static int map_create(union bpf_attr *attr) if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; - if (!attr->btf_key_type_id || !attr->btf_value_type_id) { + if (!attr->btf_value_type_id) { err = -EINVAL; goto free_map_nouncharge; } -- cgit v1.2.3 From ee6a6500fe1f5c5a3f18de33fe0178a3c627f6d0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 10 Apr 2019 10:45:38 -0400 Subject: ftrace: Remove ASSIGN_OPS_HASH() macro from ftrace.c The ASSIGN_OPS_HASH() macro was moved to fgraph.c where it was used, but for some reason it wasn't removed from ftrace.c, as it is no longer referenced there. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 26c8ca9bd06b..bf11e0553450 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -69,12 +69,8 @@ #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), -#define ASSIGN_OPS_HASH(opsname, val) \ - .func_hash = val, \ - .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else #define INIT_OPS_HASH(opsname) -#define ASSIGN_OPS_HASH(opsname, val) #endif enum { -- cgit v1.2.3 From 83ca259489409a1fe8a83dad83a82f32174d4f31 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 5 Apr 2019 09:15:25 +0800 Subject: swiotlb: dump used and total slots when swiotlb buffer is full So far the kernel only prints the requested size if swiotlb buffer if full. It is not possible to know whether it is simply an out of buffer, or it is because swiotlb cannot allocate buffer with the requested size due to fragmentation. As 'io_tlb_used' is available since commit 71602fe6d4e9 ("swiotlb: add debugfs to track swiotlb buffer usage"), both 'io_tlb_used' and 'io_tlb_nslabs' are printed when swiotlb buffer is full. Signed-off-by: Dongli Zhang Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 2b0c8fd9658e..82c767374c70 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -533,7 +533,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) - dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); + dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu, used %lu\n", + size, io_tlb_nslabs, io_tlb_used); return DMA_MAPPING_ERROR; found: io_tlb_used += nslots; -- cgit v1.2.3 From b0b9395d865e3060d97658fbc9ba3f77fecc8da1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 9 Apr 2019 11:49:09 -0700 Subject: bpf: support input __sk_buff context in BPF_PROG_TEST_RUN Add new set of arguments to bpf_attr for BPF_PROG_TEST_RUN: * ctx_in/ctx_size_in - input context * ctx_out/ctx_size_out - output context The intended use case is to pass some meta data to the test runs that operate on skb (this has being brought up on recent LPC). For programs that use bpf_prog_test_run_skb, support __sk_buff input and output. Initially, from input __sk_buff, copy _only_ cb and priority into skb, all other non-zero fields are prohibited (with EINVAL). If the user has set ctx_out/ctx_size_out, copy the potentially modified __sk_buff back to the userspace. We require all fields of input __sk_buff except the ones we explicitly support to be set to zero. The expectation is that in the future we might add support for more fields and we want to fail explicitly if the user runs the program on the kernel where we don't yet support them. The API is intentionally vague (i.e. we don't explicitly add __sk_buff to bpf_attr, but ctx_in) to potentially let other test_run types use this interface in the future (this can be xdp_md for xdp types for example). v4: * don't copy more than allowed in bpf_ctx_init [Martin] v3: * handle case where ctx_in is NULL, but ctx_out is not [Martin] * convert size==0 checks to ptr==NULL checks and add some extra ptr checks [Martin] v2: * Addressed comments from Martin Lau Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 438199e2eca4..d995eedfdd16 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2009,7 +2009,7 @@ static int bpf_prog_query(const union bpf_attr *attr, return cgroup_bpf_prog_query(attr, uattr); } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration +#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -2022,6 +2022,14 @@ static int bpf_prog_test_run(const union bpf_attr *attr, if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; + if ((attr->test.ctx_size_in && !attr->test.ctx_in) || + (!attr->test.ctx_size_in && attr->test.ctx_in)) + return -EINVAL; + + if ((attr->test.ctx_size_out && !attr->test.ctx_out) || + (!attr->test.ctx_size_out && attr->test.ctx_out)) + return -EINVAL; + prog = bpf_prog_get(attr->test.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); -- cgit v1.2.3 From 2fa717a0337e7acafda9283c938b635191b8036b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 11 Apr 2019 11:46:13 -0400 Subject: ftrace: Do not process STUB functions in ftrace_ops_list_func() The function_graph tracer has a stub function and its ops flag has the FTRACE_OPS_FL_STUB set. As the function graph does not use the ftrace_ops->func pointer but instead is called by a separate part of the ftrace trampoline. The function_graph tracer still requires to pass in a ftrace_ops that may also hold the hash of the functions to call. But there's no reason to test that hash in the function tracing portion. Instead of testing to see if we should call the stub function, just test if the ops has FTRACE_OPS_FL_STUB set, and just skip it. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bf11e0553450..433a64f49532 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6260,6 +6260,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Stub functions don't need to be called nor tested */ + if (op->flags & FTRACE_OPS_FL_STUB) + continue; /* * Check the following for each ops before calling their func: * if RCU flag is set, then rcu_is_watching() must be true -- cgit v1.2.3 From b1cd609d9b517f01867c211bd520cc805db3068a Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Tue, 12 Mar 2019 09:27:09 -0700 Subject: bpf: Add base proto function for cgroup-bpf programs Currently kernel/bpf/cgroup.c contains only one program type and one proto function cgroup_dev_func_proto(). It'd be useful to have base proto function that can be reused for new cgroup-bpf program types coming soon. Introduce cgroup_base_func_proto(). Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4e807973aa80..f6cd38746df2 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -701,7 +701,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -725,6 +725,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return cgroup_base_func_proto(func_id, prog); +} + static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, -- cgit v1.2.3 From 7b146cebe30cb481b0f70d85779da938da818637 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 27 Feb 2019 12:59:24 -0800 Subject: bpf: Sysctl hook Containerized applications may run as root and it may create problems for whole host. Specifically such applications may change a sysctl and affect applications in other containers. Furthermore in existing infrastructure it may not be possible to just completely disable writing to sysctl, instead such a process should be gradual with ability to log what sysctl are being changed by a container, investigate, limit the set of writable sysctl to currently used ones (so that new ones can not be changed) and eventually reduce this set to zero. The patch introduces new program type BPF_PROG_TYPE_CGROUP_SYSCTL and attach type BPF_CGROUP_SYSCTL to solve these problems on cgroup basis. New program type has access to following minimal context: struct bpf_sysctl { __u32 write; }; Where @write indicates whether sysctl is being read (= 0) or written (= 1). Helpers to access sysctl name and value will be introduced separately. BPF_CGROUP_SYSCTL attach point is added to sysctl code right before passing control to ctl_table->proc_handler so that BPF program can either allow or deny access to sysctl. Suggested-by: Roman Gushchin Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 7 ++++ kernel/bpf/verifier.c | 1 + 3 files changed, 100 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f6cd38746df2..610491b5f0aa 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -11,7 +11,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -768,3 +770,93 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { .get_func_proto = cgroup_dev_func_proto, .is_valid_access = cgroup_dev_is_valid_access, }; + +/** + * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl + * + * @head: sysctl table header + * @table: sysctl table + * @write: sysctl is being read (= 0) or written (= 1) + * @type: type of program to be executed + * + * Program is run when sysctl is being accessed, either read or written, and + * can allow or deny such access. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases 0 is returned. + */ +int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, + struct ctl_table *table, int write, + enum bpf_attach_type type) +{ + struct bpf_sysctl_kern ctx = { + .head = head, + .table = table, + .write = write, + }; + struct cgroup *cgrp; + int ret; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + rcu_read_unlock(); + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); + +static const struct bpf_func_proto * +sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return cgroup_base_func_proto(func_id, prog); +} + +static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off + size > sizeof(struct bpf_sysctl) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct bpf_sysctl, write): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + default: + return false; + } +} + +static u32 sysctl_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sysctl, write): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sysctl_kern, write, + FIELD_SIZEOF(struct bpf_sysctl_kern, + write), + target_size)); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops cg_sysctl_verifier_ops = { + .get_func_proto = sysctl_func_proto, + .is_valid_access = sysctl_is_valid_access, + .convert_ctx_access = sysctl_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sysctl_prog_ops = { +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d995eedfdd16..92c9b8a32b50 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1888,6 +1888,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_FLOW_DISSECTOR: ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; break; + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -1966,6 +1969,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: return skb_flow_dissector_bpf_prog_detach(attr); + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; default: return -EINVAL; } @@ -1999,6 +2005,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: + case BPF_CGROUP_SYSCTL: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f25b7c9c20ba..20808e3c95a8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5267,6 +5267,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: break; default: return 0; -- cgit v1.2.3 From 808649fb787d918a48a360a668ee4ee9023f0c11 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 27 Feb 2019 13:28:48 -0800 Subject: bpf: Introduce bpf_sysctl_get_name helper Add bpf_sysctl_get_name() helper to copy sysctl name (/proc/sys/ entry) into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer. By default full name (w/o /proc/sys/) is copied, e.g. "net/ipv4/tcp_mem". If BPF_F_SYSCTL_BASE_NAME flag is set, only base name will be copied, e.g. "tcp_mem". Documentation for the new helper is provided in bpf.h UAPI. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 610491b5f0aa..a68387043244 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -806,10 +807,77 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, + size_t *lenp) +{ + ssize_t tmp_ret = 0, ret; + + if (dir->header.parent) { + tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); + if (ret < 0) + return ret; + *bufp += ret; + *lenp -= ret; + ret += tmp_ret; + + /* Avoid leading slash. */ + if (!ret) + return ret; + + tmp_ret = strscpy(*bufp, "/", *lenp); + if (tmp_ret < 0) + return tmp_ret; + *bufp += tmp_ret; + *lenp -= tmp_ret; + + return ret + tmp_ret; +} + +BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len, u64, flags) +{ + ssize_t tmp_ret = 0, ret; + + if (!buf) + return -EINVAL; + + if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { + if (!ctx->head) + return -EINVAL; + tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(buf, ctx->table->procname, buf_len); + + return ret < 0 ? ret : tmp_ret + ret; +} + +static const struct bpf_func_proto bpf_sysctl_get_name_proto = { + .func = bpf_sysctl_get_name, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return cgroup_base_func_proto(func_id, prog); + switch (func_id) { + case BPF_FUNC_sysctl_get_name: + return &bpf_sysctl_get_name_proto; + default: + return cgroup_base_func_proto(func_id, prog); + } } static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, -- cgit v1.2.3 From 1d11b3016cec4ed9770b98e82a61708c8f4926e7 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 28 Feb 2019 19:22:15 -0800 Subject: bpf: Introduce bpf_sysctl_get_current_value helper Add bpf_sysctl_get_current_value() helper to copy current sysctl value into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer. It provides same string as user space can see by reading corresponding file in /proc/sys/, including new line, etc. Documentation for the new helper is provided in bpf.h UAPI. Since current value is kept in ctl_table->data in a parsed form, ctl_table->proc_handler() with write=0 is called to read that data and convert it to a string. Such a string can later be parsed by a program using helpers that will be introduced separately. Unfortunately it's not trivial to provide API to access parsed data due to variety of data representations (string, intvec, uintvec, ulongvec, custom structures, even NULL, etc). Instead it's assumed that user know how to handle specific sysctl they're interested in and appropriate helpers can be used. Since ctl_table->proc_handler() expects __user buffer, conversion to __user happens for kernel allocated one where the value is stored. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a68387043244..c6b2cf29a54b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -794,15 +794,37 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .head = head, .table = table, .write = write, + .cur_val = NULL, + .cur_len = PAGE_SIZE, }; struct cgroup *cgrp; int ret; + ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); + if (ctx.cur_val) { + mm_segment_t old_fs; + loff_t pos = 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, + &ctx.cur_len, &pos)) { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + set_fs(old_fs); + } else { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); rcu_read_unlock(); + kfree(ctx.cur_val); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); @@ -869,12 +891,55 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .arg4_type = ARG_ANYTHING, }; +static int copy_sysctl_value(char *dst, size_t dst_len, char *src, + size_t src_len) +{ + if (!dst) + return -EINVAL; + + if (!dst_len) + return -E2BIG; + + if (!src || !src_len) { + memset(dst, 0, dst_len); + return -EINVAL; + } + + memcpy(dst, src, min(dst_len, src_len)); + + if (dst_len > src_len) { + memset(dst + src_len, '\0', dst_len - src_len); + return src_len; + } + + dst[dst_len - 1] = '\0'; + + return -E2BIG; +} + +BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, + char *, buf, size_t, buf_len) +{ + return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { + .func = bpf_sysctl_get_current_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; + case BPF_FUNC_sysctl_get_current_value: + return &bpf_sysctl_get_current_value_proto; default: return cgroup_base_func_proto(func_id, prog); } -- cgit v1.2.3 From 4e63acdff864654cee0ac5aaeda3913798ee78f6 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 7 Mar 2019 18:38:43 -0800 Subject: bpf: Introduce bpf_sysctl_{get,set}_new_value helpers Add helpers to work with new value being written to sysctl by user space. bpf_sysctl_get_new_value() copies value being written to sysctl into provided buffer. bpf_sysctl_set_new_value() overrides new value being written by user space with a one from provided buffer. Buffer should contain string representation of the value, similar to what can be seen in /proc/sys/. Both helpers can be used only on sysctl write. File position matters and can be managed by an interface that will be introduced separately. E.g. if user space calls sys_write to a file in /proc/sys/ at file position = X, where X > 0, then the value set by bpf_sysctl_set_new_value() will be written starting from X. If program wants to override whole value with specified buffer, file position has to be set to zero. Documentation for the new helpers is provided in bpf.h UAPI. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c6b2cf29a54b..ba4e21986760 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -778,6 +778,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) + * @buf: pointer to buffer passed by user space + * @pcount: value-result argument: value is size of buffer pointed to by @buf, + * result is size of @new_buf if program set new value, initial value + * otherwise + * @new_buf: pointer to pointer to new buffer that will be allocated if program + * overrides new value provided by user space on sysctl write + * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -788,7 +795,8 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - enum bpf_attach_type type) + void __user *buf, size_t *pcount, + void **new_buf, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { .head = head, @@ -796,6 +804,9 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .write = write, .cur_val = NULL, .cur_len = PAGE_SIZE, + .new_val = NULL, + .new_len = 0, + .new_updated = 0, }; struct cgroup *cgrp; int ret; @@ -818,6 +829,18 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, ctx.cur_len = 0; } + if (write && buf && *pcount) { + /* BPF program should be able to override new value with a + * buffer bigger than provided by user. + */ + ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); + ctx.new_len = min(PAGE_SIZE, *pcount); + if (!ctx.new_val || + copy_from_user(ctx.new_val, buf, ctx.new_len)) + /* Let BPF program decide how to proceed. */ + ctx.new_len = 0; + } + rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); @@ -825,6 +848,13 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); + if (ret == 1 && ctx.new_updated) { + *new_buf = ctx.new_val; + *pcount = ctx.new_len; + } else { + kfree(ctx.new_val); + } + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); @@ -932,6 +962,51 @@ static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { .arg3_type = ARG_CONST_SIZE, }; +BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len) +{ + if (!ctx->write) { + if (buf && buf_len) + memset(buf, '\0', buf_len); + return -EINVAL; + } + return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { + .func = bpf_sysctl_get_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, + const char *, buf, size_t, buf_len) +{ + if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) + return -EINVAL; + + if (buf_len > PAGE_SIZE - 1) + return -E2BIG; + + memcpy(ctx->new_val, buf, buf_len); + ctx->new_len = buf_len; + ctx->new_updated = 1; + + return 0; +} + +static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { + .func = bpf_sysctl_set_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -940,6 +1015,10 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: return &bpf_sysctl_get_current_value_proto; + case BPF_FUNC_sysctl_get_new_value: + return &bpf_sysctl_get_new_value_proto; + case BPF_FUNC_sysctl_set_new_value: + return &bpf_sysctl_set_new_value_proto; default: return cgroup_base_func_proto(func_id, prog); } -- cgit v1.2.3 From e1550bfe0de47e30484ba91de1e50a91ec1c31f5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 7 Mar 2019 18:50:52 -0800 Subject: bpf: Add file_pos field to bpf_sysctl ctx Add file_pos field to bpf_sysctl context to read and write sysctl file position at which sysctl is being accessed (read or written). The field can be used to e.g. override whole sysctl value on write to sysctl even when sys_write is called by user space with file_pos > 0. Or BPF program may reject such accesses. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ba4e21986760..b2adf22139b3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -782,6 +782,9 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise + * @ppos: value-result argument: value is position at which read from or write + * to sysctl is happening, result is new position if program overrode it, + * initial value otherwise * @new_buf: pointer to pointer to new buffer that will be allocated if program * overrides new value provided by user space on sysctl write * NOTE: it's caller responsibility to free *new_buf if it was set @@ -796,12 +799,14 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, void __user *buf, size_t *pcount, - void **new_buf, enum bpf_attach_type type) + loff_t *ppos, void **new_buf, + enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { .head = head, .table = table, .write = write, + .ppos = ppos, .cur_val = NULL, .cur_len = PAGE_SIZE, .new_val = NULL, @@ -1030,14 +1035,22 @@ static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, { const int size_default = sizeof(__u32); - if (off < 0 || off + size > sizeof(struct bpf_sysctl) || - off % size || type != BPF_READ) + if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) return false; switch (off) { case offsetof(struct bpf_sysctl, write): + if (type != BPF_READ) + return false; bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case offsetof(struct bpf_sysctl, file_pos): + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } else { + return size == size_default; + } default: return false; } @@ -1059,6 +1072,41 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type, write), target_size)); break; + case offsetof(struct bpf_sysctl, file_pos): + /* ppos is a pointer so it should be accessed via indirect + * loads and stores. Also for stores additional temporary + * register is used since neither src_reg nor dst_reg can be + * overridden. + */ + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM( + BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_STX_MEM( + BPF_SIZEOF(u32), treg, si->src_reg, 0); + *insn++ = BPF_LDX_MEM( + BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); + } + *target_size = sizeof(u32); + break; } return insn - insn_buf; -- cgit v1.2.3 From 57c3bb725a3dd97d960d7e1cd0845d88de53217f Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 18 Mar 2019 16:57:10 -0700 Subject: bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types Currently the way to pass result from BPF helper to BPF program is to provide memory area defined by pointer and size: func(void *, size_t). It works great for generic use-case, but for simple types, such as int, it's overkill and consumes two arguments when it could use just one. Introduce new argument types ARG_PTR_TO_INT and ARG_PTR_TO_LONG to be able to pass result from helper to program via pointer to int and long correspondingly: func(int *) or func(long *). New argument types are similar to ARG_PTR_TO_MEM with the following differences: * they don't require corresponding ARG_CONST_SIZE argument, predefined access sizes are used instead (32bit for int, 64bit for long); * it's possible to use more than one such an argument in a helper; * provided pointers have to be aligned. It's easy to introduce similar ARG_PTR_TO_CHAR and ARG_PTR_TO_SHORT argument types. It's not done due to lack of use-case though. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 20808e3c95a8..15ab6fa817ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2462,6 +2462,22 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_int_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_INT || + type == ARG_PTR_TO_LONG; +} + +static int int_ptr_type_to_size(enum bpf_arg_type type) +{ + if (type == ARG_PTR_TO_INT) + return sizeof(u32); + else if (type == ARG_PTR_TO_LONG) + return sizeof(u64); + + return -EINVAL; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -2554,6 +2570,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_int_ptr(arg_type)) { + expected_type = PTR_TO_STACK; + if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) + goto err_type; } else { verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -2635,6 +2657,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno - 1, reg->umax_value, zero_size_allowed, meta); + } else if (arg_type_is_int_ptr(arg_type)) { + int size = int_ptr_type_to_size(arg_type); + + err = check_helper_mem_access(env, regno, size, false, meta); + if (err) + return err; + err = check_ptr_alignment(env, reg, 0, size, true); } return err; -- cgit v1.2.3 From d7a4cb9b6705a89937d12c8158a35a3145dc967a Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 18 Mar 2019 17:55:26 -0700 Subject: bpf: Introduce bpf_strtol and bpf_strtoul helpers Add bpf_strtol and bpf_strtoul to convert a string to long and unsigned long correspondingly. It's similar to user space strtol(3) and strtoul(3) with a few changes to the API: * instead of NUL-terminated C string the helpers expect buffer and buffer length; * resulting long or unsigned long is returned in a separate result-argument; * return value is used to indicate success or failure, on success number of consumed bytes is returned that can be used to identify position to read next if the buffer is expected to contain multiple integers; * instead of *base* argument, *flags* is used that provides base in 5 LSB, other bits are reserved for future use; * number of supported bases is limited. Documentation for the new helpers is provided in bpf.h UAPI. The helpers are made available to BPF_PROG_TYPE_CGROUP_SYSCTL programs to be able to convert string input to e.g. "ulongvec" output. E.g. "net/ipv4/tcp_mem" consists of three ulong integers. They can be parsed by calling to bpf_strtoul three times. Implementation notes: Implementation includes "../../lib/kstrtox.h" to reuse integer parsing functions. It's done exactly same way as fs/proc/base.c already does. Unfortunately existing kstrtoX function can't be used directly since they fail if any invalid character is present right after integer in the string. Existing simple_strtoX functions can't be used either since they're obsolete and don't handle overflow properly. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 4 ++ kernel/bpf/helpers.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b2adf22139b3..789d4ab2336e 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1016,6 +1016,10 @@ static const struct bpf_func_proto * sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { + case BPF_FUNC_strtol: + return &bpf_strtol_proto; + case BPF_FUNC_strtoul: + return &bpf_strtoul_proto; case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; case BPF_FUNC_sysctl_get_current_value: diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a411fc17d265..4266ffde07ca 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,9 @@ #include #include #include +#include + +#include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -363,4 +366,132 @@ const struct bpf_func_proto bpf_get_local_storage_proto = { .arg2_type = ARG_ANYTHING, }; #endif + +#define BPF_STRTOX_BASE_MASK 0x1F + +static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, + unsigned long long *res, bool *is_negative) +{ + unsigned int base = flags & BPF_STRTOX_BASE_MASK; + const char *cur_buf = buf; + size_t cur_len = buf_len; + unsigned int consumed; + size_t val_len; + char str[64]; + + if (!buf || !buf_len || !res || !is_negative) + return -EINVAL; + + if (base != 0 && base != 8 && base != 10 && base != 16) + return -EINVAL; + + if (flags & ~BPF_STRTOX_BASE_MASK) + return -EINVAL; + + while (cur_buf < buf + buf_len && isspace(*cur_buf)) + ++cur_buf; + + *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); + if (*is_negative) + ++cur_buf; + + consumed = cur_buf - buf; + cur_len -= consumed; + if (!cur_len) + return -EINVAL; + + cur_len = min(cur_len, sizeof(str) - 1); + memcpy(str, cur_buf, cur_len); + str[cur_len] = '\0'; + cur_buf = str; + + cur_buf = _parse_integer_fixup_radix(cur_buf, &base); + val_len = _parse_integer(cur_buf, base, res); + + if (val_len & KSTRTOX_OVERFLOW) + return -ERANGE; + + if (val_len == 0) + return -EINVAL; + + cur_buf += val_len; + consumed += cur_buf - str; + + return consumed; +} + +static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, + long long *res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) { + if ((long long)-_res > 0) + return -ERANGE; + *res = -_res; + } else { + if ((long long)_res < 0) + return -ERANGE; + *res = _res; + } + return err; +} + +BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, + long *, res) +{ + long long _res; + int err; + + err = __bpf_strtoll(buf, buf_len, flags, &_res); + if (err < 0) + return err; + if (_res != (long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtol_proto = { + .func = bpf_strtol, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, + unsigned long *, res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) + return -EINVAL; + if (_res != (unsigned long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtoul_proto = { + .func = bpf_strtoul, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; #endif -- cgit v1.2.3 From 51356ac89b5a15e5207e8740d5f4f8b71cb7332f Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 12 Apr 2019 16:01:01 -0700 Subject: bpf: Fix distinct pointer types warning for ARCH=i386 Fix a new warning reported by kbuild for make ARCH=i386: In file included from kernel/bpf/cgroup.c:11:0: kernel/bpf/cgroup.c: In function '__cgroup_bpf_run_filter_sysctl': include/linux/kernel.h:827:29: warning: comparison of distinct pointer types lacks a cast (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) ^ include/linux/kernel.h:841:4: note: in expansion of macro '__typecheck' (__typecheck(x, y) && __no_side_effects(x, y)) ^~~~~~~~~~~ include/linux/kernel.h:851:24: note: in expansion of macro '__safe_cmp' __builtin_choose_expr(__safe_cmp(x, y), \ ^~~~~~~~~~ include/linux/kernel.h:860:19: note: in expansion of macro '__careful_cmp' #define min(x, y) __careful_cmp(x, y, <) ^~~~~~~~~~~~~ >> kernel/bpf/cgroup.c:837:17: note: in expansion of macro 'min' ctx.new_len = min(PAGE_SIZE, *pcount); ^~~ Fixes: 4e63acdff864 ("bpf: Introduce bpf_sysctl_{get,set}_new_value helpers") Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 789d4ab2336e..e58a6c247f56 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -839,7 +839,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); - ctx.new_len = min(PAGE_SIZE, *pcount); + ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); if (!ctx.new_val || copy_from_user(ctx.new_val, buf, ctx.new_len)) /* Let BPF program decide how to proceed. */ -- cgit v1.2.3 From 53b29c336830db48ad3dc737f88b8c065b1f0851 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 12 Apr 2019 19:38:26 +0800 Subject: swiotlb: save io_tlb_used to local variable before leaving critical section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When swiotlb is full, the kernel would print io_tlb_used. However, the result might be inaccurate at that time because we have left the critical section protected by spinlock. Therefore, we backup the io_tlb_used into local variable before leaving critical section. Fixes: 83ca25948940 ("swiotlb: dump used and total slots when swiotlb buffer is full") Suggested-by: Håkon Bugge Signed-off-by: Dongli Zhang Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 82c767374c70..38d57218809c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -445,6 +445,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, unsigned long mask; unsigned long offset_slots; unsigned long max_slots; + unsigned long tmp_io_tlb_used; if (no_iotlb_memory) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); @@ -531,10 +532,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, } while (index != wrap); not_found: + tmp_io_tlb_used = io_tlb_used; + spin_unlock_irqrestore(&io_tlb_lock, flags); if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) - dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu, used %lu\n", - size, io_tlb_nslabs, io_tlb_used); + dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", + size, io_tlb_nslabs, tmp_io_tlb_used); return DMA_MAPPING_ERROR; found: io_tlb_used += nslots; -- cgit v1.2.3 From 1b04aee7e2182454a663950e68084fa5ada9625a Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:34 +0100 Subject: bpf: refactor propagate_liveness to eliminate duplicated for loop Propagation for register and stack slot are finished in separate for loop, while they are perfect to be put into a single loop. This could also let them share some common variables in later patches. Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15ab6fa817ce..da285df492fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6254,10 +6254,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, return err; } } - } - /* ... and stack slots */ - for (frame = 0; frame <= vstate->curframe; frame++) { + /* Propagate stack slots. */ state = vstate->frame[frame]; parent = vparent->frame[frame]; for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && -- cgit v1.2.3 From 3f8cafa4131f67d47c8de326c7dcd561cc65fb38 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:35 +0100 Subject: bpf: refactor propagate_liveness to eliminate code redundance Access to reg states were not factored out, the consequence is long code for dereferencing them which made the indentation not good for reading. This patch factor out these code so the core code in the loop could be easier to follow. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index da285df492fd..6fd36a8ba1a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6232,8 +6232,9 @@ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, struct bpf_verifier_state *vparent) { - int i, frame, err = 0; + struct bpf_reg_state *state_reg, *parent_reg; struct bpf_func_state *state, *parent; + int i, frame, err = 0; if (vparent->curframe != vstate->curframe) { WARN(1, "propagate_live: parent frame %d current frame %d\n", @@ -6243,28 +6244,33 @@ static int propagate_liveness(struct bpf_verifier_env *env, /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); for (frame = 0; frame <= vstate->curframe; frame++) { + parent = vparent->frame[frame]; + state = vstate->frame[frame]; + parent_reg = parent->regs; + state_reg = state->regs; /* We don't need to worry about FP liveness, it's read-only */ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ) + if (parent_reg[i].live & REG_LIVE_READ) continue; - if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, &vstate->frame[frame]->regs[i], - &vparent->frame[frame]->regs[i]); - if (err) - return err; - } + if (!(state_reg[i].live & REG_LIVE_READ)) + continue; + err = mark_reg_read(env, &state_reg[i], &parent_reg[i]); + if (err) + return err; } /* Propagate stack slots. */ - state = vstate->frame[frame]; - parent = vparent->frame[frame]; for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) + parent_reg = &parent->stack[i].spilled_ptr; + state_reg = &state->stack[i].spilled_ptr; + if (parent_reg->live & REG_LIVE_READ) continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_reg_read(env, &state->stack[i].spilled_ptr, - &parent->stack[i].spilled_ptr); + if (!(state_reg->live & REG_LIVE_READ)) + continue; + err = mark_reg_read(env, state_reg, parent_reg); + if (err) + return err; } } return err; -- cgit v1.2.3 From 55e7f3b5ac94a8c9f2e35961a45e9aa526a9e41d Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:36 +0100 Subject: bpf: factor out reg and stack slot propagation into "propagate_liveness_reg" After code refactor in previous patches, the propagation logic inside the for loop in "propagate_liveness" becomes clear that they are good enough to be factored out into a common function "propagate_liveness_reg". Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6fd36a8ba1a0..3fdb301c4f8c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6221,6 +6221,22 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } +static int propagate_liveness_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + struct bpf_reg_state *parent_reg) +{ + int err; + + if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) + return 0; + + err = mark_reg_read(env, reg, parent_reg); + if (err) + return err; + + return 0; +} + /* A write screens off any subsequent reads; but write marks come from the * straight-line code between a state and its parent. When we arrive at an * equivalent state (jump target or such) we didn't arrive by the straight-line @@ -6250,11 +6266,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, state_reg = state->regs; /* We don't need to worry about FP liveness, it's read-only */ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - if (parent_reg[i].live & REG_LIVE_READ) - continue; - if (!(state_reg[i].live & REG_LIVE_READ)) - continue; - err = mark_reg_read(env, &state_reg[i], &parent_reg[i]); + err = propagate_liveness_reg(env, &state_reg[i], + &parent_reg[i]); if (err) return err; } @@ -6264,11 +6277,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, i < parent->allocated_stack / BPF_REG_SIZE; i++) { parent_reg = &parent->stack[i].spilled_ptr; state_reg = &state->stack[i].spilled_ptr; - if (parent_reg->live & REG_LIVE_READ) - continue; - if (!(state_reg->live & REG_LIVE_READ)) - continue; - err = mark_reg_read(env, state_reg, parent_reg); + err = propagate_liveness_reg(env, state_reg, + parent_reg); if (err) return err; } -- cgit v1.2.3 From c342dc109aa5a4f0bb36335cb441aaafc98b98ef Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 12 Apr 2019 22:59:37 +0100 Subject: bpf: refactor "check_reg_arg" to eliminate code redundancy There are a few "regs[regno]" here are there across "check_reg_arg", this patch factor it out into a simple "reg" pointer. The intention is to simplify code indentation and make the later patches in this set look cleaner. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3fdb301c4f8c..c7220153c5b1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1177,30 +1177,32 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg, *regs = state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); return -EINVAL; } + reg = ®s[regno]; if (t == SRC_OP) { /* check whether register used as source operand can be read */ - if (regs[regno].type == NOT_INIT) { + if (reg->type == NOT_INIT) { verbose(env, "R%d !read_ok\n", regno); return -EACCES; } /* We don't need to worry about FP liveness because it's read-only */ - if (regno != BPF_REG_FP) - return mark_reg_read(env, ®s[regno], - regs[regno].parent); + if (regno == BPF_REG_FP) + return 0; + + return mark_reg_read(env, reg, reg->parent); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - regs[regno].live |= REG_LIVE_WRITTEN; + reg->live |= REG_LIVE_WRITTEN; if (t == DST_OP) mark_reg_unknown(env, regs, regno); } -- cgit v1.2.3 From 2d87a0674bd60d855e4008e2d84f5b23d7cb9b7d Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Wed, 10 Apr 2019 11:14:19 +0200 Subject: timekeeping: Audit clock adjustments Emit an audit record whenever the system clock is changed (i.e. shifted by a non-zero offset) by a syscall from userspace. The syscalls than can (at the time of writing) trigger such record are: - settimeofday(2), stime(2), clock_settime(2) -- via do_settimeofday64() - adjtimex(2), clock_adjtime(2) -- via do_adjtimex() The new records have type AUDIT_TIME_INJOFFSET and contain the following fields: - sec -- the 'seconds' part of the offset - nsec -- the 'nanoseconds' part of the offset Example record (time was shifted backwards by ~15.875 seconds): type=TIME_INJOFFSET msg=audit(1530616049.652:13): sec=-16 nsec=124887145 The records of this type will be associated with the corresponding syscall records. Signed-off-by: Ondrej Mosnacek Reviewed-by: Richard Guy Briggs Reviewed-by: Thomas Gleixner [PM: fixed a line width problem in __audit_tk_injoffset()] Signed-off-by: Paul Moore --- kernel/auditsc.c | 7 +++++++ kernel/time/timekeeping.c | 6 ++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 51a2ceb3a1ca..3843495d0083 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2512,6 +2512,13 @@ void __audit_fanotify(unsigned int response) AUDIT_FANOTIFY, "resp=%u", response); } +void __audit_tk_injoffset(struct timespec64 offset) +{ + audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET, + "sec=%lli nsec=%li", + (long long)offset.tv_sec, offset.tv_nsec); +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f986e1918d12..3d24be4cd607 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "tick-internal.h" #include "ntp_internal.h" @@ -1250,6 +1251,9 @@ out: /* signal hrtimers about time change */ clock_was_set(); + if (!ret) + audit_tk_injoffset(ts_delta); + return ret; } EXPORT_SYMBOL(do_settimeofday64); @@ -2322,6 +2326,8 @@ int do_adjtimex(struct __kernel_timex *txc) ret = timekeeping_inject_offset(&delta); if (ret) return ret; + + audit_tk_injoffset(delta); } ktime_get_real_ts64(&ts); -- cgit v1.2.3 From 7e8eda734d30de81d06a949c9bf9853c445ede4e Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Wed, 10 Apr 2019 11:14:20 +0200 Subject: ntp: Audit NTP parameters adjustment Emit an audit record every time selected NTP parameters are modified from userspace (via adjtimex(2) or clock_adjtime(2)). These parameters may be used to indirectly change system clock, and thus their modifications should be audited. Such events will now generate records of type AUDIT_TIME_ADJNTPVAL containing the following fields: - op -- which value was adjusted: - offset -- corresponding to the time_offset variable - freq -- corresponding to the time_freq variable - status -- corresponding to the time_status variable - adjust -- corresponding to the time_adjust variable - tick -- corresponding to the tick_usec variable - tai -- corresponding to the timekeeping's TAI offset - old -- the old value - new -- the new value Example records: type=TIME_ADJNTPVAL msg=audit(1530616044.507:7): op=status old=64 new=8256 type=TIME_ADJNTPVAL msg=audit(1530616044.511:11): op=freq old=0 new=49180377088000 The records of this type will be associated with the corresponding syscall records. An overview of parameter changes that can be done via do_adjtimex() (based on information from Miroslav Lichvar) and whether they are audited: __timekeeping_set_tai_offset() -- sets the offset from the International Atomic Time (AUDITED) NTP variables: time_offset -- can adjust the clock by up to 0.5 seconds per call and also speed it up or slow down by up to about 0.05% (43 seconds per day) (AUDITED) time_freq -- can speed up or slow down by up to about 0.05% (AUDITED) time_status -- can insert/delete leap seconds and it also enables/ disables synchronization of the hardware real-time clock (AUDITED) time_maxerror, time_esterror -- change error estimates used to inform userspace applications (NOT AUDITED) time_constant -- controls the speed of the clock adjustments that are made when time_offset is set (NOT AUDITED) time_adjust -- can temporarily speed up or slow down the clock by up to 0.05% (AUDITED) tick_usec -- a more extreme version of time_freq; can speed up or slow down the clock by up to 10% (AUDITED) Signed-off-by: Ondrej Mosnacek Reviewed-by: Richard Guy Briggs Reviewed-by: Thomas Gleixner Signed-off-by: Paul Moore --- kernel/auditsc.c | 22 ++++++++++++++++++++++ kernel/time/ntp.c | 22 +++++++++++++++++++--- kernel/time/ntp_internal.h | 4 +++- kernel/time/timekeeping.c | 7 ++++++- 4 files changed, 50 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3843495d0083..5371b59bde36 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2519,6 +2519,28 @@ void __audit_tk_injoffset(struct timespec64 offset) (long long)offset.tv_sec, offset.tv_nsec); } +static void audit_log_ntp_val(const struct audit_ntp_data *ad, + const char *op, enum audit_ntp_type type) +{ + const struct audit_ntp_val *val = &ad->vals[type]; + + if (val->newval == val->oldval) + return; + + audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL, + "op=%s old=%lli new=%lli", op, val->oldval, val->newval); +} + +void __audit_ntp_log(const struct audit_ntp_data *ad) +{ + audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET); + audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ); + audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS); + audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI); + audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK); + audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 92a90014a925..ac5555e25733 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -709,7 +710,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, * kernel time-keeping variables. used by xntpd. */ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai) + s32 *time_tai, struct audit_ntp_data *ad) { int result; @@ -720,14 +721,29 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, /* adjtime() is independent from ntp_adjtime() */ time_adjust = txc->offset; ntp_update_frequency(); + + audit_ntp_set_old(ad, AUDIT_NTP_ADJUST, save_adjust); + audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, time_adjust); } txc->offset = save_adjust; } else { - /* If there are input parameters, then process them: */ - if (txc->modes) + if (txc->modes) { + audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_old(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai); + audit_ntp_set_old(ad, AUDIT_NTP_TICK, tick_usec); + process_adjtimex_modes(txc, time_tai); + audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_new(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai); + audit_ntp_set_new(ad, AUDIT_NTP_TICK, tick_usec); + } + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); if (!(time_status & STA_NANO)) diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 40e6122e634e..908ecaa65fc3 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,6 +8,8 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai); +extern int __do_adjtimex(struct __kernel_timex *txc, + const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3d24be4cd607..f366f2fdf1b0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2307,6 +2307,7 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; + struct audit_ntp_data ad; unsigned long flags; struct timespec64 ts; s32 orig_tai, tai; @@ -2330,13 +2331,15 @@ int do_adjtimex(struct __kernel_timex *txc) audit_tk_injoffset(delta); } + audit_ntp_init(&ad); + ktime_get_real_ts64(&ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); orig_tai = tai = tk->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai); + ret = __do_adjtimex(txc, &ts, &tai, &ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); @@ -2347,6 +2350,8 @@ int do_adjtimex(struct __kernel_timex *txc) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + audit_ntp_log(&ad); + /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) timekeeping_advance(TK_ADV_FREQ); -- cgit v1.2.3 From 02a8c817a31606b6b37c2b755f6569903f44241e Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Sun, 14 Apr 2019 18:58:46 +0200 Subject: bpf: add map helper functions push, pop, peek in more BPF programs commit f1a2e44a3aec ("bpf: add queue and stack maps") introduced new BPF helper functions: - BPF_FUNC_map_push_elem - BPF_FUNC_map_pop_elem - BPF_FUNC_map_peek_elem but they were made available only for network BPF programs. This patch makes them available for tracepoint, cgroup and lirc programs. Signed-off-by: Alban Crequy Cc: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 6 ++++++ kernel/trace/bpf_trace.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e58a6c247f56..fcde0f7b2585 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -713,6 +713,12 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d64c00afceb5..91800be0c8eb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -569,6 +569,12 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_probe_read: return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: -- cgit v1.2.3 From 0d306c31b2f77391dacdeaad4470c577f2aecc4f Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Tue, 16 Apr 2019 18:13:01 +0900 Subject: bpf: use BPF_CAST_CALL for casting bpf call verifier.c uses BPF_CAST_CALL for casting bpf call except at one place in jit_subprogs(). Let's use the macro for consistency. Signed-off-by: Prashant Bhole Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c7220153c5b1..db301e9b5295 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7647,9 +7647,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->src_reg != BPF_PSEUDO_CALL) continue; subprog = insn->off; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - func[subprog]->bpf_func - - __bpf_call_base; + insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - + __bpf_call_base; } /* we use the aux data to keep a list of the start addresses -- cgit v1.2.3 From 77361825bb01ecadf3ac8622e2e4dbc28806e858 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 12 Apr 2019 17:07:32 +0200 Subject: bpf: cpumap use ptr_ring_consume_batched Move ptr_ring dequeue outside loop, that allocate SKBs and calls network stack, as these operations that can take some time. The ptr_ring is a communication channel between CPUs, where we want to reduce/limit any cacheline bouncing. Do a concentrated bulk dequeue via ptr_ring_consume_batched, to shorten the period and times the remote cacheline in ptr_ring is read Batch size 8 is both to (1) limit BH-disable period, and (2) consume one cacheline on 64-bit archs. After reducing the BH-disable section further then we can consider changing this, while still thinking about L1 cacheline size being active. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3c18260403dd..430103e182a0 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -240,6 +240,8 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +#define CPUMAP_BATCH 8 + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -252,8 +254,9 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { - unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_frame *xdpf; + unsigned int drops = 0, sched = 0; + void *frames[CPUMAP_BATCH]; + int i, n; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -269,14 +272,16 @@ static int cpu_map_kthread_run(void *data) sched = cond_resched(); } - /* Process packets in rcpu->queue */ - local_bh_disable(); /* * The bpf_cpu_map_entry is single consumer, with this * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdpf = __ptr_ring_consume(rcpu->queue))) { + n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + + local_bh_disable(); + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb; int ret; @@ -290,13 +295,9 @@ static int cpu_map_kthread_run(void *data) ret = netif_receive_skb_core(skb); if (ret == NET_RX_DROP) drops++; - - /* Limit BH-disable period */ - if (++processed == 8) - break; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); local_bh_enable(); /* resched point, may call do_softirq() */ } -- cgit v1.2.3 From 8f0504a97e1ba6b70e1c8b5a88255c280f263287 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 12 Apr 2019 17:07:43 +0200 Subject: bpf: cpumap do bulk allocation of SKBs As cpumap now batch consume xdp_frame's from the ptr_ring, it knows how many SKBs it need to allocate. Thus, lets bulk allocate these SKBs via kmem_cache_alloc_bulk() API, and use the previously introduced function build_skb_around(). Notice that the flag __GFP_ZERO asks the slab/slub allocator to clear the memory for us. This does clear a larger area than needed, but my micro benchmarks on Intel CPUs show that this is slightly faster due to being a cacheline aligned area is cleared for the SKBs. (For SLUB allocator, there is a future optimization potential, because SKBs will with high probability originate from same page. If we can find/identify continuous memory areas then the Intel CPU memset rep stos will have a real performance gain.) Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 430103e182a0..732d6ced3987 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -160,12 +160,12 @@ static void cpu_map_kthread_stop(struct work_struct *work) } static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_frame *xdpf) + struct xdp_frame *xdpf, + struct sk_buff *skb) { unsigned int hard_start_headroom; unsigned int frame_size; void *pkt_data_start; - struct sk_buff *skb; /* Part of headroom was reserved to xdpf */ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; @@ -191,8 +191,8 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); pkt_data_start = xdpf->data - hard_start_headroom; - skb = build_skb(pkt_data_start, frame_size); - if (!skb) + skb = build_skb_around(skb, pkt_data_start, frame_size); + if (unlikely(!skb)) return NULL; skb_reserve(skb, hard_start_headroom); @@ -256,7 +256,9 @@ static int cpu_map_kthread_run(void *data) while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { unsigned int drops = 0, sched = 0; void *frames[CPUMAP_BATCH]; - int i, n; + void *skbs[CPUMAP_BATCH]; + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; + int i, n, m; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -278,14 +280,20 @@ static int cpu_map_kthread_run(void *data) * consume side valid as no-resize allowed of queue. */ n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < n; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops = n; + } local_bh_disable(); for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; - struct sk_buff *skb; + struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(rcpu, xdpf); + skb = cpu_map_build_skb(rcpu, xdpf, skb); if (!skb) { xdp_return_frame(xdpf); continue; -- cgit v1.2.3 From 86d231459d6dc9094e70c35c3517f4ef860b2f1e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 12 Apr 2019 17:07:48 +0200 Subject: bpf: cpumap memory prefetchw optimizations for struct page A lot of the performance gain comes from this patch. While analysing performance overhead it was found that the largest CPU stalls were caused when touching the struct page area. It is first read with a READ_ONCE from build_skb_around via page_is_pfmemalloc(), and when freed written by page_frag_free() call. Measurements show that the prefetchw (W) variant operation is needed to achieve the performance gain. We believe this optimization it two fold, first the W-variant saves one step in the cache-coherency protocol, and second it helps us to avoid the non-temporal prefetch HW optimizations and bring this into all cache-levels. It might be worth investigating if prefetch into L2 will have the same benefit. Signed-off-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 732d6ced3987..cf727d77c6c6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -280,6 +280,18 @@ static int cpu_map_kthread_run(void *data) * consume side valid as no-resize allowed of queue. */ n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + + for (i = 0; i < n; i++) { + void *f = frames[i]; + struct page *page = virt_to_page(f); + + /* Bring struct page memory area to curr CPU. Read by + * build_skb_around via page_is_pfmemalloc(), and when + * freed written by page_frag_free call. + */ + prefetchw(page); + } + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); if (unlikely(m == 0)) { for (i = 0; i < n; i++) -- cgit v1.2.3 From 0bc199854405543b0debe67c735c0aae94f1d319 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Wed, 17 Apr 2019 16:35:49 -0400 Subject: ipv6: Add rate limit mask for ICMPv6 messages To make ICMPv6 closer to ICMPv4, add ratemask parameter. Since the ICMP message types use larger numeric values, a simple bitmask doesn't fit. I use large bitmap. The input and output are the in form of list of ranges. Set the default to rate limit all error messages but Packet Too Big. For Packet Too Big, use ratemask instead of hard-coded. There are functions where icmpv6_xrlim_allow() and icmpv6_global_allow() aren't called. This patch only adds them to icmpv6_echo_reply(). Rate limiting error messages is mandated by RFC 4443 but RFC 4890 says that it is also acceptable to rate limit informational messages. Thus, I removed the current hard-coded behavior of icmpv6_mask_allow() that doesn't rate limit informational messages. v2: Add dummy function proc_do_large_bitmap() if CONFIG_PROC_SYSCTL isn't defined, expand the description in ip-sysctl.txt and remove unnecessary conditional before kfree(). v3: Inline the bitmap instead of dynamically allocated. Still is a pointer to it is needed because of the way proc_do_large_bitmap work. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- kernel/sysctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c9ec050bcf46..599510a3355e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3326,6 +3326,11 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} #endif /* CONFIG_PROC_SYSCTL */ @@ -3366,3 +3371,4 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); +EXPORT_SYMBOL(proc_do_large_bitmap); -- cgit v1.2.3 From 50943f3e136adfc421f9768d6ae09ba7b83aaefd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:01 -0700 Subject: cgroup: rename freezer.c into legacy_freezer.c Freezer.c will contain an implementation of cgroup v2 freezer, so let's rename the v1 freezer to avoid naming conflicts. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo Cc: kernel-team@fb.com --- kernel/cgroup/Makefile | 2 +- kernel/cgroup/freezer.c | 481 ----------------------------------------- kernel/cgroup/legacy_freezer.c | 481 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 482 insertions(+), 482 deletions(-) delete mode 100644 kernel/cgroup/freezer.c create mode 100644 kernel/cgroup/legacy_freezer.c (limited to 'kernel') diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index bfcdae896122..8d5689ca94b9 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o -obj-$(CONFIG_CGROUP_FREEZER) += freezer.o +obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c deleted file mode 100644 index 08236798d173..000000000000 --- a/kernel/cgroup/freezer.c +++ /dev/null @@ -1,481 +0,0 @@ -/* - * cgroup_freezer.c - control group freezer subsystem - * - * Copyright IBM Corporation, 2007 - * - * Author : Cedric Le Goater - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is - * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared - * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING - * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of - * its ancestors has FREEZING_SELF set. - */ -enum freezer_state_flags { - CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ - CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ - CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ - CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ - - /* mask for all FREEZING flags */ - CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, -}; - -struct freezer { - struct cgroup_subsys_state css; - unsigned int state; -}; - -static DEFINE_MUTEX(freezer_mutex); - -static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct freezer, css) : NULL; -} - -static inline struct freezer *task_freezer(struct task_struct *task) -{ - return css_freezer(task_css(task, freezer_cgrp_id)); -} - -static struct freezer *parent_freezer(struct freezer *freezer) -{ - return css_freezer(freezer->css.parent); -} - -bool cgroup_freezing(struct task_struct *task) -{ - bool ret; - - rcu_read_lock(); - ret = task_freezer(task)->state & CGROUP_FREEZING; - rcu_read_unlock(); - - return ret; -} - -static const char *freezer_state_strs(unsigned int state) -{ - if (state & CGROUP_FROZEN) - return "FROZEN"; - if (state & CGROUP_FREEZING) - return "FREEZING"; - return "THAWED"; -}; - -static struct cgroup_subsys_state * -freezer_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct freezer *freezer; - - freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); - if (!freezer) - return ERR_PTR(-ENOMEM); - - return &freezer->css; -} - -/** - * freezer_css_online - commit creation of a freezer css - * @css: css being created - * - * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. - */ -static int freezer_css_online(struct cgroup_subsys_state *css) -{ - struct freezer *freezer = css_freezer(css); - struct freezer *parent = parent_freezer(freezer); - - mutex_lock(&freezer_mutex); - - freezer->state |= CGROUP_FREEZER_ONLINE; - - if (parent && (parent->state & CGROUP_FREEZING)) { - freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - atomic_inc(&system_freezing_cnt); - } - - mutex_unlock(&freezer_mutex); - return 0; -} - -/** - * freezer_css_offline - initiate destruction of a freezer css - * @css: css being destroyed - * - * @css is going away. Mark it dead and decrement system_freezing_count if - * it was holding one. - */ -static void freezer_css_offline(struct cgroup_subsys_state *css) -{ - struct freezer *freezer = css_freezer(css); - - mutex_lock(&freezer_mutex); - - if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); - - freezer->state = 0; - - mutex_unlock(&freezer_mutex); -} - -static void freezer_css_free(struct cgroup_subsys_state *css) -{ - kfree(css_freezer(css)); -} - -/* - * Tasks can be migrated into a different freezer anytime regardless of its - * current state. freezer_attach() is responsible for making new tasks - * conform to the current state. - * - * Freezer state changes and task migration are synchronized via - * @freezer->lock. freezer_attach() makes the new tasks conform to the - * current state and all following state changes can see the new tasks. - */ -static void freezer_attach(struct cgroup_taskset *tset) -{ - struct task_struct *task; - struct cgroup_subsys_state *new_css; - - mutex_lock(&freezer_mutex); - - /* - * Make the new tasks conform to the current state of @new_css. - * For simplicity, when migrating any task to a FROZEN cgroup, we - * revert it to FREEZING and let update_if_frozen() determine the - * correct state later. - * - * Tasks in @tset are on @new_css but may not conform to its - * current state before executing the following - !frozen tasks may - * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. - */ - cgroup_taskset_for_each(task, new_css, tset) { - struct freezer *freezer = css_freezer(new_css); - - if (!(freezer->state & CGROUP_FREEZING)) { - __thaw_task(task); - } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ - while (freezer && (freezer->state & CGROUP_FROZEN)) { - freezer->state &= ~CGROUP_FROZEN; - freezer = parent_freezer(freezer); - } - } - } - - mutex_unlock(&freezer_mutex); -} - -/** - * freezer_fork - cgroup post fork callback - * @task: a task which has just been forked - * - * @task has just been created and should conform to the current state of - * the cgroup_freezer it belongs to. This function may race against - * freezer_attach(). Losing to freezer_attach() means that we don't have - * to do anything as freezer_attach() will put @task into the appropriate - * state. - */ -static void freezer_fork(struct task_struct *task) -{ - struct freezer *freezer; - - /* - * The root cgroup is non-freezable, so we can skip locking the - * freezer. This is safe regardless of race with task migration. - * If we didn't race or won, skipping is obviously the right thing - * to do. If we lost and root is the new cgroup, noop is still the - * right thing to do. - */ - if (task_css_is_root(task, freezer_cgrp_id)) - return; - - mutex_lock(&freezer_mutex); - rcu_read_lock(); - - freezer = task_freezer(task); - if (freezer->state & CGROUP_FREEZING) - freeze_task(task); - - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); -} - -/** - * update_if_frozen - update whether a cgroup finished freezing - * @css: css of interest - * - * Once FREEZING is initiated, transition to FROZEN is lazily updated by - * calling this function. If the current state is FREEZING but not FROZEN, - * this function checks whether all tasks of this cgroup and the descendant - * cgroups finished freezing and, if so, sets FROZEN. - * - * The caller is responsible for grabbing RCU read lock and calling - * update_if_frozen() on all descendants prior to invoking this function. - * - * Task states and freezer state might disagree while tasks are being - * migrated into or out of @css, so we can't verify task states against - * @freezer state here. See freezer_attach() for details. - */ -static void update_if_frozen(struct cgroup_subsys_state *css) -{ - struct freezer *freezer = css_freezer(css); - struct cgroup_subsys_state *pos; - struct css_task_iter it; - struct task_struct *task; - - lockdep_assert_held(&freezer_mutex); - - if (!(freezer->state & CGROUP_FREEZING) || - (freezer->state & CGROUP_FROZEN)) - return; - - /* are all (live) children frozen? */ - rcu_read_lock(); - css_for_each_child(pos, css) { - struct freezer *child = css_freezer(pos); - - if ((child->state & CGROUP_FREEZER_ONLINE) && - !(child->state & CGROUP_FROZEN)) { - rcu_read_unlock(); - return; - } - } - rcu_read_unlock(); - - /* are all tasks frozen? */ - css_task_iter_start(css, 0, &it); - - while ((task = css_task_iter_next(&it))) { - if (freezing(task)) { - /* - * freezer_should_skip() indicates that the task - * should be skipped when determining freezing - * completion. Consider it frozen in addition to - * the usual frozen condition. - */ - if (!frozen(task) && !freezer_should_skip(task)) - goto out_iter_end; - } - } - - freezer->state |= CGROUP_FROZEN; -out_iter_end: - css_task_iter_end(&it); -} - -static int freezer_read(struct seq_file *m, void *v) -{ - struct cgroup_subsys_state *css = seq_css(m), *pos; - - mutex_lock(&freezer_mutex); - rcu_read_lock(); - - /* update states bottom-up */ - css_for_each_descendant_post(pos, css) { - if (!css_tryget_online(pos)) - continue; - rcu_read_unlock(); - - update_if_frozen(pos); - - rcu_read_lock(); - css_put(pos); - } - - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); - - seq_puts(m, freezer_state_strs(css_freezer(css)->state)); - seq_putc(m, '\n'); - return 0; -} - -static void freeze_cgroup(struct freezer *freezer) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&freezer->css, 0, &it); - while ((task = css_task_iter_next(&it))) - freeze_task(task); - css_task_iter_end(&it); -} - -static void unfreeze_cgroup(struct freezer *freezer) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&freezer->css, 0, &it); - while ((task = css_task_iter_next(&it))) - __thaw_task(task); - css_task_iter_end(&it); -} - -/** - * freezer_apply_state - apply state change to a single cgroup_freezer - * @freezer: freezer to apply state change to - * @freeze: whether to freeze or unfreeze - * @state: CGROUP_FREEZING_* flag to set or clear - * - * Set or clear @state on @cgroup according to @freeze, and perform - * freezing or thawing as necessary. - */ -static void freezer_apply_state(struct freezer *freezer, bool freeze, - unsigned int state) -{ - /* also synchronizes against task migration, see freezer_attach() */ - lockdep_assert_held(&freezer_mutex); - - if (!(freezer->state & CGROUP_FREEZER_ONLINE)) - return; - - if (freeze) { - if (!(freezer->state & CGROUP_FREEZING)) - atomic_inc(&system_freezing_cnt); - freezer->state |= state; - freeze_cgroup(freezer); - } else { - bool was_freezing = freezer->state & CGROUP_FREEZING; - - freezer->state &= ~state; - - if (!(freezer->state & CGROUP_FREEZING)) { - if (was_freezing) - atomic_dec(&system_freezing_cnt); - freezer->state &= ~CGROUP_FROZEN; - unfreeze_cgroup(freezer); - } - } -} - -/** - * freezer_change_state - change the freezing state of a cgroup_freezer - * @freezer: freezer of interest - * @freeze: whether to freeze or thaw - * - * Freeze or thaw @freezer according to @freeze. The operations are - * recursive - all descendants of @freezer will be affected. - */ -static void freezer_change_state(struct freezer *freezer, bool freeze) -{ - struct cgroup_subsys_state *pos; - - /* - * Update all its descendants in pre-order traversal. Each - * descendant will try to inherit its parent's FREEZING state as - * CGROUP_FREEZING_PARENT. - */ - mutex_lock(&freezer_mutex); - rcu_read_lock(); - css_for_each_descendant_pre(pos, &freezer->css) { - struct freezer *pos_f = css_freezer(pos); - struct freezer *parent = parent_freezer(pos_f); - - if (!css_tryget_online(pos)) - continue; - rcu_read_unlock(); - - if (pos_f == freezer) - freezer_apply_state(pos_f, freeze, - CGROUP_FREEZING_SELF); - else - freezer_apply_state(pos_f, - parent->state & CGROUP_FREEZING, - CGROUP_FREEZING_PARENT); - - rcu_read_lock(); - css_put(pos); - } - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); -} - -static ssize_t freezer_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - bool freeze; - - buf = strstrip(buf); - - if (strcmp(buf, freezer_state_strs(0)) == 0) - freeze = false; - else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) - freeze = true; - else - return -EINVAL; - - freezer_change_state(css_freezer(of_css(of)), freeze); - return nbytes; -} - -static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct freezer *freezer = css_freezer(css); - - return (bool)(freezer->state & CGROUP_FREEZING_SELF); -} - -static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct freezer *freezer = css_freezer(css); - - return (bool)(freezer->state & CGROUP_FREEZING_PARENT); -} - -static struct cftype files[] = { - { - .name = "state", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = freezer_read, - .write = freezer_write, - }, - { - .name = "self_freezing", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = freezer_self_freezing_read, - }, - { - .name = "parent_freezing", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = freezer_parent_freezing_read, - }, - { } /* terminate */ -}; - -struct cgroup_subsys freezer_cgrp_subsys = { - .css_alloc = freezer_css_alloc, - .css_online = freezer_css_online, - .css_offline = freezer_css_offline, - .css_free = freezer_css_free, - .attach = freezer_attach, - .fork = freezer_fork, - .legacy_cftypes = files, -}; diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c new file mode 100644 index 000000000000..08236798d173 --- /dev/null +++ b/kernel/cgroup/legacy_freezer.c @@ -0,0 +1,481 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is + * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared + * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING + * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of + * its ancestors has FREEZING_SELF set. + */ +enum freezer_state_flags { + CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ + CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ + CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ + CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ + + /* mask for all FREEZING flags */ + CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, +}; + +struct freezer { + struct cgroup_subsys_state css; + unsigned int state; +}; + +static DEFINE_MUTEX(freezer_mutex); + +static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct freezer, css) : NULL; +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return css_freezer(task_css(task, freezer_cgrp_id)); +} + +static struct freezer *parent_freezer(struct freezer *freezer) +{ + return css_freezer(freezer->css.parent); +} + +bool cgroup_freezing(struct task_struct *task) +{ + bool ret; + + rcu_read_lock(); + ret = task_freezer(task)->state & CGROUP_FREEZING; + rcu_read_unlock(); + + return ret; +} + +static const char *freezer_state_strs(unsigned int state) +{ + if (state & CGROUP_FROZEN) + return "FROZEN"; + if (state & CGROUP_FREEZING) + return "FREEZING"; + return "THAWED"; +}; + +static struct cgroup_subsys_state * +freezer_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + return &freezer->css; +} + +/** + * freezer_css_online - commit creation of a freezer css + * @css: css being created + * + * We're committing to creation of @css. Mark it online and inherit + * parent's freezing state while holding both parent's and our + * freezer->lock. + */ +static int freezer_css_online(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + struct freezer *parent = parent_freezer(freezer); + + mutex_lock(&freezer_mutex); + + freezer->state |= CGROUP_FREEZER_ONLINE; + + if (parent && (parent->state & CGROUP_FREEZING)) { + freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; + atomic_inc(&system_freezing_cnt); + } + + mutex_unlock(&freezer_mutex); + return 0; +} + +/** + * freezer_css_offline - initiate destruction of a freezer css + * @css: css being destroyed + * + * @css is going away. Mark it dead and decrement system_freezing_count if + * it was holding one. + */ +static void freezer_css_offline(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + + mutex_lock(&freezer_mutex); + + if (freezer->state & CGROUP_FREEZING) + atomic_dec(&system_freezing_cnt); + + freezer->state = 0; + + mutex_unlock(&freezer_mutex); +} + +static void freezer_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_freezer(css)); +} + +/* + * Tasks can be migrated into a different freezer anytime regardless of its + * current state. freezer_attach() is responsible for making new tasks + * conform to the current state. + * + * Freezer state changes and task migration are synchronized via + * @freezer->lock. freezer_attach() makes the new tasks conform to the + * current state and all following state changes can see the new tasks. + */ +static void freezer_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *new_css; + + mutex_lock(&freezer_mutex); + + /* + * Make the new tasks conform to the current state of @new_css. + * For simplicity, when migrating any task to a FROZEN cgroup, we + * revert it to FREEZING and let update_if_frozen() determine the + * correct state later. + * + * Tasks in @tset are on @new_css but may not conform to its + * current state before executing the following - !frozen tasks may + * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. + */ + cgroup_taskset_for_each(task, new_css, tset) { + struct freezer *freezer = css_freezer(new_css); + + if (!(freezer->state & CGROUP_FREEZING)) { + __thaw_task(task); + } else { + freeze_task(task); + /* clear FROZEN and propagate upwards */ + while (freezer && (freezer->state & CGROUP_FROZEN)) { + freezer->state &= ~CGROUP_FROZEN; + freezer = parent_freezer(freezer); + } + } + } + + mutex_unlock(&freezer_mutex); +} + +/** + * freezer_fork - cgroup post fork callback + * @task: a task which has just been forked + * + * @task has just been created and should conform to the current state of + * the cgroup_freezer it belongs to. This function may race against + * freezer_attach(). Losing to freezer_attach() means that we don't have + * to do anything as freezer_attach() will put @task into the appropriate + * state. + */ +static void freezer_fork(struct task_struct *task) +{ + struct freezer *freezer; + + /* + * The root cgroup is non-freezable, so we can skip locking the + * freezer. This is safe regardless of race with task migration. + * If we didn't race or won, skipping is obviously the right thing + * to do. If we lost and root is the new cgroup, noop is still the + * right thing to do. + */ + if (task_css_is_root(task, freezer_cgrp_id)) + return; + + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + freezer = task_freezer(task); + if (freezer->state & CGROUP_FREEZING) + freeze_task(task); + + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); +} + +/** + * update_if_frozen - update whether a cgroup finished freezing + * @css: css of interest + * + * Once FREEZING is initiated, transition to FROZEN is lazily updated by + * calling this function. If the current state is FREEZING but not FROZEN, + * this function checks whether all tasks of this cgroup and the descendant + * cgroups finished freezing and, if so, sets FROZEN. + * + * The caller is responsible for grabbing RCU read lock and calling + * update_if_frozen() on all descendants prior to invoking this function. + * + * Task states and freezer state might disagree while tasks are being + * migrated into or out of @css, so we can't verify task states against + * @freezer state here. See freezer_attach() for details. + */ +static void update_if_frozen(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + struct cgroup_subsys_state *pos; + struct css_task_iter it; + struct task_struct *task; + + lockdep_assert_held(&freezer_mutex); + + if (!(freezer->state & CGROUP_FREEZING) || + (freezer->state & CGROUP_FROZEN)) + return; + + /* are all (live) children frozen? */ + rcu_read_lock(); + css_for_each_child(pos, css) { + struct freezer *child = css_freezer(pos); + + if ((child->state & CGROUP_FREEZER_ONLINE) && + !(child->state & CGROUP_FROZEN)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + + /* are all tasks frozen? */ + css_task_iter_start(css, 0, &it); + + while ((task = css_task_iter_next(&it))) { + if (freezing(task)) { + /* + * freezer_should_skip() indicates that the task + * should be skipped when determining freezing + * completion. Consider it frozen in addition to + * the usual frozen condition. + */ + if (!frozen(task) && !freezer_should_skip(task)) + goto out_iter_end; + } + } + + freezer->state |= CGROUP_FROZEN; +out_iter_end: + css_task_iter_end(&it); +} + +static int freezer_read(struct seq_file *m, void *v) +{ + struct cgroup_subsys_state *css = seq_css(m), *pos; + + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + /* update states bottom-up */ + css_for_each_descendant_post(pos, css) { + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); + + update_if_frozen(pos); + + rcu_read_lock(); + css_put(pos); + } + + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); + + seq_puts(m, freezer_state_strs(css_freezer(css)->state)); + seq_putc(m, '\n'); + return 0; +} + +static void freeze_cgroup(struct freezer *freezer) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&freezer->css, 0, &it); + while ((task = css_task_iter_next(&it))) + freeze_task(task); + css_task_iter_end(&it); +} + +static void unfreeze_cgroup(struct freezer *freezer) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&freezer->css, 0, &it); + while ((task = css_task_iter_next(&it))) + __thaw_task(task); + css_task_iter_end(&it); +} + +/** + * freezer_apply_state - apply state change to a single cgroup_freezer + * @freezer: freezer to apply state change to + * @freeze: whether to freeze or unfreeze + * @state: CGROUP_FREEZING_* flag to set or clear + * + * Set or clear @state on @cgroup according to @freeze, and perform + * freezing or thawing as necessary. + */ +static void freezer_apply_state(struct freezer *freezer, bool freeze, + unsigned int state) +{ + /* also synchronizes against task migration, see freezer_attach() */ + lockdep_assert_held(&freezer_mutex); + + if (!(freezer->state & CGROUP_FREEZER_ONLINE)) + return; + + if (freeze) { + if (!(freezer->state & CGROUP_FREEZING)) + atomic_inc(&system_freezing_cnt); + freezer->state |= state; + freeze_cgroup(freezer); + } else { + bool was_freezing = freezer->state & CGROUP_FREEZING; + + freezer->state &= ~state; + + if (!(freezer->state & CGROUP_FREEZING)) { + if (was_freezing) + atomic_dec(&system_freezing_cnt); + freezer->state &= ~CGROUP_FROZEN; + unfreeze_cgroup(freezer); + } + } +} + +/** + * freezer_change_state - change the freezing state of a cgroup_freezer + * @freezer: freezer of interest + * @freeze: whether to freeze or thaw + * + * Freeze or thaw @freezer according to @freeze. The operations are + * recursive - all descendants of @freezer will be affected. + */ +static void freezer_change_state(struct freezer *freezer, bool freeze) +{ + struct cgroup_subsys_state *pos; + + /* + * Update all its descendants in pre-order traversal. Each + * descendant will try to inherit its parent's FREEZING state as + * CGROUP_FREEZING_PARENT. + */ + mutex_lock(&freezer_mutex); + rcu_read_lock(); + css_for_each_descendant_pre(pos, &freezer->css) { + struct freezer *pos_f = css_freezer(pos); + struct freezer *parent = parent_freezer(pos_f); + + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); + + if (pos_f == freezer) + freezer_apply_state(pos_f, freeze, + CGROUP_FREEZING_SELF); + else + freezer_apply_state(pos_f, + parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + + rcu_read_lock(); + css_put(pos); + } + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); +} + +static ssize_t freezer_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + bool freeze; + + buf = strstrip(buf); + + if (strcmp(buf, freezer_state_strs(0)) == 0) + freeze = false; + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) + freeze = true; + else + return -EINVAL; + + freezer_change_state(css_freezer(of_css(of)), freeze); + return nbytes; +} + +static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct freezer *freezer = css_freezer(css); + + return (bool)(freezer->state & CGROUP_FREEZING_SELF); +} + +static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct freezer *freezer = css_freezer(css); + + return (bool)(freezer->state & CGROUP_FREEZING_PARENT); +} + +static struct cftype files[] = { + { + .name = "state", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = freezer_read, + .write = freezer_write, + }, + { + .name = "self_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_self_freezing_read, + }, + { + .name = "parent_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_parent_freezing_read, + }, + { } /* terminate */ +}; + +struct cgroup_subsys freezer_cgrp_subsys = { + .css_alloc = freezer_css_alloc, + .css_online = freezer_css_online, + .css_offline = freezer_css_offline, + .css_free = freezer_css_free, + .attach = freezer_attach, + .fork = freezer_fork, + .legacy_cftypes = files, +}; -- cgit v1.2.3 From aade7f9efba098859681f8e88d81a5b44ad09b12 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:02 -0700 Subject: cgroup: implement __cgroup_task_count() helper The helper is identical to the existing cgroup_task_count() except it doesn't take the css_set_lock by itself, assuming that the caller does. Also, move cgroup_task_count() implementation into kernel/cgroup/cgroup.c, as there is nothing specific to cgroup v1. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo Cc: kernel-team@fb.com --- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 16 ---------------- kernel/cgroup/cgroup.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 30e39f3932ad..02c001ffe2e2 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -240,6 +240,7 @@ int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); +int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp); /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c126b34fd4ff..68ca5de7ec27 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -342,22 +342,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, return l; } -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - */ -int cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += link->cset->nr_tasks; - spin_unlock_irq(&css_set_lock); - return count; -} - /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f219c195a9a5..3008ea684aa0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -593,6 +593,39 @@ static void cgroup_get_live(struct cgroup *cgrp) css_get(&cgrp->self); } +/** + * __cgroup_task_count - count the number of tasks in a cgroup. The caller + * is responsible for taking the css_set_lock. + * @cgrp: the cgroup in question + */ +int __cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + lockdep_assert_held(&css_set_lock); + + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += link->cset->nr_tasks; + + return count; +} + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + */ +int cgroup_task_count(const struct cgroup *cgrp) +{ + int count; + + spin_lock_irq(&css_set_lock); + count = __cgroup_task_count(cgrp); + spin_unlock_irq(&css_set_lock); + + return count; +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; -- cgit v1.2.3 From 4dcabece4c3a9f9522127be12cc12cc120399b2f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:03 -0700 Subject: cgroup: protect cgroup->nr_(dying_)descendants by css_set_lock The number of descendant cgroups and the number of dying descendant cgroups are currently synchronized using the cgroup_mutex. The number of descendant cgroups will be required by the cgroup v2 freezer, which will use it to determine if a cgroup is frozen (depending on total number of descendants and number of frozen descendants). It's not always acceptable to grab the cgroup_mutex, especially from quite hot paths (e.g. exit()). To avoid this, let's additionally synchronize these counters using the css_set_lock. So, it's safe to read these counters with either cgroup_mutex or css_set_lock locked, and for changing both locks should be acquired. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo Cc: kernel-team@fb.com --- kernel/cgroup/cgroup.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3008ea684aa0..786ceef2f222 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4811,9 +4811,11 @@ static void css_release_work_fn(struct work_struct *work) if (cgroup_on_dfl(cgrp)) cgroup_rstat_flush(cgrp); + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; + spin_unlock_irq(&css_set_lock); cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5031,12 +5033,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_psi_free; + spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; if (tcgrp != cgrp) tcgrp->nr_descendants++; } + spin_unlock_irq(&css_set_lock); if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5321,10 +5325,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (parent && cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; } + spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); -- cgit v1.2.3 From 76f969e8948d82e78e1bc4beb6b9465908e74873 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:04 -0700 Subject: cgroup: cgroup v2 freezer Cgroup v1 implements the freezer controller, which provides an ability to stop the workload in a cgroup and temporarily free up some resources (cpu, io, network bandwidth and, potentially, memory) for some other tasks. Cgroup v2 lacks this functionality. This patch implements freezer for cgroup v2. Cgroup v2 freezer tries to put tasks into a state similar to jobctl stop. This means that tasks can be killed, ptraced (using PTRACE_SEIZE*), and interrupted. It is possible to attach to a frozen task, get some information (e.g. read registers) and detach. It's also possible to migrate a frozen tasks to another cgroup. This differs cgroup v2 freezer from cgroup v1 freezer, which mostly tried to imitate the system-wide freezer. However uninterruptible sleep is fine when all tasks are going to be frozen (hibernation case), it's not the acceptable state for some subset of the system. Cgroup v2 freezer is not supporting freezing kthreads. If a non-root cgroup contains kthread, the cgroup still can be frozen, but the kthread will remain running, the cgroup will be shown as non-frozen, and the notification will not be delivered. * PTRACE_ATTACH is not working because non-fatal signal delivery is blocked in frozen state. There are some interface differences between cgroup v1 and cgroup v2 freezer too, which are required to conform the cgroup v2 interface design principles: 1) There is no separate controller, which has to be turned on: the functionality is always available and is represented by cgroup.freeze and cgroup.events cgroup control files. 2) The desired state is defined by the cgroup.freeze control file. Any hierarchical configuration is allowed. 3) The interface is asynchronous. The actual state is available using cgroup.events control file ("frozen" field). There are no dedicated transitional states. 4) It's allowed to make any changes with the cgroup hierarchy (create new cgroups, remove old cgroups, move tasks between cgroups) no matter if some cgroups are frozen. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo No-objection-from-me-by: Oleg Nesterov Cc: kernel-team@fb.com --- kernel/cgroup/Makefile | 2 +- kernel/cgroup/cgroup.c | 110 ++++++++++++++++- kernel/cgroup/freezer.c | 317 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 2 + kernel/signal.c | 70 ++++++++++- 5 files changed, 491 insertions(+), 10 deletions(-) create mode 100644 kernel/cgroup/freezer.c (limited to 'kernel') diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 8d5689ca94b9..5d7a76bfbbb7 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o +obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 786ceef2f222..6895464b54c6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2435,8 +2435,15 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); - put_css_set_locked(from_cset); from_cset->nr_tasks--; + /* + * If the source or destination cgroup is frozen, + * the task might require to change its state. + */ + cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, + to_cset->dfl_cgrp); + put_css_set_locked(from_cset); + } } spin_unlock_irq(&css_set_lock); @@ -3477,8 +3484,11 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, static int cgroup_events_show(struct seq_file *seq, void *v) { - seq_printf(seq, "populated %d\n", - cgroup_is_populated(seq_css(seq)->cgroup)); + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); + seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); + return 0; } @@ -3540,6 +3550,40 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) } #endif +static int cgroup_freeze_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "%d\n", cgrp->freezer.freeze); + + return 0; +} + +static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + ssize_t ret; + int freeze; + + ret = kstrtoint(strstrip(buf), 0, &freeze); + if (ret) + return ret; + + if (freeze < 0 || freeze > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgroup_freeze(cgrp, freeze); + + cgroup_kn_unlock(of->kn); + + return nbytes; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4683,6 +4727,12 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, + { + .name = "cgroup.freeze", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_freeze_show, + .write = cgroup_freeze_write, + }, { .name = "cpu.stat", .flags = CFTYPE_NOT_ON_ROOT, @@ -5033,12 +5083,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_psi_free; + /* + * New cgroup inherits effective freeze counter, and + * if the parent has to be frozen, the child has too. + */ + cgrp->freezer.e_freeze = parent->freezer.e_freeze; + if (cgrp->freezer.e_freeze) + set_bit(CGRP_FROZEN, &cgrp->flags); + spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; - if (tcgrp != cgrp) + if (tcgrp != cgrp) { tcgrp->nr_descendants++; + + /* + * If the new cgroup is frozen, all ancestor cgroups + * get a new frozen descendant, but their state can't + * change because of this. + */ + if (cgrp->freezer.e_freeze) + tcgrp->freezer.nr_frozen_descendants++; + } } spin_unlock_irq(&css_set_lock); @@ -5329,6 +5396,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; + /* + * If the dying cgroup is frozen, decrease frozen descendants + * counters of ancestor cgroups. + */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + tcgrp->freezer.nr_frozen_descendants--; } spin_unlock_irq(&css_set_lock); @@ -5782,6 +5855,29 @@ void cgroup_post_fork(struct task_struct *child) cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } + + /* + * If the cgroup has to be frozen, the new task has too. + * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get + * the task into the frozen state. + */ + if (unlikely(cgroup_task_freeze(child))) { + struct cgroup *cgrp; + + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + cgrp = cset->dfl_cgrp; + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); + + /* + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later + * from do_freezer_trap(). So we avoid cgroup's + * transient switch from the frozen state and back. + */ + } + spin_unlock_irq(&css_set_lock); } @@ -5830,6 +5926,12 @@ void cgroup_exit(struct task_struct *tsk) spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); cset->nr_tasks--; + + if (unlikely(cgroup_task_frozen(tsk))) + cgroup_freezer_frozen_exit(tsk); + else if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); + spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c new file mode 100644 index 000000000000..9d8cda478fc9 --- /dev/null +++ b/kernel/cgroup/freezer.c @@ -0,0 +1,317 @@ +//SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include "cgroup-internal.h" + +/* + * Propagate the cgroup frozen state upwards by the cgroup tree. + */ +static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) +{ + int desc = 1; + + /* + * If the new state is frozen, some freezing ancestor cgroups may change + * their state too, depending on if all their descendants are frozen. + * + * Otherwise, all ancestor cgroups are forced into the non-frozen state. + */ + while ((cgrp = cgroup_parent(cgrp))) { + if (frozen) { + cgrp->freezer.nr_frozen_descendants += desc; + if (!test_bit(CGRP_FROZEN, &cgrp->flags) && + test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_descendants == + cgrp->nr_descendants) { + set_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + desc++; + } + } else { + cgrp->freezer.nr_frozen_descendants -= desc; + if (test_bit(CGRP_FROZEN, &cgrp->flags)) { + clear_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + desc++; + } + } + } +} + +/* + * Revisit the cgroup frozen state. + * Checks if the cgroup is really frozen and perform all state transitions. + */ +void cgroup_update_frozen(struct cgroup *cgrp) +{ + bool frozen; + + lockdep_assert_held(&css_set_lock); + + /* + * If the cgroup has to be frozen (CGRP_FREEZE bit set), + * and all tasks are frozen and/or stopped, let's consider + * the cgroup frozen. Otherwise it's not frozen. + */ + frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); + + if (frozen) { + /* Already there? */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + return; + + set_bit(CGRP_FROZEN, &cgrp->flags); + } else { + /* Already there? */ + if (!test_bit(CGRP_FROZEN, &cgrp->flags)) + return; + + clear_bit(CGRP_FROZEN, &cgrp->flags); + } + cgroup_file_notify(&cgrp->events_file); + + /* Update the state of ancestor cgroups. */ + cgroup_propagate_frozen(cgrp, frozen); +} + +/* + * Increment cgroup's nr_frozen_tasks. + */ +static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) +{ + cgrp->freezer.nr_frozen_tasks++; +} + +/* + * Decrement cgroup's nr_frozen_tasks. + */ +static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) +{ + cgrp->freezer.nr_frozen_tasks--; + WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); +} + +/* + * Enter frozen/stopped state, if not yet there. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. + */ +void cgroup_enter_frozen(void) +{ + struct cgroup *cgrp; + + if (current->frozen) + return; + + spin_lock_irq(&css_set_lock); + current->frozen = true; + cgrp = task_dfl_cgroup(current); + cgroup_inc_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + spin_unlock_irq(&css_set_lock); +} + +/* + * Conditionally leave frozen/stopped state. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. + * + * If always_leave is not set, and the cgroup is freezing, + * we're racing with the cgroup freezing. In this case, we don't + * drop the frozen counter to avoid a transient switch to + * the unfrozen state. + */ +void cgroup_leave_frozen(bool always_leave) +{ + struct cgroup *cgrp; + + spin_lock_irq(&css_set_lock); + cgrp = task_dfl_cgroup(current); + if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { + cgroup_dec_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + WARN_ON_ONCE(!current->frozen); + current->frozen = false; + } + spin_unlock_irq(&css_set_lock); + + if (unlikely(current->frozen)) { + /* + * If the task remained in the frozen state, + * make sure it won't reach userspace without + * entering the signal handling loop. + */ + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + } +} + +/* + * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE + * jobctl bit. + */ +static void cgroup_freeze_task(struct task_struct *task, bool freeze) +{ + unsigned long flags; + + /* If the task is about to die, don't bother with freezing it. */ + if (!lock_task_sighand(task, &flags)) + return; + + if (freeze) { + task->jobctl |= JOBCTL_TRAP_FREEZE; + signal_wake_up(task, false); + } else { + task->jobctl &= ~JOBCTL_TRAP_FREEZE; + wake_up_process(task); + } + + unlock_task_sighand(task, &flags); +} + +/* + * Freeze or unfreeze all tasks in the given cgroup. + */ +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) +{ + struct css_task_iter it; + struct task_struct *task; + + lockdep_assert_held(&cgroup_mutex); + + spin_lock_irq(&css_set_lock); + if (freeze) + set_bit(CGRP_FREEZE, &cgrp->flags); + else + clear_bit(CGRP_FREEZE, &cgrp->flags); + spin_unlock_irq(&css_set_lock); + + css_task_iter_start(&cgrp->self, 0, &it); + while ((task = css_task_iter_next(&it))) { + /* + * Ignore kernel threads here. Freezing cgroups containing + * kthreads isn't supported. + */ + if (task->flags & PF_KTHREAD) + continue; + cgroup_freeze_task(task, freeze); + } + css_task_iter_end(&it); + + /* + * Cgroup state should be revisited here to cover empty leaf cgroups + * and cgroups which descendants are already in the desired state. + */ + spin_lock_irq(&css_set_lock); + if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) + cgroup_update_frozen(cgrp); + spin_unlock_irq(&css_set_lock); +} + +/* + * Adjust the task state (freeze or unfreeze) and revisit the state of + * source and destination cgroups. + */ +void cgroup_freezer_migrate_task(struct task_struct *task, + struct cgroup *src, struct cgroup *dst) +{ + lockdep_assert_held(&css_set_lock); + + /* + * Kernel threads are not supposed to be frozen at all. + */ + if (task->flags & PF_KTHREAD) + return; + + /* + * Adjust counters of freezing and frozen tasks. + * Note, that if the task is frozen, but the destination cgroup is not + * frozen, we bump both counters to keep them balanced. + */ + if (task->frozen) { + cgroup_inc_frozen_cnt(dst); + cgroup_dec_frozen_cnt(src); + } + cgroup_update_frozen(dst); + cgroup_update_frozen(src); + + /* + * Force the task to the desired state. + */ + cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); +} + +void cgroup_freezer_frozen_exit(struct task_struct *task) +{ + struct cgroup *cgrp = task_dfl_cgroup(task); + + lockdep_assert_held(&css_set_lock); + + cgroup_dec_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); +} + +void cgroup_freeze(struct cgroup *cgrp, bool freeze) +{ + struct cgroup_subsys_state *css; + struct cgroup *dsct; + bool applied = false; + + lockdep_assert_held(&cgroup_mutex); + + /* + * Nothing changed? Just exit. + */ + if (cgrp->freezer.freeze == freeze) + return; + + cgrp->freezer.freeze = freeze; + + /* + * Propagate changes downwards the cgroup tree. + */ + css_for_each_descendant_pre(css, &cgrp->self) { + dsct = css->cgroup; + + if (cgroup_is_dead(dsct)) + continue; + + if (freeze) { + dsct->freezer.e_freeze++; + /* + * Already frozen because of ancestor's settings? + */ + if (dsct->freezer.e_freeze > 1) + continue; + } else { + dsct->freezer.e_freeze--; + /* + * Still frozen because of ancestor's settings? + */ + if (dsct->freezer.e_freeze > 0) + continue; + + WARN_ON_ONCE(dsct->freezer.e_freeze < 0); + } + + /* + * Do change actual state: freeze or unfreeze. + */ + cgroup_do_freeze(dsct, freeze); + applied = true; + } + + /* + * Even if the actual state hasn't changed, let's notify a user. + * The state can be enforced by an ancestor cgroup: the cgroup + * can already be in the desired state or it can be locked in the + * opposite state, so that the transition will never happen. + * In both cases it's better to notify a user, that there is + * nothing to wait for. + */ + if (!applied) + cgroup_file_notify(&cgrp->events_file); +} diff --git a/kernel/fork.c b/kernel/fork.c index 9dcd18aa210b..8097a0cce4db 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1222,7 +1222,9 @@ static int wait_for_vfork_done(struct task_struct *child, int killed; freezer_do_not_count(); + cgroup_enter_frozen(); killed = wait_for_completion_killable(vfork); + cgroup_leave_frozen(false); freezer_count(); if (killed) { diff --git a/kernel/signal.c b/kernel/signal.c index f98448cf2def..095e0fc57b25 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -43,6 +43,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -146,9 +147,10 @@ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) static bool recalc_sigpending_tsk(struct task_struct *t) { - if ((t->jobctl & JOBCTL_PENDING_MASK) || + if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || PENDING(&t->pending, &t->blocked) || - PENDING(&t->signal->shared_pending, &t->blocked)) { + PENDING(&t->signal->shared_pending, &t->blocked) || + cgroup_task_frozen(t)) { set_tsk_thread_flag(t, TIF_SIGPENDING); return true; } @@ -2108,6 +2110,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t preempt_disable(); read_unlock(&tasklist_lock); preempt_enable_no_resched(); + cgroup_enter_frozen(); freezable_schedule(); } else { /* @@ -2286,6 +2289,7 @@ static bool do_signal_stop(int signr) } /* Now we don't run again until woken by SIGCONT or SIGKILL */ + cgroup_enter_frozen(); freezable_schedule(); return true; } else { @@ -2332,6 +2336,43 @@ static void do_jobctl_trap(void) } } +/** + * do_freezer_trap - handle the freezer jobctl trap + * + * Puts the task into frozen state, if only the task is not about to quit. + * In this case it drops JOBCTL_TRAP_FREEZE. + * + * CONTEXT: + * Must be called with @current->sighand->siglock held, + * which is always released before returning. + */ +static void do_freezer_trap(void) + __releases(¤t->sighand->siglock) +{ + /* + * If there are other trap bits pending except JOBCTL_TRAP_FREEZE, + * let's make another loop to give it a chance to be handled. + * In any case, we'll return back. + */ + if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) != + JOBCTL_TRAP_FREEZE) { + spin_unlock_irq(¤t->sighand->siglock); + return; + } + + /* + * Now we're sure that there is no pending fatal signal and no + * pending traps. Clear TIF_SIGPENDING to not get out of schedule() + * immediately (if there is a non-fatal signal pending), and + * put the task into sleep. + */ + __set_current_state(TASK_INTERRUPTIBLE); + clear_thread_flag(TIF_SIGPENDING); + spin_unlock_irq(¤t->sighand->siglock); + cgroup_enter_frozen(); + freezable_schedule(); +} + static int ptrace_signal(int signr, kernel_siginfo_t *info) { /* @@ -2442,6 +2483,10 @@ relock: ksig->info.si_signo = signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); recalc_sigpending(); + current->jobctl &= ~JOBCTL_TRAP_FREEZE; + spin_unlock_irq(&sighand->siglock); + if (unlikely(cgroup_task_frozen(current))) + cgroup_leave_frozen(true); goto fatal; } @@ -2452,9 +2497,24 @@ relock: do_signal_stop(0)) goto relock; - if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { - do_jobctl_trap(); + if (unlikely(current->jobctl & + (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) { + if (current->jobctl & JOBCTL_TRAP_MASK) { + do_jobctl_trap(); + spin_unlock_irq(&sighand->siglock); + } else if (current->jobctl & JOBCTL_TRAP_FREEZE) + do_freezer_trap(); + + goto relock; + } + + /* + * If the task is leaving the frozen state, let's update + * cgroup counters and reset the frozen bit. + */ + if (unlikely(cgroup_task_frozen(current))) { spin_unlock_irq(&sighand->siglock); + cgroup_leave_frozen(true); goto relock; } @@ -2548,8 +2608,8 @@ relock: continue; } - fatal: spin_unlock_irq(&sighand->siglock); + fatal: /* * Anything else is fatal, maybe with a core dump. -- cgit v1.2.3 From 712e35178754bbb785d00d5fcf5abaf32699bf11 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:07 -0700 Subject: cgroup: make TRACE_CGROUP_PATH irq-safe To use the TRACE_CGROUP_PATH() macro with css_set_lock locked, let's make the macro irq-safe. It's necessary in order to trace cgroup freezer state transitions (frozen/not frozen), which are happening with css_set_lock locked. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 02c001ffe2e2..809e34a3c017 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -28,12 +28,15 @@ extern void __init enable_debug_cgroup(void); #define TRACE_CGROUP_PATH(type, cgrp, ...) \ do { \ if (trace_cgroup_##type##_enabled()) { \ - spin_lock(&trace_cgroup_path_lock); \ + unsigned long flags; \ + spin_lock_irqsave(&trace_cgroup_path_lock, \ + flags); \ cgroup_path(cgrp, trace_cgroup_path, \ TRACE_CGROUP_PATH_LEN); \ trace_cgroup_##type(cgrp, trace_cgroup_path, \ ##__VA_ARGS__); \ - spin_unlock(&trace_cgroup_path_lock); \ + spin_unlock_irqrestore(&trace_cgroup_path_lock, \ + flags); \ } \ } while (0) -- cgit v1.2.3 From 4c476d8cff48853645abc822154aaad208faebcc Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 19 Apr 2019 10:03:08 -0700 Subject: cgroup: add tracing points for cgroup v2 freezer Add cgroup:cgroup_freeze and cgroup:cgroup_unfreeze events, which are using the existing cgroup tracing infrastructure. Add the cgroup_event event class, which is similar to the cgroup class, but contains an additional integer field to store a new value (the level field is dropped). Also add two tracing events: cgroup_notify_populated and cgroup_notify_frozen, which are raised in a generic way using the TRACE_CGROUP_PATH() macro. This allows to trace cgroup state transitions and is generally helpful for debugging the cgroup freezer code. Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 ++ kernel/cgroup/freezer.c | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6895464b54c6..57edcf398d71 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -816,6 +816,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) break; cgroup1_check_for_release(cgrp); + TRACE_CGROUP_PATH(notify_populated, cgrp, + cgroup_is_populated(cgrp)); cgroup_file_notify(&cgrp->events_file); child = cgrp; diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 9d8cda478fc9..3bfbb3c8baf3 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -6,6 +6,8 @@ #include "cgroup-internal.h" +#include + /* * Propagate the cgroup frozen state upwards by the cgroup tree. */ @@ -28,6 +30,7 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) cgrp->nr_descendants) { set_bit(CGRP_FROZEN, &cgrp->flags); cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); desc++; } } else { @@ -35,6 +38,7 @@ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) if (test_bit(CGRP_FROZEN, &cgrp->flags)) { clear_bit(CGRP_FROZEN, &cgrp->flags); cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); desc++; } } @@ -73,6 +77,7 @@ void cgroup_update_frozen(struct cgroup *cgrp) clear_bit(CGRP_FROZEN, &cgrp->flags); } cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); /* Update the state of ancestor cgroups. */ cgroup_propagate_frozen(cgrp, frozen); @@ -189,6 +194,11 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) clear_bit(CGRP_FREEZE, &cgrp->flags); spin_unlock_irq(&css_set_lock); + if (freeze) + TRACE_CGROUP_PATH(freeze, cgrp); + else + TRACE_CGROUP_PATH(unfreeze, cgrp); + css_task_iter_start(&cgrp->self, 0, &it); while ((task = css_task_iter_next(&it))) { /* @@ -312,6 +322,9 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) * In both cases it's better to notify a user, that there is * nothing to wait for. */ - if (!applied) + if (!applied) { + TRACE_CGROUP_PATH(notify_frozen, cgrp, + test_bit(CGRP_FROZEN, &cgrp->flags)); cgroup_file_notify(&cgrp->events_file); + } } -- cgit v1.2.3 From 52fde6e70cccc2fcf3f39fed0d0392960e2c2b03 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 21 Apr 2019 19:40:44 -0400 Subject: function_graph: Have selftest also emulate tr->reset() as it did with tr->init() The function_graph boot up self test emulates the tr->init() function in order to add a wrapper around the function graph tracer entry code to test for lock ups and such. But it does not emulate the tr->reset(), and just calls the function_graph tracer tr->reset() function which will use its own fgraph_ops to unregister function tracing with. As the fgraph_ops is becoming more meaningful with the register_ftrace_graph() and unregister_ftrace_graph() functions, the two need to be the same. The emulated tr->init() uses its own fgraph_ops descriptor, which means the unregister_ftrace_graph() must use the same ftrace_ops, which the selftest currently does not do. By emulating the tr->reset() as the selftest does with the tr->init() it will be able to pass the same fgraph_ops descriptor to the unregister_ftrace_graph() as it did with the register_ftrace_graph(). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 9d402e7fc949..69ee8ef12cee 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -792,7 +792,10 @@ trace_selftest_startup_function_graph(struct tracer *trace, /* check the trace buffer */ ret = trace_test_buffer(&tr->trace_buffer, &count); - trace->reset(tr); + /* Need to also simulate the tr->reset to remove this fgraph_ops */ + tracing_stop_cmdline_record(); + unregister_ftrace_graph(&fgraph_ops); + tracing_start(); if (!ret && !count) { -- cgit v1.2.3 From 70c4cf17e445264453bc5323db3e50aa0ac9e81f Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Fri, 19 Apr 2019 20:49:29 -0500 Subject: audit: fix a memory leak bug In audit_rule_change(), audit_data_to_entry() is firstly invoked to translate the payload data to the kernel's rule representation. In audit_data_to_entry(), depending on the audit field type, an audit tree may be created in audit_make_tree(), which eventually invokes kmalloc() to allocate the tree. Since this tree is a temporary tree, it will be then freed in the following execution, e.g., audit_add_rule() if the message type is AUDIT_ADD_RULE or audit_del_rule() if the message type is AUDIT_DEL_RULE. However, if the message type is neither AUDIT_ADD_RULE nor AUDIT_DEL_RULE, i.e., the default case of the switch statement, this temporary tree is not freed. To fix this issue, only allocate the tree when the type is AUDIT_ADD_RULE or AUDIT_DEL_RULE. Signed-off-by: Wenwen Wang Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditfilter.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 2c3c2f349b23..1bc6410413e6 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1114,22 +1114,24 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz) int err = 0; struct audit_entry *entry; - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - switch (type) { case AUDIT_ADD_RULE: + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); err = audit_add_rule(entry); audit_log_rule_change("add_rule", &entry->rule, !err); break; case AUDIT_DEL_RULE: + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); err = audit_del_rule(entry); audit_log_rule_change("remove_rule", &entry->rule, !err); break; default: - err = -EINVAL; WARN_ON(1); + return -EINVAL; } if (err || type == AUDIT_DEL_RULE) { -- cgit v1.2.3 From 7df737e991069d75eec1ded1c8b37e81b8c54df9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 19 Apr 2019 07:44:54 -0700 Subject: bpf: remove global variables Move three global variables protected by bpf_verifier_lock into 'struct bpf_verifier_env' to allow parallel verification. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index db301e9b5295..5f0eb5bd5589 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5369,10 +5369,6 @@ enum { #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) -static int *insn_stack; /* stack of insns to process */ -static int cur_stack; /* current stack index */ -static int *insn_state; - /* t, w, e - match pseudo-code above: * t - index of current instruction * w - next instruction @@ -5380,6 +5376,9 @@ static int *insn_state; */ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) return 0; @@ -5400,9 +5399,9 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) /* tree-edge */ insn_state[t] = DISCOVERED | e; insn_state[w] = DISCOVERED; - if (cur_stack >= env->prog->len) + if (env->cfg.cur_stack >= env->prog->len) return -E2BIG; - insn_stack[cur_stack++] = w; + insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { verbose_linfo(env, t, "%d: ", t); @@ -5426,14 +5425,15 @@ static int check_cfg(struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; + int *insn_stack, *insn_state; int ret = 0; int i, t; - insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; - insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_stack) { kvfree(insn_state); return -ENOMEM; @@ -5441,12 +5441,12 @@ static int check_cfg(struct bpf_verifier_env *env) insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ - cur_stack = 1; + env->cfg.cur_stack = 1; peek_stack: - if (cur_stack == 0) + if (env->cfg.cur_stack == 0) goto check_state; - t = insn_stack[cur_stack - 1]; + t = insn_stack[env->cfg.cur_stack - 1]; if (BPF_CLASS(insns[t].code) == BPF_JMP || BPF_CLASS(insns[t].code) == BPF_JMP32) { @@ -5515,7 +5515,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; - if (cur_stack-- <= 0) { + if (env->cfg.cur_stack-- <= 0) { verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; @@ -5535,6 +5535,7 @@ check_state: err_free: kvfree(insn_state); kvfree(insn_stack); + env->cfg.insn_state = env->cfg.insn_stack = NULL; return ret; } -- cgit v1.2.3 From 45a73c17bfb92c3ceebedc80a750ef2c2931c26b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 19 Apr 2019 07:44:55 -0700 Subject: bpf: drop bpf_verifier_lock Drop bpf_verifier_lock for root to avoid being DoS-ed by unprivileged. The BPF verifier is now fully parallel. All unpriv users are still serialized by bpf_verifier_lock to avoid exhausting kernel memory by running N parallel verifications. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f0eb5bd5589..423f242a5efb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8132,9 +8132,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; + is_priv = capable(CAP_SYS_ADMIN); /* grab the mutex to protect few globals used by verifier */ - mutex_lock(&bpf_verifier_lock); + if (!is_priv) + mutex_lock(&bpf_verifier_lock); if (attr->log_level || attr->log_buf || attr->log_size) { /* user requested verbose verifier output @@ -8157,7 +8159,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - is_priv = capable(CAP_SYS_ADMIN); env->allow_ptr_leaks = is_priv; ret = replace_map_fd_with_map_ptr(env); @@ -8270,7 +8271,8 @@ err_release_maps: release_maps(env); *prog = env->prog; err_unlock: - mutex_unlock(&bpf_verifier_lock); + if (!is_priv) + mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: kfree(env); -- cgit v1.2.3 From 148a97d5a02a62f81b5d6176f871c94a65e1f3af Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 24 Apr 2019 17:24:37 +0300 Subject: dma-mapping: remove an unnecessary NULL check We already dereferenced "dev" when we called get_dma_ops() so this NULL check is too late. We're not supposed to pass NULL "dev" pointers to dma_alloc_attrs(). Signed-off-by: Dan Carpenter Signed-off-by: Christoph Hellwig --- kernel/dma/mapping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 685a53f2a793..f7afdadb6770 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -244,7 +244,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, const struct dma_map_ops *ops = get_dma_ops(dev); void *cpu_addr; - WARN_ON_ONCE(dev && !dev->coherent_dma_mask); + WARN_ON_ONCE(!dev->coherent_dma_mask); if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) return cpu_addr; -- cgit v1.2.3 From 52ba92f5882adf1ee785c4c5ef23491948917fcd Mon Sep 17 00:00:00 2001 From: Kimberly Brown Date: Mon, 1 Apr 2019 22:51:41 -0400 Subject: irqdesc: Replace irq_kobj_type's default_attrs field with groups The kobj_type default_attrs field is being replaced by the default_groups field. Replace irq_kobj_type's default_attrs field with default_groups and use the ATTRIBUTE_GROUPS macro to create irq_groups. This patch was tested by verifying that the sysfs files for the attributes in the default groups were created. Signed-off-by: Kimberly Brown Reviewed-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/irq/irqdesc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 13539e12cd80..bbec57bda666 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -275,11 +275,12 @@ static struct attribute *irq_attrs[] = { &actions_attr.attr, NULL }; +ATTRIBUTE_GROUPS(irq); static struct kobj_type irq_kobj_type = { .release = irq_kobj_release, .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = irq_attrs, + .default_groups = irq_groups, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) -- cgit v1.2.3 From 2064fbc779d43c5ac38e20a4b4979e365f87349f Mon Sep 17 00:00:00 2001 From: Kimberly Brown Date: Mon, 1 Apr 2019 22:51:47 -0400 Subject: padata: Replace padata_attr_type default_attrs field with groups The kobj_type default_attrs field is being replaced by the default_groups field. Replace padata_attr_type's default_attrs field with default_groups and use the ATTRIBUTE_GROUPS macro to create padata_default_groups. This patch was tested by loading the pcrypt module and verifying that the sysfs files for the attributes in the default groups were created. Signed-off-by: Kimberly Brown Signed-off-by: Greg Kroah-Hartman --- kernel/padata.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 3e2633ae3bca..2d2fddbb7a4c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -957,6 +957,7 @@ static struct attribute *padata_default_attrs[] = { ¶llel_cpumask_attr.attr, NULL, }; +ATTRIBUTE_GROUPS(padata_default); static ssize_t padata_sysfs_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -995,7 +996,7 @@ static const struct sysfs_ops padata_sysfs_ops = { static struct kobj_type padata_attr_type = { .sysfs_ops = &padata_sysfs_ops, - .default_attrs = padata_default_attrs, + .default_groups = padata_default_groups, .release = padata_sysfs_release, }; -- cgit v1.2.3 From 9782adeb3d9d6e33fc52392031b8e00270515442 Mon Sep 17 00:00:00 2001 From: Kimberly Brown Date: Mon, 1 Apr 2019 22:51:53 -0400 Subject: cpufreq: schedutil: Replace default_attrs field with groups The kobj_type default_attrs field is being replaced by the default_groups field. Replace sugov_tunables_ktype's default_attrs field with default groups. Change "sugov_attributes" to "sugov_attrs" and use the ATTRIBUTE_GROUPS macro to create sugov_groups. This patch was tested by setting the scaling governor to schedutil and verifying that the sysfs files for the attributes in the default groups were created. Signed-off-by: Kimberly Brown Acked-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- kernel/sched/cpufreq_schedutil.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5c41ea367422..148b60c8993d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -598,13 +598,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); -static struct attribute *sugov_attributes[] = { +static struct attribute *sugov_attrs[] = { &rate_limit_us.attr, NULL }; +ATTRIBUTE_GROUPS(sugov); static struct kobj_type sugov_tunables_ktype = { - .default_attrs = sugov_attributes, + .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, }; -- cgit v1.2.3 From 70283454c918f1d65de0ec50c45ef592d781bcae Mon Sep 17 00:00:00 2001 From: Kimberly Brown Date: Mon, 1 Apr 2019 22:51:58 -0400 Subject: livepatch: Replace klp_ktype_patch's default_attrs with groups The kobj_type default_attrs field is being replaced by the default_groups field. Replace klp_ktype_patch's default_attrs field with default_groups and use the ATTRIBUTE_GROUPS macro to create klp_patch_groups. This patch was tested by loading the livepatch-sample module and verifying that the sysfs files for the attributes in the default groups were created. Signed-off-by: Kimberly Brown Acked-by: Jiri Kosina Acked-by: Miroslav Benes Acked-by: Petr Mladek Signed-off-by: Greg Kroah-Hartman --- kernel/livepatch/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb0ee10a1981..34a8338657d2 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -419,6 +419,7 @@ static struct attribute *klp_patch_attrs[] = { &force_kobj_attr.attr, NULL }; +ATTRIBUTE_GROUPS(klp_patch); static void klp_free_object_dynamic(struct klp_object *obj) { @@ -546,7 +547,7 @@ static void klp_kobj_release_patch(struct kobject *kobj) static struct kobj_type klp_ktype_patch = { .release = klp_kobj_release_patch, .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = klp_patch_attrs, + .default_groups = klp_patch_groups, }; static void klp_kobj_release_object(struct kobject *kobj) -- cgit v1.2.3 From 118c8e9ae629d35fa9b3d27a7b9d59298b1b4183 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Apr 2019 14:37:23 -0700 Subject: bpf: support BPF_PROG_QUERY for BPF_FLOW_DISSECTOR attach_type target_fd is target namespace. If there is a flow dissector BPF program attached to that namespace, its (single) id is returned. v5: * drop net ref right after rcu unlock (Daniel Borkmann) v4: * add missing put_net (Jann Horn) v3: * add missing inline to skb_flow_dissector_prog_query static def (kbuild test robot ) v2: * don't sleep in rcu critical section (Jakub Kicinski) * check input prog_cnt (exit early) Cc: Jann Horn Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 92c9b8a32b50..b0de49598341 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2009,6 +2009,8 @@ static int bpf_prog_query(const union bpf_attr *attr, break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); + case BPF_FLOW_DISSECTOR: + return skb_flow_dissector_prog_query(attr, uattr); default: return -EINVAL; } -- cgit v1.2.3 From e43e9c339a78a0978f4ce473f645cedc05e6a57c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 Apr 2019 13:51:03 -0400 Subject: fsnotify: switch send_to_group() and ->handle_event to const struct qstr * note that conditions surrounding accesses to dname in audit_watch_handle_event() and audit_mark_handle_event() guarantee that dname won't have been NULL. Signed-off-by: Al Viro --- kernel/audit_fsnotify.c | 4 ++-- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 37ae95cfb7f4..fb241805569c 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -164,7 +164,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) static int audit_mark_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie, + const struct qstr *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); @@ -188,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group, } if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { - if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) + if (audit_compare_dname_path(dname->name, audit_mark->path, AUDIT_NAME_FULL)) return 0; audit_update_mark(audit_mark, inode); } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index abfb112f26aa..e49c912f862d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1040,7 +1040,7 @@ static void evict_chunk(struct audit_chunk *chunk) static int audit_tree_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie, + const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e8d1adeb2223..3c12fd5b680e 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -482,7 +482,7 @@ void audit_remove_watch_rule(struct audit_krule *krule) static int audit_watch_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie, + const struct qstr *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); @@ -507,9 +507,9 @@ static int audit_watch_handle_event(struct fsnotify_group *group, } if (mask & (FS_CREATE|FS_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); + audit_update_watch(parent, dname->name, inode->i_sb->s_dev, inode->i_ino, 0); else if (mask & (FS_DELETE|FS_MOVED_FROM)) - audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); + audit_update_watch(parent, dname->name, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); -- cgit v1.2.3 From 6921d4ebe418e7cce9f65c1f38c93ea82a1f546c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 Apr 2019 14:09:49 -0400 Subject: audit_update_watch(): switch to const struct qstr * Signed-off-by: Al Viro --- kernel/audit_watch.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 3c12fd5b680e..d832ce9df065 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -255,18 +255,19 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc /* Update inode info in audit rules based on filesystem event. */ static void audit_update_watch(struct audit_parent *parent, - const char *dname, dev_t dev, + const struct qstr *dname, dev_t dev, unsigned long ino, unsigned invalidating) { struct audit_watch *owatch, *nwatch, *nextw; struct audit_krule *r, *nextr; struct audit_entry *oentry, *nentry; + const unsigned char *name = dname->name; mutex_lock(&audit_filter_mutex); /* Run all of the watches on this parent looking for the one that * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path, + if (audit_compare_dname_path(name, owatch->path, AUDIT_NAME_FULL)) continue; @@ -507,9 +508,9 @@ static int audit_watch_handle_event(struct fsnotify_group *group, } if (mask & (FS_CREATE|FS_MOVED_TO) && inode) - audit_update_watch(parent, dname->name, inode->i_sb->s_dev, inode->i_ino, 0); + audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); else if (mask & (FS_DELETE|FS_MOVED_FROM)) - audit_update_watch(parent, dname->name, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); + audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); -- cgit v1.2.3 From 9df1c28bb75217b244257152ab7d788bb2a386d0 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:47 -0700 Subject: bpf: add writable context for raw tracepoints This is an opt-in interface that allows a tracepoint to provide a safe buffer that can be written from a BPF_PROG_TYPE_RAW_TRACEPOINT program. The size of the buffer must be a compile-time constant, and is checked before allowing a BPF program to attach to a tracepoint that uses this feature. The pointer to this buffer will be the first argument of tracepoints that opt in; the pointer is valid and can be bpf_probe_read() by both BPF_PROG_TYPE_RAW_TRACEPOINT and BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE programs that attach to such a tracepoint, but the buffer to which it points may only be written by the latter. Signed-off-by: Matt Mullins Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 8 ++++++-- kernel/bpf/verifier.c | 31 +++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 24 ++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b0de49598341..ae141e745f92 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1789,12 +1789,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) } raw_tp->btp = btp; - prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, - BPF_PROG_TYPE_RAW_TRACEPOINT); + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out_free_tp; } + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { + err = -EINVAL; + goto out_put_prog; + } err = bpf_probe_register(raw_tp->btp, prog); if (err) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 423f242a5efb..2ef442c62c0e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -405,6 +405,7 @@ static const char * const reg_type_str[] = { [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", + [PTR_TO_TP_BUFFER] = "tp_buffer", }; static char slot_type_char[] = { @@ -1993,6 +1994,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env, return 0; } +static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + if (off < 0) { + verbose(env, + "R%d invalid tracepoint buffer access: off=%d, size=%d", + regno, off, size); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, + "R%d invalid variable buffer offset: off=%d, var_off=%s", + regno, off, tn_buf); + return -EACCES; + } + if (off + size > env->prog->aux->max_tp_access) + env->prog->aux->max_tp_access = off + size; + + return 0; +} + + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -2137,6 +2164,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_TP_BUFFER) { + err = check_tp_buffer_access(env, reg, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 91800be0c8eb..8607aba1d882 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -915,6 +915,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; +static bool raw_tp_writable_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off == 0) { + if (size != sizeof(u64) || type != BPF_READ) + return false; + info->reg_type = PTR_TO_TP_BUFFER; + } + return raw_tp_prog_is_valid_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_writable_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -1204,6 +1225,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) return -EINVAL; + if (prog->aux->max_tp_access > btp->writable_size) + return -EINVAL; + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); } -- cgit v1.2.3 From 6ac99e8f23d4b10258406ca0dd7bffca5f31da9d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 26 Apr 2019 16:39:39 -0700 Subject: bpf: Introduce bpf sk local storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After allowing a bpf prog to - directly read the skb->sk ptr - get the fullsock bpf_sock by "bpf_sk_fullsock()" - get the bpf_tcp_sock by "bpf_tcp_sock()" - get the listener sock by "bpf_get_listener_sock()" - avoid duplicating the fields of "(bpf_)sock" and "(bpf_)tcp_sock" into different bpf running context. this patch is another effort to make bpf's network programming more intuitive to do (together with memory and performance benefit). When bpf prog needs to store data for a sk, the current practice is to define a map with the usual 4-tuples (src/dst ip/port) as the key. If multiple bpf progs require to store different sk data, multiple maps have to be defined. Hence, wasting memory to store the duplicated keys (i.e. 4 tuples here) in each of the bpf map. [ The smallest key could be the sk pointer itself which requires some enhancement in the verifier and it is a separate topic. ] Also, the bpf prog needs to clean up the elem when sk is freed. Otherwise, the bpf map will become full and un-usable quickly. The sk-free tracking currently could be done during sk state transition (e.g. BPF_SOCK_OPS_STATE_CB). The size of the map needs to be predefined which then usually ended-up with an over-provisioned map in production. Even the map was re-sizable, while the sk naturally come and go away already, this potential re-size operation is arguably redundant if the data can be directly connected to the sk itself instead of proxy-ing through a bpf map. This patch introduces sk->sk_bpf_storage to provide local storage space at sk for bpf prog to use. The space will be allocated when the first bpf prog has created data for this particular sk. The design optimizes the bpf prog's lookup (and then optionally followed by an inline update). bpf_spin_lock should be used if the inline update needs to be protected. BPF_MAP_TYPE_SK_STORAGE: ----------------------- To define a bpf "sk-local-storage", a BPF_MAP_TYPE_SK_STORAGE map (new in this patch) needs to be created. Multiple BPF_MAP_TYPE_SK_STORAGE maps can be created to fit different bpf progs' needs. The map enforces BTF to allow printing the sk-local-storage during a system-wise sk dump (e.g. "ss -ta") in the future. The purpose of a BPF_MAP_TYPE_SK_STORAGE map is not for lookup/update/delete a "sk-local-storage" data from a particular sk. Think of the map as a meta-data (or "type") of a "sk-local-storage". This particular "type" of "sk-local-storage" data can then be stored in any sk. The main purposes of this map are mostly: 1. Define the size of a "sk-local-storage" type. 2. Provide a similar syscall userspace API as the map (e.g. lookup/update, map-id, map-btf...etc.) 3. Keep track of all sk's storages of this "type" and clean them up when the map is freed. sk->sk_bpf_storage: ------------------ The main lookup/update/delete is done on sk->sk_bpf_storage (which is a "struct bpf_sk_storage"). When doing a lookup, the "map" pointer is now used as the "key" to search on the sk_storage->list. The "map" pointer is actually serving as the "type" of the "sk-local-storage" that is being requested. To allow very fast lookup, it should be as fast as looking up an array at a stable-offset. At the same time, it is not ideal to set a hard limit on the number of sk-local-storage "type" that the system can have. Hence, this patch takes a cache approach. The last search result from sk_storage->list is cached in sk_storage->cache[] which is a stable sized array. Each "sk-local-storage" type has a stable offset to the cache[] array. In the future, a map's flag could be introduced to do cache opt-out/enforcement if it became necessary. The cache size is 16 (i.e. 16 types of "sk-local-storage"). Programs can share map. On the program side, having a few bpf_progs running in the networking hotpath is already a lot. The bpf_prog should have already consolidated the existing sock-key-ed map usage to minimize the map lookup penalty. 16 has enough runway to grow. All sk-local-storage data will be removed from sk->sk_bpf_storage during sk destruction. bpf_sk_storage_get() and bpf_sk_storage_delete(): ------------------------------------------------ Instead of using bpf_map_(lookup|update|delete)_elem(), the bpf prog needs to use the new helper bpf_sk_storage_get() and bpf_sk_storage_delete(). The verifier can then enforce the ARG_PTR_TO_SOCKET argument. The bpf_sk_storage_get() also allows to "create" new elem if one does not exist in the sk. It is done by the new BPF_SK_STORAGE_GET_F_CREATE flag. An optional value can also be provided as the initial value during BPF_SK_STORAGE_GET_F_CREATE. The BPF_MAP_TYPE_SK_STORAGE also supports bpf_spin_lock. Together, it has eliminated the potential use cases for an equivalent bpf_map_update_elem() API (for bpf_prog) in this patch. Misc notes: ---------- 1. map_get_next_key is not supported. From the userspace syscall perspective, the map has the socket fd as the key while the map can be shared by pinned-file or map-id. Since btf is enforced, the existing "ss" could be enhanced to pretty print the local-storage. Supporting a kernel defined btf with 4 tuples as the return key could be explored later also. 2. The sk->sk_lock cannot be acquired. Atomic operations is used instead. e.g. cmpxchg is done on the sk->sk_bpf_storage ptr. Please refer to the source code comments for the details in synchronization cases and considerations. 3. The mem is charged to the sk->sk_omem_alloc as the sk filter does. Benchmark: --------- Here is the benchmark data collected by turning on the "kernel.bpf_stats_enabled" sysctl. Two bpf progs are tested: One bpf prog with the usual bpf hashmap (max_entries = 8192) with the sk ptr as the key. (verifier is modified to support sk ptr as the key That should have shortened the key lookup time.) Another bpf prog is with the new BPF_MAP_TYPE_SK_STORAGE. Both are storing a "u32 cnt", do a lookup on "egress_skb/cgroup" for each egress skb and then bump the cnt. netperf is used to drive data with 4096 connected UDP sockets. BPF_MAP_TYPE_HASH with a modifier verifier (152ns per bpf run) 27: cgroup_skb name egress_sk_map tag 74f56e832918070b run_time_ns 58280107540 run_cnt 381347633 loaded_at 2019-04-15T13:46:39-0700 uid 0 xlated 344B jited 258B memlock 4096B map_ids 16 btf_id 5 BPF_MAP_TYPE_SK_STORAGE in this patch (66ns per bpf run) 30: cgroup_skb name egress_sk_stora tag d4aa70984cc7bbf6 run_time_ns 25617093319 run_cnt 390989739 loaded_at 2019-04-15T13:47:54-0700 uid 0 xlated 168B jited 156B memlock 4096B map_ids 17 btf_id 6 Here is a high-level picture on how are the objects organized: sk ┌──────┐ │ │ │ │ │ │ │*sk_bpf_storage─────▶ bpf_sk_storage └──────┘ ┌───────┐ ┌───────────┤ list │ │ │ │ │ │ │ │ │ │ │ └───────┘ │ │ elem │ ┌────────┐ ├─▶│ snode │ │ ├────────┤ │ │ data │ bpf_map │ ├────────┤ ┌─────────┐ │ │map_node│◀─┬─────┤ list │ │ └────────┘ │ │ │ │ │ │ │ │ elem │ │ │ │ ┌────────┐ │ └─────────┘ └─▶│ snode │ │ ├────────┤ │ bpf_map │ data │ │ ┌─────────┐ ├────────┤ │ │ list ├───────▶│map_node│ │ │ │ └────────┘ │ │ │ │ │ │ elem │ └─────────┘ ┌────────┐ │ ┌─▶│ snode │ │ │ ├────────┤ │ │ │ data │ │ │ ├────────┤ │ │ │map_node│◀─┘ │ └────────┘ │ │ │ ┌───────┐ sk └──────────│ list │ ┌──────┐ │ │ │ │ │ │ │ │ │ │ │ │ └───────┘ │*sk_bpf_storage───────▶bpf_sk_storage └──────┘ Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 3 ++- kernel/bpf/verifier.c | 27 ++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae141e745f92..ad3ccf82f31d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -526,7 +526,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EACCES; if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_SK_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ef442c62c0e..271717246af3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2543,10 +2543,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && - type != expected_type) + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) + /* final test in check_stack_boundary() */; + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -2578,6 +2583,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } meta->ref_obj_id = reg->ref_obj_id; } + } else if (arg_type == ARG_PTR_TO_SOCKET) { + expected_type = PTR_TO_SOCKET; + if (type != expected_type) + goto err_type; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -2635,6 +2644,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta->map_ptr->key_size, false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE || + (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && + !register_is_null(reg)) || arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -2784,6 +2795,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_SK_STORAGE: + if (func_id != BPF_FUNC_sk_storage_get && + func_id != BPF_FUNC_sk_storage_delete) + goto error; + break; default: break; } @@ -2847,6 +2863,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_STACK) goto error; break; + case BPF_FUNC_sk_storage_get: + case BPF_FUNC_sk_storage_delete: + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) + goto error; + break; default: break; } -- cgit v1.2.3 From ae0be8de9a53cda3505865c11826d8ff0640237c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 26 Apr 2019 11:13:06 +0200 Subject: netlink: make nla_nest_start() add NLA_F_NESTED flag Even if the NLA_F_NESTED flag was introduced more than 11 years ago, most netlink based interfaces (including recently added ones) are still not setting it in kernel generated messages. Without the flag, message parsers not aware of attribute semantics (e.g. wireshark dissector or libmnl's mnl_nlmsg_fprintf()) cannot recognize nested attributes and won't display the structure of their contents. Unfortunately we cannot just add the flag everywhere as there may be userspace applications which check nlattr::nla_type directly rather than through a helper masking out the flags. Therefore the patch renames nla_nest_start() to nla_nest_start_noflag() and introduces nla_nest_start() as a wrapper adding NLA_F_NESTED. The calls which add NLA_F_NESTED manually are rewritten to use nla_nest_start(). Except for changes in include/net/netlink.h, the patch was generated using this semantic patch: @@ expression E1, E2; @@ -nla_nest_start(E1, E2) +nla_nest_start_noflag(E1, E2) @@ expression E1, E2; @@ -nla_nest_start_noflag(E1, E2 | NLA_F_NESTED) +nla_nest_start(E1, E2) Signed-off-by: Michal Kubecek Acked-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 1b942a7caf26..ef4f9cd980fd 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -375,7 +375,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; - na = nla_nest_start(skb, aggr); + na = nla_nest_start_noflag(skb, aggr); if (!na) goto err; -- cgit v1.2.3 From 8cb081746c031fb164089322e2336a0bf5b3070c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:28 +0200 Subject: netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- kernel/taskstats.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index ef4f9cd980fd..0e347f1c7800 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -677,8 +677,9 @@ static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, return -EINVAL; } - return nlmsg_validate(info->nlhdr, GENL_HDRLEN, TASKSTATS_CMD_ATTR_MAX, - policy, info->extack); + return nlmsg_validate_deprecated(info->nlhdr, GENL_HDRLEN, + TASKSTATS_CMD_ATTR_MAX, policy, + info->extack); } static struct genl_family family __ro_after_init = { -- cgit v1.2.3 From ef6243acb4782df587a4d7d6c310fa5b5d82684b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:31 +0200 Subject: genetlink: optionally validate strictly/dumps Add options to strictly validate messages and dump messages, sometimes perhaps validating dump messages non-strictly may be required, so add an option for that as well. Since none of this can really be applied to existing commands, set the options everwhere using the following spatch: @@ identifier ops; expression X; @@ struct genl_ops ops[] = { ..., { .cmd = X, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ... }, ... }; For new commands one should just not copy the .validate 'opt-out' flags and thus get strict validation. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- kernel/taskstats.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 0e347f1c7800..5f852b8f59f7 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -649,12 +649,14 @@ err: static const struct genl_ops taskstats_ops[] = { { .cmd = TASKSTATS_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = taskstats_user_cmd, /* policy enforced later */ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL, }, { .cmd = CGROUPSTATS_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = cgroupstats_user_cmd, /* policy enforced later */ .flags = GENL_CMD_CAP_HASPOL, -- cgit v1.2.3 From 795d673af1afae8146ac3070a2d77cfae5287c43 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 Apr 2019 14:11:05 -0400 Subject: audit_compare_dname_path(): switch to const struct qstr * Signed-off-by: Al Viro --- kernel/audit.h | 2 +- kernel/audit_fsnotify.c | 2 +- kernel/audit_watch.c | 3 +-- kernel/auditfilter.c | 6 +++--- kernel/auditsc.c | 4 ++-- 5 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 958d5b8fc1b3..2071725a999f 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -231,7 +231,7 @@ extern int audit_comparator(const u32 left, const u32 op, const u32 right); extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); extern int parent_len(const char *path); -extern int audit_compare_dname_path(const char *dname, const char *path, int plen); +extern int audit_compare_dname_path(const struct qstr *dname, const char *path, int plen); extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size); extern void audit_panic(const char *message); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index fb241805569c..b5737b826951 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -188,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group, } if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { - if (audit_compare_dname_path(dname->name, audit_mark->path, AUDIT_NAME_FULL)) + if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) return 0; audit_update_mark(audit_mark, inode); } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d832ce9df065..b50c574223fa 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -261,13 +261,12 @@ static void audit_update_watch(struct audit_parent *parent, struct audit_watch *owatch, *nwatch, *nextw; struct audit_krule *r, *nextr; struct audit_entry *oentry, *nentry; - const unsigned char *name = dname->name; mutex_lock(&audit_filter_mutex); /* Run all of the watches on this parent looking for the one that * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(name, owatch->path, + if (audit_compare_dname_path(dname, owatch->path, AUDIT_NAME_FULL)) continue; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 63f8b3f26fab..f9fff93c3351 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1290,12 +1290,12 @@ int parent_len(const char *path) * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL * here indicates that we must compute this value. */ -int audit_compare_dname_path(const char *dname, const char *path, int parentlen) +int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen) { int dlen, pathlen; const char *p; - dlen = strlen(dname); + dlen = dname->len; pathlen = strlen(path); if (pathlen < dlen) return 1; @@ -1306,7 +1306,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) p = path + parentlen; - return strncmp(p, dname, dlen); + return strncmp(p, dname->name, dlen); } int audit_filter(int msgtype, unsigned int listtype) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d1eab1d4a930..92d0ae63febd 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2045,7 +2045,7 @@ void __audit_inode_child(struct inode *parent, { struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); - const char *dname = dentry->d_name.name; + const struct qstr *dname = &dentry->d_name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; struct audit_entry *e; struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; @@ -2099,7 +2099,7 @@ void __audit_inode_child(struct inode *parent, (n->type != type && n->type != AUDIT_TYPE_UNKNOWN)) continue; - if (!strcmp(dname, n->name->name) || + if (!strcmp(dname->name, n->name->name) || !audit_compare_dname_path(dname, n->name->name, found_parent ? found_parent->name_len : -- cgit v1.2.3 From 31adf2308f33dcae59009019675224be0978bc70 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 24 Apr 2019 10:55:48 +0200 Subject: livepatch: Convert error about unsupported reliable stacktrace into a warning The commit d0807da78e11d46f ("livepatch: Remove immediate feature") caused that any livepatch was refused when reliable stacktraces were not supported on the given architecture. The limitation is too strong. User space processes are safely migrated even when entering or leaving the kernel. Kthreads transition would need to get forced. But it is safe when: + The livepatch does not change the semantic of the code. + Callbacks do not depend on a safely finished transition. Suggested-by: Josh Poimboeuf Acked-by: Josh Poimboeuf Acked-by: Miroslav Benes Reviewed-by: Kamalesh Babulal Signed-off-by: Petr Mladek --- kernel/livepatch/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb0ee10a1981..14f33ab6c583 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1003,11 +1003,10 @@ int klp_enable_patch(struct klp_patch *patch) return -ENODEV; if (!klp_have_reliable_stack()) { - pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); - return -EOPNOTSUPP; + pr_warn("This architecture doesn't have support for the livepatch consistency model.\n"); + pr_warn("The livepatch transition may never complete.\n"); } - mutex_lock(&klp_mutex); ret = klp_init_patch_early(patch); -- cgit v1.2.3 From 08970ecf744e09837bb6620c95406710f4c81ae2 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Thu, 18 Apr 2019 16:54:01 +0100 Subject: irq/irqdomain: Fix typo in the comment on top of __irq_domain_alloc_irqs() The word 'number' has been misspelt in the comment on top of _irq_domain_alloc_irqs(). Signed-off-by: Julien Grall Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9ed29e4a7dbf..a453e229f99c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1297,7 +1297,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, /** * __irq_domain_alloc_irqs - Allocate IRQs from domain * @domain: domain to allocate from - * @irq_base: allocate specified IRQ nubmer if irq_base >= 0 + * @irq_base: allocate specified IRQ number if irq_base >= 0 * @nr_irqs: number of IRQs to allocate * @node: NUMA node id for memory allocation * @arg: domain specific argument -- cgit v1.2.3 From 43d8ce9d65a54846d378545770991e65838981e0 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 26 Apr 2019 15:04:29 -0400 Subject: Provide in-kernel headers to make extending kernel easier Introduce in-kernel headers which are made available as an archive through proc (/proc/kheaders.tar.xz file). This archive makes it possible to run eBPF and other tracing programs that need to extend the kernel for tracing purposes without any dependency on the file system having headers. A github PR is sent for the corresponding BCC patch at: https://github.com/iovisor/bcc/pull/2312 On Android and embedded systems, it is common to switch kernels but not have kernel headers available on the file system. Further once a different kernel is booted, any headers stored on the file system will no longer be useful. This is an issue even well known to distros. By storing the headers as a compressed archive within the kernel, we can avoid these issues that have been a hindrance for a long time. The best way to use this feature is by building it in. Several users have a need for this, when they switch debug kernels, they do not want to update the filesystem or worry about it where to store the headers on it. However, the feature is also buildable as a module in case the user desires it not being part of the kernel image. This makes it possible to load and unload the headers from memory on demand. A tracing program can load the module, do its operations, and then unload the module to save kernel memory. The total memory needed is 3.3MB. By having the archive available at a fixed location independent of filesystem dependencies and conventions, all debugging tools can directly refer to the fixed location for the archive, without concerning with where the headers on a typical filesystem which significantly simplifies tooling that needs kernel headers. The code to read the headers is based on /proc/config.gz code and uses the same technique to embed the headers. Other approaches were discussed such as having an in-memory mountable filesystem, but that has drawbacks such as requiring an in-kernel xz decompressor which we don't have today, and requiring usage of 42 MB of kernel memory to host the decompressed headers at anytime. Also this approach is simpler than such approaches. Reviewed-by: Masahiro Yamada Signed-off-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/.gitignore | 1 + kernel/Makefile | 10 ++++++ kernel/gen_ikh_data.sh | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kheaders.c | 74 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 174 insertions(+) create mode 100755 kernel/gen_ikh_data.sh create mode 100644 kernel/kheaders.c (limited to 'kernel') diff --git a/kernel/.gitignore b/kernel/.gitignore index 6e699100872f..34d1e77ee9df 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,5 +1,6 @@ # # Generated files # +kheaders.md5 timeconst.h hz.bc diff --git a/kernel/Makefile b/kernel/Makefile index 6c57e78817da..12399614c350 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,6 +70,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o +obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -121,3 +122,12 @@ $(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) + +$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz + +quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz +cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@ +$(obj)/kheaders_data.tar.xz: FORCE + $(call cmd,genikh) + +clean-files := kheaders_data.tar.xz kheaders.md5 diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh new file mode 100755 index 000000000000..591a94f7b387 --- /dev/null +++ b/kernel/gen_ikh_data.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This script generates an archive consisting of kernel headers +# for CONFIG_IKHEADERS_PROC. +set -e +spath="$(dirname "$(readlink -f "$0")")" +kroot="$spath/.." +outdir="$(pwd)" +tarfile=$1 +cpio_dir=$outdir/$tarfile.tmp + +# Script filename relative to the kernel source root +# We add it to the archive because it is small and any changes +# to this script will also cause a rebuild of the archive. +sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" + +src_file_list=" +include/ +arch/$SRCARCH/include/ +$sfile +" + +obj_file_list=" +include/ +arch/$SRCARCH/include/ +" + +# Support incremental builds by skipping archive generation +# if timestamps of files being archived are not changed. + +# This block is useful for debugging the incremental builds. +# Uncomment it for debugging. +# iter=1 +# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; +# else; iter=$(($(cat /tmp/iter) + 1)); fi +# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter +# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter + +# include/generated/compile.h is ignored because it is touched even when none +# of the source files changed. This causes pointless regeneration, so let us +# ignore them for md5 calculation. +pushd $kroot > /dev/null +src_files_md5="$(find $src_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" +popd > /dev/null +obj_files_md5="$(find $obj_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" + +if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi +if [ -f kernel/kheaders.md5 ] && + [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && + [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && + [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then + exit +fi + +if [ "${quiet}" != "silent_" ]; then + echo " GEN $tarfile" +fi + +rm -rf $cpio_dir +mkdir $cpio_dir + +pushd $kroot > /dev/null +for f in $src_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir +popd > /dev/null + +# The second CPIO can complain if files already exist which can +# happen with out of tree builds. Just silence CPIO for now. +for f in $obj_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 + +# Remove comments except SDPX lines +find $cpio_dir -type f -print0 | + xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' + +tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null + +echo "$src_files_md5" > kernel/kheaders.md5 +echo "$obj_files_md5" >> kernel/kheaders.md5 +echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 + +rm -rf $cpio_dir diff --git a/kernel/kheaders.c b/kernel/kheaders.c new file mode 100644 index 000000000000..70ae6052920d --- /dev/null +++ b/kernel/kheaders.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel headers useful to build tracing programs + * such as for running eBPF tracing tools. + * + * (Borrowed code from kernel/configs.c) + */ + +#include +#include +#include +#include +#include + +/* + * Define kernel_headers_data and kernel_headers_data_end, within which the + * compressed kernel headers are stored. The file is first compressed with xz. + */ + +asm ( +" .pushsection .rodata, \"a\" \n" +" .global kernel_headers_data \n" +"kernel_headers_data: \n" +" .incbin \"kernel/kheaders_data.tar.xz\" \n" +" .global kernel_headers_data_end \n" +"kernel_headers_data_end: \n" +" .popsection \n" +); + +extern char kernel_headers_data; +extern char kernel_headers_data_end; + +static ssize_t +ikheaders_read_current(struct file *file, char __user *buf, + size_t len, loff_t *offset) +{ + return simple_read_from_buffer(buf, len, offset, + &kernel_headers_data, + &kernel_headers_data_end - + &kernel_headers_data); +} + +static const struct file_operations ikheaders_file_ops = { + .read = ikheaders_read_current, + .llseek = default_llseek, +}; + +static int __init ikheaders_init(void) +{ + struct proc_dir_entry *entry; + + /* create the current headers file */ + entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL, + &ikheaders_file_ops); + if (!entry) + return -ENOMEM; + + proc_set_size(entry, + &kernel_headers_data_end - + &kernel_headers_data); + return 0; +} + +static void __exit ikheaders_cleanup(void) +{ + remove_proc_entry("kheaders.tar.xz", NULL); +} + +module_init(ikheaders_init); +module_exit(ikheaders_cleanup); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joel Fernandes"); +MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel"); -- cgit v1.2.3 From 2bd1298ac17777525a41c8425521f569e412df14 Mon Sep 17 00:00:00 2001 From: Lokesh Vutla Date: Tue, 30 Apr 2019 15:42:22 +0530 Subject: genirq: Introduce irq_chip_{request,release}_resource_parent() apis Introduce irq_chip_{request,release}_resource_parent() apis so that these can be used in hierarchical irqchips. Signed-off-by: Lokesh Vutla Signed-off-by: Marc Zyngier --- kernel/irq/chip.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 51128bea3846..29d6c7d070b4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1459,6 +1459,33 @@ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) return -ENOSYS; } EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent); + +/** + * irq_chip_request_resources_parent - Request resources on the parent interrupt + * @data: Pointer to interrupt specific data + */ +int irq_chip_request_resources_parent(struct irq_data *data) +{ + data = data->parent_data; + + if (data->chip->irq_request_resources) + return data->chip->irq_request_resources(data); + + return -ENOSYS; +} +EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent); + +/** + * irq_chip_release_resources_parent - Release resources on the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_release_resources_parent(struct irq_data *data) +{ + data = data->parent_data; + if (data->chip->irq_release_resources) + data->chip->irq_release_resources(data); +} +EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent); #endif /** -- cgit v1.2.3 From 524845ff9c473d315468fa3b54054a7e6b2d95cf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 15 Apr 2019 22:31:29 -0400 Subject: bpf: switch to ->free_inode() Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Al Viro --- kernel/bpf/inode.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4a8f390a2b82..bc53e5b20ddc 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -566,9 +566,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void bpf_destroy_inode_deferred(struct rcu_head *head) +static void bpf_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); enum bpf_type type; if (S_ISLNK(inode->i_mode)) @@ -578,16 +577,11 @@ static void bpf_destroy_inode_deferred(struct rcu_head *head) free_inode_nonrcu(inode); } -static void bpf_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred); -} - static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, - .destroy_inode = bpf_destroy_inode, + .free_inode = bpf_free_inode, }; enum { -- cgit v1.2.3 From a5d5092c9285f6c8937b56f9c6ff2b22d818fc25 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 26 Feb 2019 13:16:14 -0600 Subject: gdbstub: mark expected switch fall-throughs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. This patch fixes the following warnings: kernel/debug/gdbstub.c: In function ‘gdb_serial_stub’: kernel/debug/gdbstub.c:1031:7: warning: this statement may fall through [-Wimplicit-fallthrough=] if (remcom_in_buffer[1] == '\0') { ^ kernel/debug/gdbstub.c:1036:3: note: here case 'C': /* Exception passing */ ^~~~ kernel/debug/gdbstub.c:1040:7: warning: this statement may fall through [-Wimplicit-fallthrough=] if (tmp == 0) ^ kernel/debug/gdbstub.c:1043:3: note: here case 'c': /* Continue packet */ ^~~~ kernel/debug/gdbstub.c:1050:4: warning: this statement may fall through [-Wimplicit-fallthrough=] dbg_activate_sw_breakpoints(); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/debug/gdbstub.c:1052:3: note: here default: ^~~~~~~ Warning level 3 was used: -Wimplicit-fallthrough=3 Notice that, in this particular case, the code comment is modified in accordance with what GCC is expecting to find. This patch is part of the ongoing efforts to enable -Wimplicit-fallthrough. Signed-off-by: Gustavo A. R. Silva Acked-by: Jason Wessel Signed-off-by: Daniel Thompson --- kernel/debug/gdbstub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 7510dc687c0d..9f267b8905b4 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1033,13 +1033,14 @@ int gdb_serial_stub(struct kgdb_state *ks) return DBG_PASS_EVENT; } #endif + /* Fall through */ case 'C': /* Exception passing */ tmp = gdb_cmd_exception_pass(ks); if (tmp > 0) goto default_handle; if (tmp == 0) break; - /* Fall through on tmp < 0 */ + /* Fall through - on tmp < 0 */ case 'c': /* Continue packet */ case 's': /* Single step packet */ if (kgdb_contthread && kgdb_contthread != current) { @@ -1048,7 +1049,7 @@ int gdb_serial_stub(struct kgdb_state *ks) break; } dbg_activate_sw_breakpoints(); - /* Fall through to default processing */ + /* Fall through - to default processing */ default: default_handle: error = kgdb_arch_handle_exception(ks->ex_vector, -- cgit v1.2.3 From 4cc168eaf3b67d76547fb420c22abe22a3c86003 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 22 Apr 2019 11:33:42 -0500 Subject: gdbstub: Replace strcpy() by strscpy() The strcpy() function is being deprecated. Replace it by the safer strscpy() and fix the following Coverity warning: "You might overrun the 1024-character fixed-size string remcom_in_buffer by copying cmd without checking the length." Addresses-Coverity-ID: 138999 ("Copy into fixed size buffer") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Thompson --- kernel/debug/gdbstub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 9f267b8905b4..4b280fc7dd67 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1095,10 +1095,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) return error; case 's': case 'c': - strcpy(remcom_in_buffer, cmd); + strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer)); return 0; case '$': - strcpy(remcom_in_buffer, cmd); + strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer)); gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); gdbstub_prev_in_buf_pos = 0; return 0; -- cgit v1.2.3 From 9b555c4d784c468b4167eef9ab621b5203e4f479 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 22 Apr 2019 11:21:06 -0500 Subject: kdb: kdb_support: replace strcpy() by strscpy() The strcpy() function is being deprecated. Replace it by the safer strscpy() and fix the following Coverity warning: "You might overrun the 129-character fixed-size string ks_namebuf by copying name without checking the length." Addresses-Coverity-ID: 138995 ("Copy into fixed size buffer") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_support.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 50bf9b119bad..b8e6306e7e13 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -192,7 +192,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len) while ((name = kdb_walk_kallsyms(&pos))) { if (strncmp(name, prefix_name, prefix_len) == 0) { - strcpy(ks_namebuf, name); + strscpy(ks_namebuf, name, sizeof(ks_namebuf)); /* Work out the longest name that matches the prefix */ if (++number == 1) { prev_len = min_t(int, max_len-1, -- cgit v1.2.3 From dbfe67334a1767bcb7be8b50bd237b22b272ef23 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 19 Mar 2019 10:12:04 -0700 Subject: tracing: kdb: The skip_lines parameter should have been skip_entries The things skipped by kdb's "ftdump" command when you pass it a parameter has always been entries, not lines. The difference usually doesn't matter but when the trace buffer has multi-line entries (like a stack dump) it can matter. Let's fix this both in the help text for ftdump and also in the local variable names. Link: http://lkml.kernel.org/r/20190319171206.97107-1-dianders@chromium.org Acked-by: Daniel Thompson Signed-off-by: Douglas Anderson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kdb.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 810d78a8d14c..4b666643d69f 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -17,7 +17,7 @@ #include "trace.h" #include "trace_output.h" -static void ftrace_dump_buf(int skip_lines, long cpu_file) +static void ftrace_dump_buf(int skip_entries, long cpu_file) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; @@ -70,11 +70,11 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) kdb_printf("---------------------------------\n"); cnt++; - if (!skip_lines) { + if (!skip_entries) { print_trace_line(&iter); trace_printk_seq(&iter.seq); } else { - skip_lines--; + skip_entries--; } if (KDB_FLAG(CMD_INTERRUPT)) @@ -106,7 +106,7 @@ out: */ static int kdb_ftdump(int argc, const char **argv) { - int skip_lines = 0; + int skip_entries = 0; long cpu_file; char *cp; @@ -114,9 +114,9 @@ static int kdb_ftdump(int argc, const char **argv) return KDB_ARGCOUNT; if (argc) { - skip_lines = simple_strtol(argv[1], &cp, 0); + skip_entries = simple_strtol(argv[1], &cp, 0); if (*cp) - skip_lines = 0; + skip_entries = 0; } if (argc == 2) { @@ -129,7 +129,7 @@ static int kdb_ftdump(int argc, const char **argv) } kdb_trap_printk++; - ftrace_dump_buf(skip_lines, cpu_file); + ftrace_dump_buf(skip_entries, cpu_file); kdb_trap_printk--; return 0; @@ -137,7 +137,7 @@ static int kdb_ftdump(int argc, const char **argv) static __init int kdb_ftrace_register(void) { - kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]", "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); return 0; } -- cgit v1.2.3 From ecffc8a8c7301f6f3c731ba23e38cd049a046416 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 19 Mar 2019 10:12:05 -0700 Subject: tracing: Add trace_total_entries() / trace_total_entries_cpu() These two new exported functions will be used in a future patch by kdb_ftdump() to quickly skip all but the last few trace entries. Link: http://lkml.kernel.org/r/20190319171206.97107-2-dianders@chromium.org Acked-by: Daniel Thompson Suggested-by: Steven Rostedt Signed-off-by: Douglas Anderson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 65 ++++++++++++++++++++++++++++++++++++++++------------ kernel/trace/trace.h | 3 +++ 2 files changed, 53 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2bc18de7f0dc..dcb9adb44be9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3492,34 +3492,69 @@ static void s_stop(struct seq_file *m, void *p) trace_event_read_unlock(); } +static void +get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total, + unsigned long *entries, int cpu) +{ + unsigned long count; + + count = ring_buffer_entries_cpu(buf->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; + /* total is the same as the entries */ + *total = count; + } else + *total = count + + ring_buffer_overrun_cpu(buf->buffer, cpu); + *entries = count; +} + static void get_total_entries(struct trace_buffer *buf, unsigned long *total, unsigned long *entries) { - unsigned long count; + unsigned long t, e; int cpu; *total = 0; *entries = 0; for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(buf->buffer, cpu); - /* - * If this buffer has skipped entries, then we hold all - * entries for the trace and we need to ignore the - * ones before the time stamp. - */ - if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { - count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; - /* total is the same as the entries */ - *total += count; - } else - *total += count + - ring_buffer_overrun_cpu(buf->buffer, cpu); - *entries += count; + get_total_entries_cpu(buf, &t, &e, cpu); + *total += t; + *entries += e; } } +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu); + + return entries; +} + +unsigned long trace_total_entries(struct trace_array *tr) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries(&tr->trace_buffer, &total, &entries); + + return entries; +} + static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index da00a3d508c1..33f14b9e78b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -721,6 +721,9 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu); +unsigned long trace_total_entries(struct trace_array *tr); + void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, -- cgit v1.2.3 From 03197fc02b356606355d7ede343b18e3e3737771 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 19 Mar 2019 10:12:06 -0700 Subject: tracing: kdb: Allow ftdump to skip all but the last few entries The 'ftdump' command in kdb is currently a bit of a last resort, at least if you have lots of traces turned on. It's going to print a whole boatload of data out your serial port which is probably running at 115200. This could easily take many, many minutes. Usually you're most interested in what's at the _end_ of the ftrace buffer, AKA what happened most recently. That means you've got to wait the full time for the dump. The 'ftdump' command does attempt to help you a little bit by allowing you to skip a fixed number of entries. Unfortunately it provides no way for you to know how many entries you should skip. Let's do similar to python and allow you to use a negative number to indicate that you want to skip all entries except the last few. This allows you to quickly see what you want. Note that we also change the printout in ftdump to print the (positive) number of entries actually skipped since that could be helpful to know when you've specified a negative skip count. Link: http://lkml.kernel.org/r/20190319171206.97107-3-dianders@chromium.org Signed-off-by: Douglas Anderson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kdb.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 4b666643d69f..6c1ae6b752d1 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -17,29 +17,25 @@ #include "trace.h" #include "trace_output.h" +static struct trace_iterator iter; +static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; + static void ftrace_dump_buf(int skip_entries, long cpu_file) { - /* use static because iter can be a bit big for the stack */ - static struct trace_iterator iter; - static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; struct trace_array *tr; unsigned int old_userobj; int cnt = 0, cpu; - trace_init_global_iter(&iter); - iter.buffer_iter = buffer_iter; tr = iter.tr; - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); - } - old_userobj = tr->trace_flags; /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; kdb_printf("Dumping ftrace buffer:\n"); + if (skip_entries) + kdb_printf("(skipping %d entries)\n", skip_entries); /* reset all but tr, trace, and overruns */ memset(&iter.seq, 0, @@ -89,10 +85,6 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) out: tr->trace_flags = old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); - } - for_each_tracing_cpu(cpu) { if (iter.buffer_iter[cpu]) { ring_buffer_read_finish(iter.buffer_iter[cpu]); @@ -109,6 +101,8 @@ static int kdb_ftdump(int argc, const char **argv) int skip_entries = 0; long cpu_file; char *cp; + int cnt; + int cpu; if (argc > 2) return KDB_ARGCOUNT; @@ -129,7 +123,29 @@ static int kdb_ftdump(int argc, const char **argv) } kdb_trap_printk++; + + trace_init_global_iter(&iter); + iter.buffer_iter = buffer_iter; + + for_each_tracing_cpu(cpu) { + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + + /* A negative skip_entries means skip all but the last entries */ + if (skip_entries < 0) { + if (cpu_file == RING_BUFFER_ALL_CPUS) + cnt = trace_total_entries(NULL); + else + cnt = trace_total_entries_cpu(NULL, cpu_file); + skip_entries = max(cnt + skip_entries, 0); + } + ftrace_dump_buf(skip_entries, cpu_file); + + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + kdb_trap_printk--; return 0; @@ -138,7 +154,8 @@ static int kdb_ftdump(int argc, const char **argv) static __init int kdb_ftrace_register(void) { kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]", - "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); + "Dump ftrace log; -skip dumps last #entries", 0, + KDB_ENABLE_ALWAYS_SAFE); return 0; } -- cgit v1.2.3 From aaebdf8d68479f78d9f72b239684f70fbb0722c6 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Wed, 1 May 2019 14:58:18 +0100 Subject: genirq/msi: Add a new field in msi_desc to store an IOMMU cookie When an MSI doorbell is located downstream of an IOMMU, it is required to swizzle the physical address with an appropriately-mapped IOVA for any device attached to one of our DMA ops domain. At the moment, the allocation of the mapping may be done when composing the message. However, the composing may be done in non-preemtible context while the allocation requires to be called from preemptible context. A follow-up change will split the current logic in two functions requiring to keep an IOMMU cookie per MSI. A new field is introduced in msi_desc to store an IOMMU cookie. As the cookie may not be required in some configuration, the field is protected under a new config CONFIG_IRQ_MSI_IOMMU. A pair of helpers has also been introduced to access the field. Signed-off-by: Julien Grall Reviewed-by: Robin Murphy Reviewed-by: Eric Auger Signed-off-by: Marc Zyngier --- kernel/irq/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5f3e2baefca9..8fee06625c37 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -91,6 +91,9 @@ config GENERIC_MSI_IRQ_DOMAIN select IRQ_DOMAIN_HIERARCHY select GENERIC_MSI_IRQ +config IRQ_MSI_IOMMU + bool + config HANDLE_DOMAIN_IRQ bool -- cgit v1.2.3 From 4d141ab3416d90f87775f5dee725efdf40110a8f Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 3 May 2019 15:26:24 +0200 Subject: livepatch: Remove custom kobject state handling kobject_init() always succeeds and sets the reference count to 1. It allows to always free the structures via kobject_put() and the related release callback. Note that the custom kobject state handling was used only because we did not know that kobject_put() can and actually should get called even when kobject_init_and_add() fails. The patch should not change the existing behavior. Suggested-by: "Tobin C. Harding" Signed-off-by: Petr Mladek Reviewed-by: Kamalesh Babulal Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 56 +++++++++++++++---------------------------------- 1 file changed, 17 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 14f33ab6c583..42385f23252a 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -426,6 +426,9 @@ static void klp_free_object_dynamic(struct klp_object *obj) kfree(obj); } +static struct kobj_type klp_ktype_object; +static struct kobj_type klp_ktype_func; + static struct klp_object *klp_alloc_object_dynamic(const char *name) { struct klp_object *obj; @@ -443,6 +446,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name) } INIT_LIST_HEAD(&obj->func_list); + kobject_init(&obj->kobj, &klp_ktype_object); obj->dynamic = true; return obj; @@ -471,6 +475,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func, } } + kobject_init(&func->kobj, &klp_ktype_func); /* * func->new_func is same as func->old_func. These addresses are * set when the object is loaded, see klp_init_object_loaded(). @@ -588,13 +593,7 @@ static void __klp_free_funcs(struct klp_object *obj, bool nops_only) continue; list_del(&func->node); - - /* Might be called from klp_init_patch() error path. */ - if (func->kobj_added) { - kobject_put(&func->kobj); - } else if (func->nop) { - klp_free_func_nop(func); - } + kobject_put(&func->kobj); } } @@ -624,13 +623,7 @@ static void __klp_free_objects(struct klp_patch *patch, bool nops_only) continue; list_del(&obj->node); - - /* Might be called from klp_init_patch() error path. */ - if (obj->kobj_added) { - kobject_put(&obj->kobj); - } else if (obj->dynamic) { - klp_free_object_dynamic(obj); - } + kobject_put(&obj->kobj); } } @@ -675,10 +668,8 @@ static void klp_free_patch_finish(struct klp_patch *patch) * this is called when the patch gets disabled and it * cannot get enabled again. */ - if (patch->kobj_added) { - kobject_put(&patch->kobj); - wait_for_completion(&patch->finish); - } + kobject_put(&patch->kobj); + wait_for_completion(&patch->finish); /* Put the module after the last access to struct klp_patch. */ if (!patch->forced) @@ -700,8 +691,6 @@ static void klp_free_patch_work_fn(struct work_struct *work) static int klp_init_func(struct klp_object *obj, struct klp_func *func) { - int ret; - if (!func->old_name) return -EINVAL; @@ -724,13 +713,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) * object. If the user selects 0 for old_sympos, then 1 will be used * since a unique symbol will be the first occurrence. */ - ret = kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s,%lu", func->old_name, - func->old_sympos ? func->old_sympos : 1); - if (!ret) - func->kobj_added = true; - - return ret; + return kobject_add(&func->kobj, &obj->kobj, "%s,%lu", + func->old_name, + func->old_sympos ? func->old_sympos : 1); } /* Arches may override this to finish any remaining arch-specific tasks */ @@ -801,11 +786,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) klp_find_object_module(obj); name = klp_is_module(obj) ? obj->name : "vmlinux"; - ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object, - &patch->kobj, "%s", name); + ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name); if (ret) return ret; - obj->kobj_added = true; klp_for_each_func(obj, func) { ret = klp_init_func(obj, func); @@ -829,7 +812,7 @@ static int klp_init_patch_early(struct klp_patch *patch) INIT_LIST_HEAD(&patch->list); INIT_LIST_HEAD(&patch->obj_list); - patch->kobj_added = false; + kobject_init(&patch->kobj, &klp_ktype_patch); patch->enabled = false; patch->forced = false; INIT_WORK(&patch->free_work, klp_free_patch_work_fn); @@ -840,11 +823,11 @@ static int klp_init_patch_early(struct klp_patch *patch) return -EINVAL; INIT_LIST_HEAD(&obj->func_list); - obj->kobj_added = false; + kobject_init(&obj->kobj, &klp_ktype_object); list_add_tail(&obj->node, &patch->obj_list); klp_for_each_func_static(obj, func) { - func->kobj_added = false; + kobject_init(&func->kobj, &klp_ktype_func); list_add_tail(&func->node, &obj->func_list); } } @@ -860,11 +843,9 @@ static int klp_init_patch(struct klp_patch *patch) struct klp_object *obj; int ret; - ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, - klp_root_kobj, "%s", patch->mod->name); + ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name); if (ret) return ret; - patch->kobj_added = true; if (patch->replace) { ret = klp_add_nops(patch); @@ -926,9 +907,6 @@ static int __klp_enable_patch(struct klp_patch *patch) if (WARN_ON(patch->enabled)) return -EINVAL; - if (!patch->kobj_added) - return -EINVAL; - pr_notice("enabling patch '%s'\n", patch->mod->name); klp_init_transition(patch, KLP_PATCHED); -- cgit v1.2.3 From f68d67cf2f83dc82675969724b59ca7c6da43fa9 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 3 May 2019 15:26:25 +0200 Subject: livepatch: Remove duplicated code for early initialization kobject_init() call added one more operation that has to be done when doing the early initialization of both static and dynamic livepatch structures. It would have been easier when the early initialization code was not duplicated. Let's deduplicate it for future generations of livepatching hackers. The patch does not change the existing behavior. Signed-off-by: Petr Mladek Reviewed-by: Kamalesh Babulal Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 42385f23252a..f12c0eabd843 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -426,10 +426,13 @@ static void klp_free_object_dynamic(struct klp_object *obj) kfree(obj); } -static struct kobj_type klp_ktype_object; -static struct kobj_type klp_ktype_func; +static void klp_init_func_early(struct klp_object *obj, + struct klp_func *func); +static void klp_init_object_early(struct klp_patch *patch, + struct klp_object *obj); -static struct klp_object *klp_alloc_object_dynamic(const char *name) +static struct klp_object *klp_alloc_object_dynamic(const char *name, + struct klp_patch *patch) { struct klp_object *obj; @@ -445,8 +448,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name) } } - INIT_LIST_HEAD(&obj->func_list); - kobject_init(&obj->kobj, &klp_ktype_object); + klp_init_object_early(patch, obj); obj->dynamic = true; return obj; @@ -475,7 +477,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func, } } - kobject_init(&func->kobj, &klp_ktype_func); + klp_init_func_early(obj, func); /* * func->new_func is same as func->old_func. These addresses are * set when the object is loaded, see klp_init_object_loaded(). @@ -495,11 +497,9 @@ static int klp_add_object_nops(struct klp_patch *patch, obj = klp_find_object(patch, old_obj); if (!obj) { - obj = klp_alloc_object_dynamic(old_obj->name); + obj = klp_alloc_object_dynamic(old_obj->name, patch); if (!obj) return -ENOMEM; - - list_add_tail(&obj->node, &patch->obj_list); } klp_for_each_func(old_obj, old_func) { @@ -510,8 +510,6 @@ static int klp_add_object_nops(struct klp_patch *patch, func = klp_alloc_func_nop(old_func, obj); if (!func) return -ENOMEM; - - list_add_tail(&func->node, &obj->func_list); } return 0; @@ -802,6 +800,21 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) return ret; } +static void klp_init_func_early(struct klp_object *obj, + struct klp_func *func) +{ + kobject_init(&func->kobj, &klp_ktype_func); + list_add_tail(&func->node, &obj->func_list); +} + +static void klp_init_object_early(struct klp_patch *patch, + struct klp_object *obj) +{ + INIT_LIST_HEAD(&obj->func_list); + kobject_init(&obj->kobj, &klp_ktype_object); + list_add_tail(&obj->node, &patch->obj_list); +} + static int klp_init_patch_early(struct klp_patch *patch) { struct klp_object *obj; @@ -822,13 +835,10 @@ static int klp_init_patch_early(struct klp_patch *patch) if (!obj->funcs) return -EINVAL; - INIT_LIST_HEAD(&obj->func_list); - kobject_init(&obj->kobj, &klp_ktype_object); - list_add_tail(&obj->node, &patch->obj_list); + klp_init_object_early(patch, obj); klp_for_each_func_static(obj, func) { - kobject_init(&func->kobj, &klp_ktype_func); - list_add_tail(&func->node, &obj->func_list); + klp_init_func_early(obj, func); } } -- cgit v1.2.3 From 13bf5ced93775ffccb53527a9d862e023a9daa03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 25 Mar 2019 15:44:06 +0100 Subject: dma-mapping: add a Kconfig symbol to indicate arch_dma_prep_coherent presence Add a Kconfig symbol that indicates an architecture provides a arch_dma_prep_coherent implementation, and provide a stub otherwise. This will allow the generic dma-iommu code to use it while still allowing to be built for cache coherent architectures. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 52b704e2b97a..83d711f8d665 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -38,6 +38,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL bool +config ARCH_HAS_DMA_PREP_COHERENT + bool + config ARCH_HAS_DMA_COHERENT_TO_PFN bool -- cgit v1.2.3 From 533307dc20a9e84a0687d4ca24aeb669516c0243 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Tue, 30 Apr 2019 17:57:29 +0800 Subject: cgroup: Remove unused cgrp variable The 'cgrp' is set but not used in commit <76f969e8948d8> ("cgroup: cgroup v2 freezer"). Remove it to avoid [-Wunused-but-set-variable] warning. Cc: Tejun Heo Signed-off-by: Shaokun Zhang Acked-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 57edcf398d71..4fe9f7f1a3fa 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5864,11 +5864,8 @@ void cgroup_post_fork(struct task_struct *child) * the task into the frozen state. */ if (unlikely(cgroup_task_freeze(child))) { - struct cgroup *cgrp; - spin_lock(&child->sighand->siglock); WARN_ON_ONCE(child->frozen); - cgrp = cset->dfl_cgrp; child->jobctl |= JOBCTL_TRAP_FREEZE; spin_unlock(&child->sighand->siglock); -- cgit v1.2.3 From cb2c4cd87874a7975b7b8615866b3a87bae10aab Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Apr 2019 10:59:44 -0700 Subject: cgroup: prevent spurious transition into non-frozen state If freezing of a cgroup races with waking of a task from the frozen state (like waiting in vfork() or in do_signal_stop()), a spurious transition of the cgroup state can happen. The task enters cgroup_leave_frozen(true), the cgroup->nr_frozen_tasks counter decrements, and the cgroup is switched to the unfrozen state. To prevent it, let's reserve cgroup_leave_frozen(true) for terminating processes and use cgroup_leave_frozen(false) otherwise. To avoid busy-looping in the signal handling loop waiting for JOBCTL_TRAP_FREEZE set from the cgroup freezing path, let's do it explicitly in cgroup_leave_frozen(), if the task is going to stay frozen. Suggested-by: Oleg Nesterov Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/freezer.c | 16 +++++----------- kernel/signal.c | 2 +- 2 files changed, 6 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 3bfbb3c8baf3..c321e768f8d3 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -139,19 +139,13 @@ void cgroup_leave_frozen(bool always_leave) cgroup_update_frozen(cgrp); WARN_ON_ONCE(!current->frozen); current->frozen = false; + } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) { + spin_lock(¤t->sighand->siglock); + current->jobctl |= JOBCTL_TRAP_FREEZE; + set_thread_flag(TIF_SIGPENDING); + spin_unlock(¤t->sighand->siglock); } spin_unlock_irq(&css_set_lock); - - if (unlikely(current->frozen)) { - /* - * If the task remained in the frozen state, - * make sure it won't reach userspace without - * entering the signal handling loop. - */ - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } } /* diff --git a/kernel/signal.c b/kernel/signal.c index 095e0fc57b25..16b72f4f14df 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2514,7 +2514,7 @@ relock: */ if (unlikely(cgroup_task_frozen(current))) { spin_unlock_irq(&sighand->siglock); - cgroup_leave_frozen(true); + cgroup_leave_frozen(false); goto relock; } -- cgit v1.2.3 From 96b9c592def5d7203bdad1337d9c92a2183de5cb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Apr 2019 10:59:45 -0700 Subject: cgroup: get rid of cgroup_freezer_frozen_exit() A task should never enter the exit path with the task->frozen bit set. Any frozen task must enter the signal handling loop and the only way to escape is through cgroup_leave_frozen(true), which unconditionally drops the task->frozen bit. So it means that cgroyp_freezer_frozen_exit() has zero chances to be called and has to be removed. Let's put a WARN_ON_ONCE() instead of the cgroup_freezer_frozen_exit() call to catch any potential leak of the task's frozen bit. Suggested-by: Oleg Nesterov Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 5 ++--- kernel/cgroup/freezer.c | 10 ---------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4fe9f7f1a3fa..327f37c9fdfa 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5926,9 +5926,8 @@ void cgroup_exit(struct task_struct *tsk) css_set_move_task(tsk, cset, NULL, false); cset->nr_tasks--; - if (unlikely(cgroup_task_frozen(tsk))) - cgroup_freezer_frozen_exit(tsk); - else if (unlikely(cgroup_task_freeze(tsk))) + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) cgroup_update_frozen(task_dfl_cgroup(tsk)); spin_unlock_irq(&css_set_lock); diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index c321e768f8d3..8cf010680678 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -248,16 +248,6 @@ void cgroup_freezer_migrate_task(struct task_struct *task, cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); } -void cgroup_freezer_frozen_exit(struct task_struct *task) -{ - struct cgroup *cgrp = task_dfl_cgroup(task); - - lockdep_assert_held(&css_set_lock); - - cgroup_dec_frozen_cnt(cgrp); - cgroup_update_frozen(cgrp); -} - void cgroup_freeze(struct cgroup *cgrp, bool freeze) { struct cgroup_subsys_state *css; -- cgit v1.2.3 From 1900da520c9fdc3a7cf00d21638f7c8721d5ac7f Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Sun, 21 Apr 2019 19:47:27 +0800 Subject: kernel: cgroup: fix misuse of %x Pointers should be printed with %p or %px rather than cast to unsigned long type and printed with %lx. Change %lx to %p to print the pointers. Signed-off-by: Fuqian Huang Signed-off-by: Tejun Heo --- kernel/cgroup/debug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f1b87330bee..80aa3f027ac3 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -64,8 +64,8 @@ static int current_css_set_read(struct seq_file *seq, void *v) css = cset->subsys[ss->id]; if (!css) continue; - seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, - (unsigned long)css, css->id); + seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, + css, css->id); } rcu_read_unlock(); spin_unlock_irq(&css_set_lock); @@ -224,8 +224,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) if (css->parent) snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", css->parent->id); - seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, - (unsigned long)css, css->id, + seq_printf(seq, "%2d: %-4s\t- %p[%d] %d%s\n", ss->id, ss->name, + css, css->id, atomic_read(&css->online_cnt), pbuf); } -- cgit v1.2.3 From a9e9bcb45b1525ba7aea26ed9441e8632aeeda58 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 28 Apr 2019 17:25:38 -0400 Subject: locking/rwsem: Prevent decrement of reader count before increment During my rwsem testing, it was found that after a down_read(), the reader count may occasionally become 0 or even negative. Consequently, a writer may steal the lock at that time and execute with the reader in parallel thus breaking the mutual exclusion guarantee of the write lock. In other words, both readers and writer can become rwsem owners simultaneously. The current reader wakeup code does it in one pass to clear waiter->task and put them into wake_q before fully incrementing the reader count. Once waiter->task is cleared, the corresponding reader may see it, finish the critical section and do unlock to decrement the count before the count is incremented. This is not a problem if there is only one reader to wake up as the count has been pre-incremented by 1. It is a problem if there are more than one readers to be woken up and writer can steal the lock. The wakeup was actually done in 2 passes before the following v4.9 commit: 70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once") To fix this problem, the wakeup is now done in two passes again. In the first pass, we collect the readers and count them. The reader count is then fully incremented. In the second pass, the waiter->task is then cleared and they are put into wake_q to be woken up later. Signed-off-by: Waiman Long Acked-by: Linus Torvalds Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Fixes: 70800c3c0cc5 ("locking/rwsem: Scan the wait_list for readers only once") Link: http://lkml.kernel.org/r/20190428212557.13482-2-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 46 ++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 6b3ee9948bf1..0b1f77957240 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -130,6 +130,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, { struct rwsem_waiter *waiter, *tmp; long oldcount, woken = 0, adjustment = 0; + struct list_head wlist; /* * Take a peek at the queue head waiter such that we can determine @@ -188,18 +189,43 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * of the queue. We know that woken will be at least 1 as we accounted * for above. Note we increment the 'active part' of the count by the * number of readers before waking any processes up. + * + * We have to do wakeup in 2 passes to prevent the possibility that + * the reader count may be decremented before it is incremented. It + * is because the to-be-woken waiter may not have slept yet. So it + * may see waiter->task got cleared, finish its critical section and + * do an unlock before the reader count increment. + * + * 1) Collect the read-waiters in a separate list, count them and + * fully increment the reader count in rwsem. + * 2) For each waiters in the new list, clear waiter->task and + * put them into wake_q to be woken up later. */ - list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { - struct task_struct *tsk; - + list_for_each_entry(waiter, &sem->wait_list, list) { if (waiter->type == RWSEM_WAITING_FOR_WRITE) break; woken++; - tsk = waiter->task; + } + list_cut_before(&wlist, &sem->wait_list, &waiter->list); + + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + lockevent_cond_inc(rwsem_wake_reader, woken); + if (list_empty(&sem->wait_list)) { + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; + } + + if (adjustment) + atomic_long_add(adjustment, &sem->count); + + /* 2nd pass */ + list_for_each_entry_safe(waiter, tmp, &wlist, list) { + struct task_struct *tsk; + tsk = waiter->task; get_task_struct(tsk); - list_del(&waiter->list); + /* * Ensure calling get_task_struct() before setting the reader * waiter to nil such that rwsem_down_read_failed() cannot @@ -213,16 +239,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, */ wake_q_add_safe(wake_q, tsk); } - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - lockevent_cond_inc(rwsem_wake_reader, woken); - if (list_empty(&sem->wait_list)) { - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - } - - if (adjustment) - atomic_long_add(adjustment, &sem->count); } /* -- cgit v1.2.3 From b3e5838252665ee4cfa76b82bdf1198dca81e5be Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 27 Mar 2019 13:04:15 +0100 Subject: clone: add CLONE_PIDFD This patchset makes it possible to retrieve pid file descriptors at process creation time by introducing the new flag CLONE_PIDFD to the clone() system call. Linus originally suggested to implement this as a new flag to clone() instead of making it a separate system call. As spotted by Linus, there is exactly one bit for clone() left. CLONE_PIDFD creates file descriptors based on the anonymous inode implementation in the kernel that will also be used to implement the new mount api. They serve as a simple opaque handle on pids. Logically, this makes it possible to interpret a pidfd differently, narrowing or widening the scope of various operations (e.g. signal sending). Thus, a pidfd cannot just refer to a tgid, but also a tid, or in theory - given appropriate flag arguments in relevant syscalls - a process group or session. A pidfd does not represent a privilege. This does not imply it cannot ever be that way but for now this is not the case. A pidfd comes with additional information in fdinfo if the kernel supports procfs. The fdinfo file contains the pid of the process in the callers pid namespace in the same format as the procfs status file, i.e. "Pid:\t%d". As suggested by Oleg, with CLONE_PIDFD the pidfd is returned in the parent_tidptr argument of clone. This has the advantage that we can give back the associated pid and the pidfd at the same time. To remove worries about missing metadata access this patchset comes with a sample program that illustrates how a combination of CLONE_PIDFD, and pidfd_send_signal() can be used to gain race-free access to process metadata through /proc/. The sample program can easily be translated into a helper that would be suitable for inclusion in libc so that users don't have to worry about writing it themselves. Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner Co-developed-by: Jann Horn Signed-off-by: Jann Horn Reviewed-by: Oleg Nesterov Cc: Arnd Bergmann Cc: "Eric W. Biederman" Cc: Kees Cook Cc: Thomas Gleixner Cc: David Howells Cc: "Michael Kerrisk (man-pages)" Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Aleksa Sarai Cc: Linus Torvalds Cc: Al Viro --- kernel/fork.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 103 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9dcd18aa210b..e45f0acaf451 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -11,6 +11,7 @@ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ +#include #include #include #include @@ -21,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1662,6 +1664,58 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ } +static int pidfd_release(struct inode *inode, struct file *file) +{ + struct pid *pid = file->private_data; + + file->private_data = NULL; + put_pid(pid); + return 0; +} + +#ifdef CONFIG_PROC_FS +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); + struct pid *pid = f->private_data; + + seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); + seq_putc(m, '\n'); +} +#endif + +const struct file_operations pidfd_fops = { + .release = pidfd_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = pidfd_show_fdinfo, +#endif +}; + +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ + int fd; + + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), + O_RDWR | O_CLOEXEC); + if (fd < 0) + put_pid(pid); + + return fd; +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1674,13 +1728,14 @@ static __latent_entropy struct task_struct *copy_process( unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, + int __user *parent_tidptr, int __user *child_tidptr, struct pid *pid, int trace, unsigned long tls, int node) { - int retval; + int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; @@ -1730,6 +1785,31 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_PIDFD) { + int reserved; + + /* + * - CLONE_PARENT_SETTID is useless for pidfds and also + * parent_tidptr is used to return pidfds. + * - CLONE_DETACHED is blocked so that we can potentially + * reuse it later for CLONE_PIDFD. + * - CLONE_THREAD is blocked until someone really needs it. + */ + if (clone_flags & + (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) + return ERR_PTR(-EINVAL); + + /* + * Verify that parent_tidptr is sane so we can potentially + * reuse it later. + */ + if (get_user(reserved, parent_tidptr)) + return ERR_PTR(-EFAULT); + + if (reserved != 0) + return ERR_PTR(-EINVAL); + } + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -1936,6 +2016,22 @@ static __latent_entropy struct task_struct *copy_process( } } + /* + * This has to happen after we've potentially unshared the file + * descriptor table (so that the pidfd doesn't leak into the child + * if the fd table isn't shared). + */ + if (clone_flags & CLONE_PIDFD) { + retval = pidfd_create(pid); + if (retval < 0) + goto bad_fork_free_pid; + + pidfd = retval; + retval = put_user(pidfd, parent_tidptr); + if (retval) + goto bad_fork_put_pidfd; + } + #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1996,7 +2092,7 @@ static __latent_entropy struct task_struct *copy_process( */ retval = cgroup_can_fork(p); if (retval) - goto bad_fork_free_pid; + goto bad_fork_put_pidfd; /* * From this point on we must avoid any synchronous user-space @@ -2111,6 +2207,9 @@ bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); +bad_fork_put_pidfd: + if (clone_flags & CLONE_PIDFD) + ksys_close(pidfd); bad_fork_free_pid: cgroup_threadgroup_change_end(current); if (pid != &init_struct_pid) @@ -2176,7 +2275,7 @@ static inline void init_idle_pids(struct task_struct *idle) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, + task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, cpu_to_node(cpu)); if (!IS_ERR(task)) { init_idle_pids(task); @@ -2223,7 +2322,7 @@ long _do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, + p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); add_latent_entropy(); -- cgit v1.2.3 From 2151ad1b067275730de1b38c7257478cae47d29e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Apr 2019 22:50:25 +0200 Subject: signal: support CLONE_PIDFD with pidfd_send_signal Let pidfd_send_signal() use pidfds retrieved via CLONE_PIDFD. With this patch pidfd_send_signal() becomes independent of procfs. This fullfils the request made when we merged the pidfd_send_signal() patchset. The pidfd_send_signal() syscall is now always available allowing for it to be used by users without procfs mounted or even users without procfs support compiled into the kernel. Signed-off-by: Christian Brauner Co-developed-by: Jann Horn Signed-off-by: Jann Horn Acked-by: Oleg Nesterov Cc: Arnd Bergmann Cc: "Eric W. Biederman" Cc: Kees Cook Cc: Thomas Gleixner Cc: David Howells Cc: "Michael Kerrisk (man-pages)" Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Aleksa Sarai Cc: Linus Torvalds Cc: Al Viro --- kernel/signal.c | 12 +++++++++--- kernel/sys_ni.c | 3 --- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f98448cf2def..1581140f2d99 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3513,7 +3513,6 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) return kill_something_info(sig, &info, pid); } -#ifdef CONFIG_PROC_FS /* * Verify that the signaler and signalee either are in the same pid namespace * or that the signaler's pid namespace is an ancestor of the signalee's pid @@ -3550,6 +3549,14 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) return copy_siginfo_from_user(kinfo, info); } +static struct pid *pidfd_to_pid(const struct file *file) +{ + if (file->f_op == &pidfd_fops) + return file->private_data; + + return tgid_pidfd_to_pid(file); +} + /** * sys_pidfd_send_signal - send a signal to a process through a task file * descriptor @@ -3586,7 +3593,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, return -EBADF; /* Is this a pidfd? */ - pid = tgid_pidfd_to_pid(f.file); + pid = pidfd_to_pid(f.file); if (IS_ERR(pid)) { ret = PTR_ERR(pid); goto err; @@ -3620,7 +3627,6 @@ err: fdput(f); return ret; } -#endif /* CONFIG_PROC_FS */ static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d21f4befaea4..4d9ae5ea6caf 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -167,9 +167,6 @@ COND_SYSCALL(syslog); /* kernel/sched/core.c */ -/* kernel/signal.c */ -COND_SYSCALL(pidfd_send_signal); - /* kernel/sys.c */ COND_SYSCALL(setregid); COND_SYSCALL(setgid); -- cgit v1.2.3 From 4dd537aca25dd2e0e8aca8b8923930cbe6240003 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 7 May 2019 22:55:41 +0900 Subject: tracing: uprobes: Re-enable $comm support for uprobe events Since commit 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code") dropped the $comm support from uprobe events, this re-enables it. For $comm support, uses strlcpy() instead of strncpy_from_user() to copy current task's comm. Because it is in the kernel space, strncpy_from_user() always fails to copy the comm. This also uses strlen() instead of strnlen_user() to measure the length of the comm. Note that this uses -ECOMM as a token value to fetch the comm string. If the user-space pointer points -ECOMM, it will be translated to task->comm. Link: http://lkml.kernel.org/r/155723734162.9149.4042756162201097965.stgit@devnote2 Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code") Reported-by: Andreas Ziegler Acked-by: Andreas Ziegler Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.h | 1 + kernel/trace/trace_uprobe.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index b7737666c1a8..f9a8c632188b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -124,6 +124,7 @@ struct fetch_insn { /* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ #define FETCH_INSN_MAX 16 +#define FETCH_TOKEN_COMM (-ECOMM) /* Fetch type information table */ struct fetch_type { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index cd8750a72768..eb7e06b54741 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -156,7 +156,10 @@ fetch_store_string(unsigned long addr, void *dest, void *base) if (unlikely(!maxlen)) return -ENOMEM; - ret = strncpy_from_user(dst, src, maxlen); + if (addr == FETCH_TOKEN_COMM) + ret = strlcpy(dst, current->comm, maxlen); + else + ret = strncpy_from_user(dst, src, maxlen); if (ret >= 0) { if (ret == maxlen) dst[ret - 1] = '\0'; @@ -180,7 +183,10 @@ fetch_store_strlen(unsigned long addr) int len; void __user *vaddr = (void __force __user *) addr; - len = strnlen_user(vaddr, MAX_STRING_SIZE); + if (addr == FETCH_TOKEN_COMM) + len = strlen(current->comm) + 1; + else + len = strnlen_user(vaddr, MAX_STRING_SIZE); return (len > MAX_STRING_SIZE) ? 0 : len; } @@ -220,6 +226,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, case FETCH_OP_IMM: val = code->immediate; break; + case FETCH_OP_COMM: + val = FETCH_TOKEN_COMM; + break; case FETCH_OP_FOFFS: val = translate_user_vaddr(code->immediate); break; -- cgit v1.2.3 From 489fe0096b19b664b8f3bed0fd604d617a229b5a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 7 May 2019 22:55:52 +0900 Subject: tracing: probeevent: Do not accumulate on ret variable Do not accumulate strlen result on "ret" local variable, because it is accumulated on "total" local variable for array case. Link: http://lkml.kernel.org/r/155723735237.9149.3192150444705457531.stgit@devnote2 Fixes: 40b53b771806 ("tracing: probeevent: Add array type support") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 4737bb8c07a3..c30c61f12ddd 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -88,7 +88,7 @@ stage3: /* 3rd stage: store value to buffer */ if (unlikely(!dest)) { if (code->op == FETCH_OP_ST_STRING) { - ret += fetch_store_strlen(val + code->offset); + ret = fetch_store_strlen(val + code->offset); code++; goto array; } else -- cgit v1.2.3 From 3dd1f7f24f8ceec00bbbc364c2ac3c893f0fdc4c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 7 May 2019 22:56:02 +0900 Subject: tracing: probeevent: Fix to make the type of $comm string Fix to make the type of $comm "string". If we set the other type to $comm argument, it shows meaningless value or wrong data. Currently probe events allow us to set string array type (e.g. ":string[2]"), or other digit types like x8 on $comm. But since clearly $comm is just a string data, it should not be fetched by other types including array. Link: http://lkml.kernel.org/r/155723736241.9149.14582064184468574539.stgit@devnote2 Cc: Andreas Ziegler Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 4cc2d467d34c..e0d1d5353464 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -533,13 +533,14 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } } } - /* - * The default type of $comm should be "string", and it can't be - * dereferenced. - */ - if (!t && strcmp(arg, "$comm") == 0) + + /* Since $comm can not be dereferred, we can find $comm by strcmp */ + if (strcmp(arg, "$comm") == 0) { + /* The type of $comm must be "string", and not an array. */ + if (parg->count || (t && strcmp(t, "string"))) + return -EINVAL; parg->type = find_fetch_type("string"); - else + } else parg->type = find_fetch_type(t); if (!parg->type) { trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); -- cgit v1.2.3 From 5c173bedb24dfcc8e412d3c3f111c504e4408dd5 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 1 Nov 2018 11:46:40 -0400 Subject: ring-buffer: Fix mispelling of Calculate It's not "Caculate". Link: http://lkml.kernel.org/r/20181101154640.23162-1-tiny.windzz@gmail.com Signed-off-by: Yangtao Li Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer_benchmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index ffba6789c0e2..0564f6db0561 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -362,7 +362,7 @@ static void ring_buffer_producer(void) hit--; /* make it non zero */ } - /* Caculate the average time in nanosecs */ + /* Calculate the average time in nanosecs */ avg = NSEC_PER_MSEC / (hit + missed); trace_printk("%ld ns per entry\n", avg); } -- cgit v1.2.3 From 0f5e5a3ab7fa1c09370a4d709ad6157457d5b8b6 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Mar 2019 09:17:57 +0100 Subject: tracing: Eliminate const char[] auto variables Automatic const char[] variables cause unnecessary code generation. For example, the this_mod variable leads to 3f04: 48 b8 5f 5f 74 68 69 73 5f 6d movabs $0x6d5f736968745f5f,%rax # __this_m 3f0e: 4c 8d 44 24 02 lea 0x2(%rsp),%r8 3f13: 48 8d 7c 24 10 lea 0x10(%rsp),%rdi 3f18: 48 89 44 24 02 mov %rax,0x2(%rsp) 3f1d: 4c 89 e9 mov %r13,%rcx 3f20: b8 65 00 00 00 mov $0x65,%eax # e 3f25: 48 c7 c2 00 00 00 00 mov $0x0,%rdx 3f28: R_X86_64_32S .rodata.str1.1+0x18d 3f2c: be 48 00 00 00 mov $0x48,%esi 3f31: c7 44 24 0a 6f 64 75 6c movl $0x6c75646f,0xa(%rsp) # odul 3f39: 66 89 44 24 0e mov %ax,0xe(%rsp) i.e., the string gets built on the stack at runtime. Similar code can be found for the other instances I'm replacing here. Putting the string in .rodata reduces the combined .text+.rodata size and saves time and stack space at runtime. The simplest fix, and what I've done for the this_mod case, is to just make the variable static. However, for the "" case where the same string is used twice, that prevents the linker from merging those two literals, so instead use a macro - that also keeps the two instances automatically in sync (instead of only the compile-time strlen expression). Finally, for the two runs of spaces, it turns out that the "build these strings on the stack" is not the worst part of what gcc does - it turns print_func_help_header_irq() into "if (tgid) { /* print_event_info + five seq_printf calls */ } else { /* print event_info + another five seq_printf */}". Taking inspiration from a suggestion from Al Viro, use %.*s to make snprintf either stop after the first two spaces or print the whole string. As a bonus, the seq_printfs now fit on single lines (at least, they are not longer than the existing ones in the function just above), making it easier to see that the ascii art lines up. x86-64 defconfig + CONFIG_FUNCTION_TRACER: $ scripts/stackdelta /tmp/stackusage.{0,1} ./kernel/trace/ftrace.c ftrace_mod_callback 152 136 -16 ./kernel/trace/trace.c trace_default_header 56 32 -24 ./kernel/trace/trace.c tracing_mark_raw_write 96 72 -24 ./kernel/trace/trace.c tracing_mark_write 104 80 -24 bloat-o-meter add/remove: 1/0 grow/shrink: 0/4 up/down: 14/-375 (-361) Function old new delta this_mod - 14 +14 ftrace_mod_callback 577 542 -35 tracing_mark_raw_write 444 374 -70 tracing_mark_write 616 540 -76 trace_default_header 600 406 -194 Link: http://lkml.kernel.org/r/20190320081757.6037-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.c | 34 +++++++++++++--------------------- 2 files changed, 14 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 433a64f49532..7765a53f1006 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3875,7 +3875,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, static bool module_exists(const char *module) { /* All modules have the symbol __this_module */ - const char this_mod[] = "__this_module"; + static const char this_mod[] = "__this_module"; char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dcb9adb44be9..3259019cc66d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3593,25 +3593,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char tgid_space[] = " "; - const char space[] = " "; + const char *space = " "; + int prec = tgid ? 10 : 2; print_event_info(buf, m); - seq_printf(m, "# %s _-----=> irqs-off\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s / _----=> need-resched\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s| / _---=> hardirq/softirq\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s|| / _--=> preempt-depth\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s||| / delay\n", - tgid ? tgid_space : space); - seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n", - tgid ? " TGID " : space); - seq_printf(m, "# | | %s | |||| | |\n", - tgid ? " | " : space); + seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); } void @@ -6342,13 +6335,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct ring_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; - const char faulted[] = ""; ssize_t written; int size; int len; /* Used in tracing_mark_raw_write() as well */ -#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */ +#define FAULTED_STR "" +#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ if (tracing_disabled) return -EINVAL; @@ -6380,7 +6373,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); if (len) { - memcpy(&entry->buf, faulted, FAULTED_SIZE); + memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; written = -EFAULT; } else @@ -6421,7 +6414,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, struct ring_buffer_event *event; struct ring_buffer *buffer; struct raw_data_entry *entry; - const char faulted[] = ""; unsigned long irq_flags; ssize_t written; int size; @@ -6461,7 +6453,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); if (len) { entry->id = -1; - memcpy(&entry->buf, faulted, FAULTED_SIZE); + memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); written = -EFAULT; } else written = cnt; -- cgit v1.2.3 From bfcd631eb6de474d8e097fd0f9f840fdf7272a1d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 9 Nov 2018 13:23:12 +0000 Subject: tracing: Fix white space issues in parse_pred() function Trivial fix to clean up an indentation issue, a whole chunk of code has an extra space in the indentation. Link: http://lkml.kernel.org/r/20181109132312.20994-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 180ecb390baa..d3e59312ef40 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1222,30 +1222,30 @@ static int parse_pred(const char *str, void *data, * (perf doesn't use it) and grab everything. */ if (strcmp(field->name, "ip") != 0) { - parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); - goto err_free; - } - pred->fn = filter_pred_none; - - /* - * Quotes are not required, but if they exist then we need - * to read them till we hit a matching one. - */ - if (str[i] == '\'' || str[i] == '"') - q = str[i]; - else - q = 0; - - for (i++; str[i]; i++) { - if (q && str[i] == q) - break; - if (!q && (str[i] == ')' || str[i] == '&' || - str[i] == '|')) - break; - } - /* Skip quotes */ - if (q) - s++; + parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); + goto err_free; + } + pred->fn = filter_pred_none; + + /* + * Quotes are not required, but if they exist then we need + * to read them till we hit a matching one. + */ + if (str[i] == '\'' || str[i] == '"') + q = str[i]; + else + q = 0; + + for (i++; str[i]; i++) { + if (q && str[i] == q) + break; + if (!q && (str[i] == ')' || str[i] == '&' || + str[i] == '|')) + break; + } + /* Skip quotes */ + if (q) + s++; len = i - s; if (len >= MAX_FILTER_STR_VAL) { parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); -- cgit v1.2.3 From 6fc2171c5c03672bae71d04a0f5fa88cc9c3b4e2 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 30 Nov 2018 15:56:22 +0100 Subject: tracing: Allow RCU to run between postponed startup tests When building a allmodconfig kernel for arm64 and boot that in qemu, CONFIG_FTRACE_STARTUP_TEST gets enabled and that takes time so the watchdog expires and prints out a message like this: 'watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:1]' Depending on what the what test gets called from init_trace_selftests() it stays minutes in the loop. Rework so that function cond_resched() gets called in the init_trace_selftests loop. Link: http://lkml.kernel.org/r/20181130145622.26334-1-anders.roxell@linaro.org Co-developed-by: Arnd Bergmann Signed-off-by: Arnd Bergmann Signed-off-by: Anders Roxell Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3259019cc66d..4269af5905e4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1722,6 +1722,10 @@ static __init int init_trace_selftests(void) pr_info("Running postponed tracer tests:\n"); list_for_each_entry_safe(p, n, &postponed_selftests, list) { + /* This loop can take minutes when sanitizers are enabled, so + * lets make sure we allow RCU processing. + */ + cond_resched(); ret = run_tracer_selftest(p->type); /* If the test fails, then warn and remove from available_tracers */ if (ret < 0) { -- cgit v1.2.3 From cbe08bcbbe787315c425dde284dcb715cfbf3f39 Mon Sep 17 00:00:00 2001 From: Elazar Leibovich Date: Mon, 31 Dec 2018 13:58:37 +0200 Subject: tracing: Fix partial reading of trace event's id file When reading only part of the id file, the ppos isn't tracked correctly. This is taken care by simple_read_from_buffer. Reading a single byte, and then the next byte would result EOF. While this seems like not a big deal, this breaks abstractions that reads information from files unbuffered. See for example https://github.com/golang/go/issues/29399 This code was mentioned as problematic in commit cd458ba9d5a5 ("tracing: Do not (ab)use trace_seq in event_id_read()") An example C code that show this bug is: #include #include #include #include #include #include int main(int argc, char **argv) { if (argc < 2) return 1; int fd = open(argv[1], O_RDONLY); char c; read(fd, &c, 1); printf("First %c\n", c); read(fd, &c, 1); printf("Second %c\n", c); } Then run with, e.g. sudo ./a.out /sys/kernel/debug/tracing/events/tcp/tcp_set_state/id You'll notice you're getting the first character twice, instead of the first two characters in the id file. Link: http://lkml.kernel.org/r/20181231115837.4932-1-elazar@lightbitslabs.com Cc: Orit Wasserman Cc: Oleg Nesterov Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 23725aeeab10b ("ftrace: provide an id file for each event") Signed-off-by: Elazar Leibovich Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 81c038ed6cee..0ce3db67f556 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1319,9 +1319,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) char buf[32]; int len; - if (*ppos) - return 0; - if (unlikely(!id)) return -ENODEV; -- cgit v1.2.3 From 8623b00676f16ed8972008095deca2c8e2b97a37 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 14 Jan 2019 22:34:08 -0600 Subject: tracing: Replace kzalloc with kcalloc Replace kzalloc() function with its 2-factor argument form, kcalloc(). This patch replaces cases of: kzalloc(a * b, gfp) with: kcalloc(a, b, gfp) This code was detected with the help of Coccinelle. Link: http://lkml.kernel.org/r/20190115043408.GA23456@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e0d1d5353464..a347faced959 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -558,7 +558,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, parg->count); } - code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); + code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); if (!code) return -ENOMEM; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; @@ -637,7 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code->op = FETCH_OP_END; /* Shrink down the code buffer */ - parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL); + parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); if (!parg->code) ret = -ENOMEM; else -- cgit v1.2.3 From b9416997603ef7e17d4de10b6408f19da2feb72c Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Mon, 28 Jan 2019 17:55:53 -0800 Subject: tracing: Fix documentation about disabling options using trace_options To disable a tracing option using the trace_options file, the option name needs to be prefixed with 'no', and not suffixed, as the README states. Fix it. Link: http://lkml.kernel.org/r/154872690031.47356.5739053380942044586.stgit@srivatsa-ubuntu Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4269af5905e4..a3a6945a7732 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4755,7 +4755,7 @@ static const char readme_msg[] = " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" " trace_options\t\t- Set format or modify how tracing happens\n" - "\t\t\t Disable an option by adding a suffix 'no' to the\n" + "\t\t\t Disable an option by prefixing 'no' to the\n" "\t\t\t option name\n" " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE -- cgit v1.2.3 From fdc6bae940ee9eb869e493990540098b8c0fd6ab Mon Sep 17 00:00:00 2001 From: Miroslav Lichvar Date: Wed, 17 Apr 2019 10:48:33 +0200 Subject: ntp: Allow TAI-UTC offset to be set to zero The ADJ_TAI adjtimex mode sets the TAI-UTC offset of the system clock. It is typically set by NTP/PTP implementations and it is automatically updated by the kernel on leap seconds. The initial value is zero (which applications may interpret as unknown), but this value cannot be set by adjtimex. This limitation seems to go back to the original "nanokernel" implementation by David Mills. Change the ADJ_TAI check to accept zero as a valid TAI-UTC offset in order to allow setting it back to the initial value. Fixes: 153b5d054ac2 ("ntp: support for TAI") Suggested-by: Ondrej Mosnacek Signed-off-by: Miroslav Lichvar Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Richard Cochran Cc: Prarit Bhargava Link: https://lkml.kernel.org/r/20190417084833.7401-1-mlichvar@redhat.com --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 92a90014a925..f43d47c8c3b6 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -690,7 +690,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, time_constant = max(time_constant, 0l); } - if (txc->modes & ADJ_TAI && txc->constant > 0) + if (txc->modes & ADJ_TAI && txc->constant >= 0) *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) -- cgit v1.2.3 From f2b31bb598248c04721cb8485e6091a9feb045ac Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 8 May 2019 13:34:20 -0700 Subject: cgroup: never call do_group_exit() with task->frozen bit set I've got two independent reports that cgroup_task_frozen() check in cgroup_exit() has been triggered by lkp libhugetlbfs-test and LTP ptrace01 tests. For example: [ 44.576072] WARNING: CPU: 1 PID: 3028 at kernel/cgroup/cgroup.c:5932 cgroup_exit+0x148/0x160 [ 44.577724] Modules linked in: crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel sr_mod cdrom bochs_drm sg ttm ata_generic pata_acpi ppdev drm_kms_helper snd_pcm syscopyarea aesni_intel snd_timer sysfillrect sysimgblt snd crypto_simd cryptd glue_helper soundcore fb_sys_fops joydev drm serio_raw pcspkr ata_piix libata i2c_piix4 floppy parport_pc parport ip_tables [ 44.583106] CPU: 1 PID: 3028 Comm: ptrace-write-hu Not tainted 5.1.0-rc3-00053-g9262503 #5 [ 44.584600] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 44.586116] RIP: 0010:cgroup_exit+0x148/0x160 [ 44.587135] Code: 0f 84 50 ff ff ff 48 8b 85 c8 0c 00 00 48 8b 78 70 e8 ec 2e 00 00 e9 3b ff ff ff f0 ff 43 60 0f 88 72 21 89 00 e9 48 ff ff ff <0f> 0b e9 1b ff ff ff e8 3c 73 f4 ff 66 90 66 2e 0f 1f 84 00 00 00 [ 44.590113] RSP: 0018:ffffb25702dcfd30 EFLAGS: 00010002 [ 44.591167] RAX: ffff96a7fee32410 RBX: ffff96a7ff1d6000 RCX: dead000000000200 [ 44.592446] RDX: ffff96a7ff1d6080 RSI: ffff96a7fec75290 RDI: ffff96a7fec75290 [ 44.593715] RBP: ffff96a7fec745c0 R08: ffff96a7fec74658 R09: 0000000000000000 [ 44.594985] R10: 0000000000000000 R11: 0000000000000001 R12: ffff96a7fec75101 [ 44.596266] R13: ffff96a7fec745c0 R14: ffff96a7ff3bde30 R15: ffff96a7fec75130 [ 44.597550] FS: 0000000000000000(0000) GS:ffff96a7dd700000(0000) knlGS:0000000000000000 [ 44.598950] CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 [ 44.600098] CR2: 00000000f7a00000 CR3: 000000000d20e000 CR4: 00000000000406e0 [ 44.601417] Call Trace: [ 44.602777] do_exit+0x337/0xc40 [ 44.603677] do_group_exit+0x3a/0xa0 [ 44.604610] get_signal+0x12e/0x8d0 [ 44.605533] ? __switch_to_asm+0x40/0x70 [ 44.606503] do_signal+0x36/0x650 [ 44.607409] ? __switch_to_asm+0x40/0x70 [ 44.608383] ? __schedule+0x267/0x860 [ 44.609329] exit_to_usermode_loop+0x89/0xf0 [ 44.610349] do_fast_syscall_32+0x251/0x2e3 [ 44.611357] entry_SYSENTER_compat+0x7f/0x91 [ 44.612376] ---[ end trace e4ca5cfc4b7f7964 ]--- The problem is caused by the ptrace_signal() call in the for loop in get_signal(). There is a cgroup_enter_frozen() call inside ptrace_signal(), so after exit from ptrace_signal() the task->frozen bit might be set. In this case do_group_exit() can be called with the task->frozen bit set and trigger the warning. This is only place where we can leave the loop with the task->frozen bit set and without setting JOBCTL_TRAP_FREEZE and TIF_SIGPENDING. To resolve this problem, let's move cgroup_leave_frozen(true) call to just after the fatal label. If the task is going to die, the frozen bit must be cleared no matter how we get into this point. Reported-by: kernel test robot Reported-by: Qian Cai Cc: Oleg Nesterov Cc: Tejun Heo Signed-off-by: Roman Gushchin Signed-off-by: Tejun Heo --- kernel/signal.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 16b72f4f14df..8607b11ff936 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2483,10 +2483,6 @@ relock: ksig->info.si_signo = signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); recalc_sigpending(); - current->jobctl &= ~JOBCTL_TRAP_FREEZE; - spin_unlock_irq(&sighand->siglock); - if (unlikely(cgroup_task_frozen(current))) - cgroup_leave_frozen(true); goto fatal; } @@ -2608,8 +2604,10 @@ relock: continue; } - spin_unlock_irq(&sighand->siglock); fatal: + spin_unlock_irq(&sighand->siglock); + if (unlikely(cgroup_task_frozen(current))) + cgroup_leave_frozen(true); /* * Anything else is fatal, maybe with a core dump. -- cgit v1.2.3 From c3b7112df86b769927a60a6d7175988ca3d60f09 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 10 May 2019 11:53:46 +0200 Subject: fork: do not release lock that wasn't taken Avoid calling cgroup_threadgroup_change_end() without having called cgroup_threadgroup_change_begin() first. During process creation we need to check whether the cgroup we are in allows us to fork. To perform this check the cgroup needs to guard itself against threadgroup changes and takes a lock. Prior to CLONE_PIDFD the cleanup target "bad_fork_free_pid" would also need to call cgroup_threadgroup_change_end() because said lock had already been taken. However, this is not the case anymore with the addition of CLONE_PIDFD. We are now allocating a pidfd before we check whether the cgroup we're in can fork and thus prior to taking the lock. So when copy_process() fails at the right step it would release a lock we haven't taken. This bug is not even very subtle to be honest. It's just not very clear from the naming of cgroup_threadgroup_change_{begin,end}() that a lock is taken. Here's the relevant splat: entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 RIP: 0023:0xf7fec849 Code: 85 d2 74 02 89 0a 5b 5d c3 8b 04 24 c3 8b 14 24 c3 8b 3c 24 c3 90 90 90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 002b:00000000ffed5a8c EFLAGS: 00000246 ORIG_RAX: 0000000000000078 RAX: ffffffffffffffda RBX: 0000000000003ffc RCX: 0000000000000000 RDX: 00000000200005c0 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000012 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ------------[ cut here ]------------ DEBUG_LOCKS_WARN_ON(depth <= 0) WARNING: CPU: 1 PID: 7744 at kernel/locking/lockdep.c:4052 __lock_release kernel/locking/lockdep.c:4052 [inline] WARNING: CPU: 1 PID: 7744 at kernel/locking/lockdep.c:4052 lock_release+0x667/0xa00 kernel/locking/lockdep.c:4321 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 7744 Comm: syz-executor007 Not tainted 5.1.0+ #4 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 panic+0x2cb/0x65c kernel/panic.c:214 __warn.cold+0x20/0x45 kernel/panic.c:566 report_bug+0x263/0x2b0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:179 [inline] fixup_bug arch/x86/kernel/traps.c:174 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:272 do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:291 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:972 RIP: 0010:__lock_release kernel/locking/lockdep.c:4052 [inline] RIP: 0010:lock_release+0x667/0xa00 kernel/locking/lockdep.c:4321 Code: 0f 85 a0 03 00 00 8b 35 77 66 08 08 85 f6 75 23 48 c7 c6 a0 55 6b 87 48 c7 c7 40 25 6b 87 4c 89 85 70 ff ff ff e8 b7 a9 eb ff <0f> 0b 4c 8b 85 70 ff ff ff 4c 89 ea 4c 89 e6 4c 89 c7 e8 52 63 ff RSP: 0018:ffff888094117b48 EFLAGS: 00010086 RAX: 0000000000000000 RBX: 1ffff11012822f6f RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff815af236 RDI: ffffed1012822f5b RBP: ffff888094117c00 R08: ffff888092bfc400 R09: fffffbfff113301d R10: fffffbfff113301c R11: ffffffff889980e3 R12: ffffffff8a451df8 R13: ffffffff8142e71f R14: ffffffff8a44cc80 R15: ffff888094117bd8 percpu_up_read.constprop.0+0xcb/0x110 include/linux/percpu-rwsem.h:92 cgroup_threadgroup_change_end include/linux/cgroup-defs.h:712 [inline] copy_process.part.0+0x47ff/0x6710 kernel/fork.c:2222 copy_process kernel/fork.c:1772 [inline] _do_fork+0x25d/0xfd0 kernel/fork.c:2338 __do_compat_sys_x86_clone arch/x86/ia32/sys_ia32.c:240 [inline] __se_compat_sys_x86_clone arch/x86/ia32/sys_ia32.c:236 [inline] __ia32_compat_sys_x86_clone+0xbc/0x140 arch/x86/ia32/sys_ia32.c:236 do_syscall_32_irqs_on arch/x86/entry/common.c:334 [inline] do_fast_syscall_32+0x281/0xd54 arch/x86/entry/common.c:405 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 RIP: 0023:0xf7fec849 Code: 85 d2 74 02 89 0a 5b 5d c3 8b 04 24 c3 8b 14 24 c3 8b 3c 24 c3 90 90 90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 002b:00000000ffed5a8c EFLAGS: 00000246 ORIG_RAX: 0000000000000078 RAX: ffffffffffffffda RBX: 0000000000003ffc RCX: 0000000000000000 RDX: 00000000200005c0 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000012 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Kernel Offset: disabled Rebooting in 86400 seconds.. Reported-and-tested-by: syzbot+3286e58549edc479faae@syzkaller.appspotmail.com Fixes: b3e583825266 ("clone: add CLONE_PIDFD") Signed-off-by: Christian Brauner --- kernel/fork.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5359facf9867..737db1828437 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2102,7 +2102,7 @@ static __latent_entropy struct task_struct *copy_process( */ retval = cgroup_can_fork(p); if (retval) - goto bad_fork_put_pidfd; + goto bad_fork_cgroup_threadgroup_change_end; /* * From this point on we must avoid any synchronous user-space @@ -2217,11 +2217,12 @@ bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); +bad_fork_cgroup_threadgroup_change_end: + cgroup_threadgroup_change_end(current); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) ksys_close(pidfd); bad_fork_free_pid: - cgroup_threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: -- cgit v1.2.3 From 56e33afd7757d5da2664fb797f2544ce167176be Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 10 May 2019 23:47:50 +0200 Subject: livepatch: Remove klp_check_compiler_support() The only purpose of klp_check_compiler_support() is to make sure that we are not using ftrace on x86 via mcount (because that's executed only after prologue has already happened, and that's too late for livepatching purposes). Now that mcount is not supported by ftrace any more, there is no need for klp_check_compiler_support() either. Link: http://lkml.kernel.org/r/nycvar.YFH.7.76.1905102346100.17054@cbobk.fhfr.pm Reported-by: Linus Torvalds Signed-off-by: Jiri Kosina Signed-off-by: Steven Rostedt (VMware) --- kernel/livepatch/core.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb0ee10a1981..112a36ed4a09 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1220,14 +1220,6 @@ void klp_module_going(struct module *mod) static int __init klp_init(void) { - int ret; - - ret = klp_check_compiler_support(); - if (ret) { - pr_info("Your compiler is too old; turning off.\n"); - return -EINVAL; - } - klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); if (!klp_root_kobj) return -ENOMEM; -- cgit v1.2.3 From af959b18fd447170a10865283ba691af4353cc7f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 11 May 2019 03:03:09 +0200 Subject: bpf: fix out of bounds backwards jmps due to dead code removal systemtap folks reported the following splat recently: [ 7790.862212] WARNING: CPU: 3 PID: 26759 at arch/x86/kernel/kprobes/core.c:1022 kprobe_fault_handler+0xec/0xf0 [...] [ 7790.864113] CPU: 3 PID: 26759 Comm: sshd Not tainted 5.1.0-0.rc7.git1.1.fc31.x86_64 #1 [ 7790.864198] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS[...] [ 7790.864314] RIP: 0010:kprobe_fault_handler+0xec/0xf0 [ 7790.864375] Code: 48 8b 50 [...] [ 7790.864714] RSP: 0018:ffffc06800bdbb48 EFLAGS: 00010082 [ 7790.864812] RAX: ffff9e2b75a16320 RBX: 0000000000000000 RCX: 0000000000000000 [ 7790.865306] RDX: ffffffffffffffff RSI: 000000000000000e RDI: ffffc06800bdbbf8 [ 7790.865514] RBP: ffffc06800bdbbf8 R08: 0000000000000000 R09: 0000000000000000 [ 7790.865960] R10: 0000000000000000 R11: 0000000000000000 R12: ffffc06800bdbbf8 [ 7790.866037] R13: ffff9e2ab56a0418 R14: ffff9e2b6d0bb400 R15: ffff9e2b6d268000 [ 7790.866114] FS: 00007fde49937d80(0000) GS:ffff9e2b75a00000(0000) knlGS:0000000000000000 [ 7790.866193] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 7790.866318] CR2: 0000000000000000 CR3: 000000012f312000 CR4: 00000000000006e0 [ 7790.866419] Call Trace: [ 7790.866677] do_user_addr_fault+0x64/0x480 [ 7790.867513] do_page_fault+0x33/0x210 [ 7790.868002] async_page_fault+0x1e/0x30 [ 7790.868071] RIP: 0010: (null) [ 7790.868144] Code: Bad RIP value. [ 7790.868229] RSP: 0018:ffffc06800bdbca8 EFLAGS: 00010282 [ 7790.868362] RAX: ffff9e2b598b60f8 RBX: ffffc06800bdbe48 RCX: 0000000000000004 [ 7790.868629] RDX: 0000000000000004 RSI: ffffc06800bdbc6c RDI: ffff9e2b598b60f0 [ 7790.868834] RBP: ffffc06800bdbcf8 R08: 0000000000000000 R09: 0000000000000004 [ 7790.870432] R10: 00000000ff6f7a03 R11: 0000000000000000 R12: 0000000000000001 [ 7790.871859] R13: ffffc06800bdbcb8 R14: 0000000000000000 R15: ffff9e2acd0a5310 [ 7790.873455] ? vfs_read+0x5/0x170 [ 7790.874639] ? vfs_read+0x1/0x170 [ 7790.875834] ? trace_call_bpf+0xf6/0x260 [ 7790.877044] ? vfs_read+0x1/0x170 [ 7790.878208] ? vfs_read+0x5/0x170 [ 7790.879345] ? kprobe_perf_func+0x233/0x260 [ 7790.880503] ? vfs_read+0x1/0x170 [ 7790.881632] ? vfs_read+0x5/0x170 [ 7790.882751] ? kprobe_ftrace_handler+0x92/0xf0 [ 7790.883926] ? __vfs_read+0x30/0x30 [ 7790.885050] ? ftrace_ops_assist_func+0x94/0x100 [ 7790.886183] ? vfs_read+0x1/0x170 [ 7790.887283] ? vfs_read+0x5/0x170 [ 7790.888348] ? ksys_read+0x5a/0xe0 [ 7790.889389] ? do_syscall_64+0x5c/0xa0 [ 7790.890401] ? entry_SYSCALL_64_after_hwframe+0x49/0xbe After some debugging, turns out that the logic in 2cbd95a5c4fb ("bpf: change parameters of call/branch offset adjustment") has a bug that is exposed after 52875a04f4b2 ("bpf: verifier: remove dead code") in that we miss some of the jump offset adjustments after code patching when we remove dead code, more concretely, upon backward jump spanning over the area that is being removed. BPF insns of a case that was hit pre 52875a04f4b2: [...] 676: (85) call bpf_perf_event_output#-47616 677: (05) goto pc-636 678: (62) *(u32 *)(r10 -64) = 0 679: (bf) r7 = r10 680: (07) r7 += -64 681: (05) goto pc-44 682: (05) goto pc-1 683: (05) goto pc-1 BPF insns afterwards: [...] 618: (85) call bpf_perf_event_output#-47616 619: (05) goto pc-638 620: (62) *(u32 *)(r10 -64) = 0 621: (bf) r7 = r10 622: (07) r7 += -64 623: (05) goto pc-44 To illustrate the bug, situation looks as follows: ____ 0 | | <-- foo: [...] 1 |____| 2 |____| <-- pos / end_new ^ 3 | | | 4 | | | len 5 |____| | (remove region) 6 | | <-- end_old v 7 | | 8 | | <-- curr (jmp foo) 9 |____| The condition curr >= end_new && curr + off + 1 < end_new in the branch delta adjustments is never hit because curr + off + 1 < end_new is compared as unsigned and therefore curr + off + 1 > end_new in unsigned realm as curr + off + 1 becomes negative since the insns are memmove()'d before the offset adjustments. Correct BPF insns after this fix: [...] 618: (85) call bpf_perf_event_output#-47216 619: (05) goto pc-578 620: (62) *(u32 *)(r10 -64) = 0 621: (bf) r7 = r10 622: (07) r7 += -64 623: (05) goto pc-44 Note that unprivileged case is not affected from this. Fixes: 52875a04f4b2 ("bpf: verifier: remove dead code") Fixes: 2cbd95a5c4fb ("bpf: change parameters of call/branch offset adjustment") Reported-by: Frank Ch. Eigler Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3ba56e73c90e..242a643af82f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -338,7 +338,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) } static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, - s32 end_new, u32 curr, const bool probe_pass) + s32 end_new, s32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; s32 delta = end_new - end_old; @@ -356,7 +356,7 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, } static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, - s32 end_new, u32 curr, const bool probe_pass) + s32 end_new, s32 curr, const bool probe_pass) { const s32 off_min = S16_MIN, off_max = S16_MAX; s32 delta = end_new - end_old; -- cgit v1.2.3 From ecebc5ce59a003163eb608ace38a01d7ffeb0a95 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 22 Mar 2019 18:52:27 -0700 Subject: kdb: Get rid of broken attempt to print CCVERSION in kdb summary If you drop into kdb and type "summary", it prints out a line that says this: ccversion CCVERSION ...and I don't mean that it actually prints out the version of the C compiler. It literally prints out the string "CCVERSION". The version of the C Compiler is already printed at boot up and it doesn't seem useful to replicate this in kdb. Let's just delete it. We can also delete the bit of the Makefile that called the C compiler in an attempt to pass this into kdb. This will remove one extra call to the C compiler at Makefile parse time and (very slightly) speed up builds. Signed-off-by: Douglas Anderson Reviewed-by: Masahiro Yamada Signed-off-by: Daniel Thompson --- kernel/debug/kdb/Makefile | 1 - kernel/debug/kdb/kdb_main.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile index d4fc58f4b88d..efac857c5511 100644 --- a/kernel/debug/kdb/Makefile +++ b/kernel/debug/kdb/Makefile @@ -6,7 +6,6 @@ # Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. # -CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p') obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 82a3b32a7cfc..fc96dbf8d9de 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2522,7 +2522,6 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("machine %s\n", init_uts_ns.name.machine); kdb_printf("nodename %s\n", init_uts_ns.name.nodename); kdb_printf("domainname %s\n", init_uts_ns.name.domainname); - kdb_printf("ccversion %s\n", __stringify(CCVERSION)); now = __ktime_get_real_seconds(); time64_to_tm(now, 0, &tm); -- cgit v1.2.3 From b586627e10f57ee3aa8f0cfab0d6f7dc4ae63760 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 May 2019 15:50:18 +0300 Subject: kdb: do a sanity check on the cpu in kdb_per_cpu() The "whichcpu" comes from argv[3]. The cpu_online() macro looks up the cpu in a bitmap of online cpus, but if the value is too high then it could read beyond the end of the bitmap and possibly Oops. Fixes: 5d5314d6795f ("kdb: core for kgdb back end (1 of 2)") Signed-off-by: Dan Carpenter Reviewed-by: Douglas Anderson Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index fc96dbf8d9de..9ecfa37c7fbf 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2583,7 +2583,7 @@ static int kdb_per_cpu(int argc, const char **argv) diag = kdbgetularg(argv[3], &whichcpu); if (diag) return diag; - if (!cpu_online(whichcpu)) { + if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) { kdb_printf("cpu %ld is not online\n", whichcpu); return KDB_BADCPUNUM; } -- cgit v1.2.3 From e2f7fc0ac6957cabff4cecf6c721979b571af208 Mon Sep 17 00:00:00 2001 From: Krzesimir Nowak Date: Wed, 8 May 2019 18:08:58 +0200 Subject: bpf: fix undefined behavior in narrow load handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") made the verifier add AND instructions to clear the unwanted bits with a mask when doing a narrow load. The mask is computed with (1 << size * 8) - 1 where "size" is the size of the narrow load. When doing a 4 byte load of a an 8 byte field the verifier shifts the literal 1 by 32 places to the left. This results in an overflow of a signed integer, which is an undefined behavior. Typically, the computed mask was zero, so the result of the narrow load ended up being zero too. Cast the literal to long long to avoid overflows. Note that narrow load of the 4 byte fields does not have the undefined behavior, because the load size can only be either 1 or 2 bytes, so shifting 1 by 8 or 16 places will not overflow it. And reading 4 bytes would not be a narrow load of a 4 bytes field. Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") Reviewed-by: Alban Crequy Reviewed-by: Iago López Galeiras Signed-off-by: Krzesimir Nowak Cc: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b05e8938d5c..95f9354495ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7599,7 +7599,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->dst_reg, shift); insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1ULL << size * 8) - 1); } } -- cgit v1.2.3 From 2baae3545327632167c0180e9ca1d467416f1919 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 May 2019 09:59:16 -0700 Subject: bpf: devmap: fix use-after-free Read in __dev_map_entry_free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit synchronize_rcu() is fine when the rcu callbacks only need to free memory (kfree_rcu() or direct kfree() call rcu call backs) __dev_map_entry_free() is a bit more complex, so we need to make sure that call queued __dev_map_entry_free() callbacks have completed. sysbot report: BUG: KASAN: use-after-free in dev_map_flush_old kernel/bpf/devmap.c:365 [inline] BUG: KASAN: use-after-free in __dev_map_entry_free+0x2a8/0x300 kernel/bpf/devmap.c:379 Read of size 8 at addr ffff8801b8da38c8 by task ksoftirqd/1/18 CPU: 1 PID: 18 Comm: ksoftirqd/1 Not tainted 4.17.0+ #39 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433 dev_map_flush_old kernel/bpf/devmap.c:365 [inline] __dev_map_entry_free+0x2a8/0x300 kernel/bpf/devmap.c:379 __rcu_reclaim kernel/rcu/rcu.h:178 [inline] rcu_do_batch kernel/rcu/tree.c:2558 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2818 [inline] __rcu_process_callbacks kernel/rcu/tree.c:2785 [inline] rcu_process_callbacks+0xe9d/0x1760 kernel/rcu/tree.c:2802 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:284 run_ksoftirqd+0x86/0x100 kernel/softirq.c:645 smpboot_thread_fn+0x417/0x870 kernel/smpboot.c:164 kthread+0x345/0x410 kernel/kthread.c:240 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412 Allocated by task 6675: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553 kmem_cache_alloc_trace+0x152/0x780 mm/slab.c:3620 kmalloc include/linux/slab.h:513 [inline] kzalloc include/linux/slab.h:706 [inline] dev_map_alloc+0x208/0x7f0 kernel/bpf/devmap.c:102 find_and_alloc_map kernel/bpf/syscall.c:129 [inline] map_create+0x393/0x1010 kernel/bpf/syscall.c:453 __do_sys_bpf kernel/bpf/syscall.c:2351 [inline] __se_sys_bpf kernel/bpf/syscall.c:2328 [inline] __x64_sys_bpf+0x303/0x510 kernel/bpf/syscall.c:2328 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 26: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kfree+0xd9/0x260 mm/slab.c:3813 dev_map_free+0x4fa/0x670 kernel/bpf/devmap.c:191 bpf_map_free_deferred+0xba/0xf0 kernel/bpf/syscall.c:262 process_one_work+0xc64/0x1b70 kernel/workqueue.c:2153 worker_thread+0x181/0x13a0 kernel/workqueue.c:2296 kthread+0x345/0x410 kernel/kthread.c:240 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412 The buggy address belongs to the object at ffff8801b8da37c0 which belongs to the cache kmalloc-512 of size 512 The buggy address is located 264 bytes inside of 512-byte region [ffff8801b8da37c0, ffff8801b8da39c0) The buggy address belongs to the page: page:ffffea0006e368c0 count:1 mapcount:0 mapping:ffff8801da800940 index:0xffff8801b8da3540 flags: 0x2fffc0000000100(slab) raw: 02fffc0000000100 ffffea0007217b88 ffffea0006e30cc8 ffff8801da800940 raw: ffff8801b8da3540 ffff8801b8da3040 0000000100000004 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801b8da3780: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb ffff8801b8da3800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb > ffff8801b8da3880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8801b8da3900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8801b8da3980: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc Fixes: 546ac1ffb70d ("bpf: add devmap, a map for storing net device references") Signed-off-by: Eric Dumazet Reported-by: syzbot+457d3e2ffbcf31aee5c0@syzkaller.appspotmail.com Acked-by: Toke Høiland-Jørgensen Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 191b79948424..1e525d70f833 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -164,6 +164,9 @@ static void dev_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_rcu(); + /* Make sure prior __dev_map_entry_free() have completed. */ + rcu_barrier(); + /* To ensure all pending flush operations have completed wait for flush * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. * Because the above synchronize_rcu() ensures the map is disconnected -- cgit v1.2.3 From 390e99cfdda1334f45c718cc02cd26eb3135f233 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 13 May 2019 12:04:36 -0700 Subject: bpf: mark bpf_event_notify and bpf_event_init as static Both of them are not declared in the headers and not used outside of bpf_trace.c file. Fixes: a38d1107f937c ("bpf: support raw tracepoints in modules") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b496ffdf5f36..f92d6ad5e080 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1297,7 +1297,8 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, } #ifdef CONFIG_MODULES -int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) +static int bpf_event_notify(struct notifier_block *nb, unsigned long op, + void *module) { struct bpf_trace_module *btm, *tmp; struct module *mod = module; @@ -1336,7 +1337,7 @@ static struct notifier_block bpf_module_nb = { .notifier_call = bpf_event_notify, }; -int __init bpf_event_init(void) +static int __init bpf_event_init(void) { register_module_notifier(&bpf_module_nb); return 0; -- cgit v1.2.3 From ca976bfb3154c7bc67c4651ecd144fdf67ccaee7 Mon Sep 17 00:00:00 2001 From: Wenlin Kang Date: Mon, 13 May 2019 16:57:20 +0800 Subject: kdb: Fix bound check compiler warning The strncpy() function may leave the destination string buffer unterminated, better use strscpy() instead. This fixes the following warning with gcc 8.2: kernel/debug/kdb/kdb_io.c: In function 'kdb_getstr': kernel/debug/kdb/kdb_io.c:449:3: warning: 'strncpy' specified bound 256 equals destination size [-Wstringop-truncation] strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Wenlin Kang Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6a4b41484afe..3a5184eb6977 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -446,7 +446,7 @@ poll_again: char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) - strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); + strscpy(kdb_prompt_str, prompt, CMD_BUFLEN); kdb_printf(kdb_prompt_str); kdb_nextline = 1; /* Prompt and input resets line number */ return kdb_read(buffer, bufsize); -- cgit v1.2.3 From a9e73998f9d705c94a8dca9687633adc0f24a19a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 13 May 2019 17:15:40 -0700 Subject: kernel/sys.c: prctl: fix false positive in validate_prctl_map() While validating new map we require the @start_data to be strictly less than @end_data, which is fine for regular applications (this is why this nit didn't trigger for that long). These members are set from executable loaders such as elf handers, still it is pretty valid to have a loadable data section with zero size in file, in such case the start_data is equal to end_data once kernel loader finishes. As a result when we're trying to restore such programs the procedure fails and the kernel returns -EINVAL. From the image dump of a program: | "mm_start_code": "0x400000", | "mm_end_code": "0x8f5fb4", | "mm_start_data": "0xf1bfb0", | "mm_end_data": "0xf1bfb0", Thus we need to change validate_prctl_map from strictly less to less or equal operator use. Link: http://lkml.kernel.org/r/20190408143554.GY1421@uranus.lan Fixes: f606b77f1a9e3 ("prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation") Signed-off-by: Cyrill Gorcunov Cc: Andrey Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 12df0e5434b8..bdbfe8d37418 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1924,7 +1924,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); - error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); -- cgit v1.2.3 From cefdca0a86be517bc390fc4541e3674b8e7803b0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 13 May 2019 17:16:41 -0700 Subject: userfaultfd/sysctl: add vm.unprivileged_userfaultfd Userfaultfd can be misued to make it easier to exploit existing use-after-free (and similar) bugs that might otherwise only make a short window or race condition available. By using userfaultfd to stall a kernel thread, a malicious program can keep some state that it wrote, stable for an extended period, which it can then access using an existing exploit. While it doesn't cause the exploit itself, and while it's not the only thing that can stall a kernel thread when accessing a memory location, it's one of the few that never needs privilege. We can add a flag, allowing userfaultfd to be restricted, so that in general it won't be useable by arbitrary user programs, but in environments that require userfaultfd it can be turned back on. Add a global sysctl knob "vm.unprivileged_userfaultfd" to control whether userfaultfd is allowed by unprivileged users. When this is set to zero, only privileged users (root user, or users with the CAP_SYS_PTRACE capability) will be able to use the userfaultfd syscalls. Andrea said: : The only difference between the bpf sysctl and the userfaultfd sysctl : this way is that the bpf sysctl adds the CAP_SYS_ADMIN capability : requirement, while userfaultfd adds the CAP_SYS_PTRACE requirement, : because the userfaultfd monitor is more likely to need CAP_SYS_PTRACE : already if it's doing other kind of tracking on processes runtime, in : addition of userfaultfd. In other words both syscalls works only for : root, when the two sysctl are opt-in set to 1. [dgilbert@redhat.com: changelog additions] [akpm@linux-foundation.org: documentation tweak, per Mike] Link: http://lkml.kernel.org/r/20190319030722.12441-2-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Andrea Arcangeli Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport Reviewed-by: Andrea Arcangeli Cc: Paolo Bonzini Cc: Hugh Dickins Cc: Luis Chamberlain Cc: Maxime Coquelin Cc: Maya Gokhale Cc: Jerome Glisse Cc: Pavel Emelyanov Cc: Johannes Weiner Cc: Martin Cracauer Cc: Denis Plotnikov Cc: Marty McFadden Cc: Mike Kravetz Cc: Kees Cook Cc: Mel Gorman Cc: "Kirill A . Shutemov" Cc: "Dr . David Alan Gilbert" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 599510a3355e..ba158f61aab4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "../lib/kstrtox.h" @@ -1719,6 +1720,17 @@ static struct ctl_table vm_table[] = { .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, +#endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; -- cgit v1.2.3 From 73b0140bf0fe9df90fb267c00673c4b9bf285430 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:11 -0700 Subject: mm/gup: change GUP fast to use flags rather than a write 'bool' To facilitate additional options to get_user_pages_fast() change the singular write parameter to be gup_flags. This patch does not change any functionality. New functionality will follow in subsequent patches. Some of the get_user_pages_fast() call sites were unchanged because they already passed FOLL_WRITE or 0 for the write parameter. NOTE: It was suggested to change the ordering of the get_user_pages_fast() arguments to ensure that callers were converted. This breaks the current GUP call site convention of having the returned pages be the final parameter. So the suggestion was rejected. Link: http://lkml.kernel.org/r/20190328084422.29911-4-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-4-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Mike Marshall Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dan Williams Cc: "David S. Miller" Cc: Heiko Carstens Cc: Ingo Molnar Cc: James Hogan Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6262f1534ac9..2268b97d5439 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -543,7 +543,7 @@ again: if (unlikely(should_fail_futex(fshared))) return -EFAULT; - err = get_user_pages_fast(address, 1, 1, &page); + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. -- cgit v1.2.3 From 6f4f13e8d9e27cefd2cd88dd4fd80aa6d68b9131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Mon, 13 May 2019 17:20:49 -0700 Subject: mm/mmu_notifier: contextual information for event triggering invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<---------------------------------------------------------------------- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +unsigned flags, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, NULL, E2, E3, E4) ...> } ---------------------------------------------------------------------->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Link: http://lkml.kernel.org/r/20190326164747.24405-6-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4ca7364c956d..e34b699f3865 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); -- cgit v1.2.3 From 7269f999934b289da7972e975b781417b07ef836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Mon, 13 May 2019 17:20:53 -0700 Subject: mm/mmu_notifier: use correct mmu_notifier events for each invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This updates each existing invalidation to use the correct mmu notifier event that represent what is happening to the CPU page table. See the patch which introduced the events to see the rational behind this. Link: http://lkml.kernel.org/r/20190326164747.24405-7-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e34b699f3865..78f61bfc6b79 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); -- cgit v1.2.3 From 940519f0c8b757fdcbc5d14c93cdaada20ded14c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 13 May 2019 17:21:26 -0700 Subject: mm, memory_hotplug: provide a more generic restrictions for memory hotplug arch_add_memory, __add_pages take a want_memblock which controls whether the newly added memory should get the sysfs memblock user API (e.g. ZONE_DEVICE users do not want/need this interface). Some callers even want to control where do we allocate the memmap from by configuring altmap. Add a more generic hotplug context for arch_add_memory and __add_pages. struct mhp_restrictions contains flags which contains additional features to be enabled by the memory hotplug (MHP_MEMBLOCK_API currently) and altmap for alternative memmap allocator. This patch shouldn't introduce any functional change. [akpm@linux-foundation.org: build fix] Link: http://lkml.kernel.org/r/20190408082633.2864-3-osalvador@suse.de Signed-off-by: Michal Hocko Signed-off-by: Oscar Salvador Cc: Dan Williams Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index a856cb5ff192..4e59d29245f4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -148,6 +148,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; + struct mhp_restrictions restrictions = { + /* + * We do not want any optional features only our own memmap + */ + .altmap = altmap, + }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; @@ -214,7 +220,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL, false); + align_size >> PAGE_SHIFT, &restrictions); } else { error = kasan_add_zero_shadow(__va(align_start), align_size); if (error) { @@ -222,8 +228,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_kasan; } - error = arch_add_memory(nid, align_start, align_size, altmap, - false); + error = arch_add_memory(nid, align_start, align_size, + &restrictions); } if (!error) { -- cgit v1.2.3 From 350e88bad4964da6feabee02a1a70381bcdb087e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:22:59 -0700 Subject: mm: memblock: make keeping memblock memory opt-in rather than opt-out Most architectures do not need the memblock memory after the page allocator is initialized, but only few enable ARCH_DISCARD_MEMBLOCK in the arch Kconfig. Replacing ARCH_DISCARD_MEMBLOCK with ARCH_KEEP_MEMBLOCK and inverting the logic makes it clear which architectures actually use memblock after system initialization and skips the necessity to add ARCH_DISCARD_MEMBLOCK to the architectures that are still missing that option. Link: http://lkml.kernel.org/r/1556102150-32517-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Michael Ellerman (powerpc) Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Richard Kuo Cc: Tony Luck Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: Ley Foon Tan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f7fb8f6a688f..072b6ee55e3f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -static int kexec_walk_memblock(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) -{ - return 0; -} -#else +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static int kexec_walk_memblock(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { @@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, return ret; } +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 0; +} #endif /** @@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; - if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); -- cgit v1.2.3 From 640be2d1ffbc1946f1547eb89b5005ed7542de99 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:23:23 -0700 Subject: kernel/memremap.c: remove the unused device_private_entry_fault() export This export has been entirely unused since it was added more than 1 1/2 years ago. Link: http://lkml.kernel.org/r/20190429115535.12793-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 4e59d29245f4..1490e63f69a9 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -45,7 +45,6 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, */ return devmem->page_fault(vma, addr, page, flags, pmdp); } -EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ static void pgmap_array_delete(struct resource *res) -- cgit v1.2.3 From c6110222c6f49ea68169f353565eb865488a8619 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 May 2019 01:18:55 +0200 Subject: bpf: add map_lookup_elem_sys_only for lookups from syscall side Add a callback map_lookup_elem_sys_only() that map implementations could use over map_lookup_elem() from system call side in case the map implementation needs to handle the latter differently than from the BPF data path. If map_lookup_elem_sys_only() is set, this will be preferred pick for map lookups out of user space. This hook is used in a follow-up fix for LRU map, but once development window opens, we can convert other map types from map_lookup_elem() (here, the one called upon BPF_MAP_LOOKUP_ELEM cmd is meant) over to use the callback to simplify and clean up the latter. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ad3ccf82f31d..cb5440b02e82 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -808,7 +808,10 @@ static int map_lookup_elem(union bpf_attr *attr) err = map->ops->map_peek_elem(map, value); } else { rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); + if (map->ops->map_lookup_elem_sys_only) + ptr = map->ops->map_lookup_elem_sys_only(map, key); + else + ptr = map->ops->map_lookup_elem(map, key); if (IS_ERR(ptr)) { err = PTR_ERR(ptr); } else if (!ptr) { -- cgit v1.2.3 From 50b045a8c0ccf44f76640ac3eea8d80ca53979a3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 May 2019 01:18:56 +0200 Subject: bpf, lru: avoid messing with eviction heuristics upon syscall lookup One of the biggest issues we face right now with picking LRU map over regular hash table is that a map walk out of user space, for example, to just dump the existing entries or to remove certain ones, will completely mess up LRU eviction heuristics and wrong entries such as just created ones will get evicted instead. The reason for this is that we mark an entry as "in use" via bpf_lru_node_set_ref() from system call lookup side as well. Thus upon walk, all entries are being marked, so information of actual least recently used ones are "lost". In case of Cilium where it can be used (besides others) as a BPF based connection tracker, this current behavior causes disruption upon control plane changes that need to walk the map from user space to evict certain entries. Discussion result from bpfconf [0] was that we should simply just remove marking from system call side as no good use case could be found where it's actually needed there. Therefore this patch removes marking for regular LRU and per-CPU flavor. If there ever should be a need in future, the behavior could be selected via map creation flag, but due to mentioned reason we avoid this here. [0] http://vger.kernel.org/bpfconf.html Fixes: 29ba732acbee ("bpf: Add BPF_MAP_TYPE_LRU_HASH") Fixes: 8f8449384ec3 ("bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 192d32e77db3..0f2708fde5f7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -527,18 +527,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } -static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, + void *key, const bool mark) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) { - bpf_lru_node_set_ref(&l->lru_node); + if (mark) + bpf_lru_node_set_ref(&l->lru_node); return l->key + round_up(map->key_size, 8); } return NULL; } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ + return __htab_lru_map_lookup_elem(map, key, true); +} + +static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) +{ + return __htab_lru_map_lookup_elem(map, key, false); +} + static u32 htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -1250,6 +1262,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_lru_map_lookup_elem, + .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, @@ -1281,7 +1294,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; @@ -1297,8 +1309,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; - if (htab_is_lru(htab)) - bpf_lru_node_set_ref(&l->lru_node); + /* We do not mark LRU map element here in order to not mess up + * eviction heuristics when user space does a map walk. + */ pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, -- cgit v1.2.3 From acb2ec3dd003b50b6fb5772057a08ec0dc45d42a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 14 May 2019 15:40:43 -0700 Subject: kernel/Makefile: don't assume that kernel/gen_ikh_data.sh is executable If the user downloads and applies patch-5.1.gz using patch(1), the x bit on kernel/gen_ikh_data.sh is not set. /bin/sh: 1: ./kernel/gen_ikh_data.sh: Permission denied Fix this by using CONFIG_SHELL. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 298437bb2c6a..33824f0385b3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz -cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@ +cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) -- cgit v1.2.3 From c3f3ce049f7d97cc7ec9c01cb51d9ec74e0f37c2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 14 May 2019 15:40:46 -0700 Subject: userfaultfd: use RCU to free the task struct when fork fails The task structure is freed while get_mem_cgroup_from_mm() holds rcu_read_lock() and dereferences mm->owner. get_mem_cgroup_from_mm() failing fork() ---- --- task = mm->owner mm->owner = NULL; free(task) if (task) *task; /* use after free */ The fix consists in freeing the task with RCU also in the fork failure case, exactly like it always happens for the regular exit(2) path. That is enough to make the rcu_read_lock hold in get_mem_cgroup_from_mm() (left side above) effective to avoid a use after free when dereferencing the task structure. An alternate possible fix would be to defer the delivery of the userfaultfd contexts to the monitor until after fork() is guaranteed to succeed. Such a change would require more changes because it would create a strict ordering dependency where the uffd methods would need to be called beyond the last potentially failing branch in order to be safe. This solution as opposed only adds the dependency to common code to set mm->owner to NULL and to free the task struct that was pointed by mm->owner with RCU, if fork ends up failing. The userfaultfd methods can still be called anywhere during the fork runtime and the monitor will keep discarding orphaned "mm" coming from failed forks in userland. This race condition couldn't trigger if CONFIG_MEMCG was set =n at build time. [aarcange@redhat.com: improve changelog, reduce #ifdefs per Michal] Link: http://lkml.kernel.org/r/20190429035752.4508-1-aarcange@redhat.com Link: http://lkml.kernel.org/r/20190325225636.11635-2-aarcange@redhat.com Fixes: 893e26e61d04 ("userfaultfd: non-cooperative: Add fork() event") Signed-off-by: Andrea Arcangeli Tested-by: zhong jiang Reported-by: syzbot+cbb52e396df3e565ab02@syzkaller.appspotmail.com Cc: Oleg Nesterov Cc: Jann Horn Cc: Hugh Dickins Cc: Mike Rapoport Cc: Mike Kravetz Cc: Peter Xu Cc: Jason Gunthorpe Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: zhong jiang Cc: syzbot+cbb52e396df3e565ab02@syzkaller.appspotmail.com Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 737db1828437..b409e792aadc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -955,6 +955,15 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static __always_inline void mm_clear_owner(struct mm_struct *mm, + struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + if (mm->owner == p) + WRITE_ONCE(mm->owner, NULL); +#endif +} + static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG @@ -1343,6 +1352,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; + mm_init_owner(mm, NULL); mmput(mm); fail_nomem: @@ -1726,6 +1736,21 @@ static int pidfd_create(struct pid *pid) return fd; } +static void __delayed_free_task(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + free_task(tsk); +} + +static __always_inline void delayed_free_task(struct task_struct *tsk) +{ + if (IS_ENABLED(CONFIG_MEMCG)) + call_rcu(&tsk->rcu, __delayed_free_task); + else + free_task(tsk); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2233,8 +2258,10 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + mm_clear_owner(p->mm, p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -2265,7 +2292,7 @@ bad_fork_cleanup_count: bad_fork_free: p->state = TASK_DEAD; put_task_stack(p); - free_task(p); + delayed_free_task(p); fork_out: spin_lock_irq(¤t->sighand->siglock); hlist_del_init(&delayed.node); -- cgit v1.2.3 From 987717e5e016a0dd3011d3bb16546672713f94e2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 14 May 2019 15:40:50 -0700 Subject: mm: change mm_update_next_owner() to update mm->owner with WRITE_ONCE The RCU reader uses rcu_dereference() inside rcu_read_lock critical sections, so the writer shall use WRITE_ONCE. Just a cleanup, we still rely on gcc to emit atomic writes in other places. Link: http://lkml.kernel.org/r/20190325225636.11635-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reviewed-by: Andrew Morton Cc: Hugh Dickins Cc: Jann Horn Cc: Jason Gunthorpe Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Oleg Nesterov Cc: Peter Xu Cc: zhong jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2166c2d92ddc..8361a560cd1d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -422,7 +422,7 @@ retry: * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; } @@ -462,7 +462,7 @@ retry: * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; assign_new_owner: @@ -483,7 +483,7 @@ assign_new_owner: put_task_struct(c); goto retry; } - mm->owner = c; + WRITE_ONCE(mm->owner, c); task_unlock(c); put_task_struct(c); } -- cgit v1.2.3 From 33b2d6302abc4ccea1d9b3f095e2e27b02ca264e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:40:56 -0700 Subject: psi: introduce state_mask to represent stalled psi states Patch series "psi: pressure stall monitors", v6. This is a respin of: https://lwn.net/ml/linux-kernel/20190308184311.144521-1-surenb%40google.com/ Android is adopting psi to detect and remedy memory pressure that results in stuttering and decreased responsiveness on mobile devices. Psi gives us the stall information, but because we're dealing with latencies in the millisecond range, periodically reading the pressure files to detect stalls in a timely fashion is not feasible. Psi also doesn't aggregate its averages at a high-enough frequency right now. This patch series extends the psi interface such that users can configure sensitive latency thresholds and use poll() and friends to be notified when these are breached. As high-frequency aggregation is costly, it implements an aggregation method that is optimized for fast, short-interval averaging, and makes the aggregation frequency adaptive, such that high-frequency updates only happen while monitored stall events are actively occurring. With these patches applied, Android can monitor for, and ward off, mounting memory shortages before they cause problems for the user. For example, using memory stall monitors in userspace low memory killer daemon (lmkd) we can detect mounting pressure and kill less important processes before device becomes visibly sluggish. In our memory stress testing psi memory monitors produce roughly 10x less false positives compared to vmpressure signals. Having ability to specify multiple triggers for the same psi metric allows other parts of Android framework to monitor memory state of the device and act accordingly. The new interface is straight-forward. The user opens one of the pressure files for writing and writes a trigger description into the file descriptor that defines the stall state - some or full, and the maximum stall time over a given window of time. E.g.: /* Signal when stall time exceeds 100ms of a 1s window */ char trigger[] = "full 100000 1000000" fd = open("/proc/pressure/memory") write(fd, trigger, sizeof(trigger)) while (poll() >= 0) { ... }; close(fd); When the monitored stall state is entered, psi adapts its aggregation frequency according to what the configured time window requires in order to emit event signals in a timely fashion. Once the stalling subsides, aggregation reverts back to normal. The trigger is associated with the open file descriptor. To stop monitoring, the user only needs to close the file descriptor and the trigger is discarded. Patches 1-6 prepare the psi code for polling support. Patch 7 implements the adaptive polling logic, the pressure growth detection optimized for short intervals, and hooks up write() and poll() on the pressure files. The patches were developed in collaboration with Johannes Weiner. This patch (of 7): The psi monitoring patches will need to determine the same states as record_times(). To avoid calculating them twice, maintain a state mask that can be consulted cheaply. Do this in a separate patch to keep the churn in the main feature patch at a minimum. This adds 4-byte state_mask member into psi_group_cpu struct which results in its first cacheline-aligned part becoming 52 bytes long. Add explicit values to enumeration element counters that affect psi_group_cpu struct size. Link: http://lkml.kernel.org/r/20190124211518.244221-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0e97ca9306ef..22c1505ad290 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -213,17 +213,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state) static void get_recent_times(struct psi_group *group, int cpu, u32 *times) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); - unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; + enum psi_states s; unsigned int seq; - int s; + u32 state_mask; /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); - memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_mask = groupc->state_mask; state_start = groupc->state_start; } while (read_seqcount_retry(&groupc->seq, seq)); @@ -239,7 +239,7 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(tasks, s)) + if (state_mask & (1 << s)) times[s] += now - state_start; delta = times[s] - groupc->times_prev[s]; @@ -407,15 +407,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, delta = now - groupc->state_start; groupc->state_start = now; - if (test_state(groupc->tasks, PSI_IO_SOME)) { + if (groupc->state_mask & (1 << PSI_IO_SOME)) { groupc->times[PSI_IO_SOME] += delta; - if (test_state(groupc->tasks, PSI_IO_FULL)) + if (groupc->state_mask & (1 << PSI_IO_FULL)) groupc->times[PSI_IO_FULL] += delta; } - if (test_state(groupc->tasks, PSI_MEM_SOME)) { + if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] += delta; - if (test_state(groupc->tasks, PSI_MEM_FULL)) + if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; else if (memstall_tick) { u32 sample; @@ -436,10 +436,10 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } } - if (test_state(groupc->tasks, PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) groupc->times[PSI_CPU_SOME] += delta; - if (test_state(groupc->tasks, PSI_NONIDLE)) + if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } @@ -448,6 +448,8 @@ static void psi_group_change(struct psi_group *group, int cpu, { struct psi_group_cpu *groupc; unsigned int t, m; + enum psi_states s; + u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -480,6 +482,13 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; + /* Calculate state mask representing active states */ + for (s = 0; s < NR_PSI_STATES; s++) { + if (test_state(groupc->tasks, s)) + state_mask |= (1 << s); + } + groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); } -- cgit v1.2.3 From 9289c5e6a78a5a9397df5fa60eb82b105abcfecf Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:40:59 -0700 Subject: psi: make psi_enable static psi_enable is not used outside of psi.c, make it static. Link: http://lkml.kernel.org/r/20190319235619.260832-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Andrew Morton Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 22c1505ad290..281702de9772 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -140,9 +140,9 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable; #else -bool psi_enable = true; +static bool psi_enable = true; #endif static int __init setup_psi(char *str) { -- cgit v1.2.3 From bcc78db64168eb6dede056fed2999f75f7ace309 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:02 -0700 Subject: psi: rename psi fields in preparation for psi trigger addition Rename psi_group structure member fields used for calculating psi totals and averages for clear distinction between them and for trigger-related fields that will be added by "psi: introduce psi monitor". [surenb@google.com: v6] Link: http://lkml.kernel.org/r/20190319235619.260832-4-surenb@google.com Link: http://lkml.kernel.org/r/20190124211518.244221-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 281702de9772..4fb4d9913bc8 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -165,7 +165,7 @@ static struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static void psi_update_work(struct work_struct *work); +static void psi_avgs_work(struct work_struct *work); static void group_init(struct psi_group *group) { @@ -173,9 +173,9 @@ static void group_init(struct psi_group *group) for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->next_update = sched_clock() + psi_period; - INIT_DELAYED_WORK(&group->clock_work, psi_update_work); - mutex_init(&group->stat_lock); + group->avg_next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + mutex_init(&group->avgs_lock); } void __init psi_init(void) @@ -278,7 +278,7 @@ static bool update_stats(struct psi_group *group) int cpu; int s; - mutex_lock(&group->stat_lock); + mutex_lock(&group->avgs_lock); /* * Collect the per-cpu time buckets and average them into a @@ -319,7 +319,7 @@ static bool update_stats(struct psi_group *group) /* avgX= */ now = sched_clock(); - expires = group->next_update; + expires = group->avg_next_update; if (now < expires) goto out; if (now - expires >= psi_period) @@ -332,14 +332,14 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->last_update + (missed_periods * psi_period)); - group->last_update = now; + group->avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now; for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->total_prev[s]; + sample = group->total[s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -359,22 +359,22 @@ static bool update_stats(struct psi_group *group) */ if (sample > period) sample = period; - group->total_prev[s] += sample; + group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } out: - mutex_unlock(&group->stat_lock); + mutex_unlock(&group->avgs_lock); return nonidle_total; } -static void psi_update_work(struct work_struct *work) +static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; bool nonidle; dwork = to_delayed_work(work); - group = container_of(dwork, struct psi_group, clock_work); + group = container_of(dwork, struct psi_group, avgs_work); /* * If there is task activity, periodically fold the per-cpu @@ -391,8 +391,9 @@ static void psi_update_work(struct work_struct *work) u64 now; now = sched_clock(); - if (group->next_update > now) - delay = nsecs_to_jiffies(group->next_update - now) + 1; + if (group->avg_next_update > now) + delay = nsecs_to_jiffies( + group->avg_next_update - now) + 1; schedule_delayed_work(dwork, delay); } } @@ -546,13 +547,13 @@ void psi_task_change(struct task_struct *task, int clear, int set) */ if (unlikely((clear & TSK_RUNNING) && (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_update_work)) + wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; while ((group = iterate_groups(task, &iter))) { psi_group_change(group, cpu, clear, set); - if (wake_clock && !delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } } @@ -649,7 +650,7 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return; - cancel_delayed_work_sync(&cgroup->psi.clock_work); + cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); } -- cgit v1.2.3 From 7fc70a3999366560ad1d4f2389a78360300c2c6a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:06 -0700 Subject: psi: split update_stats into parts Split update_stats into collect_percpu_times and update_averages for collect_percpu_times to be reused later inside psi monitor. Link: http://lkml.kernel.org/r/20190319235619.260832-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 57 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 4fb4d9913bc8..ace5ed97b186 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -269,17 +269,13 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool update_stats(struct psi_group *group) +static bool collect_percpu_times(struct psi_group *group) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; - unsigned long missed_periods = 0; unsigned long nonidle_total = 0; - u64 now, expires, period; int cpu; int s; - mutex_lock(&group->avgs_lock); - /* * Collect the per-cpu time buckets and average them into a * single time sample that is normalized to wallclock time. @@ -317,11 +313,18 @@ static bool update_stats(struct psi_group *group) for (s = 0; s < NR_PSI_STATES - 1; s++) group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + return nonidle_total; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; + /* avgX= */ - now = sched_clock(); expires = group->avg_next_update; - if (now < expires) - goto out; if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); @@ -332,7 +335,7 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->avg_next_update = expires + ((1 + missed_periods) * psi_period); + avg_next_update = expires + ((1 + missed_periods) * psi_period); period = now - (group->avg_last_update + (missed_periods * psi_period)); group->avg_last_update = now; @@ -362,9 +365,8 @@ static bool update_stats(struct psi_group *group) group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } -out: - mutex_unlock(&group->avgs_lock); - return nonidle_total; + + return avg_next_update; } static void psi_avgs_work(struct work_struct *work) @@ -372,10 +374,16 @@ static void psi_avgs_work(struct work_struct *work) struct delayed_work *dwork; struct psi_group *group; bool nonidle; + u64 now; dwork = to_delayed_work(work); group = container_of(dwork, struct psi_group, avgs_work); + mutex_lock(&group->avgs_lock); + + now = sched_clock(); + + nonidle = collect_percpu_times(group); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -383,19 +391,15 @@ static void psi_avgs_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ - - nonidle = update_stats(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); if (nonidle) { - unsigned long delay = 0; - u64 now; - - now = sched_clock(); - if (group->avg_next_update > now) - delay = nsecs_to_jiffies( - group->avg_next_update - now) + 1; - schedule_delayed_work(dwork, delay); + schedule_delayed_work(dwork, nsecs_to_jiffies( + group->avg_next_update - now) + 1); } + + mutex_unlock(&group->avgs_lock); } static void record_times(struct psi_group_cpu *groupc, int cpu, @@ -707,11 +711,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; + u64 now; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - update_stats(group); + /* Update averages before reporting them */ + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); for (full = 0; full < 2 - (res == PSI_CPU); full++) { unsigned long avg[3]; -- cgit v1.2.3 From 333f3017c5a893b000b2b4a3529814ce93fa83d7 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:09 -0700 Subject: psi: track changed states Introduce changed_states parameter into collect_percpu_times to track the states changed since the last update. This will be needed to detect whether polled states activated in the monitor patch. Link: http://lkml.kernel.org/r/20190319235619.260832-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ace5ed97b186..1b99eeffaa25 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -210,7 +210,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, u32 *times, + u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); u64 now, state_start; @@ -218,6 +219,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) unsigned int seq; u32 state_mask; + *pchanged_states = 0; + /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); @@ -246,6 +249,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) groupc->times_prev[s] = times[s]; times[s] = delta; + if (delta) + *pchanged_states |= (1 << s); } } @@ -269,10 +274,11 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool collect_percpu_times(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; unsigned long nonidle_total = 0; + u32 changed_states = 0; int cpu; int s; @@ -287,8 +293,11 @@ static bool collect_percpu_times(struct psi_group *group) for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; u32 nonidle; + u32 cpu_changed_states; - get_recent_times(group, cpu, times); + get_recent_times(group, cpu, times, + &cpu_changed_states); + changed_states |= cpu_changed_states; nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); nonidle_total += nonidle; @@ -313,7 +322,8 @@ static bool collect_percpu_times(struct psi_group *group) for (s = 0; s < NR_PSI_STATES - 1; s++) group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); - return nonidle_total; + if (pchanged_states) + *pchanged_states = changed_states; } static u64 update_averages(struct psi_group *group, u64 now) @@ -373,6 +383,7 @@ static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; + u32 changed_states; bool nonidle; u64 now; @@ -383,7 +394,8 @@ static void psi_avgs_work(struct work_struct *work) now = sched_clock(); - nonidle = collect_percpu_times(group); + collect_percpu_times(group, &changed_states); + nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -719,7 +731,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); - collect_percpu_times(group); + collect_percpu_times(group, NULL); if (now >= group->avg_next_update) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); -- cgit v1.2.3 From 8af0c18af1425fc70686c0fdcfc0072cd8431aa0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:12 -0700 Subject: include/: refactor headers to allow kthread.h inclusion in psi_types.h kthread.h can't be included in psi_types.h because it creates a circular inclusion with kthread.h eventually including psi_types.h and complaining on kthread structures not being defined because they are defined further in the kthread.h. Resolve this by removing psi_types.h inclusion from the headers included from kthread.h. Link: http://lkml.kernel.org/r/20190319235619.260832-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 5942eeafb9ac..be4e8795561a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 0e94682b73bfa6c44c98af7a26771c9c08c055d5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 14 May 2019 15:41:15 -0700 Subject: psi: introduce psi monitor Psi monitor aims to provide a low-latency short-term pressure detection mechanism configurable by users. It allows users to monitor psi metrics growth and trigger events whenever a metric raises above user-defined threshold within user-defined time window. Time window and threshold are both expressed in usecs. Multiple psi resources with different thresholds and window sizes can be monitored concurrently. Psi monitors activate when system enters stall state for the monitored psi metric and deactivate upon exit from the stall state. While system is in the stall state psi signal growth is monitored at a rate of 10 times per tracking window. Min window size is 500ms, therefore the min monitoring interval is 50ms. Max window size is 10s with monitoring interval of 1s. When activated psi monitor stays active for at least the duration of one tracking window to avoid repeated activations/deactivations when psi signal is bouncing. Notifications to the users are rate-limited to one per tracking window. Link: http://lkml.kernel.org/r/20190319235619.260832-8-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Johannes Weiner Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cgroup.c | 71 ++++++- kernel/sched/psi.c | 494 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 547 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 327f37c9fdfa..1140357d46f4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3550,7 +3550,65 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); } -#endif + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) +{ + struct psi_trigger *new; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + cgroup_get(cgrp); + cgroup_kn_unlock(of->kn); + + new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + if (IS_ERR(new)) { + cgroup_put(cgrp); + return PTR_ERR(new); + } + + psi_trigger_replace(&of->priv, new); + + cgroup_put(cgrp); + + return nbytes; +} + +static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IO); +} + +static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); +} + +static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); +} + +static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, + poll_table *pt) +{ + return psi_trigger_poll(&of->priv, of->file, pt); +} + +static void cgroup_pressure_release(struct kernfs_open_file *of) +{ + psi_trigger_replace(&of->priv, NULL); +} +#endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) { @@ -4745,18 +4803,27 @@ static struct cftype cgroup_base_files[] = { .name = "io.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "memory.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "cpu.pressure", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, -#endif +#endif /* CONFIG_PSI */ { } /* terminate */ }; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1b99eeffaa25..e88918e0bb6d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -4,6 +4,9 @@ * Copyright (c) 2018 Facebook, Inc. * Author: Johannes Weiner * + * Polling support by Suren Baghdasaryan + * Copyright (c) 2018 Google, Inc. + * * When CPU, memory and IO are contended, tasks experience delays that * reduce throughput and introduce latencies into the workload. Memory * and IO contention, in addition, can cause a full loss of forward @@ -129,9 +132,13 @@ #include #include #include +#include #include #include #include +#include +#include +#include #include #include "sched.h" @@ -156,6 +163,11 @@ __setup("psi=", setup_psi); #define EXP_60s 1981 /* 1/exp(2s/60s) */ #define EXP_300s 2034 /* 1/exp(2s/300s) */ +/* PSI trigger definitions */ +#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ +#define WINDOW_MAX_US 10000000 /* Max window size is 10s */ +#define UPDATES_PER_WINDOW 10 /* 10 updates per window */ + /* Sampling frequency in nanoseconds */ static u64 psi_period __read_mostly; @@ -176,6 +188,17 @@ static void group_init(struct psi_group *group) group->avg_next_update = sched_clock() + psi_period; INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); + /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); + mutex_init(&group->trigger_lock); + INIT_LIST_HEAD(&group->triggers); + memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); + group->poll_states = 0; + group->poll_min_period = U32_MAX; + memset(group->polling_total, 0, sizeof(group->polling_total)); + group->polling_next_update = ULLONG_MAX; + group->polling_until = 0; + rcu_assign_pointer(group->poll_kworker, NULL); } void __init psi_init(void) @@ -210,7 +233,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times, +static void get_recent_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times, u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); @@ -245,8 +269,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times, if (state_mask & (1 << s)) times[s] += now - state_start; - delta = times[s] - groupc->times_prev[s]; - groupc->times_prev[s] = times[s]; + delta = times[s] - groupc->times_prev[aggregator][s]; + groupc->times_prev[aggregator][s] = times[s]; times[s] = delta; if (delta) @@ -274,7 +298,9 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) +static void collect_percpu_times(struct psi_group *group, + enum psi_aggregators aggregator, + u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; unsigned long nonidle_total = 0; @@ -295,7 +321,7 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) u32 nonidle; u32 cpu_changed_states; - get_recent_times(group, cpu, times, + get_recent_times(group, cpu, aggregator, times, &cpu_changed_states); changed_states |= cpu_changed_states; @@ -320,7 +346,8 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states) /* total= */ for (s = 0; s < NR_PSI_STATES - 1; s++) - group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + group->total[aggregator][s] += + div_u64(deltas[s], max(nonidle_total, 1UL)); if (pchanged_states) *pchanged_states = changed_states; @@ -352,7 +379,7 @@ static u64 update_averages(struct psi_group *group, u64 now) for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->avg_total[s]; + sample = group->total[PSI_AVGS][s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -394,7 +421,7 @@ static void psi_avgs_work(struct work_struct *work) now = sched_clock(); - collect_percpu_times(group, &changed_states); + collect_percpu_times(group, PSI_AVGS, &changed_states); nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu @@ -414,6 +441,187 @@ static void psi_avgs_work(struct work_struct *work) mutex_unlock(&group->avgs_lock); } +/* Trigger tracking window manupulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void init_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + + list_for_each_entry(t, &group->triggers, node) + window_reset(&t->win, now, + group->total[PSI_POLL][t->state], 0); + memcpy(group->polling_total, group->total[PSI_POLL], + sizeof(group->polling_total)); + group->polling_next_update = now + group->poll_min_period; +} + +static u64 update_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + bool new_stall = false; + u64 *total = group->total[PSI_POLL]; + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, &group->triggers, node) { + u64 growth; + + /* Check for stall activity */ + if (group->polling_total[t->state] == total[t->state]) + continue; + + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + new_stall = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) + wake_up_interruptible(&t->event_wait); + t->last_event_time = now; + } + + if (new_stall) + memcpy(group->polling_total, total, + sizeof(group->polling_total)); + + return now + group->poll_min_period; +} + +/* + * Schedule polling if it's not already scheduled. It's safe to call even from + * hotpath because even though kthread_queue_delayed_work takes worker->lock + * spinlock that spinlock is never contended due to poll_scheduled atomic + * preventing such competition. + */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +{ + struct kthread_worker *kworker; + + /* Do not reschedule if already scheduled */ + if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + return; + + rcu_read_lock(); + + kworker = rcu_dereference(group->poll_kworker); + /* + * kworker might be NULL in case psi_trigger_destroy races with + * psi_task_change (hotpath) which can't use locks + */ + if (likely(kworker)) + kthread_queue_delayed_work(kworker, &group->poll_work, delay); + else + atomic_set(&group->poll_scheduled, 0); + + rcu_read_unlock(); +} + +static void psi_poll_work(struct kthread_work *work) +{ + struct kthread_delayed_work *dwork; + struct psi_group *group; + u32 changed_states; + u64 now; + + dwork = container_of(work, struct kthread_delayed_work, work); + group = container_of(dwork, struct psi_group, poll_work); + + atomic_set(&group->poll_scheduled, 0); + + mutex_lock(&group->trigger_lock); + + now = sched_clock(); + + collect_percpu_times(group, PSI_POLL, &changed_states); + + if (changed_states & group->poll_states) { + /* Initialize trigger windows when entering polling mode */ + if (now > group->polling_until) + init_triggers(group, now); + + /* + * Keep the monitor active for at least the duration of the + * minimum tracking window as long as monitor states are + * changing. + */ + group->polling_until = now + + group->poll_min_period * UPDATES_PER_WINDOW; + } + + if (now > group->polling_until) { + group->polling_next_update = ULLONG_MAX; + goto out; + } + + if (now >= group->polling_next_update) + group->polling_next_update = update_triggers(group, now); + + psi_schedule_poll_work(group, + nsecs_to_jiffies(group->polling_next_update - now) + 1); + +out: + mutex_unlock(&group->trigger_lock); +} + static void record_times(struct psi_group_cpu *groupc, int cpu, bool memstall_tick) { @@ -460,8 +668,8 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_NONIDLE] += delta; } -static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static u32 psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) { struct psi_group_cpu *groupc; unsigned int t, m; @@ -507,6 +715,8 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; write_seqcount_end(&groupc->seq); + + return state_mask; } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -567,7 +777,11 @@ void psi_task_change(struct task_struct *task, int clear, int set) wake_clock = false; while ((group = iterate_groups(task, &iter))) { - psi_group_change(group, cpu, clear, set); + u32 state_mask = psi_group_change(group, cpu, clear, set); + + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + if (wake_clock && !delayed_work_pending(&group->avgs_work)) schedule_delayed_work(&group->avgs_work, PSI_FREQ); } @@ -668,6 +882,8 @@ void psi_cgroup_free(struct cgroup *cgroup) cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); + /* All triggers must be removed by now */ + WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); } /** @@ -731,7 +947,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); - collect_percpu_times(group, NULL); + collect_percpu_times(group, PSI_AVGS, NULL); if (now >= group->avg_next_update) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); @@ -743,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) for (w = 0; w < 3; w++) avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", @@ -786,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file) return single_open(file, psi_cpu_show, NULL); } +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res) +{ + struct psi_trigger *t; + enum psi_states state; + u32 threshold_us; + u32 window_us; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_SOME + res * 2; + else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_FULL + res * 2; + else + return ERR_PTR(-EINVAL); + + if (state >= PSI_NONIDLE) + return ERR_PTR(-EINVAL); + + if (window_us < WINDOW_MIN_US || + window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* Check threshold */ + if (threshold_us == 0 || threshold_us > window_us) + return ERR_PTR(-EINVAL); + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + + t->group = group; + t->state = state; + t->threshold = threshold_us * NSEC_PER_USEC; + t->win.size = window_us * NSEC_PER_USEC; + window_reset(&t->win, 0, 0, 0); + + t->event = 0; + t->last_event_time = 0; + init_waitqueue_head(&t->event_wait); + kref_init(&t->refcount); + + mutex_lock(&group->trigger_lock); + + if (!rcu_access_pointer(group->poll_kworker)) { + struct sched_param param = { + .sched_priority = MAX_RT_PRIO - 1, + }; + struct kthread_worker *kworker; + + kworker = kthread_create_worker(0, "psimon"); + if (IS_ERR(kworker)) { + kfree(t); + mutex_unlock(&group->trigger_lock); + return ERR_CAST(kworker); + } + sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); + kthread_init_delayed_work(&group->poll_work, + psi_poll_work); + rcu_assign_pointer(group->poll_kworker, kworker); + } + + list_add(&t->node, &group->triggers); + group->poll_min_period = min(group->poll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->nr_triggers[t->state]++; + group->poll_states |= (1 << t->state); + + mutex_unlock(&group->trigger_lock); + + return t; +} + +static void psi_trigger_destroy(struct kref *ref) +{ + struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); + struct psi_group *group = t->group; + struct kthread_worker *kworker_to_destroy = NULL; + + if (static_branch_likely(&psi_disabled)) + return; + + /* + * Wakeup waiters to stop polling. Can happen if cgroup is deleted + * from under a polling process. + */ + wake_up_interruptible(&t->event_wait); + + mutex_lock(&group->trigger_lock); + + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->nr_triggers[t->state]--; + if (!group->nr_triggers[t->state]) + group->poll_states &= ~(1 << t->state); + /* reset min update period for the remaining triggers */ + list_for_each_entry(tmp, &group->triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->poll_min_period = period; + /* Destroy poll_kworker when the last trigger is destroyed */ + if (group->poll_states == 0) { + group->polling_until = 0; + kworker_to_destroy = rcu_dereference_protected( + group->poll_kworker, + lockdep_is_held(&group->trigger_lock)); + rcu_assign_pointer(group->poll_kworker, NULL); + } + } + + mutex_unlock(&group->trigger_lock); + + /* + * Wait for both *trigger_ptr from psi_trigger_replace and + * poll_kworker RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_kworker + */ + synchronize_rcu(); + /* + * Destroy the kworker after releasing trigger_lock to prevent a + * deadlock while waiting for psi_poll_work to acquire trigger_lock + */ + if (kworker_to_destroy) { + kthread_cancel_delayed_work_sync(&group->poll_work); + kthread_destroy_worker(kworker_to_destroy); + } + kfree(t); +} + +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) +{ + struct psi_trigger *old = *trigger_ptr; + + if (static_branch_likely(&psi_disabled)) + return; + + rcu_assign_pointer(*trigger_ptr, new); + if (old) + kref_put(&old->refcount, psi_trigger_destroy); +} + +__poll_t psi_trigger_poll(void **trigger_ptr, + struct file *file, poll_table *wait) +{ + __poll_t ret = DEFAULT_POLLMASK; + struct psi_trigger *t; + + if (static_branch_likely(&psi_disabled)) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + + rcu_read_lock(); + + t = rcu_dereference(*(void __rcu __force **)trigger_ptr); + if (!t) { + rcu_read_unlock(); + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + } + kref_get(&t->refcount); + + rcu_read_unlock(); + + poll_wait(file, &t->event_wait, wait); + + if (cmpxchg(&t->event, 1, 0) == 1) + ret |= EPOLLPRI; + + kref_put(&t->refcount, psi_trigger_destroy); + + return ret; +} + +static ssize_t psi_write(struct file *file, const char __user *user_buf, + size_t nbytes, enum psi_res res) +{ + char buf[32]; + size_t buf_size; + struct seq_file *seq; + struct psi_trigger *new; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + buf_size = min(nbytes, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size - 1] = '\0'; + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) + return PTR_ERR(new); + + seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ + mutex_lock(&seq->lock); + psi_trigger_replace(&seq->private, new); + mutex_unlock(&seq->lock); + + return nbytes; +} + +static ssize_t psi_io_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IO); +} + +static ssize_t psi_memory_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_MEM); +} + +static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_CPU); +} + +static __poll_t psi_fop_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + + return psi_trigger_poll(&seq->private, file, wait); +} + +static int psi_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + psi_trigger_replace(&seq->private, NULL); + return single_release(inode, file); +} + static const struct file_operations psi_io_fops = { .open = psi_io_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_io_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_memory_fops = { .open = psi_memory_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_memory_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_cpu_fops = { .open = psi_cpu_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_cpu_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static int __init psi_proc_init(void) -- cgit v1.2.3 From df5ba5be7425e1df296d40c5f37a39d98ec666a2 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Tue, 14 May 2019 15:41:18 -0700 Subject: kernel/sched/psi.c: expose pressure metrics on root cgroup Pressure metrics are already recorded and exposed in procfs for the entire system, but any tool which monitors cgroup pressure has to special case the root cgroup to read from procfs. This patch exposes the already recorded pressure metrics on the root cgroup. Link: http://lkml.kernel.org/r/20190510174938.3361741-1-dschatzberg@fb.com Signed-off-by: Dan Schatzberg Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Li Zefan Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cgroup.c | 18 ++++++++++++------ kernel/sched/psi.c | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1140357d46f4..217cec4e22c6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3540,15 +3540,24 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_CPU); } static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, @@ -4801,7 +4810,6 @@ static struct cftype cgroup_base_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -4809,7 +4817,6 @@ static struct cftype cgroup_base_files[] = { }, { .name = "memory.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -4817,7 +4824,6 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index e88918e0bb6d..7acc632c3b82 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -173,7 +173,7 @@ static u64 psi_period __read_mostly; /* System-level pressure and stall tracking */ static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); -static struct psi_group psi_system = { +struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -- cgit v1.2.3 From 831246570d34692e0550da952d0655bdcc985419 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 14 May 2019 15:42:28 -0700 Subject: kernel/notifier.c: double register detection By design notifiers can be registerd once only, 2nd register attempt called by mistake silently corrupts notifiers list. A few years ago I investigated described problem, the host was power cycled because of notifier list corruption. I've prepared this patch and applied it to the OpenVZ kernel and sent this patch but nobody commented on it. Later it helped us to detect a similar problem in the OpenVz kernel. Mistakes with notifier registration can happen for example during subsystem initialization from different namespaces, or because of a lost unregister in the roll-back path on initialization failures. The proposed check cannot prevent the described problem, however it allows us to detect its reason quickly without coredump analysis. Link: http://lkml.kernel.org/r/04127e71-4782-9bbb-fe5a-7c01e93a99b0@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/notifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index 6196af8a8223..bfc95b3e4235 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -22,6 +22,7 @@ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { + WARN_ONCE(((*nl) == n), "double register detected"); if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); -- cgit v1.2.3 From 0cc75888dad112395df0ac755915ceb79811831c Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Tue, 14 May 2019 15:42:31 -0700 Subject: kernel/latencytop.c: remove unnecessary checks for latencytop_enabled 1. In latencytop source codes, we only have such calling chain: account_scheduler_latency(struct task_struct *task, int usecs, int inter) { if (unlikely(latencytop_enabled)) /* the outtermost check */ __account_scheduler_latency(task, usecs, inter); } __account_scheduler_latency account_global_scheduler_latency if (!latencytop_enabled) So, the inner check for latencytop_enabled is not necessary at all. 2. In clear_all_latency_tracing and now is called clear_tsk_latency_tracing the check for latencytop_enabled is redundant and buggy to some extent. We have no reason to refuse clearing the /proc/$pid/latency if latencytop_enabled is set to 0, considering that if we use latencytop manually by echo 0 > /proc/sys/kernel/latencytop, then we want to clear /proc/$pid/latency and failed. Also we don't have such check in brother function clear_global_latency_tracing. Notes: These changes are only visible to users who set CONFIG_LATENCYTOP and won't change user tool latencytop's behavior. Link: http://lkml.kernel.org/r/20190226114602.16902-2-linf@wangsu.com Signed-off-by: Lin Feng Cc: Alexey Dobriyan Cc: Fabian Frederick Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/latencytop.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 99a5b5f46dc5..bbde5614da71 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -71,9 +71,6 @@ void clear_all_latency_tracing(struct task_struct *p) { unsigned long flags; - if (!latencytop_enabled) - return; - raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; @@ -96,9 +93,6 @@ account_global_scheduler_latency(struct task_struct *tsk, int firstnonnull = MAXLR + 1; int i; - if (!latencytop_enabled) - return; - /* skip kernel threads for now */ if (!tsk->mm) return; -- cgit v1.2.3 From e02c9b0d65a7493180db45320f82482c6ba8ea57 Mon Sep 17 00:00:00 2001 From: Lin Feng Date: Tue, 14 May 2019 15:42:34 -0700 Subject: kernel/latencytop.c: rename clear_all_latency_tracing to clear_tsk_latency_tracing The name clear_all_latency_tracing is misleading, in fact which only clear per task's latency_record[], and we do have another function named clear_global_latency_tracing which clear the global latency_record[] buffer. Link: http://lkml.kernel.org/r/20190226114602.16902-1-linf@wangsu.com Signed-off-by: Lin Feng Cc: Alexey Dobriyan Cc: Fabian Frederick Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- kernel/latencytop.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b409e792aadc..b4cba953040a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2093,7 +2093,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif - clear_all_latency_tracing(p); + clear_tsk_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index bbde5614da71..871734ea2f04 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -67,7 +67,7 @@ static struct latency_record latency_record[MAXLR]; int latencytop_enabled; -void clear_all_latency_tracing(struct task_struct *p) +void clear_tsk_latency_tracing(struct task_struct *p) { unsigned long flags; -- cgit v1.2.3 From 6c4e121fda519e0da5c6755a60fdef8cd39634ae Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 14 May 2019 15:42:37 -0700 Subject: kernel/user.c: clean up some leftover code The out_unlock label is misleading; no unlocking happens after it, so just return NULL directly. Also, nothing between the kmem_cache_zalloc() that creates new and the two key_put() can initialize new->uid_keyring or new->session_keyring, so those calls are no-ops. Link: http://lkml.kernel.org/r/20190424200404.9114-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Reviewed-by: Andrew Morton Cc: "Peter Zijlstra (Intel)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 0df9b1640b2a..88b834f0eebc 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -185,7 +185,7 @@ struct user_struct *alloc_uid(kuid_t uid) if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) - goto out_unlock; + return NULL; new->uid = uid; refcount_set(&new->__count, 1); @@ -199,8 +199,6 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - key_put(new->uid_keyring); - key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -210,9 +208,6 @@ struct user_struct *alloc_uid(kuid_t uid) } return up; - -out_unlock: - return NULL; } static int __init uid_cache_init(void) -- cgit v1.2.3 From b028fb612849add771679c1b99a50d99264c9632 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Tue, 14 May 2019 15:44:35 -0700 Subject: kernel/signal.c: annotate implicit fall through There is a plan to build the kernel with -Wimplicit-fallthrough and this place in the code produced a warning (W=1). This commit remove the following warning: kernel/signal.c:795:13: warning: this statement may fall through [-Wimplicit-fallthrough=] Link: http://lkml.kernel.org/r/20190114203505.17875-1-malat@debian.org Signed-off-by: Mathieu Malaterre Acked-by: Gustavo A. R. Silva Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 62f9aea4a15a..c4dd66436fc5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -840,6 +840,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, */ if (!sid || sid == task_session(current)) break; + /* fall through */ default: return -EPERM; } -- cgit v1.2.3 From 475dae385497dde3f25271ce77b526a1e54a472a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 14 May 2019 15:44:52 -0700 Subject: kernel/sysctl.c: switch to bitmap_zalloc() Switch to bitmap_zalloc() to show clearly what we are allocating. Besides that it returns pointer of bitmap type instead of opaque void *. Link: http://lkml.kernel.org/r/20190304094037.57756-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Kees Cook Reviewed-by: Andrew Morton Cc: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ba158f61aab4..d82f9161adb8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3178,9 +3178,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len), - sizeof(unsigned long), - GFP_KERNEL); + tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); if (!tmp_bitmap) { kfree(kbuf); return -ENOMEM; @@ -3271,7 +3269,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, *ppos += *lenp; } - kfree(tmp_bitmap); + bitmap_free(tmp_bitmap); return err; } -- cgit v1.2.3 From e260ad01f0aa9e96b5386d5cd7184afd949dc457 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 14 May 2019 15:44:55 -0700 Subject: sysctl: return -EINVAL if val violates minmax Currently when userspace gives us a values that overflow e.g. file-max and other callers of __do_proc_doulongvec_minmax() we simply ignore the new value and leave the current value untouched. This can be problematic as it gives the illusion that the limit has indeed be bumped when in fact it failed. This commit makes sure to return EINVAL when an overflow is detected. Please note that this is a userspace facing change. Link: http://lkml.kernel.org/r/20190210203943.8227-4-christian@brauner.io Signed-off-by: Christian Brauner Acked-by: Luis Chamberlain Cc: Kees Cook Cc: Alexey Dobriyan Cc: Al Viro Cc: Dominik Brodowski Cc: "Eric W. Biederman" Cc: Joe Lawrence Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d82f9161adb8..f7bd1aead3bf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2886,8 +2886,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (neg) continue; val = convmul * val / convdiv; - if ((min && val < *min) || (max && val > *max)) - continue; + if ((min && val < *min) || (max && val > *max)) { + err = -EINVAL; + break; + } *i = val; } else { val = convdiv * (*i) / convmul; -- cgit v1.2.3 From 3116ad38f51c98c81175151bd7358858a92a6031 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 14 May 2019 15:45:13 -0700 Subject: kernel/sysctl.c: fix proc_do_large_bitmap for large input buffers Today, proc_do_large_bitmap() truncates a large write input buffer to PAGE_SIZE - 1, which may result in misparsed numbers at the (truncated) end of the buffer. Further, it fails to notify the caller that the buffer was truncated, so it doesn't get called iteratively to finish the entire input buffer. Tell the caller if there's more work to do by adding the skipped amount back to left/*lenp before returning. To fix the misparsing, reset the position if we have completely consumed a truncated buffer (or if just one char is left, which may be a "-" in a range), and ask the caller to come back for more. Link: http://lkml.kernel.org/r/20190320222831.8243-7-mcgrof@kernel.org Signed-off-by: Eric Sandeen Signed-off-by: Luis Chamberlain Acked-by: Kees Cook Cc: Eric Sandeen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f7bd1aead3bf..943c89178e3d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3172,9 +3172,13 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (write) { char *kbuf, *p; + size_t skipped = 0; - if (left > PAGE_SIZE - 1) + if (left > PAGE_SIZE - 1) { left = PAGE_SIZE - 1; + /* How much of the buffer we'll skip this pass */ + skipped = *lenp - left; + } p = kbuf = memdup_user_nul(buffer, left); if (IS_ERR(kbuf)) @@ -3189,9 +3193,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, while (!err && left) { unsigned long val_a, val_b; bool neg; + size_t saved_left; + /* In case we stop parsing mid-number, we can reset */ + saved_left = left; err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); + /* + * If we consumed the entirety of a truncated buffer or + * only one char is left (may be a "-"), then stop here, + * reset, & come back for more. + */ + if ((left <= 1) && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_a >= bitmap_len || neg) { @@ -3209,6 +3226,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); + /* + * If we consumed all of a truncated buffer or + * then stop here, reset, & come back for more. + */ + if (!left && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_b >= bitmap_len || neg || @@ -3227,6 +3253,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, proc_skip_char(&p, &left, '\n'); } kfree(kbuf); + left += skipped; } else { unsigned long bit_a, bit_b = 0; -- cgit v1.2.3 From 1fd402df4586bcc239298081449ce58a78211626 Mon Sep 17 00:00:00 2001 From: Timmy Li Date: Tue, 14 May 2019 15:45:16 -0700 Subject: kernel/pid.c: remove unneeded hash header file Hash functions are not needed since idr is used now. Let's remove hash header file for cleanup. Link: http://lkml.kernel.org/r/20190430053319.95913-1-scuttimmy@gmail.com Signed-off-by: Timmy Li Cc: "Eric W. Biederman" Cc: Michal Hocko Cc: Matthew Wilcox Cc: Oleg Nesterov Cc: Mike Rapoport Cc: KJ Tsanaktsidis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..89548d35eefb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 826eba0d77bc74c4d1c611374b76abfe251e8538 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Tue, 14 May 2019 15:45:25 -0700 Subject: gcov: clang: move common GCC code into gcc_base.c Patch series "gcov: add Clang support", v4. This patch (of 3): base.c contains a few callbacks specific to GCC's gcov implementation. Move these into their own module in preparation for Clang support. Link: http://lkml.kernel.org/r/20190318025411.98014-2-trong@android.com Signed-off-by: Greg Hackmann Signed-off-by: Nick Desaulniers Signed-off-by: Tri Vo Tested-by: Trilok Soni Tested-by: Prasad Sodagudi Tested-by: Tri Vo Reviewed-by: Peter Oberparleiter Cc: Daniel Mentz Cc: Petri Gynther Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/Makefile | 4 +-- kernel/gcov/base.c | 84 ++---------------------------------------------- kernel/gcov/gcc_base.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/gcov/gcov.h | 3 ++ 4 files changed, 93 insertions(+), 84 deletions(-) create mode 100644 kernel/gcov/gcc_base.c (limited to 'kernel') diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index ff06d64df397..45431ed679d1 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -2,5 +2,5 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o -obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o -obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o +obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o +obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9c7c8d5c18f2..799d42072727 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -22,88 +22,8 @@ #include #include "gcov.h" -static int gcov_events_enabled; -static DEFINE_MUTEX(gcov_lock); - -/* - * __gcov_init is called by gcc-generated constructor code for each object - * file compiled with -fprofile-arcs. - */ -void __gcov_init(struct gcov_info *info) -{ - static unsigned int gcov_version; - - mutex_lock(&gcov_lock); - if (gcov_version == 0) { - gcov_version = gcov_info_version(info); - /* - * Printing gcc's version magic may prove useful for debugging - * incompatibility reports. - */ - pr_info("version magic: 0x%x\n", gcov_version); - } - /* - * Add new profiling data structure to list and inform event - * listener. - */ - gcov_info_link(info); - if (gcov_events_enabled) - gcov_event(GCOV_ADD, info); - mutex_unlock(&gcov_lock); -} -EXPORT_SYMBOL(__gcov_init); - -/* - * These functions may be referenced by gcc-generated profiling code but serve - * no function for kernel profiling. - */ -void __gcov_flush(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_flush); - -void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_add); - -void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_single); - -void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_delta); - -void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_ior); - -void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_time_profile); - -void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_icall_topn); - -void __gcov_exit(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_exit); +int gcov_events_enabled; +DEFINE_MUTEX(gcov_lock); /** * gcov_enable_events - enable event reporting through gcov_event() diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c new file mode 100644 index 000000000000..3cf736b9f880 --- /dev/null +++ b/kernel/gcov/gcc_base.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "gcov.h" + +/* + * __gcov_init is called by gcc-generated constructor code for each object + * file compiled with -fprofile-arcs. + */ +void __gcov_init(struct gcov_info *info) +{ + static unsigned int gcov_version; + + mutex_lock(&gcov_lock); + if (gcov_version == 0) { + gcov_version = gcov_info_version(info); + /* + * Printing gcc's version magic may prove useful for debugging + * incompatibility reports. + */ + pr_info("version magic: 0x%x\n", gcov_version); + } + /* + * Add new profiling data structure to list and inform event + * listener. + */ + gcov_info_link(info); + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(__gcov_init); + +/* + * These functions may be referenced by gcc-generated profiling code but serve + * no function for kernel profiling. + */ +void __gcov_flush(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_flush); + +void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_add); + +void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_single); + +void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_delta); + +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_ior); + +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); + +void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_icall_topn); + +void __gcov_exit(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index de118ad4a024..0ecf1d664ec3 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -83,4 +83,7 @@ struct gcov_link { }; extern const struct gcov_link gcov_link[]; +extern int gcov_events_enabled; +extern struct mutex gcov_lock; + #endif /* GCOV_H */ -- cgit v1.2.3 From e178a5beb36960902379040ee0b667fb0a8eee93 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Tue, 14 May 2019 15:45:31 -0700 Subject: gcov: clang support LLVM uses profiling data that's deliberately similar to GCC, but has a very different way of exporting that data. LLVM calls llvm_gcov_init() once per module, and provides a couple of callbacks that we can use to ask for more data. We care about the "writeout" callback, which in turn calls back into compiler-rt/this module to dump all the gathered coverage data to disk: llvm_gcda_start_file() llvm_gcda_emit_function() llvm_gcda_emit_arcs() llvm_gcda_emit_function() llvm_gcda_emit_arcs() [... repeats for each function ...] llvm_gcda_summary_info() llvm_gcda_end_file() This design is much more stateless and unstructured than gcc's, and is intended to run at process exit. This forces us to keep some local state about which module we're dealing with at the moment. On the other hand, it also means we don't depend as much on how LLVM represents profiling data internally. See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more details on how this works, particularly GCOVProfiler::emitProfileArcs(), GCOVProfiler::insertCounterWriteout(), and GCOVProfiler::insertFlush(). [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20190417225328.208129-1-trong@android.com Signed-off-by: Greg Hackmann Signed-off-by: Nick Desaulniers Signed-off-by: Tri Vo Co-developed-by: Nick Desaulniers Co-developed-by: Tri Vo Tested-by: Trilok Soni Tested-by: Prasad Sodagudi Tested-by: Tri Vo Tested-by: Daniel Mentz Tested-by: Petri Gynther Reviewed-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 3 +- kernel/gcov/Makefile | 1 + kernel/gcov/base.c | 2 +- kernel/gcov/clang.c | 581 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/gcov/gcc_3_4.c | 12 ++ kernel/gcov/gcc_4_7.c | 12 ++ kernel/gcov/gcov.h | 2 + 7 files changed, 611 insertions(+), 2 deletions(-) create mode 100644 kernel/gcov/clang.c (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 1e3823fa799b..f71c1adcff31 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -53,6 +53,7 @@ config GCOV_PROFILE_ALL choice prompt "Specify GCOV format" depends on GCOV_KERNEL + depends on CC_IS_GCC ---help--- The gcov format is usually determined by the GCC version, and the default is chosen according to your GCC version. However, there are @@ -62,7 +63,7 @@ choice config GCOV_FORMAT_3_4 bool "GCC 3.4 format" - depends on CC_IS_GCC && GCC_VERSION < 40700 + depends on GCC_VERSION < 40700 ---help--- Select this option to use the format defined by GCC 3.4. diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 45431ed679d1..d66a74b0f100 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -4,3 +4,4 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o +obj-$(CONFIG_CC_IS_CLANG) += clang.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 799d42072727..0ffe9f194080 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -64,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, /* Remove entries located in module from linked list. */ while ((info = gcov_info_next(info))) { - if (within_module((unsigned long)info, mod)) { + if (gcov_info_within_module(info, mod)) { gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c new file mode 100644 index 000000000000..c94b820a1b62 --- /dev/null +++ b/kernel/gcov/clang.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Google, Inc. + * modified from kernel/gcov/gcc_4_7.c + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * LLVM uses profiling data that's deliberately similar to GCC, but has a + * very different way of exporting that data. LLVM calls llvm_gcov_init() once + * per module, and provides a couple of callbacks that we can use to ask for + * more data. + * + * We care about the "writeout" callback, which in turn calls back into + * compiler-rt/this module to dump all the gathered coverage data to disk: + * + * llvm_gcda_start_file() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * [... repeats for each function ...] + * llvm_gcda_summary_info() + * llvm_gcda_end_file() + * + * This design is much more stateless and unstructured than gcc's, and is + * intended to run at process exit. This forces us to keep some local state + * about which module we're dealing with at the moment. On the other hand, it + * also means we don't depend as much on how LLVM represents profiling data + * internally. + * + * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more + * details on how this works, particularly GCOVProfiler::emitProfileArcs(), + * GCOVProfiler::insertCounterWriteout(), and + * GCOVProfiler::insertFlush(). + */ + +#define pr_fmt(fmt) "gcov: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "gcov.h" + +typedef void (*llvm_gcov_callback)(void); + +struct gcov_info { + struct list_head head; + + const char *filename; + unsigned int version; + u32 checksum; + + struct list_head functions; +}; + +struct gcov_fn_info { + struct list_head head; + + u32 ident; + u32 checksum; + u8 use_extra_checksum; + u32 cfg_checksum; + + u32 num_counters; + u64 *counters; + const char *function_name; +}; + +static struct gcov_info *current_info; + +static LIST_HEAD(clang_gcov_list); + +void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) +{ + struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + INIT_LIST_HEAD(&info->functions); + + mutex_lock(&gcov_lock); + + list_add_tail(&info->head, &clang_gcov_list); + current_info = info; + writeout(); + current_info = NULL; + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(llvm_gcov_init); + +void llvm_gcda_start_file(const char *orig_filename, const char version[4], + u32 checksum) +{ + current_info->filename = orig_filename; + memcpy(¤t_info->version, version, sizeof(current_info->version)); + current_info->checksum = checksum; +} +EXPORT_SYMBOL(llvm_gcda_start_file); + +void llvm_gcda_emit_function(u32 ident, const char *function_name, + u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) +{ + struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + info->ident = ident; + info->checksum = func_checksum; + info->use_extra_checksum = use_extra_checksum; + info->cfg_checksum = cfg_checksum; + if (function_name) + info->function_name = kstrdup(function_name, GFP_KERNEL); + + list_add_tail(&info->head, ¤t_info->functions); +} +EXPORT_SYMBOL(llvm_gcda_emit_function); + +void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) +{ + struct gcov_fn_info *info = list_last_entry(¤t_info->functions, + struct gcov_fn_info, head); + + info->num_counters = num_counters; + info->counters = counters; +} +EXPORT_SYMBOL(llvm_gcda_emit_arcs); + +void llvm_gcda_summary_info(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_summary_info); + +void llvm_gcda_end_file(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_end_file); + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ + return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ + return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ + if (!info) + return list_first_entry_or_null(&clang_gcov_list, + struct gcov_info, head); + if (list_is_last(&info->head, &clang_gcov_list)) + return NULL; + return list_next_entry(info, head); +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ + list_add_tail(&info->head, &clang_gcov_list); +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ + /* Generic code unlinks while iterating. */ + __list_del_entry(&info->head); +} + +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info->filename, mod); +} + +/* Symbolic links to be created for each profiling data file. */ +const struct gcov_link gcov_link[] = { + { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ + { 0, NULL}, +}; + +/** + * gcov_info_reset - reset profiling data to zero + * @info: profiling data set + */ +void gcov_info_reset(struct gcov_info *info) +{ + struct gcov_fn_info *fn; + + list_for_each_entry(fn, &info->functions, head) + memset(fn->counters, 0, + sizeof(fn->counters[0]) * fn->num_counters); +} + +/** + * gcov_info_is_compatible - check if profiling data can be added + * @info1: first profiling data set + * @info2: second profiling data set + * + * Returns non-zero if profiling data can be added, zero otherwise. + */ +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) +{ + struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null( + &info1->functions, struct gcov_fn_info, head); + struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null( + &info2->functions, struct gcov_fn_info, head); + + if (info1->checksum != info2->checksum) + return false; + if (!fn_ptr1) + return fn_ptr1 == fn_ptr2; + while (!list_is_last(&fn_ptr1->head, &info1->functions) && + !list_is_last(&fn_ptr2->head, &info2->functions)) { + if (fn_ptr1->checksum != fn_ptr2->checksum) + return false; + if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) + return false; + if (fn_ptr1->use_extra_checksum && + fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) + return false; + fn_ptr1 = list_next_entry(fn_ptr1, head); + fn_ptr2 = list_next_entry(fn_ptr2, head); + } + return list_is_last(&fn_ptr1->head, &info1->functions) && + list_is_last(&fn_ptr2->head, &info2->functions); +} + +/** + * gcov_info_add - add up profiling data + * @dest: profiling data set to which data is added + * @source: profiling data set which is added + * + * Adds profiling counts of @source to @dest. + */ +void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) +{ + struct gcov_fn_info *dfn_ptr; + struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions, + struct gcov_fn_info, head); + + list_for_each_entry(dfn_ptr, &dst->functions, head) { + u32 i; + + for (i = 0; i < sfn_ptr->num_counters; i++) + dfn_ptr->counters[i] += sfn_ptr->counters[i]; + } +} + +static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) +{ + size_t cv_size; /* counter values size */ + struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), + GFP_KERNEL); + if (!fn_dup) + return NULL; + INIT_LIST_HEAD(&fn_dup->head); + + fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL); + if (!fn_dup->function_name) + goto err_name; + + cv_size = fn->num_counters * sizeof(fn->counters[0]); + fn_dup->counters = vmalloc(cv_size); + if (!fn_dup->counters) + goto err_counters; + memcpy(fn_dup->counters, fn->counters, cv_size); + + return fn_dup; + +err_counters: + kfree(fn_dup->function_name); +err_name: + kfree(fn_dup); + return NULL; +} + +/** + * gcov_info_dup - duplicate profiling data set + * @info: profiling data set to duplicate + * + * Return newly allocated duplicate on success, %NULL on error. + */ +struct gcov_info *gcov_info_dup(struct gcov_info *info) +{ + struct gcov_info *dup; + struct gcov_fn_info *fn; + + dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); + if (!dup) + return NULL; + INIT_LIST_HEAD(&dup->head); + INIT_LIST_HEAD(&dup->functions); + dup->filename = kstrdup(info->filename, GFP_KERNEL); + if (!dup->filename) + goto err; + + list_for_each_entry(fn, &info->functions, head) { + struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn); + + if (!fn_dup) + goto err; + list_add_tail(&fn_dup->head, &dup->functions); + } + + return dup; + +err: + gcov_info_free(dup); + return NULL; +} + +/** + * gcov_info_free - release memory for profiling data set duplicate + * @info: profiling data set duplicate to free + */ +void gcov_info_free(struct gcov_info *info) +{ + struct gcov_fn_info *fn, *tmp; + + list_for_each_entry_safe(fn, tmp, &info->functions, head) { + kfree(fn->function_name); + vfree(fn->counters); + list_del(&fn->head); + kfree(fn); + } + kfree(info->filename); + kfree(info); +} + +#define ITER_STRIDE PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { + struct gcov_info *info; + void *buffer; + size_t size; + loff_t pos; +}; + +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +static size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + *data = v; + } + + return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +static size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + } + + return sizeof(*data) * 2; +} + +/** + * convert_to_gcda - convert profiling data set to gcda file format + * @buffer: the buffer to store file data or %NULL if no data should be stored + * @info: profiling data set to be converted + * + * Returns the number of bytes that were/would have been stored into the buffer. + */ +static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +{ + struct gcov_fn_info *fi_ptr; + size_t pos = 0; + + /* File header. */ + pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); + pos += store_gcov_u32(buffer, pos, info->version); + pos += store_gcov_u32(buffer, pos, info->checksum); + + list_for_each_entry(fi_ptr, &info->functions, head) { + u32 i; + u32 len = 2; + + if (fi_ptr->use_extra_checksum) + len++; + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); + pos += store_gcov_u32(buffer, pos, len); + pos += store_gcov_u32(buffer, pos, fi_ptr->ident); + pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); + if (fi_ptr->use_extra_checksum) + pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); + pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); + for (i = 0; i < fi_ptr->num_counters; i++) + pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]); + } + + return pos; +} + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + + iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); + if (!iter) + goto err_free; + + iter->info = info; + /* Dry-run to get the actual buffer size. */ + iter->size = convert_to_gcda(NULL, info); + iter->buffer = vmalloc(iter->size); + if (!iter->buffer) + goto err_free; + + convert_to_gcda(iter->buffer, info); + + return iter; + +err_free: + kfree(iter); + return NULL; +} + + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +void gcov_iter_free(struct gcov_iterator *iter) +{ + vfree(iter->buffer); + kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +void gcov_iter_start(struct gcov_iterator *iter) +{ + iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +int gcov_iter_next(struct gcov_iterator *iter) +{ + if (iter->pos < iter->size) + iter->pos += ITER_STRIDE; + + if (iter->pos >= iter->size) + return -EINVAL; + + return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + size_t len; + + if (iter->pos >= iter->size) + return -EINVAL; + + len = ITER_STRIDE; + if (iter->pos + len > iter->size) + len = iter->size - iter->pos; + + seq_write(seq, iter->buffer + iter->pos, len); + + return 0; +} diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 2dddecbdbe6e..801ee4b0b969 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index ca5e5c0ef853..ec37563674d6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 0ecf1d664ec3..6ab2c1808c9d 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -15,6 +15,7 @@ #ifndef GCOV_H #define GCOV_H GCOV_H +#include #include /* @@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info); struct gcov_info *gcov_info_next(struct gcov_info *info); void gcov_info_link(struct gcov_info *info); void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); +bool gcov_info_within_module(struct gcov_info *info, struct module *mod); /* Base interface. */ enum gcov_action { -- cgit v1.2.3 From c39ea0b9dd24bf1bf2baa5cdbfa1905f3065347b Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 14 May 2019 15:45:34 -0700 Subject: panic: avoid the extra noise dmesg When kernel panic happens, it will first print the panic call stack, then the ending msg like: [ 35.743249] ---[ end Kernel panic - not syncing: Fatal exception [ 35.749975] ------------[ cut here ]------------ The above message are very useful for debugging. But if system is configured to not reboot on panic, say the "panic_timeout" parameter equals 0, it will likely print out many noisy message like WARN() call stack for each and every CPU except the panic one, messages like below: WARNING: CPU: 1 PID: 280 at kernel/sched/core.c:1198 set_task_cpu+0x183/0x190 Call Trace: try_to_wake_up default_wake_function autoremove_wake_function __wake_up_common __wake_up_common_lock __wake_up wake_up_klogd_work_func irq_work_run_list irq_work_tick update_process_times tick_sched_timer __hrtimer_run_queues hrtimer_interrupt smp_apic_timer_interrupt apic_timer_interrupt For people working in console mode, the screen will first show the panic call stack, but immediately overridden by these noisy extra messages, which makes debugging much more difficult, as the original context gets lost on screen. Also these noisy messages will confuse some users, as I have seen many bug reporters posted the noisy message into bugzilla, instead of the real panic call stack and context. Adding a flag "suppress_printk" which gets set in panic() to avoid those noisy messages, without changing current kernel behavior that both panic blinking and sysrq magic key can work as is, suggested by Petr Mladek. To verify this, make sure kernel is not configured to reboot on panic and in console # echo c > /proc/sysrq-trigger to see if console only prints out the panic call stack. Link: http://lkml.kernel.org/r/1551430186-24169-1-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Acked-by: Steven Rostedt (VMware) Acked-by: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Kees Cook Cc: Borislav Petkov Cc: Andi Kleen Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 3 +++ kernel/printk/printk.c | 10 ++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index c1fcaad337b7..a6145050a8da 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -321,6 +321,9 @@ void panic(const char *fmt, ...) disabled_wait(); #endif pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); + + /* Do not scroll important messages printed above */ + suppress_printk = 1; local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 02ca827b8fac..17102fd4c136 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -86,6 +86,12 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); +/* + * System may need to suppress printk message under certain + * circumstances, like after kernel panic happens. + */ +int __read_mostly suppress_printk; + #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" @@ -1943,6 +1949,10 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; u64 curr_log_seq; + /* Suppress unimportant messages after panic happens */ + if (unlikely(suppress_printk)) + return 0; + if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; in_sched = true; -- cgit v1.2.3 From b287a25a7148a89d977c819c1f7d6584f875b682 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Tue, 14 May 2019 15:45:37 -0700 Subject: panic/reboot: allow specifying reboot_mode for panic only Allow specifying reboot_mode for panic only. This is needed on systems where ramoops is used to store panic logs, and user wants to use warm reset to preserve those, while still having cold reset on normal reboots. Link: http://lkml.kernel.org/r/20190322004735.27702-1-aaro.koskinen@iki.fi Signed-off-by: Aaro Koskinen Reviewed-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 ++ kernel/reboot.c | 20 +++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index a6145050a8da..8779d64bace0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -306,6 +306,8 @@ void panic(const char *fmt, ...) * shutting down. But if there is a chance of * rebooting the system it will be rebooted. */ + if (panic_reboot_mode != REBOOT_UNDEFINED) + reboot_mode = panic_reboot_mode; emergency_restart(); } #ifdef __sparc__ diff --git a/kernel/reboot.c b/kernel/reboot.c index e1b79b6a2735..b9e79e8c7226 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -31,6 +31,7 @@ EXPORT_SYMBOL(cad_pid); #define DEFAULT_REBOOT_MODE #endif enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; +enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; /* * This variable is used privately to keep track of whether or not @@ -519,6 +520,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot); static int __init reboot_setup(char *str) { for (;;) { + enum reboot_mode *mode; + /* * Having anything passed on the command line via * reboot= will cause us to disable DMI checking @@ -526,17 +529,24 @@ static int __init reboot_setup(char *str) */ reboot_default = 0; + if (!strncmp(str, "panic_", 6)) { + mode = &panic_reboot_mode; + str += 6; + } else { + mode = &reboot_mode; + } + switch (*str) { case 'w': - reboot_mode = REBOOT_WARM; + *mode = REBOOT_WARM; break; case 'c': - reboot_mode = REBOOT_COLD; + *mode = REBOOT_COLD; break; case 'h': - reboot_mode = REBOOT_HARD; + *mode = REBOOT_HARD; break; case 's': @@ -553,11 +563,11 @@ static int __init reboot_setup(char *str) if (rc) return rc; } else - reboot_mode = REBOOT_SOFT; + *mode = REBOOT_SOFT; break; } case 'g': - reboot_mode = REBOOT_GPIO; + *mode = REBOOT_GPIO; break; case 'b': -- cgit v1.2.3 From 89963adcdb430e047f4c03ac3ed6ce9aa42a595c Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Wed, 15 May 2019 15:23:52 +1000 Subject: kernel/compat.c: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. This patch aims to suppress 3 missing-break-in-switch false positives on some architectures. Acked-by: Arnd Bergmann Cc: Deepa Dinamani Cc: Gustavo A. R. Silva Cc: Kees Cook Cc: Jann Horn Signed-off-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- kernel/compat.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index d8a36c6ad7c9..b5f7063c0db6 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -346,8 +346,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return -EFAULT; switch (_NSIG_WORDS) { case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); + /* fall through */ case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); + /* fall through */ case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); + /* fall through */ case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } #else -- cgit v1.2.3 From 05b289263772b0698589abc47771264a685cd365 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 16 May 2019 10:38:21 -0700 Subject: signal: unconditionally leave the frozen state in ptrace_stop() Alex Xu reported a regression in strace, caused by the introduction of the cgroup v2 freezer. The regression can be reproduced by stracing the following simple program: #include int main() { write(1, "a", 1); return 0; } An attempt to run strace ./a.out leads to the infinite loop: [ pre-main omitted ] write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) write(1, "a", 1) = ? ERESTARTSYS (To be restarted if SA_RESTART is set) [ repeats forever ] The problem occurs because the traced task leaves ptrace_stop() (and the signal handling loop) with the frozen bit set. So let's call cgroup_leave_frozen(true) unconditionally after sleeping in ptrace_stop(). With this patch applied, strace works as expected: [ pre-main omitted ] write(1, "a", 1) = 1 exit_group(0) = ? +++ exited with 0 +++ Reported-by: Alex Xu Fixes: 76f969e8948d ("cgroup: cgroup v2 freezer") Signed-off-by: Roman Gushchin Acked-by: Oleg Nesterov Cc: Tejun Heo Signed-off-by: Tejun Heo --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c4dd66436fc5..a1eb44dc9ff5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2113,6 +2113,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t preempt_enable_no_resched(); cgroup_enter_frozen(); freezable_schedule(); + cgroup_leave_frozen(true); } else { /* * By the time we got the lock, our tracer went away. -- cgit v1.2.3 From e547ff3f803e779a3898f1f48447b29f43c54085 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Tue, 14 May 2019 19:42:57 -0700 Subject: bpf: relax inode permission check for retrieving bpf program For iptable module to load a bpf program from a pinned location, it only retrieve a loaded program and cannot change the program content so requiring a write permission for it might not be necessary. Also when adding or removing an unrelated iptable rule, it might need to flush and reload the xt_bpf related rules as well and triggers the inode permission check. It might be better to remove the write premission check for the inode so we won't need to grant write access to all the processes that flush and restore iptables rules. Signed-off-by: Chenbo Feng Signed-off-by: Alexei Starovoitov --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index bc53e5b20ddc..84a80b02db99 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -518,7 +518,7 @@ out: static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) { struct bpf_prog *prog; - int ret = inode_permission(inode, MAY_READ | MAY_WRITE); + int ret = inode_permission(inode, MAY_READ); if (ret) return ERR_PTR(ret); -- cgit v1.2.3 From de6da1e8bcf0dd2058b950b127491821207679dc Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 17 May 2019 14:31:50 -0700 Subject: panic: add an option to replay all the printk message in buffer Currently on panic, kernel will lower the loglevel and print out pending printk msg only with console_flush_on_panic(). Add an option for users to configure the "panic_print" to replay all dmesg in buffer, some of which they may have never seen due to the loglevel setting, which will help panic debugging . [feng.tang@intel.com: keep the original console_flush_on_panic() inside panic()] Link: http://lkml.kernel.org/r/1556199137-14163-1-git-send-email-feng.tang@intel.com [feng.tang@intel.com: use logbuf lock to protect the console log index] Link: http://lkml.kernel.org/r/1556269868-22654-1-git-send-email-feng.tang@intel.com Link: http://lkml.kernel.org/r/1556095872-36838-1-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Reviewed-by: Petr Mladek Cc: Aaro Koskinen Cc: Petr Mladek Cc: Steven Rostedt Cc: Sergey Senozhatsky Cc: Kees Cook Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 6 +++++- kernel/printk/printk.c | 12 +++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 8779d64bace0..b4543a31a495 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_TIMER_INFO 0x00000004 #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 +#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -134,6 +135,9 @@ EXPORT_SYMBOL(nmi_panic); static void panic_print_sys_info(void) { + if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) + console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); @@ -277,7 +281,7 @@ void panic(const char *fmt, ...) * panic() is not being callled from OOPS. */ debug_locks_off(); - console_flush_on_panic(); + console_flush_on_panic(CONSOLE_FLUSH_PENDING); panic_print_sys_info(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 17102fd4c136..a6e06fe38e41 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2535,10 +2535,11 @@ void console_unblank(void) /** * console_flush_on_panic - flush console content on panic + * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ -void console_flush_on_panic(void) +void console_flush_on_panic(enum con_flush_mode mode) { /* * If someone else is holding the console lock, trylock will fail @@ -2549,6 +2550,15 @@ void console_flush_on_panic(void) */ console_trylock(); console_may_schedule = 0; + + if (mode == CONSOLE_REPLAY_ALL) { + unsigned long flags; + + logbuf_lock_irqsave(flags); + console_seq = log_first_seq; + console_idx = log_first_idx; + logbuf_unlock_irqrestore(flags); + } console_unlock(); } -- cgit v1.2.3 From 7eaf51a2e094229b75cc0c315f1cbbe2f3960058 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 May 2019 14:51:17 -0400 Subject: stacktrace: Unbreak stack_trace_save_tsk_reliable() Miroslav reported that the livepatch self-tests were failing, specifically a case in which the consistency model ensures that a current executing function is not allowed to be patched, "TEST: busy target module". Recent renovations of stack_trace_save_tsk_reliable() left it returning only an -ERRNO success indication in some configuration combinations: klp_check_stack() ret = stack_trace_save_tsk_reliable() #ifdef CONFIG_ARCH_STACKWALK && CONFIG_HAVE_RELIABLE_STACKTRACE stack_trace_save_tsk_reliable() ret = arch_stack_walk_reliable() return 0 return -EINVAL ... return ret; ... if (ret < 0) /* stack_trace_save_tsk_reliable error */ nr_entries = ret; << 0 Previously (and currently for !CONFIG_ARCH_STACKWALK && CONFIG_HAVE_RELIABLE_STACKTRACE) stack_trace_save_tsk_reliable() returned the number of entries that it consumed in the passed storage array. In the case of the above config and trace, be sure to return the stacktrace_cookie.len on stack_trace_save_tsk_reliable() success. Fixes: 25e39e32b0a3f ("livepatch: Simplify stack trace retrieval") Reported-by: Miroslav Benes Signed-off-by: Joe Lawrence Signed-off-by: Thomas Gleixner Reviewed-by: Kamalesh Babulal Acked-by: Josh Poimboeuf Cc: live-patching@vger.kernel.org Cc: jikos@kernel.org Cc: pmladek@suse.com Link: https://lkml.kernel.org/r/20190517185117.24642-1-joe.lawrence@redhat.com --- kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 27bafc1e271e..90d3e0bf0302 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -206,7 +206,7 @@ int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, ret = arch_stack_walk_reliable(consume_entry, &c, tsk); put_task_stack(tsk); - return ret; + return ret ? ret : c.len; } #endif -- cgit v1.2.3 From 457c89965399115e5cd8bf38f9c597293405703d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 May 2019 13:08:55 +0100 Subject: treewide: Add SPDX license identifier for missed files Add SPDX license identifiers to all files which: - Have no license information of any form - Have EXPORT_.*_SYMBOL_GPL inside which was used in the initial scan/conversion to ignore the file These files fall under the project license, GPL v2 only. The resulting SPDX license identifier is: GPL-2.0-only Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/tnum.c | 1 + kernel/cgroup/cgroup-v1.c | 1 + kernel/cgroup/rstat.c | 1 + kernel/context_tracking.c | 1 + kernel/crash_dump.c | 1 + kernel/dma/swiotlb.c | 1 + kernel/exit.c | 1 + kernel/fork.c | 1 + kernel/freezer.c | 1 + kernel/hung_task.c | 1 + kernel/irq_work.c | 1 + kernel/jump_label.c | 1 + kernel/kallsyms.c | 1 + kernel/kthread.c | 1 + kernel/locking/lockdep.c | 1 + kernel/locking/mutex.c | 1 + kernel/locking/percpu-rwsem.c | 1 + kernel/locking/rtmutex.c | 1 + kernel/notifier.c | 1 + kernel/panic.c | 1 + kernel/pid.c | 1 + kernel/pid_namespace.c | 1 + kernel/power/qos.c | 1 + kernel/printk/printk.c | 1 + kernel/profile.c | 1 + kernel/ptrace.c | 1 + kernel/reboot.c | 1 + kernel/resource.c | 1 + kernel/sched/clock.c | 1 + kernel/sched/core.c | 1 + kernel/sched/cputime.c | 1 + kernel/sched/idle.c | 1 + kernel/sched/isolation.c | 1 + kernel/sched/wait.c | 1 + kernel/sched/wait_bit.c | 1 + kernel/signal.c | 1 + kernel/smp.c | 1 + kernel/smpboot.c | 1 + kernel/stacktrace.c | 1 + kernel/sysctl.c | 1 + kernel/umh.c | 1 + kernel/up.c | 1 + kernel/user-return-notifier.c | 1 + kernel/user.c | 1 + kernel/workqueue.c | 1 + 45 files changed, 45 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 938d41211be7..ca52b9642943 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* tnum: tracked (or tristate) numbers * * A tnum tracks knowledge about the bits of a value. Each bit can be either diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 68ca5de7ec27..88006be40ea3 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index bb95a35e8c2d..ca19b4c8acf5 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 9ad37b9e44a7..be01a4d627c9 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Context tracking: Probe on high level context boundaries such as kernel * and userspace. This includes syscalls and exceptions entry/exit. diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index b64e238b553b..9c23ae074b40 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include #include #include diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 6f7619c1f877..13f0cb080a4d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Dynamic DMA mapping support. * diff --git a/kernel/exit.c b/kernel/exit.c index 8361a560cd1d..1803efb2922f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * diff --git a/kernel/fork.c b/kernel/fork.c index b4cba953040a..b2b87d450b80 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/fork.c * diff --git a/kernel/freezer.c b/kernel/freezer.c index b162b74611e4..c0738424bb43 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/freezer.c - Function to freeze a process * diff --git a/kernel/hung_task.c b/kernel/hung_task.c index f108a95882c6..14a625c16cb3 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Detect Hung Task * diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 73288914ed5e..d42acaf81886 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * diff --git a/kernel/jump_label.c b/kernel/jump_label.c index de6efdecc70d..0bfa10f4410c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * jump label support * diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 14934afa9e68..95a260f9214b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. * diff --git a/kernel/kthread.c b/kernel/kthread.c index be4e8795561a..621467c33fef 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d06190fa5082..c47788fa85f9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/lockdep.c * diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index db578783dd36..0c601ae072b3 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/locking/mutex.c * diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f17dad99eec8..b6a9cc62099a 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include #include #include diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 978d63a8261c..38fbf9fa7f1b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RT-Mutexes: simple blocking mutual exclusion locks with PI support * diff --git a/kernel/notifier.c b/kernel/notifier.c index bfc95b3e4235..d9f5081d578d 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include #include #include diff --git a/kernel/panic.c b/kernel/panic.c index b4543a31a495..4d9f55bf7d38 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/panic.c * diff --git a/kernel/pid.c b/kernel/pid.c index 89548d35eefb..e5cad0c7d5dd 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic pidhash and scalable, time-bounded PID allocator * diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index aa6e72fb7c08..f54bc7cb6c2d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Pid namespaces * diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9d22131afc1e..33e3febaba53 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * This module exposes the interface to kernel space for specifying * QoS dependencies. It provides infrastructure for registration of: diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a6e06fe38e41..1888f6a3b694 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/printk.c * diff --git a/kernel/profile.c b/kernel/profile.c index 9c08a2c7cb1d..af7c94bf5fa1 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/profile.c * Simple profiling. Manages a direct-mapped profile hit count buffer, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6f357f4fc859..5710d07e67cf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/ptrace.c * diff --git a/kernel/reboot.c b/kernel/reboot.c index b9e79e8c7226..c4d472b7f1b4 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/reboot.c * diff --git a/kernel/resource.c b/kernel/resource.c index 8c15f846e8ef..158f04ec1d4f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/resource.c * diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e3e3b979f9bd..1152259a4ca0 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * sched_clock() for unstable CPU clocks * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 102dfcf0a29a..874c427742a9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/core.c * diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ba4a143bdcf3..2305ce89a26c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Simple CPU accounting cgroup controller */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f5516bae0c1b..80940939b733 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic entry points for the idle threads and * implementation of the idle task scheduling class. diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 687302051a27..123ea07a3f3b 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Housekeeping management. Manage the targets for routine code that can run on * any CPU: unbound workqueues, timers, kthreads and any offloadable work. diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 6eb1f8efd221..fa0f9adfb752 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic waiting primitives. * diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index c67c6d24adc2..45eba18a2898 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * The implementation of the wait_bit*() and related waiting APIs: */ diff --git a/kernel/signal.c b/kernel/signal.c index a1eb44dc9ff5..d7b9d14ac80d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/signal.c * diff --git a/kernel/smp.c b/kernel/smp.c index f4cf1b0bb3b8..d155374632eb 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic helpers for smp ipi calls * diff --git a/kernel/smpboot.c b/kernel/smpboot.c index c230c2dd48e1..2efe1e206167 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Common SMP CPU bringup/teardown functions */ diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 27bafc1e271e..5667f1da3ede 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/stacktrace.c * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 943c89178e3d..7d1008be6173 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * sysctl.c: General linux system control interface * diff --git a/kernel/umh.c b/kernel/umh.c index d937cbad903a..7f255b5a8845 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * umh - the kernel usermode helper */ diff --git a/kernel/up.c b/kernel/up.c index ff536f9cc8a2..483c9962c999 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Uniprocessor-only support functions. The counterpart to kernel/smp.c */ diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 9586b670a5b2..870ecd7c63ed 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include #include diff --git a/kernel/user.c b/kernel/user.c index 88b834f0eebc..78b17e36e705 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * The "user cache". * diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9657315405de..95aea04ff722 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/workqueue.c - generic async execution with shared worker pool * -- cgit v1.2.3 From ec8f24b7faaf3d4799a7c3f4c1b87f6b02778ad1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 May 2019 13:07:45 +0100 Subject: treewide: Add SPDX license identifier - Makefile/Kconfig Add SPDX license identifiers to all Make/Kconfig files which: - Have no license information of any form These files fall under the project license, GPL v2 only. The resulting SPDX license identifier is: GPL-2.0-only Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/Kconfig.freezer | 1 + kernel/Kconfig.hz | 1 + kernel/Kconfig.locks | 1 + kernel/Kconfig.preempt | 1 + kernel/debug/Makefile | 1 + kernel/dma/Kconfig | 1 + kernel/gcov/Kconfig | 1 + kernel/irq/Kconfig | 1 + kernel/livepatch/Kconfig | 1 + kernel/livepatch/Makefile | 1 + kernel/power/Kconfig | 1 + kernel/printk/Makefile | 1 + kernel/rcu/Kconfig | 1 + kernel/rcu/Kconfig.debug | 1 + kernel/time/Kconfig | 1 + kernel/trace/Kconfig | 1 + 16 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer index a3bb4cb52539..68646feefb3d 100644 --- a/kernel/Kconfig.freezer +++ b/kernel/Kconfig.freezer @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only config FREEZER def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 2a202a846757..38ef6d06888e 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Timer Interrupt Frequency Configuration # diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index bf770d7556f7..e0852dc333ac 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # The ARCH_INLINE foo is necessary because select ignores "depends on" # diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0fee5fe6c899..dc0b682ec2d9 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only choice prompt "Preemption Model" diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile index a85edc339985..332ee6c6ec2c 100644 --- a/kernel/debug/Makefile +++ b/kernel/debug/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Makefile for the linux kernel debugger # diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 83d711f8d665..70f8f8d9200e 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config HAS_DMA bool diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index f71c1adcff31..3941a9c48f83 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only menu "GCOV-based kernel profiling" config GCOV_KERNEL diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 8fee06625c37..f92d9a687372 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only menu "IRQ subsystem" # Options selectable by the architecture code diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index ec4565122e65..54102deb50ba 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config HAVE_LIVEPATCH bool help diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index b36ceda6488e..cf9b5bcdb952 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_LIVEPATCH) += livepatch.o livepatch-objs := core.o patch.o shadow.o transition.o diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9bbaaab14b36..ff8592ddedee 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config SUSPEND bool "Suspend to RAM and standby" depends on ARCH_SUSPEND_POSSIBLE diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4a2ffc39eb95..4d052fc6bcde 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 37301430970e..480edf328b51 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # RCU-related configuration options # diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 0ec7d1d33a14..5ec3ea4028e2 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # RCU-related debugging configuration options # diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e2c038d6c13c..fcc42353f125 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Timer subsystem related configuration options # diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d965cef6c77..564e5fdb025f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Architectures that offer an FUNCTION_TRACER implementation should # select HAVE_FUNCTION_TRACER: -- cgit v1.2.3 From d6cd1e9b9ff4f5e2d6b7085ad8601f86177fd300 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 May 2019 15:51:39 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 9 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details you should have received a copy of the gnu general public license along with this program if not you can access it online at http www gnu org licenses gpl 2 0 html extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 1 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Kate Stewart Reviewed-by: Jilayne Lovejoy Reviewed-by: Steve Winslow Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190519154041.430943677@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/locking/test-ww_mutex.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 65a3b7e55b9f..3e82f449b4ff 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Module-based API test facility for ww_mutexes - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. */ #include -- cgit v1.2.3 From 1ccea77e2a2687cae171b7987eb44730ec8c6d5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 May 2019 15:51:43 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 13 Based on 2 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details you should have received a copy of the gnu general public license along with this program if not see http www gnu org licenses this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details [based] [from] [clk] [highbank] [c] you should have received a copy of the gnu general public license along with this program if not see http www gnu org licenses extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 355 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Kate Stewart Reviewed-by: Jilayne Lovejoy Reviewed-by: Steve Winslow Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190519154041.837383322@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/livepatch/core.c | 14 +------------- kernel/livepatch/patch.c | 14 +------------- kernel/livepatch/shadow.c | 14 +------------- kernel/livepatch/transition.c | 14 +------------- kernel/printk/internal.h | 14 +------------- kernel/printk/printk_safe.c | 14 +------------- 6 files changed, 6 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 91cd519756d3..2398832947c6 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * core.c - Kernel Live Patching Core * * Copyright (C) 2014 Seth Jennings * Copyright (C) 2014 SUSE - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 99cb3ad05eb4..bd43537702bd 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * patch.c - livepatch patching functions * * Copyright (C) 2014 Seth Jennings * Copyright (C) 2014 SUSE * Copyright (C) 2015 Josh Poimboeuf - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index 83958c814439..e5c9fb295ba9 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * shadow.c - Shadow Variables * * Copyright (C) 2014 Josh Poimboeuf * Copyright (C) 2014 Seth Jennings * Copyright (C) 2017 Joe Lawrence - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ /** diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index c53370d596be..abb2a4a2cbb2 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * transition.c - Kernel Live Patching transition functions * * Copyright (C) 2015-2016 Josh Poimboeuf - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 0f1898820cba..c8e6ab689d42 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -1,18 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * internal.h - printk internal definitions - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 0913b4d385de..b4045e782743 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * printk_safe.c - Safe printk for printk-deadlock-prone contexts - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #include -- cgit v1.2.3 From 7170066ecd289cd8560695b6f86ba8dc723b6505 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 May 2019 15:51:55 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 25 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it would be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 6 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Steve Winslow Reviewed-by: Kate Stewart Reviewed-by: Jilayne Lovejoy Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190519154043.007767574@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/delayacct.c | 11 +---------- kernel/test_kprobes.c | 11 +---------- 2 files changed, 2 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 2a12b988c717..27725754ac99 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* delayacct.c - per-task delay accounting * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #include diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 7bca480151b0..76c997fdbc9d 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -1,17 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * test_kprobes.c - simple sanity test for *probes * * Copyright IBM Corp. 2008 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #define pr_fmt(fmt) "Kprobe smoke test: " fmt -- cgit v1.2.3 From 55267c88c003a3648567beae7c90512d3e2ab15e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 18 Apr 2019 10:18:50 -0500 Subject: tracing: Prevent hist_field_var_ref() from accessing NULL tracing_map_elts hist_field_var_ref() is an implementation of hist_field_fn_t(), which can be called with a null tracing_map_elt elt param when assembling a key in event_hist_trigger(). In the case of hist_field_var_ref() this doesn't make sense, because a variable can only be resolved by looking it up using an already assembled key i.e. a variable can't be used to assemble a key since the key is required in order to access the variable. Upper layers should prevent the user from constructing a key using a variable in the first place, but in case one slips through, it shouldn't cause a NULL pointer dereference. Also if one does slip through, we want to know about it, so emit a one-time warning in that case. Link: http://lkml.kernel.org/r/64ec8dc15c14d305295b64cdfcc6b2b9dd14753f.1555597045.git.tom.zanussi@linux.intel.com Reported-by: Vincent Bernat Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7fca3457c705..06e7b9f66de6 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1854,6 +1854,9 @@ static u64 hist_field_var_ref(struct hist_field *hist_field, struct hist_elt_data *elt_data; u64 var_val = 0; + if (WARN_ON_ONCE(!elt)) + return var_val; + elt_data = elt->private_data; var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; -- cgit v1.2.3 From c8d94a1878342fdffedaaf15201d951dfc147065 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 18 Apr 2019 10:18:51 -0500 Subject: tracing: Check keys for variable references in expressions too There's an existing check for variable references in keys, but it doesn't go far enough. It checks whether a key field is a variable reference but doesn't check whether it's an expression containing variable references, which can cause the same problems for callers. Use the existing field_has_hist_vars() function rather than a direct top-level flag check to catch all possible variable references. Link: http://lkml.kernel.org/r/e8c3d3d53db5ca90ceea5a46e5413103a6902fc7.1555597045.git.tom.zanussi@linux.intel.com Cc: stable@vger.kernel.org Fixes: 067fe038e70f6 ("tracing: Add variable reference handling to hist triggers") Reported-by: Vincent Bernat Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 06e7b9f66de6..2b76f9520bd0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -59,7 +59,7 @@ C(NO_CLOSING_PAREN, "No closing paren found"), \ C(SUBSYS_NOT_FOUND, "Missing subsystem"), \ C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \ - C(INVALID_REF_KEY, "Using variable references as keys not supported"), \ + C(INVALID_REF_KEY, "Using variable references in keys not supported"), \ C(VAR_NOT_FOUND, "Couldn't find variable"), \ C(FIELD_NOT_FOUND, "Couldn't find field"), @@ -4506,7 +4506,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, goto out; } - if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { + if (field_has_hist_vars(hist_field, 0)) { hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str)); destroy_hist_field(hist_field, 0); ret = -EINVAL; -- cgit v1.2.3 From 9b2ca371b1505a547217b244f903ad3fb86fa5b4 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 18 Apr 2019 10:18:52 -0500 Subject: tracing: Add a check_val() check before updating cond_snapshot() track_val Without this check a snapshot is taken whenever a bucket's max is hit, rather than only when the global max is hit, as it should be. Before: In this example, we do a first run of the workload (cyclictest), examine the output, note the max ('triggering value') (347), then do a second run and note the max again. In this case, the max in the second run (39) is below the max in the first run, but since we haven't cleared the histogram, the first max is still in the histogram and is higher than any other max, so it should still be the max for the snapshot. It isn't however - the value should still be 347 after the second run. # echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_waking/trigger # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_prio,next_comm,prev_pid,prev_prio,prev_comm):onmax($wakeup_lat).snapshot() if next_comm=="cyclictest"' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger # cyclictest -p 80 -n -s -t 2 -D 2 # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist { next_pid: 2143 } hitcount: 199 max: 44 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/4 { next_pid: 2145 } hitcount: 1325 max: 38 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/2 { next_pid: 2144 } hitcount: 1982 max: 347 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 Snapshot taken (see tracing/snapshot). Details: triggering value { onmax($wakeup_lat) }: 347 triggered by event with key: { next_pid: 2144 } # cyclictest -p 80 -n -s -t 2 -D 2 # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist { next_pid: 2143 } hitcount: 199 max: 44 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/4 { next_pid: 2148 } hitcount: 199 max: 16 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/1 { next_pid: 2145 } hitcount: 1325 max: 38 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/2 { next_pid: 2150 } hitcount: 1326 max: 39 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/4 { next_pid: 2144 } hitcount: 1982 max: 347 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 { next_pid: 2149 } hitcount: 1983 max: 130 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/0 Snapshot taken (see tracing/snapshot). Details: triggering value { onmax($wakeup_lat) }: 39 triggered by event with key: { next_pid: 2150 } After: In this example, we do a first run of the workload (cyclictest), examine the output, note the max ('triggering value') (375), then do a second run and note the max again. In this case, the max in the second run is still 375, the highest in any bucket, as it should be. # cyclictest -p 80 -n -s -t 2 -D 2 # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist { next_pid: 2072 } hitcount: 200 max: 28 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/5 { next_pid: 2074 } hitcount: 1323 max: 375 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/2 { next_pid: 2073 } hitcount: 1980 max: 153 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 Snapshot taken (see tracing/snapshot). Details: triggering value { onmax($wakeup_lat) }: 375 triggered by event with key: { next_pid: 2074 } # cyclictest -p 80 -n -s -t 2 -D 2 # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist { next_pid: 2101 } hitcount: 199 max: 49 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 { next_pid: 2072 } hitcount: 200 max: 28 next_prio: 120 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/5 { next_pid: 2074 } hitcount: 1323 max: 375 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/2 { next_pid: 2103 } hitcount: 1325 max: 74 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/4 { next_pid: 2073 } hitcount: 1980 max: 153 next_prio: 19 next_comm: cyclictest prev_pid: 0 prev_prio: 120 prev_comm: swapper/6 { next_pid: 2102 } hitcount: 1981 max: 84 next_prio: 19 next_comm: cyclictest prev_pid: 12 prev_prio: 120 prev_comm: kworker/0:1 Snapshot taken (see tracing/snapshot). Details: triggering value { onmax($wakeup_lat) }: 375 triggered by event with key: { next_pid: 2074 } Link: http://lkml.kernel.org/r/95958351329f129c07504b4d1769c47a97b70d65.1555597045.git.tom.zanussi@linux.intel.com Cc: stable@vger.kernel.org Fixes: a3785b7eca8fd ("tracing: Add hist trigger snapshot() action") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 2b76f9520bd0..ca6b0dff60c5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3585,14 +3585,20 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) struct track_data *track_data = tr->cond_snapshot->cond_data; struct hist_elt_data *elt_data, *track_elt_data; struct snapshot_context *context = cond_data; + struct action_data *action; u64 track_val; if (!track_data) return false; + action = track_data->action_data; + track_val = get_track_val(track_data->hist_data, context->elt, track_data->action_data); + if (!action->track_data.check_val(track_data->track_val, track_val)) + return false; + track_data->track_val = track_val; memcpy(track_data->key, context->key, track_data->key_len); -- cgit v1.2.3 From 4eebe38a37f9397ffecd4bd3afbdf36838a97969 Mon Sep 17 00:00:00 2001 From: Jagadeesh Pagadala Date: Thu, 28 Mar 2019 03:49:46 +0530 Subject: kernel/trace/trace.h: Remove duplicate header of trace_seq.h Remove duplicate header which is included twice. Link: http://lkml.kernel.org/r/1553725186-41442-1-git-send-email-jagdsh.linux@gmail.com Reviewed-by: Mukesh Ojha Signed-off-by: Jagadeesh Pagadala Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1974ce818ddb..82c70b63d375 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -15,7 +15,6 @@ #include #include #include -#include #include #ifdef CONFIG_FTRACE_SYSCALLS -- cgit v1.2.3 From 1b038c6e05ff70a1e66e3e571c2e6106bdb75f53 Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Fri, 17 May 2019 13:52:31 +0200 Subject: perf/ring_buffer: Fix exposing a temporarily decreased data_head In perf_output_put_handle(), an IRQ/NMI can happen in below location and write records to the same ring buffer: ... local_dec_and_test(&rb->nest) ... <-- an IRQ/NMI can happen here rb->user_page->data_head = head; ... In this case, a value A is written to data_head in the IRQ, then a value B is written to data_head after the IRQ. And A > B. As a result, data_head is temporarily decreased from A to B. And a reader may see data_head < data_tail if it read the buffer frequently enough, which creates unexpected behaviors. This can be fixed by moving dec(&rb->nest) to after updating data_head, which prevents the IRQ/NMI above from updating data_head. [ Split up by peterz. ] Signed-off-by: Yabin Cui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: mark.rutland@arm.com Fixes: ef60777c9abd ("perf: Optimize the perf_output() path by removing IRQ-disables") Link: http://lkml.kernel.org/r/20190517115418.224478157@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 674b35383491..009467a60578 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -51,11 +51,18 @@ again: head = local_read(&rb->head); /* - * IRQ/NMI can happen here, which means we can miss a head update. + * IRQ/NMI can happen here and advance @rb->head, causing our + * load above to be stale. */ - if (!local_dec_and_test(&rb->nest)) + /* + * If this isn't the outermost nesting, we don't have to update + * @rb->user_page->data_head. + */ + if (local_read(&rb->nest) > 1) { + local_dec(&rb->nest); goto out; + } /* * Since the mmap() consumer (userspace) can run on a different CPU: @@ -87,9 +94,18 @@ again: rb->user_page->data_head = head; /* - * Now check if we missed an update -- rely on previous implied - * compiler barriers to force a re-read. + * We must publish the head before decrementing the nest count, + * otherwise an IRQ/NMI can publish a more recent head value and our + * write will (temporarily) publish a stale value. + */ + barrier(); + local_set(&rb->nest, 0); + + /* + * Ensure we decrement @rb->nest before we validate the @rb->head. + * Otherwise we cannot be sure we caught the 'last' nested update. */ + barrier(); if (unlikely(head != local_read(&rb->head))) { local_inc(&rb->nest); goto again; -- cgit v1.2.3 From 3f9fbe9bd86c534eba2faf5d840fd44c6049f50e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 May 2019 13:52:32 +0200 Subject: perf/ring_buffer: Add ordering to rb->nest increment Similar to how decrementing rb->next too early can cause data_head to (temporarily) be observed to go backward, so too can this happen when we increment too late. This barrier() ensures the rb->head load happens after the increment, both the one in the 'goto again' path, as the one from perf_output_get_handle() -- albeit very unlikely to matter for the latter. Suggested-by: Yabin Cui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mark.rutland@arm.com Cc: namhyung@kernel.org Fixes: ef60777c9abd ("perf: Optimize the perf_output() path by removing IRQ-disables") Link: http://lkml.kernel.org/r/20190517115418.309516009@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 009467a60578..4b5f8d932400 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -48,6 +48,15 @@ static void perf_output_put_handle(struct perf_output_handle *handle) unsigned long head; again: + /* + * In order to avoid publishing a head value that goes backwards, + * we must ensure the load of @rb->head happens after we've + * incremented @rb->nest. + * + * Otherwise we can observe a @rb->head value before one published + * by an IRQ/NMI happening between the load and the increment. + */ + barrier(); head = local_read(&rb->head); /* -- cgit v1.2.3 From 4d839dd9e4356bbacf3eb0ab13a549b83b008c21 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 May 2019 13:52:33 +0200 Subject: perf/ring-buffer: Always use {READ,WRITE}_ONCE() for rb->user_page data We must use {READ,WRITE}_ONCE() on rb->user_page data such that concurrent usage will see whole values. A few key sites were missing this. Suggested-by: Yabin Cui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mark.rutland@arm.com Cc: namhyung@kernel.org Fixes: 7b732a750477 ("perf_counter: new output ABI - part 1") Link: http://lkml.kernel.org/r/20190517115418.394192145@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 4b5f8d932400..7a0c73e4b3eb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -100,7 +100,7 @@ again: * See perf_output_begin(). */ smp_wmb(); /* B, matches C */ - rb->user_page->data_head = head; + WRITE_ONCE(rb->user_page->data_head, head); /* * We must publish the head before decrementing the nest count, @@ -496,7 +496,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) perf_event_aux_event(handle->event, aux_head, size, handle->aux_flags); - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) wakeup = true; @@ -528,7 +528,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; -- cgit v1.2.3 From 5322ea58a06da2e69c5ef36a9b4d4b9255edd423 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 May 2019 13:52:34 +0200 Subject: perf/ring-buffer: Use regular variables for nesting While the IRQ/NMI will nest, the nest-count will be invariant over the actual exception, since it will decrement equal to increment. This means we can -- carefully -- use a regular variable since the typical LOAD-STORE race doesn't exist (similar to preempt_count). This optimizes the ring-buffer for all LOAD-STORE architectures, since they need to use atomic ops to implement local_t. Suggested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: mark.rutland@arm.com Cc: namhyung@kernel.org Cc: yabinc@google.com Link: http://lkml.kernel.org/r/20190517115418.481392777@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/internal.h | 4 ++-- kernel/events/ring_buffer.c | 41 ++++++++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 79c47076700a..3aef4191798c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -24,7 +24,7 @@ struct ring_buffer { atomic_t poll; /* POLL_ for wakeups */ local_t head; /* write position */ - local_t nest; /* nested writers */ + unsigned int nest; /* nested writers */ local_t events; /* event limit */ local_t wakeup; /* wakeup stamp */ local_t lost; /* nr records lost */ @@ -41,7 +41,7 @@ struct ring_buffer { /* AUX area */ long aux_head; - local_t aux_nest; + unsigned int aux_nest; long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ unsigned long aux_pgoff; int aux_nr_pages; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 7a0c73e4b3eb..ffb59a4ef4ff 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle) struct ring_buffer *rb = handle->rb; preempt_disable(); - local_inc(&rb->nest); + + /* + * Avoid an explicit LOAD/STORE such that architectures with memops + * can use them. + */ + (*(volatile unsigned int *)&rb->nest)++; handle->wakeup = local_read(&rb->wakeup); } @@ -46,6 +51,17 @@ static void perf_output_put_handle(struct perf_output_handle *handle) { struct ring_buffer *rb = handle->rb; unsigned long head; + unsigned int nest; + + /* + * If this isn't the outermost nesting, we don't have to update + * @rb->user_page->data_head. + */ + nest = READ_ONCE(rb->nest); + if (nest > 1) { + WRITE_ONCE(rb->nest, nest - 1); + goto out; + } again: /* @@ -64,15 +80,6 @@ again: * load above to be stale. */ - /* - * If this isn't the outermost nesting, we don't have to update - * @rb->user_page->data_head. - */ - if (local_read(&rb->nest) > 1) { - local_dec(&rb->nest); - goto out; - } - /* * Since the mmap() consumer (userspace) can run on a different CPU: * @@ -108,7 +115,7 @@ again: * write will (temporarily) publish a stale value. */ barrier(); - local_set(&rb->nest, 0); + WRITE_ONCE(rb->nest, 0); /* * Ensure we decrement @rb->nest before we validate the @rb->head. @@ -116,7 +123,7 @@ again: */ barrier(); if (unlikely(head != local_read(&rb->head))) { - local_inc(&rb->nest); + WRITE_ONCE(rb->nest, 1); goto again; } @@ -355,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *output_event = event; unsigned long aux_head, aux_tail; struct ring_buffer *rb; + unsigned int nest; if (output_event->parent) output_event = output_event->parent; @@ -385,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!refcount_inc_not_zero(&rb->aux_refcount)) goto err; + nest = READ_ONCE(rb->aux_nest); /* * Nesting is not supported for AUX area, make sure nested * writers are caught early */ - if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) + if (WARN_ON_ONCE(nest)) goto err_put; + WRITE_ONCE(rb->aux_nest, nest + 1); + aux_head = rb->aux_head; handle->rb = rb; @@ -419,7 +430,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!handle->size) { /* A, matches D */ event->pending_disable = smp_processor_id(); perf_output_wakeup(handle); - local_set(&rb->aux_nest, 0); + WRITE_ONCE(rb->aux_nest, 0); goto err_put; } } @@ -508,7 +519,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) handle->event = NULL; - local_set(&rb->aux_nest, 0); + WRITE_ONCE(rb->aux_nest, 0); /* can't be last */ rb_free_aux(rb); ring_buffer_put(rb); -- cgit v1.2.3 From b4d0d230ccfb5d1a9ea85da64aa584df7c148ee9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 May 2019 19:08:01 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 36 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public licence as published by the free software foundation either version 2 of the licence or at your option any later version extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 114 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Kate Stewart Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190520170857.552531963@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/cred.c | 6 +----- kernel/module-internal.h | 6 +----- kernel/module_signing.c | 6 +----- 3 files changed, 3 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 45d77284aed0..e74ffdc98a92 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Task credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #include #include diff --git a/kernel/module-internal.h b/kernel/module-internal.h index d354341f8cc0..33783abc377b 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -1,12 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* Module internals * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #include diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6b9a926fd86b..b10fb1986ca9 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Module signature checker * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #include -- cgit v1.2.3 From 6ff3f917e06625f9612f0dbcda10bef45b099b00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 May 2019 19:08:03 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 38 Based on 1 normalized pattern(s): this file is released under the gplv2 and any later version extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 1 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Kate Stewart Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190520170857.732920462@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/stop_machine.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 7231fb5953fc..2b5a6754646f 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * kernel/stop_machine.c * @@ -5,8 +6,6 @@ * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo - * - * This file is released under the GPLv2 and any later version. */ #include #include -- cgit v1.2.3 From f7b101d33046a837c2aa4526cef28a3c785d7af2 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 15 May 2019 17:35:51 -0400 Subject: kheaders: Move from proc to sysfs The kheaders archive consisting of the kernel headers used for compiling bpf programs is in /proc. However there is concern that moving it here will make it permanent. Let us move it to /sys/kernel as discussed [1]. [1] https://lore.kernel.org/patchwork/patch/1067310/#1265969 Suggested-by: Steven Rostedt Signed-off-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/Makefile | 4 +-- kernel/gen_ikh_data.sh | 89 -------------------------------------------------- kernel/gen_kheaders.sh | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kheaders.c | 40 +++++++++-------------- 4 files changed, 107 insertions(+), 115 deletions(-) delete mode 100755 kernel/gen_ikh_data.sh create mode 100755 kernel/gen_kheaders.sh (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 33824f0385b3..a8d923b5481b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -71,7 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o -obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o +obj-$(CONFIG_IKHEADERS) += kheaders.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz -cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@ +cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh deleted file mode 100755 index 591a94f7b387..000000000000 --- a/kernel/gen_ikh_data.sh +++ /dev/null @@ -1,89 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# This script generates an archive consisting of kernel headers -# for CONFIG_IKHEADERS_PROC. -set -e -spath="$(dirname "$(readlink -f "$0")")" -kroot="$spath/.." -outdir="$(pwd)" -tarfile=$1 -cpio_dir=$outdir/$tarfile.tmp - -# Script filename relative to the kernel source root -# We add it to the archive because it is small and any changes -# to this script will also cause a rebuild of the archive. -sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" - -src_file_list=" -include/ -arch/$SRCARCH/include/ -$sfile -" - -obj_file_list=" -include/ -arch/$SRCARCH/include/ -" - -# Support incremental builds by skipping archive generation -# if timestamps of files being archived are not changed. - -# This block is useful for debugging the incremental builds. -# Uncomment it for debugging. -# iter=1 -# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; -# else; iter=$(($(cat /tmp/iter) + 1)); fi -# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter -# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter - -# include/generated/compile.h is ignored because it is touched even when none -# of the source files changed. This causes pointless regeneration, so let us -# ignore them for md5 calculation. -pushd $kroot > /dev/null -src_files_md5="$(find $src_file_list -type f | - grep -v "include/generated/compile.h" | - xargs ls -lR | md5sum | cut -d ' ' -f1)" -popd > /dev/null -obj_files_md5="$(find $obj_file_list -type f | - grep -v "include/generated/compile.h" | - xargs ls -lR | md5sum | cut -d ' ' -f1)" - -if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi -if [ -f kernel/kheaders.md5 ] && - [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && - [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && - [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then - exit -fi - -if [ "${quiet}" != "silent_" ]; then - echo " GEN $tarfile" -fi - -rm -rf $cpio_dir -mkdir $cpio_dir - -pushd $kroot > /dev/null -for f in $src_file_list; - do find "$f" ! -name "*.cmd" ! -name ".*"; -done | cpio --quiet -pd $cpio_dir -popd > /dev/null - -# The second CPIO can complain if files already exist which can -# happen with out of tree builds. Just silence CPIO for now. -for f in $obj_file_list; - do find "$f" ! -name "*.cmd" ! -name ".*"; -done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 - -# Remove comments except SDPX lines -find $cpio_dir -type f -print0 | - xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' - -tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null - -echo "$src_files_md5" > kernel/kheaders.md5 -echo "$obj_files_md5" >> kernel/kheaders.md5 -echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 - -rm -rf $cpio_dir diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh new file mode 100755 index 000000000000..581b83534587 --- /dev/null +++ b/kernel/gen_kheaders.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This script generates an archive consisting of kernel headers +# for CONFIG_IKHEADERS. +set -e +spath="$(dirname "$(readlink -f "$0")")" +kroot="$spath/.." +outdir="$(pwd)" +tarfile=$1 +cpio_dir=$outdir/$tarfile.tmp + +# Script filename relative to the kernel source root +# We add it to the archive because it is small and any changes +# to this script will also cause a rebuild of the archive. +sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" + +src_file_list=" +include/ +arch/$SRCARCH/include/ +$sfile +" + +obj_file_list=" +include/ +arch/$SRCARCH/include/ +" + +# Support incremental builds by skipping archive generation +# if timestamps of files being archived are not changed. + +# This block is useful for debugging the incremental builds. +# Uncomment it for debugging. +# iter=1 +# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; +# else; iter=$(($(cat /tmp/iter) + 1)); fi +# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter +# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter + +# include/generated/compile.h is ignored because it is touched even when none +# of the source files changed. This causes pointless regeneration, so let us +# ignore them for md5 calculation. +pushd $kroot > /dev/null +src_files_md5="$(find $src_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" +popd > /dev/null +obj_files_md5="$(find $obj_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" + +if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi +if [ -f kernel/kheaders.md5 ] && + [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && + [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && + [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then + exit +fi + +if [ "${quiet}" != "silent_" ]; then + echo " GEN $tarfile" +fi + +rm -rf $cpio_dir +mkdir $cpio_dir + +pushd $kroot > /dev/null +for f in $src_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir +popd > /dev/null + +# The second CPIO can complain if files already exist which can +# happen with out of tree builds. Just silence CPIO for now. +for f in $obj_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 + +# Remove comments except SDPX lines +find $cpio_dir -type f -print0 | + xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' + +tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null + +echo "$src_files_md5" > kernel/kheaders.md5 +echo "$obj_files_md5" >> kernel/kheaders.md5 +echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 + +rm -rf $cpio_dir diff --git a/kernel/kheaders.c b/kernel/kheaders.c index 70ae6052920d..8f69772af77b 100644 --- a/kernel/kheaders.c +++ b/kernel/kheaders.c @@ -8,9 +8,8 @@ #include #include -#include +#include #include -#include /* * Define kernel_headers_data and kernel_headers_data_end, within which the @@ -31,39 +30,32 @@ extern char kernel_headers_data; extern char kernel_headers_data_end; static ssize_t -ikheaders_read_current(struct file *file, char __user *buf, - size_t len, loff_t *offset) +ikheaders_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) { - return simple_read_from_buffer(buf, len, offset, - &kernel_headers_data, - &kernel_headers_data_end - - &kernel_headers_data); + memcpy(buf, &kernel_headers_data + off, len); + return len; } -static const struct file_operations ikheaders_file_ops = { - .read = ikheaders_read_current, - .llseek = default_llseek, +static struct bin_attribute kheaders_attr __ro_after_init = { + .attr = { + .name = "kheaders.tar.xz", + .mode = 0444, + }, + .read = &ikheaders_read, }; static int __init ikheaders_init(void) { - struct proc_dir_entry *entry; - - /* create the current headers file */ - entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL, - &ikheaders_file_ops); - if (!entry) - return -ENOMEM; - - proc_set_size(entry, - &kernel_headers_data_end - - &kernel_headers_data); - return 0; + kheaders_attr.size = (&kernel_headers_data_end - + &kernel_headers_data); + return sysfs_create_bin_file(kernel_kobj, &kheaders_attr); } static void __exit ikheaders_cleanup(void) { - remove_proc_entry("kheaders.tar.xz", NULL); + sysfs_remove_bin_file(kernel_kobj, &kheaders_attr); } module_init(ikheaders_init); -- cgit v1.2.3 From 1457dc9ed8da871fbbc0a2ebdaed0405eeeed0cf Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 15 May 2019 17:35:52 -0400 Subject: kheaders: Do not regenerate archive if config is not changed Linus reported an issue that doing an allmodconfig was causing the kheaders archive to be regenerated even though the config is the same. This patch fixes the issue by ignoring the config-related header files for "knowing when to regenerate based on timestamps". Instead, if the CONFIG_X_Y option really changes, then we there are the include/config/X/Y.h which will already tells us "if a config really changed". So we don't really need these files for regeneration detection anyway, and ignoring them fixes Linus's issue. Reported-by: Linus Torvalds Signed-off-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/gen_kheaders.sh | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 581b83534587..9a34e1d9bd7f 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -31,9 +31,8 @@ arch/$SRCARCH/include/ # This block is useful for debugging the incremental builds. # Uncomment it for debugging. -# iter=1 -# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; -# else; iter=$(($(cat /tmp/iter) + 1)); fi +# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; +# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi # find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter # find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter @@ -43,10 +42,18 @@ arch/$SRCARCH/include/ pushd $kroot > /dev/null src_files_md5="$(find $src_file_list -type f | grep -v "include/generated/compile.h" | + grep -v "include/generated/autoconf.h" | + grep -v "include/config/auto.conf" | + grep -v "include/config/auto.conf.cmd" | + grep -v "include/config/tristate.conf" | xargs ls -lR | md5sum | cut -d ' ' -f1)" popd > /dev/null obj_files_md5="$(find $obj_file_list -type f | grep -v "include/generated/compile.h" | + grep -v "include/generated/autoconf.h" | + grep -v "include/config/auto.conf" | + grep -v "include/config/auto.conf.cmd" | + grep -v "include/config/tristate.conf" | xargs ls -lR | md5sum | cut -d ' ' -f1)" if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi @@ -82,7 +89,7 @@ find $cpio_dir -type f -print0 | tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null -echo "$src_files_md5" > kernel/kheaders.md5 +echo "$src_files_md5" > kernel/kheaders.md5 echo "$obj_files_md5" >> kernel/kheaders.md5 echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 -- cgit v1.2.3 From 51816e9e113934281b44f1a352852ef7631e75ea Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 24 May 2019 15:42:22 -0400 Subject: locking/lock_events: Use this_cpu_add() when necessary The kernel test robot has reported that the use of __this_cpu_add() causes bug messages like: BUG: using __this_cpu_add() in preemptible [00000000] code: ... Given the imprecise nature of the count and the possibility of resetting the count and doing the measurement again, this is not really a big problem to use the unprotected __this_cpu_*() functions. To make the preemption checking code happy, the this_cpu_*() functions will be used if CONFIG_DEBUG_PREEMPT is defined. The imprecise nature of the locking counts are also documented with the suggestion that we should run the measurement a few times with the counts reset in between to get a better picture of what is going on under the hood. Fixes: a8654596f0371 ("locking/rwsem: Enable lock event counting") Suggested-by: Linus Torvalds Signed-off-by: Waiman Long Signed-off-by: Linus Torvalds --- kernel/locking/lock_events.h | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index feb1acc54611..46b71af8eef2 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -30,13 +30,51 @@ enum lock_events { */ DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); +/* + * The purpose of the lock event counting subsystem is to provide a low + * overhead way to record the number of specific locking events by using + * percpu counters. It is the percpu sum that matters, not specifically + * how many of them happens in each cpu. + * + * It is possible that the same percpu counter may be modified in both + * the process and interrupt contexts. For architectures that perform + * percpu operation with multiple instructions, it is possible to lose + * count if a process context percpu update is interrupted in the middle + * and the same counter is updated in the interrupt context. Therefore, + * the generated percpu sum may not be precise. The error, if any, should + * be small and insignificant. + * + * For those architectures that do multi-instruction percpu operation, + * preemption in the middle and moving the task to another cpu may cause + * a larger error in the count. Again, this will be few and far between. + * Given the imprecise nature of the count and the possibility of resetting + * the count and doing the measurement again, this is not really a big + * problem. + * + * To get a better picture of what is happening under the hood, it is + * suggested that a few measurements should be taken with the counts + * reset in between to stamp out outliner because of these possible + * error conditions. + * + * To minimize overhead, we use __this_cpu_*() in all cases except when + * CONFIG_DEBUG_PREEMPT is defined. In this particular case, this_cpu_*() + * will be used to avoid the appearance of unwanted BUG messages. + */ +#ifdef CONFIG_DEBUG_PREEMPT +#define lockevent_percpu_inc(x) this_cpu_inc(x) +#define lockevent_percpu_add(x, v) this_cpu_add(x, v) +#else +#define lockevent_percpu_inc(x) __this_cpu_inc(x) +#define lockevent_percpu_add(x, v) __this_cpu_add(x, v) +#endif + /* * Increment the PV qspinlock statistical counters */ static inline void __lockevent_inc(enum lock_events event, bool cond) { if (cond) - __this_cpu_inc(lockevents[event]); + lockevent_percpu_inc(lockevents[event]); } #define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) @@ -44,7 +82,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond) static inline void __lockevent_add(enum lock_events event, int inc) { - __this_cpu_add(lockevents[event], inc); + lockevent_percpu_add(lockevents[event], inc); } #define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) -- cgit v1.2.3 From 0c97bf863efce63d6ab7971dad811601e6171d2f Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 23 May 2019 14:45:35 +0200 Subject: tracing: Silence GCC 9 array bounds warning Starting with GCC 9, -Warray-bounds detects cases when memset is called starting on a member of a struct but the size to be cleared ends up writing over further members. Such a call happens in the trace code to clear, at once, all members after and including `seq` on struct trace_iterator: In function 'memset', inlined from 'ftrace_dump' at kernel/trace/trace.c:8914:3: ./include/linux/string.h:344:9: warning: '__builtin_memset' offset [8505, 8560] from the object at 'iter' is out of the bounds of referenced subobject 'seq' with type 'struct trace_seq' at offset 4368 [-Warray-bounds] 344 | return __builtin_memset(p, c, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ In order to avoid GCC complaining about it, we compute the address ourselves by adding the offsetof distance instead of referring directly to the member. Since there are two places doing this clear (trace.c and trace_kdb.c), take the chance to move the workaround into a single place in the internal header. Link: http://lkml.kernel.org/r/20190523124535.GA12931@gmail.com Signed-off-by: Miguel Ojeda [ Removed unnecessary parenthesis around "iter" ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +----- kernel/trace/trace.h | 18 ++++++++++++++++++ kernel/trace/trace_kdb.c | 6 +----- 3 files changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c92b3d9ea30..1c80521fd436 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8910,12 +8910,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) cnt++; - /* reset all but tr, trace, and overruns */ - memset(&iter.seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; - iter.pos = -1; if (trace_find_next_entry_inc(&iter) != NULL) { int ret; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 82c70b63d375..005f08629b8b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1966,4 +1966,22 @@ static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } extern struct trace_iterator *tracepoint_print_iter; +/* + * Reset the state of the trace_iterator so that it can read consumed data. + * Normally, the trace_iterator is used for reading the data when it is not + * consumed, and must retain state. + */ +static __always_inline void trace_iterator_reset(struct trace_iterator *iter) +{ + const size_t offset = offsetof(struct trace_iterator, seq); + + /* + * Keep gcc from complaining about overwriting more than just one + * member in the structure. + */ + memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset); + + iter->pos = -1; +} + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 6c1ae6b752d1..cca65044c14c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -37,12 +37,8 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) if (skip_entries) kdb_printf("(skipping %d entries)\n", skip_entries); - /* reset all but tr, trace, and overruns */ - memset(&iter.seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; - iter.pos = -1; if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { -- cgit v1.2.3 From bb1869012d7b78d1474808cb4c8bd8b272645876 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 16 May 2019 12:43:19 +0200 Subject: ACPI: PM: Call pm_set_suspend_via_firmware() during hibernation On systems with ACPI platform firmware the last stage of hibernation is analogous to system suspend to S3 (suspend-to-RAM), so it should be handled analogously. In particular, pm_suspend_via_firmware() should return 'true' in that stage to let the callers of it know that control will be passed to the platform firmware going forward, so pm_set_suspend_via_firmware() needs to be called then in analogy with acpi_suspend_begin(). However, the platform hibernation ->begin() callback is invoked during the "freeze" transition (before creating a snapshot image of system memory) as well as during the "hibernate" transition which is the last stage of it and pm_set_suspend_via_firmware() should be invoked by that callback in the latter stage only. In order to implement that redefine the hibernation ->begin() callback to take a pm_message_t argument to indicate which stage of hibernation is taking place and rework acpi_hibernation_begin() and acpi_hibernation_begin_old() to take it into account as needed. Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c8c272df7154..97522630b1b6 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -129,7 +129,7 @@ static int hibernation_test(int level) { return 0; } static int platform_begin(int platform_mode) { return (platform_mode && hibernation_ops) ? - hibernation_ops->begin() : 0; + hibernation_ops->begin(PMSG_FREEZE) : 0; } /** @@ -542,7 +542,7 @@ int hibernation_platform_enter(void) * hibernation_ops->finish() before saving the image, so we should let * the firmware know that we're going to enter the sleep state after all */ - error = hibernation_ops->begin(); + error = hibernation_ops->begin(PMSG_HIBERNATE); if (error) goto Close; -- cgit v1.2.3 From dfb4a6f2191a80c8b790117d0ff592fd712d3296 Mon Sep 17 00:00:00 2001 From: Tomas Bortoli Date: Tue, 28 May 2019 17:43:38 +0200 Subject: tracing: Avoid memory leak in predicate_parse() In case of errors, predicate_parse() goes to the out_free label to free memory and to return an error code. However, predicate_parse() does not free the predicates of the temporary prog_stack array, thence leaking them. Link: http://lkml.kernel.org/r/20190528154338.29976-1-tomasbortoli@gmail.com Cc: stable@vger.kernel.org Fixes: 80765597bc587 ("tracing: Rewrite filter logic to be simpler and faster") Reported-by: syzbot+6b8e0fb820e570c59e19@syzkaller.appspotmail.com Signed-off-by: Tomas Bortoli [ Added protection around freeing prog_stack[i].pred ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d3e59312ef40..5079d1db3754 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -428,7 +428,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL); if (!op_stack) return ERR_PTR(-ENOMEM); - prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL); + prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL); if (!prog_stack) { parse_error(pe, -ENOMEM, 0); goto out_free; @@ -579,7 +579,11 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, out_free: kfree(op_stack); kfree(inverts); - kfree(prog_stack); + if (prog_stack) { + for (i = 0; prog_stack[i].pred; i++) + kfree(prog_stack[i].pred); + kfree(prog_stack); + } return ERR_PTR(ret); } -- cgit v1.2.3 From f6e2aa91a46d2bc79fce9b93a988dbe7655c90c0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 28 May 2019 18:46:37 -0500 Subject: signal/ptrace: Don't leak unitialized kernel memory with PTRACE_PEEK_SIGINFO Recently syzbot in conjunction with KMSAN reported that ptrace_peek_siginfo can copy an uninitialized siginfo to userspace. Inspecting ptrace_peek_siginfo confirms this. The problem is that off when initialized from args.off can be initialized to a negaive value. At which point the "if (off >= 0)" test to see if off became negative fails because off started off negative. Prevent the core problem by adding a variable found that is only true if a siginfo is found and copied to a temporary in preparation for being copied to userspace. Prevent args.off from being truncated when being assigned to off by testing that off is <= the maximum possible value of off. Convert off to an unsigned long so that we should not have to truncate args.off, we have well defined overflow behavior so if we add another check we won't risk fighting undefined compiler behavior, and so that we have a type whose maximum value is easy to test for. Cc: Andrei Vagin Cc: stable@vger.kernel.org Reported-by: syzbot+0d602a1b0d8c95bdf299@syzkaller.appspotmail.com Fixes: 84c751bd4aeb ("ptrace: add ability to retrieve signals without removing from a queue (v4)") Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6f357f4fc859..02c6528ead5c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -704,6 +704,10 @@ static int ptrace_peek_siginfo(struct task_struct *child, if (arg.nr < 0) return -EINVAL; + /* Ensure arg.off fits in an unsigned long */ + if (arg.off > ULONG_MAX) + return 0; + if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) pending = &child->signal->shared_pending; else @@ -711,18 +715,20 @@ static int ptrace_peek_siginfo(struct task_struct *child, for (i = 0; i < arg.nr; ) { kernel_siginfo_t info; - s32 off = arg.off + i; + unsigned long off = arg.off + i; + bool found = false; spin_lock_irq(&child->sighand->siglock); list_for_each_entry(q, &pending->list, list) { if (!off--) { + found = true; copy_siginfo(&info, &q->info); break; } } spin_unlock_irq(&child->sighand->siglock); - if (off >= 0) /* beyond the end of the list */ + if (!found) /* beyond the end of the list */ break; #ifdef CONFIG_COMPAT -- cgit v1.2.3 From 2874c5fd284268364ece81a7bd936f3c8168e567 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 May 2019 08:55:01 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 3029 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190527070032.746973796@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 242a643af82f..7c473f208a10 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * @@ -12,11 +13,6 @@ * Alexei Starovoitov * Daniel Borkmann * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ -- cgit v1.2.3 From 1a59d1b8e05ea6ab45f7e18897de1ef0e6bc3da6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 May 2019 08:55:05 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 156 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details you should have received a copy of the gnu general public license along with this program if not write to the free software foundation inc 59 temple place suite 330 boston ma 02111 1307 usa extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 1334 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Richard Fontana Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190527070033.113240726@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/audit.c | 15 +-------------- kernel/audit.h | 15 +-------------- kernel/audit_watch.c | 15 +-------------- kernel/auditfilter.c | 15 +-------------- kernel/extable.c | 14 +------------- kernel/futex.c | 15 +-------------- kernel/kprobes.c | 15 +-------------- kernel/module.c | 14 +------------- kernel/params.c | 14 +------------- kernel/tracepoint.c | 15 +-------------- 10 files changed, 10 insertions(+), 137 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index b96bf69183f4..486c968214d9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* audit.c -- Auditing support * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c @@ -5,20 +6,6 @@ * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * Written by Rickard E. (Rik) Faith * * Goals: 1) Integrate fully with Security Modules. diff --git a/kernel/audit.h b/kernel/audit.h index 2071725a999f..6c076d4982da 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -1,22 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* audit -- definition of audit_context structure and supporting types * * Copyright 2003-2004 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index b50c574223fa..1f31c2f1e6fc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -1,22 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* audit_watch.c -- watching inodes * * Copyright 2003-2009 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 303fb04770ce..9f8e190e3bea 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1,22 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* auditfilter.c -- filtering of audit events * * Copyright 2003-2004 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/extable.c b/kernel/extable.c index 6a5b61ebc66c..e23cce6e6092 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Rewritten by Rusty Russell, on the backs of many others... Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include diff --git a/kernel/futex.c b/kernel/futex.c index 2268b97d5439..4b5b468c58b6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 @@ -29,20 +30,6 @@ * * "The futexes are also cursed." * "But they come in a choice of three flavours!" - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b1ea30a5540e..445337c107e0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1,21 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Kernel Probes (KProbes) * kernel/kprobes.c * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) IBM Corporation, 2002, 2004 * * 2002-Oct Created by Vamsi Krishna S Kernel diff --git a/kernel/module.c b/kernel/module.c index 6e6712b3aaf5..80c7c09584cf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (C) 2002 Richard Henderson Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include diff --git a/kernel/params.c b/kernel/params.c index ce89f757e6da..cf448785d058 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Helpers for initial module or kernel cmdline parsing Copyright (C) 2001 Rusty Russell. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 46f2ab1e08a9..df3ade14ccbd 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008-2014 Mathieu Desnoyers - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include -- cgit v1.2.3 From c942fddf8793b2013be8c901b47d0a8dc02bf99f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 May 2019 08:55:06 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 157 Based on 3 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version [author] [kishon] [vijay] [abraham] [i] [kishon]@[ti] [com] this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation either version 2 of the license or at your option any later version [author] [graeme] [gregory] [gg]@[slimlogic] [co] [uk] [author] [kishon] [vijay] [abraham] [i] [kishon]@[ti] [com] [based] [on] [twl6030]_[usb] [c] [author] [hema] [hk] [hemahk]@[ti] [com] this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-or-later has been chosen to replace the boilerplate/reference in 1105 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Richard Fontana Reviewed-by: Kate Stewart Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190527070033.202006027@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/audit_fsnotify.c | 11 +---------- kernel/locking/qrwlock.c | 11 +---------- kernel/locking/qspinlock.c | 11 +---------- kernel/locking/qspinlock_stat.h | 10 +--------- kernel/sched/membarrier.c | 11 +---------- kernel/taskstats.c | 12 +----------- kernel/tsacct.c | 13 +------------ 7 files changed, 7 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index b5737b826951..f0d243318452 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -1,18 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* audit_fsnotify.c -- tracking inodes * * Copyright 2003-2009,2014-2015 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index c7471c3fb798..fe9ca92faa2a 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Queued read/write locks * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. * * Authors: Waiman Long diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index e14b32c69639..2473f10c6956 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Queued spinlock * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2013-2014,2018 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 54152670ff24..e625bb410aa2 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -1,13 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. * * Authors: Waiman Long */ diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 3cd8a3a795d2..aa8d75804108 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -1,17 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2010-2017 Mathieu Desnoyers * * membarrier system call - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include "sched.h" diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 5f852b8f59f7..13a0f2e6ebc2 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -1,19 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * taskstats.c - Export per-task statistics to userland * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 * (C) Balbir Singh, IBM Corp. 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 370724b45391..7be3e7530841 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * tsacct.c - System accounting over taskstats interface * * Copyright (C) Jay Lan, - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include -- cgit v1.2.3 From 468e15fdc2ec7048ab1ae93e200559151c84647e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 May 2019 08:55:17 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 170 Based on 1 normalized pattern(s): this file is release under the gplv2 extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 1 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Richard Fontana Reviewed-by: Kate Stewart Reviewed-by: Armijn Hemel Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190527070034.216732358@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/ksysfs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 46ba853656f6..35859da8bd4f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which * are not related to any other subsystem * * Copyright (C) 2004 Kay Sievers - * - * This file is release under the GPLv2 - * */ #include -- cgit v1.2.3 From 25763b3c864cf517d686661012d184ee47a49b4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2019 10:10:09 -0700 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 206 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of version 2 of the gnu general public license as published by the free software foundation extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 107 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Richard Fontana Reviewed-by: Steve Winslow Reviewed-by: Alexios Zavras Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190528171438.615055994@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/bpf_lru_list.c | 5 +---- kernel/bpf/bpf_lru_list.h | 5 +---- kernel/bpf/map_in_map.c | 5 +---- kernel/bpf/map_in_map.h | 5 +---- kernel/bpf/percpu_freelist.c | 5 +---- kernel/bpf/percpu_freelist.h | 5 +---- kernel/bpf/stackmap.c | 5 +---- 7 files changed, 7 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e6ef4401a138..1b6b9349cb85 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include #include diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 7d4f89b7cb84..f02504640e18 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3dff41403583..fab4fb134547 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include #include diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 6183db9ec08c..a507bf6ef8b9 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __MAP_IN_MAP_H__ #define __MAP_IN_MAP_H__ diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 0c1b4ba9e90e..6e090140b924 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include "percpu_freelist.h" diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index c3960118e617..fbf8a8a28979 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __PERCPU_FREELIST_H__ #define __PERCPU_FREELIST_H__ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 950ab2f28922..d38e49f943a1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include #include -- cgit v1.2.3 From 6b115bf58e6f013ca75e7115aabcbd56c20ff31d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 31 May 2019 10:38:57 -0700 Subject: cgroup: Call cgroup_release() before __exit_signal() cgroup_release() calls cgroup_subsys->release() which is used by the pids controller to uncharge its pid. We want to use it to manage iteration of dying tasks which requires putting it before __unhash_process(). Move cgroup_release() above __exit_signal(). While this makes it uncharge before the pid is freed, pid is RCU freed anyway and the window is very narrow. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 1803efb2922f..a75b6a7f458a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -195,6 +195,7 @@ repeat: rcu_read_unlock(); proc_flush_task(p); + cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); @@ -220,7 +221,6 @@ repeat: } write_unlock_irq(&tasklist_lock); - cgroup_release(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); -- cgit v1.2.3 From b636fd38dc40113f853337a7d2a6885ad23b8811 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 31 May 2019 10:38:58 -0700 Subject: cgroup: Implement css_task_iter_skip() When a task is moved out of a cset, task iterators pointing to the task are advanced using the normal css_task_iter_advance() call. This is fine but we'll be tracking dying tasks on csets and thus moving tasks from cset->tasks to (to be added) cset->dying_tasks. When we remove a task from cset->tasks, if we advance the iterators, they may move over to the next cset before we had the chance to add the task back on the dying list, which can allow the task to escape iteration. This patch separates out skipping from advancing. Skipping only moves the affected iterators to the next pointer rather than fully advancing it and the following advancing will recognize that the cursor has already been moved forward and do the rest of advancing. This ensures that when a task moves from one list to another in its cset, as long as it moves in the right direction, it's always visible to iteration. This doesn't cause any visible behavior changes. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- kernel/cgroup/cgroup.c | 60 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..035aee466bbf 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[]; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); -static void css_task_iter_advance(struct css_task_iter *it); +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -843,6 +844,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated) cgroup_update_populated(link->cgrp, populated); } +/* + * @task is leaving, advance task iterators which are pointing to it so + * that they can resume at the next position. Advancing an iterator might + * remove it from the list, use safe walk. See css_task_iter_skip() for + * details. + */ +static void css_set_skip_task_iters(struct css_set *cset, + struct task_struct *task) +{ + struct css_task_iter *it, *pos; + + list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) + css_task_iter_skip(it, task); +} + /** * css_set_move_task - move a task from one css_set to another * @task: task being moved @@ -868,22 +884,9 @@ static void css_set_move_task(struct task_struct *task, css_set_update_populated(to_cset, true); if (from_cset) { - struct css_task_iter *it, *pos; - WARN_ON_ONCE(list_empty(&task->cg_list)); - /* - * @task is leaving, advance task iterators which are - * pointing to it so that they can resume at the next - * position. Advancing an iterator might remove it from - * the list, use safe walk. See css_task_iter_advance*() - * for details. - */ - list_for_each_entry_safe(it, pos, &from_cset->task_iters, - iters_node) - if (it->task_pos == &task->cg_list) - css_task_iter_advance(it); - + css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); @@ -4430,10 +4433,19 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) list_add(&it->iters_node, &cset->task_iters); } -static void css_task_iter_advance(struct css_task_iter *it) +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task) { - struct list_head *next; + lockdep_assert_held(&css_set_lock); + + if (it->task_pos == &task->cg_list) { + it->task_pos = it->task_pos->next; + it->flags |= CSS_TASK_ITER_SKIPPED; + } +} +static void css_task_iter_advance(struct css_task_iter *it) +{ lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { @@ -4442,15 +4454,15 @@ repeat: * consumed first and then ->mg_tasks. After ->mg_tasks, * we move onto the next cset. */ - next = it->task_pos->next; - - if (next == it->tasks_head) - next = it->mg_tasks_head->next; + if (it->flags & CSS_TASK_ITER_SKIPPED) + it->flags &= ~CSS_TASK_ITER_SKIPPED; + else + it->task_pos = it->task_pos->next; - if (next == it->mg_tasks_head) + if (it->task_pos == it->tasks_head) + it->task_pos = it->mg_tasks_head->next; + if (it->task_pos == it->mg_tasks_head) css_task_iter_advance_css_set(it); - else - it->task_pos = next; } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); -- cgit v1.2.3 From c03cd7738a83b13739f00546166969342c8ff014 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 31 May 2019 10:38:58 -0700 Subject: cgroup: Include dying leaders with live threads in PROCS iterations CSS_TASK_ITER_PROCS currently iterates live group leaders; however, this means that a process with dying leader and live threads will be skipped. IOW, cgroup.procs might be empty while cgroup.threads isn't, which is confusing to say the least. Fix it by making cset track dying tasks and include dying leaders with live threads in PROCS iteration. Signed-off-by: Tejun Heo Reported-and-tested-by: Topi Miettinen Cc: Oleg Nesterov --- kernel/cgroup/cgroup.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 035aee466bbf..a7df319c2e9a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -739,6 +739,7 @@ struct css_set init_css_set = { .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), @@ -1213,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); @@ -4399,15 +4401,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) it->task_pos = NULL; return; } - } while (!css_set_populated(cset)); + } while (!css_set_populated(cset) && !list_empty(&cset->dying_tasks)); if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; - else + else if (!list_empty(&cset->mg_tasks)) it->task_pos = cset->mg_tasks.next; + else + it->task_pos = cset->dying_tasks.next; it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; + it->dying_tasks_head = &cset->dying_tasks; /* * We don't keep css_sets locked across iteration steps and thus @@ -4446,6 +4451,8 @@ static void css_task_iter_skip(struct css_task_iter *it, static void css_task_iter_advance(struct css_task_iter *it) { + struct task_struct *task; + lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { @@ -4462,17 +4469,32 @@ repeat: if (it->task_pos == it->tasks_head) it->task_pos = it->mg_tasks_head->next; if (it->task_pos == it->mg_tasks_head) + it->task_pos = it->dying_tasks_head->next; + if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } - /* if PROCS, skip over tasks which aren't group leaders */ - if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && - !thread_group_leader(list_entry(it->task_pos, struct task_struct, - cg_list))) - goto repeat; + if (!it->task_pos) + return; + + task = list_entry(it->task_pos, struct task_struct, cg_list); + + if (it->flags & CSS_TASK_ITER_PROCS) { + /* if PROCS, skip over tasks which aren't group leaders */ + if (!thread_group_leader(task)) + goto repeat; + + /* and dying leaders w/o live member threads */ + if (!atomic_read(&task->signal->live)) + goto repeat; + } else { + /* skip all dying ones */ + if (task->flags & PF_EXITING) + goto repeat; + } } /** @@ -6009,6 +6031,7 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); + list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; WARN_ON_ONCE(cgroup_task_frozen(tsk)); @@ -6034,6 +6057,13 @@ void cgroup_release(struct task_struct *task) do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); + + if (use_task_css_set_links) { + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); + } } void cgroup_free(struct task_struct *task) -- cgit v1.2.3 From 8856ae4df3e9b5295ea2da7ad3b00796386454ec Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 31 May 2019 22:30:12 -0700 Subject: kernel/fork.c: make max_threads symbol static Fix build warning, kernel/fork.c:125:5: warning: symbol 'max_threads' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20190516015118.140561-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reported-by: Hulk Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b2b87d450b80..75675b9bf6df 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -123,7 +123,7 @@ unsigned long total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */ -int max_threads; /* tunable limit on nr_threads */ +static int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; -- cgit v1.2.3 From 11bbd8b416f8abf40900dc5041152892f873d915 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 31 May 2019 22:30:16 -0700 Subject: prctl_set_mm: refactor checks from validate_prctl_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite comment of validate_prctl_map claims there are no capability checks, it is not completely true since commit 4d28df6152aa ("prctl: Allow local CAP_SYS_ADMIN changing exe_file"). Extract the check out of the function and make the function perform purely arithmetic checks. This patch should not change any behavior, it is mere refactoring for following patch. [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20190502125203.24014-2-mkoutny@suse.com Signed-off-by: Michal Koutný Reviewed-by: Kirill Tkhai Reviewed-by: Cyrill Gorcunov Cc: Kirill Tkhai Cc: Laurent Dufour Cc: Mateusz Guzik Cc: Michal Hocko Cc: Yang Shi Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 51 +++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index bdbfe8d37418..775bf8d18d03 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1882,13 +1882,14 @@ exit_err: } /* + * Check arithmetic relations of passed addresses. + * * WARNING: we don't require any capability here so be very careful * in what is allowed for modification from userspace. */ -static int validate_prctl_map(struct prctl_mm_map *prctl_map) +static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) { unsigned long mmap_max_addr = TASK_SIZE; - struct mm_struct *mm = current->mm; int error = -EINVAL, i; static const unsigned char offsets[] = { @@ -1949,24 +1950,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) prctl_map->start_data)) goto out; - /* - * Someone is trying to cheat the auxv vector. - */ - if (prctl_map->auxv_size) { - if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) - goto out; - } - - /* - * Finally, make sure the caller has the rights to - * change /proc/pid/exe link: only local sys admin should - * be allowed to. - */ - if (prctl_map->exe_fd != (u32)-1) { - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) - goto out; - } - error = 0; out: return error; @@ -1993,11 +1976,18 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) return -EFAULT; - error = validate_prctl_map(&prctl_map); + error = validate_prctl_map_addr(&prctl_map); if (error) return error; if (prctl_map.auxv_size) { + /* + * Someone is trying to cheat the auxv vector. + */ + if (!prctl_map.auxv || + prctl_map.auxv_size > sizeof(mm->saved_auxv)) + return -EINVAL; + memset(user_auxv, 0, sizeof(user_auxv)); if (copy_from_user(user_auxv, (const void __user *)prctl_map.auxv, @@ -2010,6 +2000,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data } if (prctl_map.exe_fd != (u32)-1) { + /* + * Make sure the caller has the rights to + * change /proc/pid/exe link: only local sys admin should + * be allowed to. + */ + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EINVAL; + error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); if (error) return error; @@ -2097,7 +2095,11 @@ static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { struct mm_struct *mm = current->mm; - struct prctl_mm_map prctl_map; + struct prctl_mm_map prctl_map = { + .auxv = NULL, + .auxv_size = 0, + .exe_fd = -1, + }; struct vm_area_struct *vma; int error; @@ -2139,9 +2141,6 @@ static int prctl_set_mm(int opt, unsigned long addr, prctl_map.arg_end = mm->arg_end; prctl_map.env_start = mm->env_start; prctl_map.env_end = mm->env_end; - prctl_map.auxv = NULL; - prctl_map.auxv_size = 0; - prctl_map.exe_fd = -1; switch (opt) { case PR_SET_MM_START_CODE: @@ -2181,7 +2180,7 @@ static int prctl_set_mm(int opt, unsigned long addr, goto out; } - error = validate_prctl_map(&prctl_map); + error = validate_prctl_map_addr(&prctl_map); if (error) goto out; -- cgit v1.2.3 From bc81426f5beef7da863d3365bc9d45e820448745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 31 May 2019 22:30:19 -0700 Subject: prctl_set_mm: downgrade mmap_sem to read lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit a3b609ef9f8b ("proc read mm's {arg,env}_{start,end} with mmap semaphore taken.") added synchronization of reading argument/environment boundaries under mmap_sem. Later commit 88aa7cc688d4 ("mm: introduce arg_lock to protect arg_start|end and env_start|end in mm_struct") avoided the coarse use of mmap_sem in similar situations. But there still remained two places that (mis)use mmap_sem. get_cmdline should also use arg_lock instead of mmap_sem when it reads the boundaries. The second place that should use arg_lock is in prctl_set_mm. By protecting the boundaries fields with the arg_lock, we can downgrade mmap_sem to reader lock (analogous to what we already do in prctl_set_mm_map). [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20190502125203.24014-3-mkoutny@suse.com Fixes: 88aa7cc688d4 ("mm: introduce arg_lock to protect arg_start|end and env_start|end in mm_struct") Signed-off-by: Michal Koutný Signed-off-by: Laurent Dufour Co-developed-by: Laurent Dufour Reviewed-by: Cyrill Gorcunov Acked-by: Michal Hocko Cc: Yang Shi Cc: Mateusz Guzik Cc: Kirill Tkhai Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 775bf8d18d03..2969304c29fe 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2127,9 +2127,15 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EINVAL; - down_write(&mm->mmap_sem); + /* + * arg_lock protects concurent updates of arg boundaries, we need + * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr + * validation. + */ + down_read(&mm->mmap_sem); vma = find_vma(mm, addr); + spin_lock(&mm->arg_lock); prctl_map.start_code = mm->start_code; prctl_map.end_code = mm->end_code; prctl_map.start_data = mm->start_data; @@ -2217,7 +2223,8 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: - up_write(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); + up_read(&mm->mmap_sem); return error; } -- cgit v1.2.3 From 9852ae3fe5293264f01c49f2571ef7688f7823ce Mon Sep 17 00:00:00 2001 From: Chris Down Date: Fri, 31 May 2019 22:30:22 -0700 Subject: mm, memcg: consider subtrees in memory.events memory.stat and other files already consider subtrees in their output, and we should too in order to not present an inconsistent interface. The current situation is fairly confusing, because people interacting with cgroups expect hierarchical behaviour in the vein of memory.stat, cgroup.events, and other files. For example, this causes confusion when debugging reclaim events under low, as currently these always read "0" at non-leaf memcg nodes, which frequently causes people to misdiagnose breach behaviour. The same confusion applies to other counters in this file when debugging issues. Aggregation is done at write time instead of at read-time since these counters aren't hot (unlike memory.stat which is per-page, so it does it at read time), and it makes sense to bundle this with the file notifications. After this patch, events are propagated up the hierarchy: [root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events low 0 high 0 max 0 oom 0 oom_kill 0 [root@ktst ~]# systemd-run -p MemoryMax=1 true Running as unit: run-r251162a189fb4562b9dabfdc9b0422f5.service [root@ktst ~]# cat /sys/fs/cgroup/system.slice/memory.events low 0 high 0 max 7 oom 1 oom_kill 1 As this is a change in behaviour, this can be reverted to the old behaviour by mounting with the `memory_localevents' flag set. However, we use the new behaviour by default as there's a lack of evidence that there are any current users of memory.events that would find this change undesirable. akpm: this is a behaviour change, so Cc:stable. THis is so that forthcoming distros which use cgroup v2 are more likely to pick up the revised behaviour. Link: http://lkml.kernel.org/r/20190208224419.GA24772@chrisdown.name Signed-off-by: Chris Down Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Cc: Dennis Zhou Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cgroup.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..426a0026225c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1810,11 +1810,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, enum cgroup2_param { Opt_nsdelegate, + Opt_memory_localevents, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_param_specs[] = { - fsparam_flag ("nsdelegate", Opt_nsdelegate), + fsparam_flag("nsdelegate", Opt_nsdelegate), + fsparam_flag("memory_localevents", Opt_memory_localevents), {} }; @@ -1837,6 +1839,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; + case Opt_memory_localevents: + ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; + return 0; } return -EINVAL; } @@ -1848,6 +1853,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + + if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; } } @@ -1855,6 +1865,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + seq_puts(seq, ",memory_localevents"); return 0; } @@ -6325,7 +6337,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); + return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); -- cgit v1.2.3 From 98af37d624ed8c83f1953b1b6b2f6866011fc064 Mon Sep 17 00:00:00 2001 From: Zhenliang Wei Date: Fri, 31 May 2019 22:30:52 -0700 Subject: kernel/signal.c: trace_signal_deliver when signal_group_exit In the fixes commit, removing SIGKILL from each thread signal mask and executing "goto fatal" directly will skip the call to "trace_signal_deliver". At this point, the delivery tracking of the SIGKILL signal will be inaccurate. Therefore, we need to add trace_signal_deliver before "goto fatal" after executing sigdelset. Note: SEND_SIG_NOINFO matches the fact that SIGKILL doesn't have any info. Link: http://lkml.kernel.org/r/20190425025812.91424-1-weizhenliang@huawei.com Fixes: cf43a757fd4944 ("signal: Restore the stop PTRACE_EVENT_EXIT") Signed-off-by: Zhenliang Wei Reviewed-by: Christian Brauner Reviewed-by: Oleg Nesterov Cc: Eric W. Biederman Cc: Ivan Delalande Cc: Arnd Bergmann Cc: Thomas Gleixner Cc: Deepa Dinamani Cc: Greg Kroah-Hartman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d7b9d14ac80d..328a01e1a2f0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2485,6 +2485,8 @@ relock: if (signal_group_exit(signal)) { ksig->info.si_signo = signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); + trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, + &sighand->action[SIGKILL - 1]); recalc_sigpending(); goto fatal; } -- cgit v1.2.3 From a61373476127edac8bcc5ee9d68a74dc1b864f53 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 27 May 2019 12:45:18 +0200 Subject: PM: sleep: Add kerneldoc comments to some functions Add kerneldoc comments to pm_suspend_via_firmware(), pm_resume_via_firmware() and pm_suspend_via_s2idle() to explain what they do. Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ef908c134b34..43d869db6c07 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -62,6 +62,12 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); enum s2idle_states __read_mostly s2idle_state; static DEFINE_RAW_SPINLOCK(s2idle_lock); +/** + * pm_suspend_via_s2idle - Check if suspend-to-idle is the default suspend. + * + * Return 'true' if suspend-to-idle has been selected as the default system + * suspend method. + */ bool pm_suspend_via_s2idle(void) { return mem_sleep_current == PM_SUSPEND_TO_IDLE; -- cgit v1.2.3 From ec527c318036a65a083ef68d8ba95789d2212246 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 30 May 2019 00:09:39 +0200 Subject: x86/power: Fix 'nosmt' vs hibernation triple fault during resume As explained in 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once") we always, no matter what, have to bring up x86 HT siblings during boot at least once in order to avoid first MCE bringing the system to its knees. That means that whenever 'nosmt' is supplied on the kernel command-line, all the HT siblings are as a result sitting in mwait or cpudile after going through the online-offline cycle at least once. This causes a serious issue though when a kernel, which saw 'nosmt' on its commandline, is going to perform resume from hibernation: if the resume from the hibernated image is successful, cr3 is flipped in order to point to the address space of the kernel that is being resumed, which in turn means that all the HT siblings are all of a sudden mwaiting on address which is no longer valid. That results in triple fault shortly after cr3 is switched, and machine reboots. Fix this by always waking up all the SMT siblings before initiating the 'restore from hibernation' process; this guarantees that all the HT siblings will be properly carried over to the resumed kernel waiting in resume_play_dead(), and acted upon accordingly afterwards, based on the target kernel configuration. Symmetricaly, the resumed kernel has to push the SMT siblings to mwait again in case it has SMT disabled; this means it has to online all the siblings when resuming (so that they come out of hlt) and offline them again to let them reach mwait. Cc: 4.19+ # v4.19+ Debugged-by: Thomas Gleixner Fixes: 0cc3cd21657b ("cpu/hotplug: Boot HT siblings at least once") Signed-off-by: Jiri Kosina Acked-by: Pavel Machek Reviewed-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Signed-off-by: Rafael J. Wysocki --- kernel/cpu.c | 4 ++-- kernel/power/hibernate.c | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f2ef10460698..077fde6fb953 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2061,7 +2061,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_ONLINE); } -static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; @@ -2093,7 +2093,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } -static int cpuhp_smt_enable(void) +int cpuhp_smt_enable(void) { int cpu, ret = 0; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c8c272df7154..b65635753e8e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -257,6 +257,11 @@ void swsusp_show_speed(ktime_t start, ktime_t stop, (kps % 1000) / 10); } +__weak int arch_resume_nosmt(void) +{ + return 0; +} + /** * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. @@ -324,6 +329,10 @@ static int create_image(int platform_mode) Enable_cpus: suspend_enable_secondary_cpus(); + /* Allow architectures to do nosmt-specific post-resume dances */ + if (!in_suspend) + error = arch_resume_nosmt(); + Platform_finish: platform_finish(platform_mode); -- cgit v1.2.3 From c732327f04a3818f35fa97d07b1d64d31b691d78 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Jun 2019 15:18:43 +0200 Subject: signal: improve comments Improve the comments for pidfd_send_signal(). First, the comment still referred to a file descriptor for a process as a "task file descriptor" which stems from way back at the beginning of the discussion. Replace this with "pidfd" for consistency. Second, the wording for the explanation of the arguments to the syscall was a bit inconsistent, e.g. some used the past tense some used present tense. Make the wording more consistent. Signed-off-by: Christian Brauner --- kernel/signal.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 328a01e1a2f0..d622eac9d169 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3621,12 +3621,11 @@ static struct pid *pidfd_to_pid(const struct file *file) } /** - * sys_pidfd_send_signal - send a signal to a process through a task file - * descriptor - * @pidfd: the file descriptor of the process - * @sig: signal to be sent - * @info: the signal info - * @flags: future flags to be passed + * sys_pidfd_send_signal - Signal a process through a pidfd + * @pidfd: file descriptor of the process + * @sig: signal to send + * @info: signal info + * @flags: future flags * * The syscall currently only signals via PIDTYPE_PID which covers * kill(, . It does not signal threads or process -- cgit v1.2.3 From 9c92ab61914157664a2fbdf926df0eb937838e45 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 May 2019 07:17:56 -0700 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 282 Based on 1 normalized pattern(s): this software is licensed under the terms of the gnu general public license version 2 as published by the free software foundation and may be copied distributed and modified under those terms this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 285 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Alexios Zavras Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190529141900.642774971@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/cpu_pm.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 67b02e138a47..cbca6879ab7d 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -1,18 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Google, Inc. * * Author: * Colin Cross - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include -- cgit v1.2.3 From 5b497af42fab12cadc0e29bcb7052cf9963603f5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 May 2019 07:18:09 -0700 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 295 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of version 2 of the gnu general public license as published by the free software foundation this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 64 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Alexios Zavras Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190529141901.894819585@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/arraymap.c | 10 +--------- kernel/bpf/devmap.c | 10 +--------- kernel/bpf/disasm.c | 10 +--------- kernel/bpf/disasm.h | 10 +--------- kernel/bpf/hashtab.c | 10 +--------- kernel/bpf/helpers.c | 10 +--------- kernel/bpf/syscall.c | 10 +--------- kernel/bpf/verifier.c | 10 +--------- 8 files changed, 8 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 584636c9e2eb..262a321f58a6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016,2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1e525d70f833..15dbc15c5b0c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ /* Devmaps primary use is as a backend map for XDP BPF helper call diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d9ce383c0f9c..b44d8c447afd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e1324a834a24..e546b18d27da 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __BPF_DISASM_H__ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0f2708fde5f7..583df5cb302d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4266ffde07ca..5e28718928ca 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb5440b02e82..ef63d26622f2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 95f9354495ad..d15cc4fafa89 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include #include -- cgit v1.2.3 From 4505153954fdb1465d2b178288a9bf646f2a2166 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 May 2019 16:57:47 -0700 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 333 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license version 2 as published by the free software foundation this program is distributed in the hope that it will be useful but without any warranty without even the implied warranty of merchantability or fitness for a particular purpose see the gnu general public license for more details you should have received a copy of the gnu general public license along with this program if not write to the free software foundation inc 59 temple place suite 330 boston ma 02111 1307 usa extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 136 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Alexios Zavras Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190530000436.384967451@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/dma/debug.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index badd77670d00..099002d84f46 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 Advanced Micro Devices, Inc. * * Author: Joerg Roedel - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define pr_fmt(fmt) "DMA-API: " fmt -- cgit v1.2.3 From ddc64d0ac97814fcc42ed90a2ea0c69658806c67 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 31 May 2019 01:09:24 -0700 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 363 Based on 1 normalized pattern(s): released under terms in gpl version 2 see copying extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 5 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Armijn Hemel Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531081035.689962394@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/cpumap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index cf727d77c6c6..8ebd0fa826f8 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* bpf/cpumap.c * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. - * Released under terms in GPL version 2. See COPYING. */ /* The 'cpumap' is primarily used as a backend map for XDP BPF helper -- cgit v1.2.3 From 55716d26439f5c4008b0bcb7f17d1f7c0d8fbcfc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 1 Jun 2019 10:08:42 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 428 Based on 1 normalized pattern(s): this file is released under the gplv2 extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 68 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Armijn Hemel Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531190114.292346262@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/power/hibernate.c | 3 +-- kernel/power/main.c | 4 +--- kernel/power/snapshot.c | 4 +--- kernel/power/suspend.c | 3 +-- kernel/power/suspend_test.c | 3 +-- kernel/power/swap.c | 4 +--- kernel/power/user.c | 4 +--- 7 files changed, 7 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 97522630b1b6..8fc054e5c501 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. * @@ -6,8 +7,6 @@ * Copyright (c) 2004 Pavel Machek * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * Copyright (C) 2012 Bojan Smojver - * - * This file is released under the GPLv2. */ #define pr_fmt(fmt) "PM: " fmt diff --git a/kernel/power/main.c b/kernel/power/main.c index 4f43e724f6eb..bdbd605c4215 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/main.c - PM subsystem core functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * - * This file is released under the GPLv2 - * */ #include diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index bc9558ab1e5b..83105874f255 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/snapshot.c * @@ -5,9 +6,6 @@ * * Copyright (C) 1998-2005 Pavel Machek * Copyright (C) 2006 Rafael J. Wysocki - * - * This file is released under the GPLv2. - * */ #define pr_fmt(fmt) "PM: " fmt diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ef908c134b34..3c57e206f3e8 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -1,11 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/suspend.c - Suspend to RAM and standby functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2009 Rafael J. Wysocki , Novell Inc. - * - * This file is released under the GPLv2. */ #define pr_fmt(fmt) "PM: " fmt diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 6a897e8b2a88..60564b58de07 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. * * Copyright (c) 2009 Pavel Machek - * - * This file is released under the GPLv2. */ #include diff --git a/kernel/power/swap.c b/kernel/power/swap.c index d7f6c1a288d3..e1912ad13bdc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/swap.c * @@ -7,9 +8,6 @@ * Copyright (C) 1998,2001-2005 Pavel Machek * Copyright (C) 2006 Rafael J. Wysocki * Copyright (C) 2010-2012 Bojan Smojver - * - * This file is released under the GPLv2. - * */ #define pr_fmt(fmt) "PM: " fmt diff --git a/kernel/power/user.c b/kernel/power/user.c index cb24e840a3e6..77438954cc2b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/user.c * * This file provides the user space interface for software suspend/resume. * * Copyright (C) 2006 Rafael J. Wysocki - * - * This file is released under the GPLv2. - * */ #include -- cgit v1.2.3 From 767a67b0b35520348dc3b28dcba06454b0f9023d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 1 Jun 2019 10:08:44 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 430 Based on 1 normalized pattern(s): distribute under gplv2 extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 8 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Armijn Hemel Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531190114.475576622@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/softirq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 2c3382378d94..a6b81c6b6bff 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -1,10 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/softirq.c * * Copyright (C) 1992 Linus Torvalds * - * Distribute under GPLv2. - * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) */ -- cgit v1.2.3 From 3e45610181bcc2c68d343a62c1d028890160ef79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 1 Jun 2019 10:08:50 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 436 Based on 1 normalized pattern(s): distributed under the terms of the gnu gpl version 2 extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 2 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Armijn Hemel Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531190115.032570679@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/locking/semaphore.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 561acdd39960..d9dd94defc0a 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008 Intel Corporation * Author: Matthew Wilcox * - * Distributed under the terms of the GNU GPL, version 2 - * * This file implements counting semaphores. * A counting semaphore may be acquired 'n' times before sleeping. * See mutex.c for single-acquisition sleeping locks which enforce -- cgit v1.2.3 From b886d83c5b621abc84ff9616f14c529be3f6b147 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 1 Jun 2019 10:08:55 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 441 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation version 2 of the license extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 315 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Allison Randal Reviewed-by: Armijn Hemel Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531190115.503150771@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/async.c | 6 +----- kernel/backtracetest.c | 6 +----- kernel/latencytop.c | 6 +----- kernel/nsproxy.c | 6 +----- kernel/sched/cpudeadline.c | 6 +----- kernel/sched/cpupri.c | 6 +----- kernel/ucount.c | 7 +------ kernel/user_namespace.c | 7 +------ kernel/utsname.c | 6 +----- kernel/utsname_sysctl.c | 6 +----- 10 files changed, 10 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 12c332e4e13e..4f9c1d614016 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * async.c: Asynchronous function calls for boot performance * * (C) Copyright 2009 Intel Corporation * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a563c8fdad0d..a2a97fa3071b 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Simple stack backtrace regression test module * * (C) Copyright 2008 Intel Corporation * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 871734ea2f04..e3acead004e6 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * latencytop.c: Latency display infrastructure * * (C) Copyright 2008 Intel Corporation * Author: Arjan van de Ven - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f6c5d330059a..c815f58e6bc0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2006 IBM Corporation * * Author: Serge Hallyn * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - * * Jun 2006 - namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 50316455ea66..ec4e4a9aab5f 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/cpudl.c * * Global CPU deadline management * * Author: Juri Lelli - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include "sched.h" diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index daaadf939ccb..9c6480e6d62d 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/cpupri.c * @@ -20,11 +21,6 @@ * searches). For tasks with affinity restrictions, the algorithm has a * worst case complexity of O(min(102, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include "sched.h" diff --git a/kernel/ucount.c b/kernel/ucount.c index f48d1b6376a4..feb128c7b5d9 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -1,9 +1,4 @@ -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ +// SPDX-License-Identifier: GPL-2.0-only #include #include diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 923414a246e9..0eff45ce7703 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1,9 +1,4 @@ -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ +// SPDX-License-Identifier: GPL-2.0-only #include #include diff --git a/kernel/utsname.c b/kernel/utsname.c index dcd6be1996fe..f0e491193009 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2004 IBM Corporation * * Author: Serge Hallyn - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. */ #include diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 258033d62cb3..3732c888a949 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 * * Author: Eric Biederman - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. */ #include -- cgit v1.2.3 From cee0c33c546a93957a52ae9ab6bebadbee765ec5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 5 Jun 2019 09:54:34 -0700 Subject: cgroup: css_task_iter_skip()'d iterators must be advanced before accessed b636fd38dc40 ("cgroup: Implement css_task_iter_skip()") introduced css_task_iter_skip() which is used to fix task iterations skipping dying threadgroup leaders with live threads. Skipping is implemented as a subportion of full advancing but css_task_iter_next() forgot to fully advance a skipped iterator before determining the next task to visit causing it to return invalid task pointers. Fix it by making css_task_iter_next() fully advance the iterator if it has been skipped since the previous iteration. Signed-off-by: Tejun Heo Reported-by: syzbot Link: http://lkml.kernel.org/r/00000000000097025d058a7fd785@google.com Fixes: b636fd38dc40 ("cgroup: Implement css_task_iter_skip()") --- kernel/cgroup/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a7df319c2e9a..9538a12d42d6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4550,6 +4550,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) spin_lock_irq(&css_set_lock); + /* @it may be half-advanced by skips, finish advancing */ + if (it->flags & CSS_TASK_ITER_SKIPPED) + css_task_iter_advance(it); + if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); -- cgit v1.2.3 From 983695fa676568fc0fe5ddd995c7267aabc24632 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 7 Jun 2019 01:48:57 +0200 Subject: bpf: fix unconnected udp hooks Intention of cgroup bind/connect/sendmsg BPF hooks is to act transparently to applications as also stated in original motivation in 7828f20e3779 ("Merge branch 'bpf-cgroup-bind-connect'"). When recently integrating the latter two hooks into Cilium to enable host based load-balancing with Kubernetes, I ran into the issue that pods couldn't start up as DNS got broken. Kubernetes typically sets up DNS as a service and is thus subject to load-balancing. Upon further debugging, it turns out that the cgroupv2 sendmsg BPF hooks API is currently insufficient and thus not usable as-is for standard applications shipped with most distros. To break down the issue we ran into with a simple example: # cat /etc/resolv.conf nameserver 147.75.207.207 nameserver 147.75.207.208 For the purpose of a simple test, we set up above IPs as service IPs and transparently redirect traffic to a different DNS backend server for that node: # cilium service list ID Frontend Backend 1 147.75.207.207:53 1 => 8.8.8.8:53 2 147.75.207.208:53 1 => 8.8.8.8:53 The attached BPF program is basically selecting one of the backends if the service IP/port matches on the cgroup hook. DNS breaks here, because the hooks are not transparent enough to applications which have built-in msg_name address checks: # nslookup 1.1.1.1 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 [...] ;; connection timed out; no servers could be reached # dig 1.1.1.1 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 [...] ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 ;; global options: +cmd ;; connection timed out; no servers could be reached For comparison, if none of the service IPs is used, and we tell nslookup to use 8.8.8.8 directly it works just fine, of course: # nslookup 1.1.1.1 8.8.8.8 1.1.1.1.in-addr.arpa name = one.one.one.one. In order to fix this and thus act more transparent to the application, this needs reverse translation on recvmsg() side. A minimal fix for this API is to add similar recvmsg() hooks behind the BPF cgroups static key such that the program can track state and replace the current sockaddr_in{,6} with the original service IP. From BPF side, this basically tracks the service tuple plus socket cookie in an LRU map where the reverse NAT can then be retrieved via map value as one example. Side-note: the BPF cgroups static key should be converted to a per-hook static key in future. Same example after this fix: # cilium service list ID Frontend Backend 1 147.75.207.207:53 1 => 8.8.8.8:53 2 147.75.207.208:53 1 => 8.8.8.8:53 Lookups work fine now: # nslookup 1.1.1.1 1.1.1.1.in-addr.arpa name = one.one.one.one. Authoritative answers can be found from: # dig 1.1.1.1 ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 ;; global options: +cmd ;; Got answer: ;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 51550 ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1 ;; OPT PSEUDOSECTION: ; EDNS: version: 0, flags:; udp: 512 ;; QUESTION SECTION: ;1.1.1.1. IN A ;; AUTHORITY SECTION: . 23426 IN SOA a.root-servers.net. nstld.verisign-grs.com. 2019052001 1800 900 604800 86400 ;; Query time: 17 msec ;; SERVER: 147.75.207.207#53(147.75.207.207) ;; WHEN: Tue May 21 12:59:38 UTC 2019 ;; MSG SIZE rcvd: 111 And from an actual packet level it shows that we're using the back end server when talking via 147.75.207.20{7,8} front end: # tcpdump -i any udp [...] 12:59:52.698732 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) 12:59:52.698735 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) [...] In order to be flexible and to have same semantics as in sendmsg BPF programs, we only allow return codes in [1,1] range. In the sendmsg case the program is called if msg->msg_name is present which can be the case in both, connected and unconnected UDP. The former only relies on the sockaddr_in{,6} passed via connect(2) if passed msg->msg_name was NULL. Therefore, on recvmsg side, we act in similar way to call into the BPF program whenever a non-NULL msg->msg_name was passed independent of sk->sk_state being TCP_ESTABLISHED or not. Note that for TCP case, the msg->msg_name is ignored in the regular recvmsg path and therefore not relevant. For the case of ip{,v6}_recv_error() paths, picked up via MSG_ERRQUEUE, the hook is not called. This is intentional as it aligns with the same semantics as in case of TCP cgroup BPF hooks right now. This might be better addressed in future through a different bpf_attach_type such that this case can be distinguished from the regular recvmsg paths, for example. Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Acked-by: Martynas Pumputis Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 8 ++++++++ kernel/bpf/verifier.c | 12 ++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb5440b02e82..e8ba3a153691 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1581,6 +1581,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: return 0; default: return -EINVAL; @@ -1875,6 +1877,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1960,6 +1964,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -2011,6 +2017,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 95f9354495ad..d2c8a6677ac4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5361,9 +5361,12 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); switch (env->prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + range = tnum_range(1, 1); case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SYSCTL: @@ -5380,16 +5383,17 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { + char tn_buf[48]; + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "has value %s", tn_buf); } else { verbose(env, "has unknown scalar value"); } - verbose(env, " should have been 0 or 1\n"); + tnum_strn(tn_buf, sizeof(tn_buf), range); + verbose(env, " should have been in %s\n", tn_buf); return -EINVAL; } return 0; -- cgit v1.2.3 From 54b7b868e826b294687c439b68ec55fe20cafe5b Mon Sep 17 00:00:00 2001 From: Angelo Ruocco Date: Tue, 21 May 2019 10:01:54 +0200 Subject: cgroup: let a symlink too be created with a cftype file This commit enables a cftype to have a symlink (of any name) that points to the file associated with the cftype. Signed-off-by: Angelo Ruocco Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- kernel/cgroup/cgroup.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 426a0026225c..155048b0eca2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1460,8 +1460,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, static struct kernfs_syscall_ops cgroup_kf_syscall_ops; -static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, - char *buf) +static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf, bool write_link_name) { struct cgroup_subsys *ss = cft->ss; @@ -1471,13 +1471,26 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, - cft->name); + write_link_name ? cft->link_name : cft->name); } else { - strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strscpy(buf, write_link_name ? cft->link_name : cft->name, + CGROUP_FILE_NAME_MAX); } return buf; } +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) +{ + return cgroup_fill_name(cgrp, cft, buf, false); +} + +static char *cgroup_link_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) +{ + return cgroup_fill_name(cgrp, cft, buf, true); +} + /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question @@ -1636,6 +1649,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); + if (cft->flags & CFTYPE_SYMLINKED) + kernfs_remove_by_name(cgrp->kn, + cgroup_link_name(cgrp, cft, name)); } /** @@ -3821,6 +3837,7 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; + struct kernfs_node *kn_link; struct lock_class_key *key = NULL; int ret; @@ -3851,6 +3868,14 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, spin_unlock_irq(&cgroup_file_kn_lock); } + if (cft->flags & CFTYPE_SYMLINKED) { + kn_link = kernfs_create_link(cgrp->kn, + cgroup_link_name(cgrp, cft, name), + kn); + if (IS_ERR(kn_link)) + return PTR_ERR(kn_link); + } + return 0; } -- cgit v1.2.3 From cf8929885de318c0bf73438c9e5dde59d6536f7c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Jun 2019 03:35:41 -0600 Subject: cgroup/bfq: revert bfq.weight symlink change There's some discussion on how to do this the best, and Tejun prefers that BFQ just create the file itself instead of having cgroups support a symlink feature. Hence revert commit 54b7b868e826 and 19e9da9e86c4 for 5.2, and this can be done properly for 5.3. Signed-off-by: Jens Axboe --- kernel/cgroup/cgroup.c | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 155048b0eca2..426a0026225c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1460,8 +1460,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, static struct kernfs_syscall_ops cgroup_kf_syscall_ops; -static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft, - char *buf, bool write_link_name) +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) { struct cgroup_subsys *ss = cft->ss; @@ -1471,26 +1471,13 @@ static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft, snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, - write_link_name ? cft->link_name : cft->name); + cft->name); } else { - strscpy(buf, write_link_name ? cft->link_name : cft->name, - CGROUP_FILE_NAME_MAX); + strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); } return buf; } -static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, - char *buf) -{ - return cgroup_fill_name(cgrp, cft, buf, false); -} - -static char *cgroup_link_name(struct cgroup *cgrp, const struct cftype *cft, - char *buf) -{ - return cgroup_fill_name(cgrp, cft, buf, true); -} - /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question @@ -1649,9 +1636,6 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); - if (cft->flags & CFTYPE_SYMLINKED) - kernfs_remove_by_name(cgrp->kn, - cgroup_link_name(cgrp, cft, name)); } /** @@ -3837,7 +3821,6 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; - struct kernfs_node *kn_link; struct lock_class_key *key = NULL; int ret; @@ -3868,14 +3851,6 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, spin_unlock_irq(&cgroup_file_kn_lock); } - if (cft->flags & CFTYPE_SYMLINKED) { - kn_link = kernfs_create_link(cgrp->kn, - cgroup_link_name(cgrp, cft, name), - kn); - if (IS_ERR(kn_link)) - return PTR_ERR(kn_link); - } - return 0; } -- cgit v1.2.3 From c596687a008b579c503afb7a64fcacc7270fae9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 10 Jun 2019 09:08:27 -0700 Subject: cgroup: Fix css_task_iter_advance_css_set() cset skip condition While adding handling for dying task group leaders c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") added an inverted cset skip condition to css_task_iter_advance_css_set(). It should skip cset if it's completely empty but was incorrectly testing for the inverse condition for the dying_tasks list. Fix it. Signed-off-by: Tejun Heo Fixes: c03cd7738a83 ("cgroup: Include dying leaders with live threads in PROCS iterations") Reported-by: syzbot+d4bba5ccd4f9a2a68681@syzkaller.appspotmail.com --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9538a12d42d6..6420ff87d72c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4401,7 +4401,7 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) it->task_pos = NULL; return; } - } while (!css_set_populated(cset) && !list_empty(&cset->dying_tasks)); + } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; -- cgit v1.2.3 From da2577fdd0932ea4eefe73903f1130ee366767d2 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Sat, 8 Jun 2019 12:54:19 -0700 Subject: bpf: lpm_trie: check left child of last leftmost node for NULL If the leftmost parent node of the tree has does not have a child on the left side, then trie_get_next_key (and bpftool map dump) will not look at the child on the right. This leads to the traversal missing elements. Lookup is not affected. Update selftest to handle this case. Reproducer: bpftool map create /sys/fs/bpf/lpm type lpm_trie key 6 \ value 1 entries 256 name test_lpm flags 1 bpftool map update pinned /sys/fs/bpf/lpm key 8 0 0 0 0 0 value 1 bpftool map update pinned /sys/fs/bpf/lpm key 16 0 0 0 0 128 value 2 bpftool map dump pinned /sys/fs/bpf/lpm Returns only 1 element. (2 expected) Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE") Signed-off-by: Jonathan Lemon Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e61630c2e50b..864e2a496376 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -716,9 +716,14 @@ find_leftmost: * have exact two children, so this function will never return NULL. */ for (node = search_root; node;) { - if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + if (node->flags & LPM_TREE_NODE_FLAG_IM) { + node = rcu_dereference(node->child[0]); + } else { next_node = node; - node = rcu_dereference(node->child[0]); + node = rcu_dereference(node->child[0]); + if (!node) + node = rcu_dereference(next_node->child[1]); + } } do_copy: next_key->prefixlen = next_node->prefixlen; -- cgit v1.2.3 From f6581f5b55141a95657ef5742cf6a6bfa20a109f Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 29 May 2019 13:31:57 +0200 Subject: ptrace: restore smp_rmb() in __ptrace_may_access() Restore the read memory barrier in __ptrace_may_access() that was deleted a couple years ago. Also add comments on this barrier and the one it pairs with to explain why they're there (as far as I understand). Fixes: bfedb589252c ("mm: Add a user_ns owner to mm_struct and fix ptrace permission checks") Cc: stable@vger.kernel.org Acked-by: Kees Cook Acked-by: Oleg Nesterov Signed-off-by: Jann Horn Signed-off-by: Eric W. Biederman --- kernel/cred.c | 9 +++++++++ kernel/ptrace.c | 10 ++++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 45d77284aed0..07e069d00696 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -450,6 +450,15 @@ int commit_creds(struct cred *new) if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; + /* + * If a task drops privileges and becomes nondumpable, + * the dumpability change must become visible before + * the credential change; otherwise, a __ptrace_may_access() + * racing with this change may be able to attach to a task it + * shouldn't be able to attach to (as if the task had dropped + * privileges without becoming nondumpable). + * Pairs with a read barrier in __ptrace_may_access(). + */ smp_wmb(); } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 02c6528ead5c..c9b4646ad375 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -323,6 +323,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) return -EPERM; ok: rcu_read_unlock(); + /* + * If a task drops privileges and becomes nondumpable (through a syscall + * like setresuid()) while we are trying to access it, we must ensure + * that the dumpability is read after the credentials; otherwise, + * we may be able to attach to a task that we shouldn't be able to + * attach to (as if the task had dropped privileges without becoming + * nondumpable). + * Pairs with a write barrier in commit_creds(). + */ + smp_rmb(); mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && -- cgit v1.2.3 From d477f8c202d1f0d4791ab1263ca7657bbe5cf79e Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Wed, 12 Jun 2019 11:50:48 -0400 Subject: cpuset: restore sanity to cpuset_cpus_allowed_fallback() In the case that a process is constrained by taskset(1) (i.e. sched_setaffinity(2)) to a subset of available cpus, and all of those are subsequently offlined, the scheduler will set tsk->cpus_allowed to the current value of task_cs(tsk)->effective_cpus. This is done via a call to do_set_cpus_allowed() in the context of cpuset_cpus_allowed_fallback() made by the scheduler when this case is detected. This is the only call made to cpuset_cpus_allowed_fallback() in the latest mainline kernel. However, this is not sane behavior. I will demonstrate this on a system running the latest upstream kernel with the following initial configuration: # grep -i cpu /proc/$$/status Cpus_allowed: ffffffff,fffffff Cpus_allowed_list: 0-63 (Where cpus 32-63 are provided via smt.) If we limit our current shell process to cpu2 only and then offline it and reonline it: # taskset -p 4 $$ pid 2272's current affinity mask: ffffffffffffffff pid 2272's new affinity mask: 4 # echo off > /sys/devices/system/cpu/cpu2/online # dmesg | tail -3 [ 2195.866089] process 2272 (bash) no longer affine to cpu2 [ 2195.872700] IRQ 114: no longer affine to CPU2 [ 2195.879128] smpboot: CPU 2 is now offline # echo on > /sys/devices/system/cpu/cpu2/online # dmesg | tail -1 [ 2617.043572] smpboot: Booting Node 0 Processor 2 APIC 0x4 We see that our current process now has an affinity mask containing every cpu available on the system _except_ the one we originally constrained it to: # grep -i cpu /proc/$$/status Cpus_allowed: ffffffff,fffffffb Cpus_allowed_list: 0-1,3-63 This is not sane behavior, as the scheduler can now not only place the process on previously forbidden cpus, it can't even schedule it on the cpu it was originally constrained to! Other cases result in even more exotic affinity masks. Take for instance a process with an affinity mask containing only cpus provided by smt at the moment that smt is toggled, in a configuration such as the following: # taskset -p f000000000 $$ # grep -i cpu /proc/$$/status Cpus_allowed: 000000f0,00000000 Cpus_allowed_list: 36-39 A double toggle of smt results in the following behavior: # echo off > /sys/devices/system/cpu/smt/control # echo on > /sys/devices/system/cpu/smt/control # grep -i cpus /proc/$$/status Cpus_allowed: ffffff00,ffffffff Cpus_allowed_list: 0-31,40-63 This is even less sane than the previous case, as the new affinity mask excludes all smt-provided cpus with ids less than those that were previously in the affinity mask, as well as those that were actually in the mask. With this patch applied, both of these cases end in the following state: # grep -i cpu /proc/$$/status Cpus_allowed: ffffffff,ffffffff Cpus_allowed_list: 0-63 The original policy is discarded. Though not ideal, it is the simplest way to restore sanity to this fallback case without reinventing the cpuset wheel that rolls down the kernel just fine in cgroup v2. A user who wishes for the previous affinity mask to be restored in this fallback case can use that mechanism instead. This patch modifies scheduler behavior by instead resetting the mask to task_cs(tsk)->cpus_allowed by default, and cpu_possible mask in legacy mode. I tested the cases above on both modes. Note that the scheduler uses this fallback mechanism if and only if _every_ other valid avenue has been traveled, and it is the last resort before calling BUG(). Suggested-by: Waiman Long Suggested-by: Phil Auld Signed-off-by: Joel Savitz Acked-by: Phil Auld Acked-by: Waiman Long Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6a1942ed781c..515525ff1cfd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) spin_unlock_irqrestore(&callback_lock, flags); } +/** + * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. + * @tsk: pointer to task_struct with which the scheduler is struggling + * + * Description: In the case that the scheduler cannot find an allowed cpu in + * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy + * mode however, this value is the same as task_cs(tsk)->effective_cpus, + * which will not contain a sane cpumask during cases such as cpu hotplugging. + * This is the absolute last resort for the scheduler and it is only used if + * _every_ other avenue has been traveled. + **/ + void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { rcu_read_lock(); - do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); + do_set_cpus_allowed(tsk, is_in_v2_mode() ? + task_cs(tsk)->cpus_allowed : cpu_possible_mask); rcu_read_unlock(); /* -- cgit v1.2.3 From 2e3f139e8ecebf177fe01299285a56855e93fb84 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 13 Jun 2019 15:56:21 -0700 Subject: mm/devm_memremap_pages: introduce devm_memunmap_pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the new devm_release_action() facility to allow devm_memremap_pages_release() to be manually triggered. Link: http://lkml.kernel.org/r/155727337088.292046.5774214552136776763.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Logan Gunthorpe Cc: Bjorn Helgaas Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: "Jérôme Glisse" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 1490e63f69a9..715b434bd316 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -271,6 +271,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) } EXPORT_SYMBOL_GPL(devm_memremap_pages); +void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) +{ + devm_release_action(dev, devm_memremap_pages_release, pgmap); +} +EXPORT_SYMBOL_GPL(devm_memunmap_pages); + unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ -- cgit v1.2.3 From 50f44ee7248ad2f7984ef081974a6ecd09724b3e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 13 Jun 2019 15:56:33 -0700 Subject: mm/devm_memremap_pages: fix final page put race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Logan noticed that devm_memremap_pages_release() kills the percpu_ref drops all the page references that were acquired at init and then immediately proceeds to unplug, arch_remove_memory(), the backing pages for the pagemap. If for some reason device shutdown actually collides with a busy / elevated-ref-count page then arch_remove_memory() should be deferred until after that reference is dropped. As it stands the "wait for last page ref drop" happens *after* devm_memremap_pages_release() returns, which is obviously too late and can lead to crashes. Fix this situation by assigning the responsibility to wait for the percpu_ref to go idle to devm_memremap_pages() with a new ->cleanup() callback. Implement the new cleanup callback for all devm_memremap_pages() users: pmem, devdax, hmm, and p2pdma. Link: http://lkml.kernel.org/r/155727339156.292046.5432007428235387859.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: 41e94a851304 ("add devm_memremap_pages") Signed-off-by: Dan Williams Reported-by: Logan Gunthorpe Reviewed-by: Ira Weiny Reviewed-by: Logan Gunthorpe Cc: Bjorn Helgaas Cc: "Jérôme Glisse" Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 715b434bd316..6e1970719dc2 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -95,6 +95,7 @@ static void devm_memremap_pages_release(void *data) pgmap->kill(pgmap->ref); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); + pgmap->cleanup(pgmap->ref); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -133,8 +134,8 @@ static void devm_memremap_pages_release(void *data) * 2/ The altmap field may optionally be initialized, in which case altmap_valid * must be set to true * - * 3/ pgmap->ref must be 'live' on entry and will be killed at - * devm_memremap_pages_release() time, or if this routine fails. + * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped + * at devm_memremap_pages_release() time, or if this routine fails. * * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -156,8 +157,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; - if (!pgmap->ref || !pgmap->kill) + if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); return ERR_PTR(-EINVAL); + } align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -168,14 +171,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (conflict_pgmap) { dev_WARN(dev, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); - return ERR_PTR(-ENOMEM); + error = -ENOMEM; + goto err_array; } conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); if (conflict_pgmap) { dev_WARN(dev, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); - return ERR_PTR(-ENOMEM); + error = -ENOMEM; + goto err_array; } is_ram = region_intersects(align_start, align_size, @@ -267,6 +272,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgmap_array_delete(res); err_array: pgmap->kill(pgmap->ref); + pgmap->cleanup(pgmap->ref); + return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); -- cgit v1.2.3 From e3ff9c3678b4d80e22d2557b68726174578eaf52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Jun 2019 21:40:45 +0200 Subject: timekeeping: Repair ktime_get_coarse*() granularity Jason reported that the coarse ktime based time getters advance only once per second and not once per tick as advertised. The code reads only the monotonic base time, which advances once per second. The nanoseconds are accumulated on every tick in xtime_nsec up to a second and the regular time getters take this nanoseconds offset into account, but the ktime_get_coarse*() implementation fails to do so. Add the accumulated xtime_nsec value to the monotonic base time to get the proper per tick advancing coarse tinme. Fixes: b9ff604cff11 ("timekeeping: Add ktime_get_coarse_with_offset") Reported-by: Jason A. Donenfeld Signed-off-by: Thomas Gleixner Tested-by: Jason A. Donenfeld Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Clemens Ladisch Cc: Sultan Alsawaf Cc: Waiman Long Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1906132136280.1791@nanos.tec.linutronix.de --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 85f5912d8f70..44b726bab4bd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -808,17 +808,18 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; + u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); + nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); - return base; - + return base + nsecs; } EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); -- cgit v1.2.3 From becf33f694dc50656766e0fde8883437d5c8d4b4 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Mon, 10 Jun 2019 13:00:16 +0900 Subject: tracing: Fix out-of-range read in trace_stack_print() Puts range check before dereferencing the pointer. Reproducer: # echo stacktrace > trace_options # echo 1 > events/enable # cat trace > /dev/null KASAN report: ================================================================== BUG: KASAN: use-after-free in trace_stack_print+0x26b/0x2c0 Read of size 8 at addr ffff888069d20000 by task cat/1953 CPU: 0 PID: 1953 Comm: cat Not tainted 5.2.0-rc3+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014 Call Trace: dump_stack+0x8a/0xce print_address_description+0x60/0x224 ? trace_stack_print+0x26b/0x2c0 ? trace_stack_print+0x26b/0x2c0 __kasan_report.cold+0x1a/0x3e ? trace_stack_print+0x26b/0x2c0 kasan_report+0xe/0x20 trace_stack_print+0x26b/0x2c0 print_trace_line+0x6ea/0x14d0 ? tracing_buffers_read+0x700/0x700 ? trace_find_next_entry_inc+0x158/0x1d0 s_show+0xea/0x310 seq_read+0xaa7/0x10e0 ? seq_escape+0x230/0x230 __vfs_read+0x7c/0x100 vfs_read+0x16c/0x3a0 ksys_read+0x121/0x240 ? kernel_write+0x110/0x110 ? perf_trace_sys_enter+0x8a0/0x8a0 ? syscall_slow_exit_work+0xa9/0x410 do_syscall_64+0xb7/0x390 ? prepare_exit_to_usermode+0x165/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f867681f910 Code: b6 fe ff ff 48 8d 3d 0f be 08 00 48 83 ec 08 e8 06 db 01 00 66 0f 1f 44 00 00 83 3d f9 2d 2c 00 00 75 10 b8 00 00 00 00 04 RSP: 002b:00007ffdabf23488 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f867681f910 RDX: 0000000000020000 RSI: 00007f8676cde000 RDI: 0000000000000003 RBP: 00007f8676cde000 R08: ffffffffffffffff R09: 0000000000000000 R10: 0000000000000871 R11: 0000000000000246 R12: 00007f8676cde000 R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000000ec0 Allocated by task 1214: save_stack+0x1b/0x80 __kasan_kmalloc.constprop.0+0xc2/0xd0 kmem_cache_alloc+0xaf/0x1a0 getname_flags+0xd2/0x5b0 do_sys_open+0x277/0x5a0 do_syscall_64+0xb7/0x390 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 1214: save_stack+0x1b/0x80 __kasan_slab_free+0x12c/0x170 kmem_cache_free+0x8a/0x1c0 putname+0xe1/0x120 do_sys_open+0x2c5/0x5a0 do_syscall_64+0xb7/0x390 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff888069d20000 which belongs to the cache names_cache of size 4096 The buggy address is located 0 bytes inside of 4096-byte region [ffff888069d20000, ffff888069d21000) The buggy address belongs to the page: page:ffffea0001a74800 refcount:1 mapcount:0 mapping:ffff88806ccd1380 index:0x0 compound_mapcount: 0 flags: 0x100000000010200(slab|head) raw: 0100000000010200 dead000000000100 dead000000000200 ffff88806ccd1380 raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888069d1ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff888069d1ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff888069d20000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888069d20080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888069d20100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Link: http://lkml.kernel.org/r/20190610040016.5598-1-devel@etsukata.com Fixes: 4285f2fcef80 ("tracing: Remove the ULONG_MAX stack trace hackery") Signed-off-by: Eiichi Tsukata Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 54373d93e251..ba751f993c3b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1057,7 +1057,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "\n"); - for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { + for (p = field->caller; p && p < end && *p != ULONG_MAX; p++) { if (trace_seq_has_overflowed(s)) break; -- cgit v1.2.3 From cbdaeaf050b730ea02e9ab4ff844ce54d85dbe1d Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 5 Jun 2019 13:11:58 +0200 Subject: tracing: avoid build warning with HAVE_NOP_MCOUNT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Selecting HAVE_NOP_MCOUNT enables -mnop-mcount (if gcc supports it) and sets CC_USING_NOP_MCOUNT. Reuse __is_defined (which is suitable for testing CC_USING_* defines) to avoid conditional compilation and fix the following gcc 9 warning on s390: kernel/trace/ftrace.c:2514:1: warning: ‘ftrace_code_disable’ defined but not used [-Wunused-function] Link: http://lkml.kernel.org/r/patch.git-1a82d13f33ac.your-ad-here.call-01559732716-ext-6629@work.hours Fixes: 2f4df0017baed ("tracing: Add -mcount-nop option support") Signed-off-by: Vasily Gorbik Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a12aff849c04..e77a6c92620f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2935,14 +2935,13 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) p = &pg->records[i]; p->flags = rec_flags; -#ifndef CC_USING_NOP_MCOUNT /* * Do the initial record conversion from mcount jump * to the NOP instructions. */ - if (!ftrace_code_disable(mod, p)) + if (!__is_defined(CC_USING_NOP_MCOUNT) && + !ftrace_code_disable(mod, p)) break; -#endif update_cnt++; } -- cgit v1.2.3 From ff585c5b9a27e64084c84e2ddf24fd00bf8dcfc1 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 14 Jun 2019 23:32:10 +0800 Subject: tracing: Make two symbols static Fix sparse warnings: kernel/trace/trace.c:6927:24: warning: symbol 'get_tracing_log_err' was not declared. Should it be static? kernel/trace/trace.c:8196:15: warning: symbol 'trace_instance_dir' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20190614153210.24424-1-yuehaibing@huawei.com Acked-by: Tom Zanussi Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1c80521fd436..83e08b78dbee 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6923,7 +6923,7 @@ struct tracing_log_err { static DEFINE_MUTEX(tracing_err_log_lock); -struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) +static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) { struct tracing_log_err *err; @@ -8192,7 +8192,7 @@ static const struct file_operations buffer_percent_fops = { .llseek = default_llseek, }; -struct dentry *trace_instance_dir; +static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); -- cgit v1.2.3 From f01098c74b5219f3969d4750eeed1a36bfc038e3 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Fri, 14 Jun 2019 16:40:25 +0900 Subject: tracing/uprobe: Fix NULL pointer dereference in trace_uprobe_create() Just like the case of commit 8b05a3a7503c ("tracing/kprobes: Fix NULL pointer dereference in trace_kprobe_create()"), writing an incorrectly formatted string to uprobe_events can trigger NULL pointer dereference. Reporeducer: # echo r > /sys/kernel/debug/tracing/uprobe_events dmesg: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000079d12067 P4D 8000000079d12067 PUD 7b7ab067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 1903 Comm: bash Not tainted 5.2.0-rc3+ #15 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014 RIP: 0010:strchr+0x0/0x30 Code: c0 eb 0d 84 c9 74 18 48 83 c0 01 48 39 d0 74 0f 0f b6 0c 07 3a 0c 06 74 ea 19 c0 83 c8 01 c3 31 c0 c3 0f 1f 84 00 00 00 00 00 <0f> b6 07 89 f2 40 38 f0 75 0e eb 13 0f b6 47 01 48 83 c RSP: 0018:ffffb55fc0403d10 EFLAGS: 00010293 RAX: ffff993ffb793400 RBX: 0000000000000000 RCX: ffffffffa4852625 RDX: 0000000000000000 RSI: 000000000000002f RDI: 0000000000000000 RBP: ffffb55fc0403dd0 R08: ffff993ffb793400 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffff993ff9cc1668 R14: 0000000000000001 R15: 0000000000000000 FS: 00007f30c5147700(0000) GS:ffff993ffda00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007b628000 CR4: 00000000000006f0 Call Trace: trace_uprobe_create+0xe6/0xb10 ? __kmalloc_track_caller+0xe6/0x1c0 ? __kmalloc+0xf0/0x1d0 ? trace_uprobe_create+0xb10/0xb10 create_or_delete_trace_uprobe+0x35/0x90 ? trace_uprobe_create+0xb10/0xb10 trace_run_command+0x9c/0xb0 trace_parse_run_command+0xf9/0x1eb ? probes_open+0x80/0x80 __vfs_write+0x43/0x90 vfs_write+0x14a/0x2a0 ksys_write+0xa2/0x170 do_syscall_64+0x7f/0x200 entry_SYSCALL_64_after_hwframe+0x49/0xbe Link: http://lkml.kernel.org/r/20190614074026.8045-1-devel@etsukata.com Cc: stable@vger.kernel.org Fixes: 0597c49c69d5 ("tracing/uprobes: Use dyn_event framework for uprobe events") Reviewed-by: Srikar Dronamraju Signed-off-by: Eiichi Tsukata Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index eb7e06b54741..a88c692e3b8a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -443,10 +443,17 @@ static int trace_uprobe_create(int argc, const char **argv) ret = 0; ref_ctr_offset = 0; - /* argc must be >= 1 */ - if (argv[0][0] == 'r') + switch (argv[0][0]) { + case 'r': is_return = true; - else if (argv[0][0] != 'p' || argc < 2) + break; + case 'p': + break; + default: + return -ECANCELED; + } + + if (argc < 2) return -ECANCELED; if (argv[0][1] == ':') -- cgit v1.2.3 From a4158345ec5acb44cc0a9ef4381e0784c1bc7722 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Fri, 14 Jun 2019 16:40:26 +0900 Subject: tracing/uprobe: Fix obsolete comment on trace_uprobe_create() Commit 0597c49c69d5 ("tracing/uprobes: Use dyn_event framework for uprobe events") cleaned up the usage of trace_uprobe_create(), and the function has been no longer used for removing uprobe/uretprobe. Link: http://lkml.kernel.org/r/20190614074026.8045-2-devel@etsukata.com Reviewed-by: Srikar Dronamraju Signed-off-by: Eiichi Tsukata Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a88c692e3b8a..b55906c77ce0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -426,8 +426,6 @@ end: /* * Argument syntax: * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] - * - * - Remove uprobe: -:[GRP/]EVENT */ static int trace_uprobe_create(int argc, const char **argv) { -- cgit v1.2.3 From 9f255b632bf12c4dd7fc31caee89aa991ef75176 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 13 Jun 2019 20:07:22 -0500 Subject: module: Fix livepatch/ftrace module text permissions race It's possible for livepatch and ftrace to be toggling a module's text permissions at the same time, resulting in the following panic: BUG: unable to handle page fault for address: ffffffffc005b1d9 #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation PGD 3ea0c067 P4D 3ea0c067 PUD 3ea0e067 PMD 3cc13067 PTE 3b8a1061 Oops: 0003 [#1] PREEMPT SMP PTI CPU: 1 PID: 453 Comm: insmod Tainted: G O K 5.2.0-rc1-a188339ca5 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-20181126_142135-anatol 04/01/2014 RIP: 0010:apply_relocate_add+0xbe/0x14c Code: fa 0b 74 21 48 83 fa 18 74 38 48 83 fa 0a 75 40 eb 08 48 83 38 00 74 33 eb 53 83 38 00 75 4e 89 08 89 c8 eb 0a 83 38 00 75 43 <89> 08 48 63 c1 48 39 c8 74 2e eb 48 83 38 00 75 32 48 29 c1 89 08 RSP: 0018:ffffb223c00dbb10 EFLAGS: 00010246 RAX: ffffffffc005b1d9 RBX: 0000000000000000 RCX: ffffffff8b200060 RDX: 000000000000000b RSI: 0000004b0000000b RDI: ffff96bdfcd33000 RBP: ffffb223c00dbb38 R08: ffffffffc005d040 R09: ffffffffc005c1f0 R10: ffff96bdfcd33c40 R11: ffff96bdfcd33b80 R12: 0000000000000018 R13: ffffffffc005c1f0 R14: ffffffffc005e708 R15: ffffffff8b2fbc74 FS: 00007f5f447beba8(0000) GS:ffff96bdff900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffc005b1d9 CR3: 000000003cedc002 CR4: 0000000000360ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: klp_init_object_loaded+0x10f/0x219 ? preempt_latency_start+0x21/0x57 klp_enable_patch+0x662/0x809 ? virt_to_head_page+0x3a/0x3c ? kfree+0x8c/0x126 patch_init+0x2ed/0x1000 [livepatch_test02] ? 0xffffffffc0060000 do_one_initcall+0x9f/0x1c5 ? kmem_cache_alloc_trace+0xc4/0xd4 ? do_init_module+0x27/0x210 do_init_module+0x5f/0x210 load_module+0x1c41/0x2290 ? fsnotify_path+0x3b/0x42 ? strstarts+0x2b/0x2b ? kernel_read+0x58/0x65 __do_sys_finit_module+0x9f/0xc3 ? __do_sys_finit_module+0x9f/0xc3 __x64_sys_finit_module+0x1a/0x1c do_syscall_64+0x52/0x61 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The above panic occurs when loading two modules at the same time with ftrace enabled, where at least one of the modules is a livepatch module: CPU0 CPU1 klp_enable_patch() klp_init_object_loaded() module_disable_ro() ftrace_module_enable() ftrace_arch_code_modify_post_process() set_all_modules_text_ro() klp_write_object_relocations() apply_relocate_add() *patches read-only code* - BOOM A similar race exists when toggling ftrace while loading a livepatch module. Fix it by ensuring that the livepatch and ftrace code patching operations -- and their respective permissions changes -- are protected by the text_mutex. Link: http://lkml.kernel.org/r/ab43d56ab909469ac5d2520c5d944ad6d4abd476.1560474114.git.jpoimboe@redhat.com Reported-by: Johannes Erdfelt Fixes: 444d13ff10fb ("modules: add ro_after_init support") Acked-by: Jessica Yu Reviewed-by: Petr Mladek Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (VMware) --- kernel/livepatch/core.c | 6 ++++++ kernel/trace/ftrace.c | 10 +++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 91cd519756d3..2d17e6e364b5 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "core.h" #include "patch.h" @@ -730,16 +731,21 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; + mutex_lock(&text_mutex); + module_disable_ro(patch->mod); ret = klp_write_object_relocations(patch->mod, obj); if (ret) { module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); return ret; } arch_klp_init_object_loaded(patch, obj); module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); + klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, func->old_sympos, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e77a6c92620f..a89700590485 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -34,6 +34,7 @@ #include #include #include +#include #include @@ -2610,10 +2611,12 @@ static void ftrace_run_update_code(int command) { int ret; + mutex_lock(&text_mutex); + ret = ftrace_arch_code_modify_prepare(); FTRACE_WARN_ON(ret); if (ret) - return; + goto out_unlock; /* * By default we use stop_machine() to modify the code. @@ -2625,6 +2628,9 @@ static void ftrace_run_update_code(int command) ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); + +out_unlock: + mutex_unlock(&text_mutex); } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, @@ -5775,6 +5781,7 @@ void ftrace_module_enable(struct module *mod) struct ftrace_page *pg; mutex_lock(&ftrace_lock); + mutex_lock(&text_mutex); if (ftrace_disabled) goto out_unlock; @@ -5836,6 +5843,7 @@ void ftrace_module_enable(struct module *mod) ftrace_arch_code_modify_post_process(); out_unlock: + mutex_unlock(&text_mutex); mutex_unlock(&ftrace_lock); process_cached_mods(mod->name); -- cgit v1.2.3 From 04e03d9a616c19a47178eaca835358610e63a1dd Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 6 Jun 2019 11:17:54 +0800 Subject: ftrace: Fix NULL pointer dereference in free_ftrace_func_mapper() The mapper may be NULL when called from register_ftrace_function_probe() with probe->data == NULL. This issue can be reproduced as follow (it may be covered by compiler optimization sometime): / # cat /sys/kernel/debug/tracing/set_ftrace_filter #### all functions enabled #### / # echo foo_bar:dump > /sys/kernel/debug/tracing/set_ftrace_filter [ 206.949100] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 206.952402] Mem abort info: [ 206.952819] ESR = 0x96000006 [ 206.955326] Exception class = DABT (current EL), IL = 32 bits [ 206.955844] SET = 0, FnV = 0 [ 206.956272] EA = 0, S1PTW = 0 [ 206.956652] Data abort info: [ 206.957320] ISV = 0, ISS = 0x00000006 [ 206.959271] CM = 0, WnR = 0 [ 206.959938] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000419f3a000 [ 206.960483] [0000000000000000] pgd=0000000411a87003, pud=0000000411a83003, pmd=0000000000000000 [ 206.964953] Internal error: Oops: 96000006 [#1] SMP [ 206.971122] Dumping ftrace buffer: [ 206.973677] (ftrace buffer empty) [ 206.975258] Modules linked in: [ 206.976631] Process sh (pid: 281, stack limit = 0x(____ptrval____)) [ 206.978449] CPU: 10 PID: 281 Comm: sh Not tainted 5.2.0-rc1+ #17 [ 206.978955] Hardware name: linux,dummy-virt (DT) [ 206.979883] pstate: 60000005 (nZCv daif -PAN -UAO) [ 206.980499] pc : free_ftrace_func_mapper+0x2c/0x118 [ 206.980874] lr : ftrace_count_free+0x68/0x80 [ 206.982539] sp : ffff0000182f3ab0 [ 206.983102] x29: ffff0000182f3ab0 x28: ffff8003d0ec1700 [ 206.983632] x27: ffff000013054b40 x26: 0000000000000001 [ 206.984000] x25: ffff00001385f000 x24: 0000000000000000 [ 206.984394] x23: ffff000013453000 x22: ffff000013054000 [ 206.984775] x21: 0000000000000000 x20: ffff00001385fe28 [ 206.986575] x19: ffff000013872c30 x18: 0000000000000000 [ 206.987111] x17: 0000000000000000 x16: 0000000000000000 [ 206.987491] x15: ffffffffffffffb0 x14: 0000000000000000 [ 206.987850] x13: 000000000017430e x12: 0000000000000580 [ 206.988251] x11: 0000000000000000 x10: cccccccccccccccc [ 206.988740] x9 : 0000000000000000 x8 : ffff000013917550 [ 206.990198] x7 : ffff000012fac2e8 x6 : ffff000012fac000 [ 206.991008] x5 : ffff0000103da588 x4 : 0000000000000001 [ 206.991395] x3 : 0000000000000001 x2 : ffff000013872a28 [ 206.991771] x1 : 0000000000000000 x0 : 0000000000000000 [ 206.992557] Call trace: [ 206.993101] free_ftrace_func_mapper+0x2c/0x118 [ 206.994827] ftrace_count_free+0x68/0x80 [ 206.995238] release_probe+0xfc/0x1d0 [ 206.995555] register_ftrace_function_probe+0x4a8/0x868 [ 206.995923] ftrace_trace_probe_callback.isra.4+0xb8/0x180 [ 206.996330] ftrace_dump_callback+0x50/0x70 [ 206.996663] ftrace_regex_write.isra.29+0x290/0x3a8 [ 206.997157] ftrace_filter_write+0x44/0x60 [ 206.998971] __vfs_write+0x64/0xf0 [ 206.999285] vfs_write+0x14c/0x2f0 [ 206.999591] ksys_write+0xbc/0x1b0 [ 206.999888] __arm64_sys_write+0x3c/0x58 [ 207.000246] el0_svc_common.constprop.0+0x408/0x5f0 [ 207.000607] el0_svc_handler+0x144/0x1c8 [ 207.000916] el0_svc+0x8/0xc [ 207.003699] Code: aa0003f8 a9025bf5 aa0103f5 f946ea80 (f9400303) [ 207.008388] ---[ end trace 7b6d11b5f542bdf1 ]--- [ 207.010126] Kernel panic - not syncing: Fatal exception [ 207.011322] SMP: stopping secondary CPUs [ 207.013956] Dumping ftrace buffer: [ 207.014595] (ftrace buffer empty) [ 207.015632] Kernel Offset: disabled [ 207.017187] CPU features: 0x002,20006008 [ 207.017985] Memory Limit: none [ 207.019825] ---[ end Kernel panic - not syncing: Fatal exception ]--- Link: http://lkml.kernel.org/r/20190606031754.10798-1-liwei391@huawei.com Signed-off-by: Wei Li Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a89700590485..38277af44f5c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4226,10 +4226,13 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, struct ftrace_func_entry *entry; struct ftrace_func_map *map; struct hlist_head *hhd; - int size = 1 << mapper->hash.size_bits; - int i; + int size, i; + + if (!mapper) + return; if (free_func && mapper->hash.count) { + size = 1 << mapper->hash.size_bits; for (i = 0; i < size; i++) { hhd = &mapper->hash.buckets[i]; hlist_for_each_entry(entry, hhd, hlist) { -- cgit v1.2.3 From d4dd153d551634683fccf8881f606fa9f3dfa1ef Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:13 +0900 Subject: bpf, devmap: Fix premature entry free on destroying map dev_map_free() waits for flush_needed bitmap to be empty in order to ensure all flush operations have completed before freeing its entries. However the corresponding clear_bit() was called before using the entries, so the entries could be used after free. All access to the entries needs to be done before clearing the bit. It seems commit a5e2da6e9787 ("bpf: netdev is never null in __dev_map_flush") accidentally changed the clear_bit() and memory access order. Note that the problem happens only in __dev_map_flush(), not in dev_map_flush_old(). dev_map_flush_old() is called only after nulling out the corresponding netdev_map entry, so dev_map_free() never frees the entry thus no such race happens there. Fixes: a5e2da6e9787 ("bpf: netdev is never null in __dev_map_flush") Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1e525d70f833..e001fb1a96b1 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -291,10 +291,10 @@ void __dev_map_flush(struct bpf_map *map) if (unlikely(!dev)) continue; - __clear_bit(bit, bitmap); - bq = this_cpu_ptr(dev->bulkq); bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); + + __clear_bit(bit, bitmap); } } -- cgit v1.2.3 From edabf4d9dd905acd60048ea1579943801e3a4876 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:14 +0900 Subject: bpf, devmap: Add missing bulk queue free dev_map_free() forgot to free bulk queue when freeing its entries. Fixes: 5d053f9da431 ("bpf: devmap prepare xdp frames for bulking") Signed-off-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e001fb1a96b1..a126d95d12de 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -186,6 +186,7 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } -- cgit v1.2.3 From 86723c8640633bee4b4588d3c7784ee7a0032f65 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 14 Jun 2019 17:20:15 +0900 Subject: bpf, devmap: Add missing RCU read lock on flush .ndo_xdp_xmit() assumes it is called under RCU. For example virtio_net uses RCU to detect it has setup the resources for tx. The assumption accidentally broke when introducing bulk queue in devmap. Fixes: 5d053f9da431 ("bpf: devmap prepare xdp frames for bulking") Reported-by: David Ahern Signed-off-by: Toshiaki Makita Signed-off-by: Daniel Borkmann --- kernel/bpf/devmap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a126d95d12de..1defea4b2755 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -282,6 +282,7 @@ void __dev_map_flush(struct bpf_map *map) unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); u32 bit; + rcu_read_lock(); for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); struct xdp_bulk_queue *bq; @@ -297,6 +298,7 @@ void __dev_map_flush(struct bpf_map *map) __clear_bit(bit, bitmap); } + rcu_read_unlock(); } /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or @@ -389,6 +391,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) int cpu; + rcu_read_lock(); for_each_online_cpu(cpu) { bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); __clear_bit(dev->bit, bitmap); @@ -396,6 +399,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) bq = per_cpu_ptr(dev->bulkq, cpu); bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); } + rcu_read_unlock(); } } -- cgit v1.2.3 From a8e11e5c5611a9f70470aebeb2c1dd6132f609d7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Jun 2019 16:22:18 -0700 Subject: sysctl: define proc_do_static_key() Convert proc_dointvec_minmax_bpf_stats() into a more generic helper, since we are going to use jump labels more often. Note that sysctl_bpf_stats_enabled is removed, since it is no longer needed/used. Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 1 - kernel/sysctl.c | 44 +++++++++++++++++++++++--------------------- 2 files changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7c473f208a10..080e2bb644cc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2097,7 +2097,6 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); -int sysctl_bpf_stats_enabled __read_mostly; /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7d1008be6173..1beca96fb625 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -230,11 +230,6 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #endif static int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -#ifdef CONFIG_BPF_SYSCALL -static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -#endif #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses its own private copy */ @@ -1253,12 +1248,10 @@ static struct ctl_table kern_table[] = { }, { .procname = "bpf_stats_enabled", - .data = &sysctl_bpf_stats_enabled, - .maxlen = sizeof(sysctl_bpf_stats_enabled), + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), .mode = 0644, - .proc_handler = proc_dointvec_minmax_bpf_stats, - .extra1 = &zero, - .extra2 = &one, + .proc_handler = proc_do_static_key, }, #endif #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) @@ -3374,26 +3367,35 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #endif /* CONFIG_PROC_SYSCTL */ -#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) -static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +#if defined(CONFIG_SYSCTL) +int proc_do_static_key(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) { - int ret, bpf_stats = *(int *)table->data; - struct ctl_table tmp = *table; + struct static_key *key = (struct static_key *)table->data; + static DEFINE_MUTEX(static_key_mutex); + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = &zero, + .extra2 = &one, + }; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; - tmp.data = &bpf_stats; + mutex_lock(&static_key_mutex); + val = static_key_enabled(key); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { - *(int *)table->data = bpf_stats; - if (bpf_stats) - static_branch_enable(&bpf_stats_enabled_key); + if (val) + static_key_enable(key); else - static_branch_disable(&bpf_stats_enabled_key); + static_key_disable(key); } + mutex_unlock(&static_key_mutex); return ret; } #endif -- cgit v1.2.3 From 9594dc3c7e71b9f52bee1d7852eb3d4e3aea9e99 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Tue, 11 Jun 2019 14:53:04 -0700 Subject: bpf: fix nested bpf tracepoints with per-cpu data BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as they do not increment bpf_prog_active while executing. This enables three levels of nesting, to support - a kprobe or raw tp or perf event, - another one of the above that irq context happens to call, and - another one in nmi context (at most one of which may be a kprobe or perf event). Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") Signed-off-by: Matt Mullins Acked-by: Andrii Nakryiko Acked-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 100 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 84 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f92d6ad5e080..1c9a4745e596 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); - static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_sample_data *sd) @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, return perf_event_output(event, sd, regs); } +/* + * Support executing tracepoints in normal, irq, and nmi context that each call + * bpf_perf_event_output + */ +struct bpf_trace_sample_data { + struct perf_sample_data sds[3]; +}; + +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); +static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); + struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); + int nest_level = this_cpu_inc_return(bpf_trace_nest_level); struct perf_raw_record raw = { .frag = { .size = size, .data = data, }, }; + struct perf_sample_data *sd; + int err; - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { + err = -EBUSY; + goto out; + } + + sd = &sds->sds[nest_level - 1]; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { + err = -EINVAL; + goto out; + } perf_sample_data_init(sd, 0, 0); sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, sd); + err = __bpf_perf_event_output(regs, map, flags, sd); + +out: + this_cpu_dec(bpf_trace_nest_level); + return err; } static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. + * + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage + * in normal, irq, and nmi context. */ -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); +struct bpf_raw_tp_regs { + struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); +static struct pt_regs *get_bpf_raw_tp_regs(void) +{ + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + this_cpu_dec(bpf_raw_tp_nest_level); + return ERR_PTR(-EBUSY); + } + + return &tp_regs->regs[nest_level - 1]; +} + +static void put_bpf_raw_tp_regs(void) +{ + this_cpu_dec(bpf_raw_tp_nest_level); +} + BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); - return ____bpf_perf_event_output(regs, map, flags, data, size); + ret = ____bpf_perf_event_output(regs, map, flags, data, size); + + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ - return bpf_get_stackid((unsigned long) regs, (unsigned long) map, - flags, 0, 0); + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, void *, buf, u32, size, u64, flags) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); - return bpf_get_stack((unsigned long) regs, (unsigned long) buf, - (unsigned long) size, flags, 0); + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { -- cgit v1.2.3 From 40b0b3f8fb2d8f55d13ceed41593d46689a6b496 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 3 Jun 2019 07:44:46 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 230 Based on 2 normalized pattern(s): this source code is licensed under the gnu general public license version 2 see the file copying for more details this source code is licensed under general public license version 2 see extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 52 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Enrico Weigelt Reviewed-by: Allison Randal Reviewed-by: Alexios Zavras Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190602204653.449021192@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/crash_core.c | 4 +--- kernel/kexec.c | 4 +--- kernel/kexec_core.c | 4 +--- kernel/kexec_file.c | 4 +--- 4 files changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 093c9f917ed0..9f1557b98468 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -1,9 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * crash.c - kernel crash support code. * Copyright (C) 2002-2004 Eric Biederman - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #include diff --git a/kernel/kexec.c b/kernel/kexec.c index 68559808fdfa..1b018f1a6e0d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1,9 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec_load system call * Copyright (C) 2002-2004 Eric Biederman - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index fd5c95ff9251..d5870723b8ad 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1,9 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec system call core code. * Copyright (C) 2002-2004 Eric Biederman - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 072b6ee55e3f..ef7b951a8087 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kexec: kexec_file_load system call * * Copyright (C) 2014 Red Hat Inc. * Authors: * Vivek Goyal - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -- cgit v1.2.3 From 8092f73c51567470bd79472c6eb25d2e1841fac3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 3 Jun 2019 07:45:04 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 248 Based on 1 normalized pattern(s): this file is released under the gpl v2 extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 3 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Alexios Zavras Reviewed-by: Allison Randal Reviewed-by: Armijn Hemel Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190602204655.103854853@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/power/poweroff.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 7ef6866b521d..6d475281c730 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -1,7 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * poweroff.c - sysrq handler to gracefully power down machine. - * - * This file is released under the GPL v2 */ #include -- cgit v1.2.3 From f85d208658468b1a298f31daddb05a7684969cd4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Jun 2019 10:10:45 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 451 Based on 1 normalized pattern(s): this file is subject to the terms and conditions of version 2 of the gnu general public license see the file copying in the main directory of the linux distribution for more details extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 5 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Enrico Weigelt Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190604081200.872755311@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/cgroup.c | 5 +---- kernel/bpf/lpm_trie.c | 5 +---- kernel/cgroup/pids.c | 5 +---- kernel/cgroup/rdma.c | 5 +---- 4 files changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index fcde0f7b2585..92a7d0cf8d13 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1,11 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Functions to manage eBPF programs attached to cgroups * * Copyright (c) 2016 Daniel Mack - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e61630c2e50b..1647a3f763a7 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Longest prefix match list implementation * * Copyright (c) 2016,2017 Daniel Mack * Copyright (c) 2016 David Herrmann - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index c9960baaa14f..8e513a573fe9 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Process number limiting controller for cgroups. * @@ -25,10 +26,6 @@ * a superset of parent/child/pids.current. * * Copyright (C) 2015 Aleksa Sarai - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 1d75ae7f1cb7..ae042c347c64 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RDMA resource limiting controller for cgroups. * @@ -5,10 +6,6 @@ * additional RDMA resources after a certain limit is reached. * * Copyright (C) 2016 Parav Pandit - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include -- cgit v1.2.3 From d2912cb15bdda8ba4a5dd73396ad62641af2f520 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Jun 2019 10:11:33 +0200 Subject: treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 500 Based on 2 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license version 2 as published by the free software foundation this program is free software you can redistribute it and or modify it under the terms of the gnu general public license version 2 as published by the free software foundation # extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 4122 file(s). Signed-off-by: Thomas Gleixner Reviewed-by: Enrico Weigelt Reviewed-by: Kate Stewart Reviewed-by: Allison Randal Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190604081206.933168790@linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/inode.c | 5 +---- kernel/compat.c | 5 +---- kernel/sched/debug.c | 5 +---- 3 files changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 84a80b02db99..cc0d0cf114e3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Minimal file system backend for holding eBPF maps and programs, * used by bpf(2) object pinning. @@ -5,10 +6,6 @@ * Authors: * * Daniel Borkmann - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation. */ #include diff --git a/kernel/compat.c b/kernel/compat.c index b5f7063c0db6..a2bc1d6ceb57 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/compat.c * @@ -5,10 +6,6 @@ * on 64 bit kernels. * * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 678bfb9bd87f..14c6a8716ba1 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1,13 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/debug.c * * Print the CFS rbtree and other debugging details * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include "sched.h" -- cgit v1.2.3